Coverage for install/scipp/io/csv.py: 33%

1# SPDX-License-Identifier: BSD-3-Clause

4"""Load CSV files.

6Note

7----

8CSV support requires `pandas <https://pandas.pydata.org/>`_ to be installed.

9You need to do this manually as it is not declared as a dependency of Scipp.

10You can use either ``pip install pandas`` or ``conda install -c conda-forge pandas``.

13CSV ('comma separated values') files store a simple table of data as a string.

14There are many different forms of this format.

15So check whether the loaded data is what you expect for your files.

16See :func:`scipp.io.csv.load_csv` for examples.

18See Also

19--------

20pandas.read_csv:

21 More details on the underlying parser.

22"""

24from collections.abc import Iterable

25from io import BytesIO, StringIO

26from os import PathLike

27from typing import Any

29from ..compat.pandas_compat import HeaderParserArg, from_pandas_dataframe

30from ..core import Dataset

33# The typehint of filepath_or_buffer is less generic than in pd.read_csv

34# because the definitions of protocols are private in pandas.

35def _load_dataframe(

36 filepath_or_buffer: str | PathLike[str] | StringIO | BytesIO,

37 sep: str | None,

38) -> Any:

39 try:

40 import pandas as pd

41 except ImportError:

42 raise ImportError(

43 "Pandas is required to load CSV files but not install. "

44 "Install it with `pip install pandas` or "

45 "`conda install -c conda-forge pandas`."

46 ) from None

47 return pd.read_csv(filepath_or_buffer, sep=sep)

50def load_csv(

51 filename: str | PathLike[str] | StringIO | BytesIO,

52 *,

53 sep: str | None = ',',

54 data_columns: str | Iterable[str] | None = None,

55 header_parser: HeaderParserArg = None,

56) -> Dataset:

57 """Load a CSV file as a dataset.

59 This function currently uses Pandas to load the file and converts the result

60 into a :class:`scipp.Dataset`.

61 Pandas is not a hard dependency of Scipp and will thus not be installed

62 automatically, so you need to install it manually.

64 ``load_csv`` exists to conveniently load simple CSV files.

65 If a file cannot be loaded directly, consider using Pandas directly.

66 For example, use :func:`pandas.read_csv` to load the file into a data frame and

67 :func:`scipp.compat.pandas_compat.from_pandas` to convert

68 the data frame into a dataset.

70 Parameters

71 ----------

72 filename:

73 Path or URL of file to load or buffer to load from.

74 sep:

75 Column separator.

76 Automatically deduced if ``sep is None``.

77 See :func:`pandas.read_csv` for details.

78 data_columns:

79 Select which columns to assign as data.

80 The rest are returned as coordinates.

81 If ``None``, all columns are assigned as data.

82 Use an empty list to assign all columns as coordinates.

83 header_parser:

84 Parser for column headers.

85 See :func:`scipp.compat.pandas_compat.from_pandas` for details.

87 Returns

88 -------

89 :

90 The loaded data as a dataset.

92 Examples

93 --------

94 Given the following CSV 'file':

96 >>> from io import StringIO

97 >>> csv_content = '''a [m],b [s],c

98 ... 1,5,9

99 ... 2,6,10

100 ... 3,7,11

101 ... 4,8,12'''

102

103 By default, it will be loaded as

104

105 >>> sc.io.load_csv(StringIO(csv_content))

106 <scipp.Dataset>

107 Dimensions: Sizes[row:4, ]

108 Data:

109 a [m] int64 [dimensionless] (row) [1, 2, 3, 4]

110 b [s] int64 [dimensionless] (row) [5, 6, 7, 8]

111 c int64 [dimensionless] (row) [9, 10, 11, 12]

112

113 In this example, the column headers encode units.

114 They can be parsed into actual units:

115

116 >>> sc.io.load_csv(StringIO(csv_content), header_parser='bracket')

117 <scipp.Dataset>

118 Dimensions: Sizes[row:4, ]

119 Data:

120 a int64 [m] (row) [1, 2, 3, 4]

121 b int64 [s] (row) [5, 6, 7, 8]

122 c int64 <no unit> (row) [9, 10, 11, 12]

123

124 It is possible to select which columns are stored as data:

125

126 >>> sc.io.load_csv(

127 ... StringIO(csv_content),

128 ... header_parser='bracket',

129 ... data_columns='a',

130 ... )

131 <scipp.Dataset>

132 Dimensions: Sizes[row:4, ]

133 Coordinates:

134 * b int64 [s] (row) [5, 6, 7, 8]

135 * c int64 <no unit> (row) [9, 10, 11, 12]

136 Data:

137 a int64 [m] (row) [1, 2, 3, 4]

138 """

139 df = _load_dataframe(filename, sep=sep)

140 return from_pandas_dataframe(

141 df,

142 data_columns=data_columns,

143 include_trivial_index=False,

144 header_parser=header_parser,

145 )