Coverage for install/scipp/io/csv.py: 33%

1# SPDX-License-Identifier: BSD-3-Clause

4"""Load CSV files.

6Note

7----

8CSV support requires `pandas <https://pandas.pydata.org/>`_ to be installed.

9You need to do this manually as it is not declared as a dependency of Scipp.

10You can use either ``pip install pandas`` or ``conda install -c conda-forge pandas``.

13CSV ('comma separated values') files store a simple table of data as a string.

14There are many different forms of this format.

15So check whether the loaded data is what you expect for your files.

16See :func:`scipp.io.csv.load_csv` for examples.

18See Also

19--------

20pandas.read_csv:

21 More details on the underlying parser.

22"""

24from io import BytesIO, StringIO

25from os import PathLike

26from typing import Iterable, Optional, Union

28from ..compat.pandas_compat import HeaderParserArg, from_pandas

29from ..core import Dataset

32# The typehint of filepath_or_buffer is less generic than in pd.read_csv

33# because the definitions of protocols are private in pandas.

34def _load_dataframe(

35 filepath_or_buffer: Union[str, PathLike, StringIO, BytesIO],

36 sep: str,

37):

38 try:

39 import pandas as pd

40 except ImportError:

41 raise ImportError(

42 "Pandas is required to load CSV files but not install. "

43 "Install it with `pip install pandas` or "

44 "`conda install -c conda-forge pandas`."

45 ) from None

46 return pd.read_csv(filepath_or_buffer, sep=sep)

49def load_csv(

50 filename: Union[str, PathLike, StringIO, BytesIO],

51 *,

52 sep: Optional[str] = ',',

53 data_columns: Optional[Union[str, Iterable[str]]] = None,

54 header_parser: HeaderParserArg = None,

55) -> Dataset:

56 """Load a CSV file as a dataset.

58 This function currently uses Pandas to load the file and converts the result

59 into a :class:`scipp.Dataset`.

60 Pandas is not a hard dependency of Scipp and will thus not be installed

61 automatically, so you need to install it manually.

63 ``load_csv`` exists to conveniently load simple CSV files.

64 If a file cannot be loaded directly, consider using Pandas directly.

65 For example, use :func:`pandas.read_csv` to load the file into a data frame and

66 :func:`scipp.compat.pandas_compat.from_pandas` to convert

67 the data frame into a dataset.

69 Parameters

70 ----------

71 filename:

72 Path or URL of file to load or buffer to load from.

73 sep:

74 Column separator.

75 Automatically deduced if ``sep is None``.

76 See :func:`pandas.read_csv` for details.

77 data_columns:

78 Select which columns to assign as data.

79 The rest are returned as coordinates.

80 If ``None``, all columns are assigned as data.

81 Use an empty list to assign all columns as coordinates.

82 header_parser:

83 Parser for column headers.

84 See :func:`scipp.compat.pandas_compat.from_pandas` for details.

86 Returns

87 -------

88 :

89 The loaded data as a dataset.

91 Examples

92 --------

93 Given the following CSV 'file':

95 >>> from io import StringIO

96 >>> csv_content = '''a [m],b [s],c

97 ... 1,5,9

98 ... 2,6,10

99 ... 3,7,11

100 ... 4,8,12'''

101

102 By default, it will be loaded as

103

104 >>> sc.io.load_csv(StringIO(csv_content))

105 <scipp.Dataset>

106 Dimensions: Sizes[row:4, ]

107 Data:

108 a [m] int64 [dimensionless] (row) [1, 2, 3, 4]

109 b [s] int64 [dimensionless] (row) [5, 6, 7, 8]

110 c int64 [dimensionless] (row) [9, 10, 11, 12]

111

112 In this example, the column headers encode units.

113 They can be parsed into actual units:

114

115 >>> sc.io.load_csv(StringIO(csv_content), header_parser='bracket')

116 <scipp.Dataset>

117 Dimensions: Sizes[row:4, ]

118 Data:

119 a int64 [m] (row) [1, 2, 3, 4]

120 b int64 [s] (row) [5, 6, 7, 8]

121 c int64 <no unit> (row) [9, 10, 11, 12]

122

123 It is possible to select which columns are stored as data:

124

125 >>> sc.io.load_csv(

126 ... StringIO(csv_content),

127 ... header_parser='bracket',

128 ... data_columns='a',

129 ... )

130 <scipp.Dataset>

131 Dimensions: Sizes[row:4, ]

132 Coordinates:

133 * b int64 [s] (row) [5, 6, 7, 8]

134 * c int64 <no unit> (row) [9, 10, 11, 12]

135 Data:

136 a int64 [m] (row) [1, 2, 3, 4]

137 """

138 df = _load_dataframe(filename, sep=sep)

139 return from_pandas(

140 df,

141 data_columns=data_columns,

142 include_trivial_index=False,

143 header_parser=header_parser,

144 )