Coverage for install/scipp/io/csv.py: 33%

15 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-12-01 01:59 +0000

1# SPDX-License-Identifier: BSD-3-Clause 

2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp) 

3 

4"""Load CSV files. 

5 

6Note 

7---- 

8CSV support requires `pandas <https://pandas.pydata.org/>`_ to be installed. 

9You need to do this manually as it is not declared as a dependency of Scipp. 

10You can use either ``pip install pandas`` or ``conda install -c conda-forge pandas``. 

11 

12 

13CSV ('comma separated values') files store a simple table of data as a string. 

14There are many different forms of this format. 

15So check whether the loaded data is what you expect for your files. 

16See :func:`scipp.io.csv.load_csv` for examples. 

17 

18See Also 

19-------- 

20pandas.read_csv: 

21 More details on the underlying parser. 

22""" 

23 

24from collections.abc import Iterable 

25from io import BytesIO, StringIO 

26from os import PathLike 

27from typing import Any 

28 

29from ..compat.pandas_compat import HeaderParserArg, from_pandas_dataframe 

30from ..core import Dataset 

31 

32 

33# The typehint of filepath_or_buffer is less generic than in pd.read_csv 

34# because the definitions of protocols are private in pandas. 

35def _load_dataframe( 

36 filepath_or_buffer: str | PathLike[str] | StringIO | BytesIO, 

37 sep: str | None, 

38) -> Any: 

39 try: 

40 import pandas as pd 

41 except ImportError: 

42 raise ImportError( 

43 "Pandas is required to load CSV files but not install. " 

44 "Install it with `pip install pandas` or " 

45 "`conda install -c conda-forge pandas`." 

46 ) from None 

47 return pd.read_csv(filepath_or_buffer, sep=sep) 

48 

49 

50def load_csv( 

51 filename: str | PathLike[str] | StringIO | BytesIO, 

52 *, 

53 sep: str | None = ',', 

54 data_columns: str | Iterable[str] | None = None, 

55 header_parser: HeaderParserArg = None, 

56) -> Dataset: 

57 """Load a CSV file as a dataset. 

58 

59 This function currently uses Pandas to load the file and converts the result 

60 into a :class:`scipp.Dataset`. 

61 Pandas is not a hard dependency of Scipp and will thus not be installed 

62 automatically, so you need to install it manually. 

63 

64 ``load_csv`` exists to conveniently load simple CSV files. 

65 If a file cannot be loaded directly, consider using Pandas directly. 

66 For example, use :func:`pandas.read_csv` to load the file into a data frame and 

67 :func:`scipp.compat.pandas_compat.from_pandas` to convert 

68 the data frame into a dataset. 

69 

70 Parameters 

71 ---------- 

72 filename: 

73 Path or URL of file to load or buffer to load from. 

74 sep: 

75 Column separator. 

76 Automatically deduced if ``sep is None``. 

77 See :func:`pandas.read_csv` for details. 

78 data_columns: 

79 Select which columns to assign as data. 

80 The rest are returned as coordinates. 

81 If ``None``, all columns are assigned as data. 

82 Use an empty list to assign all columns as coordinates. 

83 header_parser: 

84 Parser for column headers. 

85 See :func:`scipp.compat.pandas_compat.from_pandas` for details. 

86 

87 Returns 

88 ------- 

89 : 

90 The loaded data as a dataset. 

91 

92 Examples 

93 -------- 

94 Given the following CSV 'file': 

95 

96 >>> from io import StringIO 

97 >>> csv_content = '''a [m],b [s],c 

98 ... 1,5,9 

99 ... 2,6,10 

100 ... 3,7,11 

101 ... 4,8,12''' 

102 

103 By default, it will be loaded as 

104 

105 >>> sc.io.load_csv(StringIO(csv_content)) 

106 <scipp.Dataset> 

107 Dimensions: Sizes[row:4, ] 

108 Data: 

109 a [m] int64 [dimensionless] (row) [1, 2, 3, 4] 

110 b [s] int64 [dimensionless] (row) [5, 6, 7, 8] 

111 c int64 [dimensionless] (row) [9, 10, 11, 12] 

112 

113 In this example, the column headers encode units. 

114 They can be parsed into actual units: 

115 

116 >>> sc.io.load_csv(StringIO(csv_content), header_parser='bracket') 

117 <scipp.Dataset> 

118 Dimensions: Sizes[row:4, ] 

119 Data: 

120 a int64 [m] (row) [1, 2, 3, 4] 

121 b int64 [s] (row) [5, 6, 7, 8] 

122 c int64 <no unit> (row) [9, 10, 11, 12] 

123 

124 It is possible to select which columns are stored as data: 

125 

126 >>> sc.io.load_csv( 

127 ... StringIO(csv_content), 

128 ... header_parser='bracket', 

129 ... data_columns='a', 

130 ... ) 

131 <scipp.Dataset> 

132 Dimensions: Sizes[row:4, ] 

133 Coordinates: 

134 * b int64 [s] (row) [5, 6, 7, 8] 

135 * c int64 <no unit> (row) [9, 10, 11, 12] 

136 Data: 

137 a int64 [m] (row) [1, 2, 3, 4] 

138 """ 

139 df = _load_dataframe(filename, sep=sep) 

140 return from_pandas_dataframe( 

141 df, 

142 data_columns=data_columns, 

143 include_trivial_index=False, 

144 header_parser=header_parser, 

145 )