Coverage for install/scipp/io/csv.py: 33%

15 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-04-28 01:28 +0000

1# SPDX-License-Identifier: BSD-3-Clause 

2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp) 

3 

4"""Load CSV files. 

5 

6Note 

7---- 

8CSV support requires `pandas <https://pandas.pydata.org/>`_ to be installed. 

9You need to do this manually as it is not declared as a dependency of Scipp. 

10You can use either ``pip install pandas`` or ``conda install -c conda-forge pandas``. 

11 

12 

13CSV ('comma separated values') files store a simple table of data as a string. 

14There are many different forms of this format. 

15So check whether the loaded data is what you expect for your files. 

16See :func:`scipp.io.csv.load_csv` for examples. 

17 

18See Also 

19-------- 

20pandas.read_csv: 

21 More details on the underlying parser. 

22""" 

23 

24from io import BytesIO, StringIO 

25from os import PathLike 

26from typing import Iterable, Optional, Union 

27 

28from ..compat.pandas_compat import HeaderParserArg, from_pandas 

29from ..core import Dataset 

30 

31 

32# The typehint of filepath_or_buffer is less generic than in pd.read_csv 

33# because the definitions of protocols are private in pandas. 

34def _load_dataframe( 

35 filepath_or_buffer: Union[str, PathLike, StringIO, BytesIO], 

36 sep: str, 

37): 

38 try: 

39 import pandas as pd 

40 except ImportError: 

41 raise ImportError( 

42 "Pandas is required to load CSV files but not install. " 

43 "Install it with `pip install pandas` or " 

44 "`conda install -c conda-forge pandas`." 

45 ) from None 

46 return pd.read_csv(filepath_or_buffer, sep=sep) 

47 

48 

49def load_csv( 

50 filename: Union[str, PathLike, StringIO, BytesIO], 

51 *, 

52 sep: Optional[str] = ',', 

53 data_columns: Optional[Union[str, Iterable[str]]] = None, 

54 header_parser: HeaderParserArg = None, 

55) -> Dataset: 

56 """Load a CSV file as a dataset. 

57 

58 This function currently uses Pandas to load the file and converts the result 

59 into a :class:`scipp.Dataset`. 

60 Pandas is not a hard dependency of Scipp and will thus not be installed 

61 automatically, so you need to install it manually. 

62 

63 ``load_csv`` exists to conveniently load simple CSV files. 

64 If a file cannot be loaded directly, consider using Pandas directly. 

65 For example, use :func:`pandas.read_csv` to load the file into a data frame and 

66 :func:`scipp.compat.pandas_compat.from_pandas` to convert 

67 the data frame into a dataset. 

68 

69 Parameters 

70 ---------- 

71 filename: 

72 Path or URL of file to load or buffer to load from. 

73 sep: 

74 Column separator. 

75 Automatically deduced if ``sep is None``. 

76 See :func:`pandas.read_csv` for details. 

77 data_columns: 

78 Select which columns to assign as data. 

79 The rest are returned as coordinates. 

80 If ``None``, all columns are assigned as data. 

81 Use an empty list to assign all columns as coordinates. 

82 header_parser: 

83 Parser for column headers. 

84 See :func:`scipp.compat.pandas_compat.from_pandas` for details. 

85 

86 Returns 

87 ------- 

88 : 

89 The loaded data as a dataset. 

90 

91 Examples 

92 -------- 

93 Given the following CSV 'file': 

94 

95 >>> from io import StringIO 

96 >>> csv_content = '''a [m],b [s],c 

97 ... 1,5,9 

98 ... 2,6,10 

99 ... 3,7,11 

100 ... 4,8,12''' 

101 

102 By default, it will be loaded as 

103 

104 >>> sc.io.load_csv(StringIO(csv_content)) 

105 <scipp.Dataset> 

106 Dimensions: Sizes[row:4, ] 

107 Data: 

108 a [m] int64 [dimensionless] (row) [1, 2, 3, 4] 

109 b [s] int64 [dimensionless] (row) [5, 6, 7, 8] 

110 c int64 [dimensionless] (row) [9, 10, 11, 12] 

111 

112 In this example, the column headers encode units. 

113 They can be parsed into actual units: 

114 

115 >>> sc.io.load_csv(StringIO(csv_content), header_parser='bracket') 

116 <scipp.Dataset> 

117 Dimensions: Sizes[row:4, ] 

118 Data: 

119 a int64 [m] (row) [1, 2, 3, 4] 

120 b int64 [s] (row) [5, 6, 7, 8] 

121 c int64 <no unit> (row) [9, 10, 11, 12] 

122 

123 It is possible to select which columns are stored as data: 

124 

125 >>> sc.io.load_csv( 

126 ... StringIO(csv_content), 

127 ... header_parser='bracket', 

128 ... data_columns='a', 

129 ... ) 

130 <scipp.Dataset> 

131 Dimensions: Sizes[row:4, ] 

132 Coordinates: 

133 * b int64 [s] (row) [5, 6, 7, 8] 

134 * c int64 <no unit> (row) [9, 10, 11, 12] 

135 Data: 

136 a int64 [m] (row) [1, 2, 3, 4] 

137 """ 

138 df = _load_dataframe(filename, sep=sep) 

139 return from_pandas( 

140 df, 

141 data_columns=data_columns, 

142 include_trivial_index=False, 

143 header_parser=header_parser, 

144 )