Coverage for install/scipp/io/csv.py: 33%
15 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-04-28 01:28 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-04-28 01:28 +0000
1# SPDX-License-Identifier: BSD-3-Clause
2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
4"""Load CSV files.
6Note
7----
8CSV support requires `pandas <https://pandas.pydata.org/>`_ to be installed.
9You need to do this manually as it is not declared as a dependency of Scipp.
10You can use either ``pip install pandas`` or ``conda install -c conda-forge pandas``.
13CSV ('comma separated values') files store a simple table of data as a string.
14There are many different forms of this format.
15So check whether the loaded data is what you expect for your files.
16See :func:`scipp.io.csv.load_csv` for examples.
18See Also
19--------
20pandas.read_csv:
21 More details on the underlying parser.
22"""
24from io import BytesIO, StringIO
25from os import PathLike
26from typing import Iterable, Optional, Union
28from ..compat.pandas_compat import HeaderParserArg, from_pandas
29from ..core import Dataset
32# The typehint of filepath_or_buffer is less generic than in pd.read_csv
33# because the definitions of protocols are private in pandas.
34def _load_dataframe(
35 filepath_or_buffer: Union[str, PathLike, StringIO, BytesIO],
36 sep: str,
37):
38 try:
39 import pandas as pd
40 except ImportError:
41 raise ImportError(
42 "Pandas is required to load CSV files but not install. "
43 "Install it with `pip install pandas` or "
44 "`conda install -c conda-forge pandas`."
45 ) from None
46 return pd.read_csv(filepath_or_buffer, sep=sep)
49def load_csv(
50 filename: Union[str, PathLike, StringIO, BytesIO],
51 *,
52 sep: Optional[str] = ',',
53 data_columns: Optional[Union[str, Iterable[str]]] = None,
54 header_parser: HeaderParserArg = None,
55) -> Dataset:
56 """Load a CSV file as a dataset.
58 This function currently uses Pandas to load the file and converts the result
59 into a :class:`scipp.Dataset`.
60 Pandas is not a hard dependency of Scipp and will thus not be installed
61 automatically, so you need to install it manually.
63 ``load_csv`` exists to conveniently load simple CSV files.
64 If a file cannot be loaded directly, consider using Pandas directly.
65 For example, use :func:`pandas.read_csv` to load the file into a data frame and
66 :func:`scipp.compat.pandas_compat.from_pandas` to convert
67 the data frame into a dataset.
69 Parameters
70 ----------
71 filename:
72 Path or URL of file to load or buffer to load from.
73 sep:
74 Column separator.
75 Automatically deduced if ``sep is None``.
76 See :func:`pandas.read_csv` for details.
77 data_columns:
78 Select which columns to assign as data.
79 The rest are returned as coordinates.
80 If ``None``, all columns are assigned as data.
81 Use an empty list to assign all columns as coordinates.
82 header_parser:
83 Parser for column headers.
84 See :func:`scipp.compat.pandas_compat.from_pandas` for details.
86 Returns
87 -------
88 :
89 The loaded data as a dataset.
91 Examples
92 --------
93 Given the following CSV 'file':
95 >>> from io import StringIO
96 >>> csv_content = '''a [m],b [s],c
97 ... 1,5,9
98 ... 2,6,10
99 ... 3,7,11
100 ... 4,8,12'''
102 By default, it will be loaded as
104 >>> sc.io.load_csv(StringIO(csv_content))
105 <scipp.Dataset>
106 Dimensions: Sizes[row:4, ]
107 Data:
108 a [m] int64 [dimensionless] (row) [1, 2, 3, 4]
109 b [s] int64 [dimensionless] (row) [5, 6, 7, 8]
110 c int64 [dimensionless] (row) [9, 10, 11, 12]
112 In this example, the column headers encode units.
113 They can be parsed into actual units:
115 >>> sc.io.load_csv(StringIO(csv_content), header_parser='bracket')
116 <scipp.Dataset>
117 Dimensions: Sizes[row:4, ]
118 Data:
119 a int64 [m] (row) [1, 2, 3, 4]
120 b int64 [s] (row) [5, 6, 7, 8]
121 c int64 <no unit> (row) [9, 10, 11, 12]
123 It is possible to select which columns are stored as data:
125 >>> sc.io.load_csv(
126 ... StringIO(csv_content),
127 ... header_parser='bracket',
128 ... data_columns='a',
129 ... )
130 <scipp.Dataset>
131 Dimensions: Sizes[row:4, ]
132 Coordinates:
133 * b int64 [s] (row) [5, 6, 7, 8]
134 * c int64 <no unit> (row) [9, 10, 11, 12]
135 Data:
136 a int64 [m] (row) [1, 2, 3, 4]
137 """
138 df = _load_dataframe(filename, sep=sep)
139 return from_pandas(
140 df,
141 data_columns=data_columns,
142 include_trivial_index=False,
143 header_parser=header_parser,
144 )