Coverage for install/scipp/io/csv.py: 33%
15 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-17 01:51 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-17 01:51 +0000
1# SPDX-License-Identifier: BSD-3-Clause
2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
4"""Load CSV files.
6Note
7----
8CSV support requires `pandas <https://pandas.pydata.org/>`_ to be installed.
9You need to do this manually as it is not declared as a dependency of Scipp.
10You can use either ``pip install pandas`` or ``conda install -c conda-forge pandas``.
13CSV ('comma separated values') files store a simple table of data as a string.
14There are many different forms of this format.
15So check whether the loaded data is what you expect for your files.
16See :func:`scipp.io.csv.load_csv` for examples.
18See Also
19--------
20pandas.read_csv:
21 More details on the underlying parser.
22"""
24from collections.abc import Iterable
25from io import BytesIO, StringIO
26from os import PathLike
27from typing import Any
29from ..compat.pandas_compat import HeaderParserArg, from_pandas_dataframe
30from ..core import Dataset
33# The typehint of filepath_or_buffer is less generic than in pd.read_csv
34# because the definitions of protocols are private in pandas.
35def _load_dataframe(
36 filepath_or_buffer: str | PathLike[str] | StringIO | BytesIO,
37 sep: str | None,
38) -> Any:
39 try:
40 import pandas as pd
41 except ImportError:
42 raise ImportError(
43 "Pandas is required to load CSV files but not install. "
44 "Install it with `pip install pandas` or "
45 "`conda install -c conda-forge pandas`."
46 ) from None
47 return pd.read_csv(filepath_or_buffer, sep=sep)
50def load_csv(
51 filename: str | PathLike[str] | StringIO | BytesIO,
52 *,
53 sep: str | None = ',',
54 data_columns: str | Iterable[str] | None = None,
55 header_parser: HeaderParserArg = None,
56) -> Dataset:
57 """Load a CSV file as a dataset.
59 This function currently uses Pandas to load the file and converts the result
60 into a :class:`scipp.Dataset`.
61 Pandas is not a hard dependency of Scipp and will thus not be installed
62 automatically, so you need to install it manually.
64 ``load_csv`` exists to conveniently load simple CSV files.
65 If a file cannot be loaded directly, consider using Pandas directly.
66 For example, use :func:`pandas.read_csv` to load the file into a data frame and
67 :func:`scipp.compat.pandas_compat.from_pandas` to convert
68 the data frame into a dataset.
70 Parameters
71 ----------
72 filename:
73 Path or URL of file to load or buffer to load from.
74 sep:
75 Column separator.
76 Automatically deduced if ``sep is None``.
77 See :func:`pandas.read_csv` for details.
78 data_columns:
79 Select which columns to assign as data.
80 The rest are returned as coordinates.
81 If ``None``, all columns are assigned as data.
82 Use an empty list to assign all columns as coordinates.
83 header_parser:
84 Parser for column headers.
85 See :func:`scipp.compat.pandas_compat.from_pandas` for details.
87 Returns
88 -------
89 :
90 The loaded data as a dataset.
92 Examples
93 --------
94 Given the following CSV 'file':
96 >>> from io import StringIO
97 >>> csv_content = '''a [m],b [s],c
98 ... 1,5,9
99 ... 2,6,10
100 ... 3,7,11
101 ... 4,8,12'''
103 By default, it will be loaded as
105 >>> sc.io.load_csv(StringIO(csv_content))
106 <scipp.Dataset>
107 Dimensions: Sizes[row:4, ]
108 Data:
109 a [m] int64 [dimensionless] (row) [1, 2, 3, 4]
110 b [s] int64 [dimensionless] (row) [5, 6, 7, 8]
111 c int64 [dimensionless] (row) [9, 10, 11, 12]
113 In this example, the column headers encode units.
114 They can be parsed into actual units:
116 >>> sc.io.load_csv(StringIO(csv_content), header_parser='bracket')
117 <scipp.Dataset>
118 Dimensions: Sizes[row:4, ]
119 Data:
120 a int64 [m] (row) [1, 2, 3, 4]
121 b int64 [s] (row) [5, 6, 7, 8]
122 c int64 <no unit> (row) [9, 10, 11, 12]
124 It is possible to select which columns are stored as data:
126 >>> sc.io.load_csv(
127 ... StringIO(csv_content),
128 ... header_parser='bracket',
129 ... data_columns='a',
130 ... )
131 <scipp.Dataset>
132 Dimensions: Sizes[row:4, ]
133 Coordinates:
134 * b int64 [s] (row) [5, 6, 7, 8]
135 * c int64 <no unit> (row) [9, 10, 11, 12]
136 Data:
137 a int64 [m] (row) [1, 2, 3, 4]
138 """
139 df = _load_dataframe(filename, sep=sep)
140 return from_pandas_dataframe(
141 df,
142 data_columns=data_columns,
143 include_trivial_index=False,
144 header_parser=header_parser,
145 )