Line data Source code
1 : // SPDX-License-Identifier: BSD-3-Clause
2 : // Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
3 : /// @file
4 : /// @author Simon Heybrock
5 : #include <sstream>
6 :
7 : #include "scipp/core/except.h"
8 : #include "scipp/dataset/dataset.h"
9 : #include "scipp/dataset/dataset_util.h"
10 : #include "scipp/dataset/except.h"
11 : #include "scipp/units/unit.h"
12 :
13 : namespace scipp::dataset {
14 :
15 : namespace {
16 11105 : template <class T> void expect_writable(const T &dict) {
17 11105 : if (dict.is_readonly())
18 3 : throw except::DatasetError(
19 : "Read-only flag is set, cannot insert new or erase existing items.");
20 11102 : }
21 :
22 11064 : void expect_valid(const Dataset &ds) {
23 11064 : if (!ds.is_valid())
24 2 : throw except::DatasetError(
25 : "Dataset is not valid. This is an internal error stemming from an "
26 4 : "improperly initialized dataset.");
27 11062 : }
28 :
29 : template <class T>
30 4322 : void expect_matching_item_dims(const Dataset &dset, const std::string_view key,
31 : const T &to_insert) {
32 4322 : if (dset.sizes() != to_insert.dims()) {
33 12 : std::ostringstream msg;
34 12 : msg << "Cannot add item '" << key << "' with dims " << to_insert.dims()
35 12 : << " to dataset with dims " << to_string(dset.sizes()) << ".";
36 12 : throw except::DimensionError(msg.str());
37 12 : }
38 4310 : }
39 : } // namespace
40 :
41 : /// Make an invalid dataset.
42 : ///
43 : /// Such a dataset is intended to be filled using setDataInit and must
44 : /// never be exposed to Python!
45 41083 : Dataset::Dataset() : m_valid{false} {}
46 :
47 1612 : Dataset::Dataset(const Dataset &other)
48 1612 : : m_coords(other.m_coords), m_data(other.m_data), m_readonly(false),
49 1612 : m_valid{other.m_valid} {}
50 :
51 23 : Dataset::Dataset(const DataArray &data) {
52 23 : m_coords.setSizes(data.dims());
53 23 : setData(data.name(), data);
54 23 : }
55 :
56 28 : Dataset &Dataset::operator=(const Dataset &other) {
57 28 : return *this = Dataset(other);
58 : }
59 :
60 188 : Dataset &Dataset::operator=(Dataset &&other) {
61 188 : if (this == &other) {
62 0 : return *this;
63 : }
64 188 : check_nested_in_assign(*this, other);
65 186 : m_coords = std::move(other.m_coords);
66 186 : m_data = std::move(other.m_data);
67 186 : m_readonly = other.m_readonly;
68 186 : m_valid = other.m_valid;
69 186 : return *this;
70 : }
71 :
72 : /// Removes all data items from the Dataset.
73 : ///
74 : /// Coordinates are not modified.
75 3 : void Dataset::clear() {
76 3 : expect_writable(*this);
77 3 : m_data.clear();
78 3 : }
79 :
80 5 : void Dataset::setCoords(Coords other) {
81 5 : expect_valid(*this);
82 5 : scipp::expect::includes(other.sizes(), m_coords.sizes());
83 1 : m_coords = std::move(other);
84 1 : }
85 : /// Return a const view to all coordinates of the dataset.
86 16937 : const Coords &Dataset::coords() const noexcept { return m_coords; }
87 :
88 : /// Return a view to all coordinates of the dataset.
89 2356 : Coords &Dataset::coords() noexcept { return m_coords; }
90 :
91 : /// Return a Dataset without the given coordinate names.
92 8 : Dataset Dataset::drop_coords(const scipp::span<const Dim> coord_names) const {
93 8 : Dataset result = *this;
94 19 : for (const auto &name : coord_names)
95 11 : result.coords().erase(name);
96 8 : return result;
97 0 : }
98 :
99 : /// Alias for coords().
100 41 : const Coords &Dataset::meta() const noexcept { return coords(); }
101 : /// Alias for coords().
102 0 : Coords &Dataset::meta() noexcept { return coords(); }
103 :
104 9471 : bool Dataset::contains(const std::string &name) const noexcept {
105 9471 : return m_data.contains(name);
106 : }
107 :
108 : /// Removes a data item from the Dataset
109 : ///
110 : /// Coordinates are not modified.
111 41 : void Dataset::erase(const std::string &name) {
112 41 : expect_writable(*this);
113 40 : scipp::expect::contains(*this, name);
114 38 : m_data.erase(std::string(name));
115 38 : }
116 :
117 : /// Extract a data item from the Dataset, returning a DataArray
118 : ///
119 : /// Coordinates are not modified.
120 13 : DataArray Dataset::extract(const std::string &name) {
121 13 : auto extracted = operator[](name);
122 12 : erase(name);
123 12 : return extracted;
124 0 : }
125 :
126 : /// Return a data item with coordinates with given name.
127 3681 : DataArray Dataset::operator[](const std::string &name) const {
128 3681 : scipp::expect::contains(*this, name);
129 7336 : return *find(name);
130 : }
131 :
132 : /// Set (insert or replace) the coordinate for the given dimension.
133 6736 : void Dataset::setCoord(const Dim dim, Variable coord) {
134 6736 : expect_writable(*this);
135 6736 : expect_valid(*this);
136 6735 : m_coords.set(dim, std::move(coord));
137 6728 : }
138 :
139 : /// Set (insert or replace) data (values, optional variances) with given name.
140 : ///
141 : /// Throws if the provided values bring the dataset into an inconsistent state
142 : /// (mismatching dimensions). The default is to drop existing attributes, unless
143 : /// AttrPolicy::Keep is specified.
144 4325 : void Dataset::setData(const std::string &name, Variable data,
145 : const AttrPolicy attrPolicy) {
146 4325 : expect_writable(*this);
147 4323 : expect_valid(*this);
148 4322 : expect_matching_item_dims(*this, name, data);
149 4310 : const auto replace = contains(name);
150 4310 : if (replace && attrPolicy == AttrPolicy::Keep)
151 73 : m_data.insert_or_assign(
152 219 : name, DataArray(std::move(data), {}, m_data[name].masks().items(),
153 73 : m_data[name].attrs().items(), name));
154 : else
155 4237 : m_data.insert_or_assign(name, DataArray(std::move(data)));
156 4310 : }
157 :
158 : // See docs of overload for data arrays.
159 486 : void Dataset::setDataInit(const std::string &name, Variable data,
160 : const AttrPolicy attrPolicy) {
161 486 : if (!is_valid()) {
162 418 : m_coords.setSizes(data.dims());
163 418 : m_valid = true;
164 : }
165 486 : setData(name, std::move(data), attrPolicy);
166 486 : }
167 :
168 : namespace {
169 3248 : auto coords_to_skip(const Dataset &dst, const DataArray &src) {
170 3248 : std::vector<Dim> to_skip;
171 6017 : for (auto &&[dim, coord] : src.coords())
172 2770 : if (const auto it = dst.coords().find(dim); it != dst.coords().end()) {
173 1080 : if (it->second.is_aligned() == coord.is_aligned())
174 1080 : expect::matching_coord(dim, coord, it->second, "set coord");
175 0 : else if (it->second.is_aligned())
176 : // Aligned coords overwrite unaligned.
177 0 : to_skip.push_back(dim);
178 : }
179 3247 : return to_skip;
180 1 : }
181 : } // namespace
182 :
183 : /// Set (insert or replace) data from a DataArray with a given name.
184 : ///
185 : /// Coordinates, masks, and attributes of the data array are added to the
186 : /// dataset. Throws if there are existing but mismatching coords, masks, or
187 : /// attributes. Throws if the provided data brings the dataset into an
188 : /// inconsistent state (mismatching dimensions).
189 3255 : void Dataset::setData(const std::string &name, const DataArray &data) {
190 : // Return early on self assign to avoid exceptions from Python inplace ops
191 3255 : if (const auto it = find(name); it != end()) {
192 76 : if (it->data().is_same(data.data()) && it->masks() == data.masks() &&
193 76 : it->attrs() == data.attrs() && it->coords() == data.coords())
194 7 : return;
195 : }
196 3248 : const auto to_skip = coords_to_skip(*this, data);
197 :
198 3250 : setData(name, data.data());
199 3244 : auto &item = m_data[name];
200 6012 : for (auto &&[dim, coord] : data.coords())
201 4458 : if (m_coords.find(dim) == m_coords.end() &&
202 4458 : std::find(to_skip.begin(), to_skip.end(), dim) == to_skip.end())
203 1690 : setCoord(dim, coord);
204 : // Attrs might be shadowed by a coord, but this cannot be prevented in
205 : // general, so instead of failing here we proceed (and may fail later if
206 : // meta() is called).
207 3244 : item.attrs() = data.attrs();
208 3244 : item.masks() = data.masks();
209 3247 : }
210 :
211 : /// Like setData but allow inserting into a default-initialized dataset.
212 : ///
213 : /// A default-constructed dataset cannot be filled using setData or setCoord
214 : /// as the dataset's dimensions are unknown and the input cannot be validated.
215 : /// setDataInit sets the sizes when called with a default-initialized dataset.
216 : /// It can be used for creating a new dataset and filling it step by step.
217 : ///
218 : /// When using this, always make sure to ultimately produce a valid dataset.
219 : /// setDataInit is often called in a loop.
220 : /// Keep in mind that the loop might not run when the input dataset is empty!
221 834 : void Dataset::setDataInit(const std::string &name, const DataArray &data) {
222 834 : if (!is_valid()) {
223 670 : m_coords.setSizes(data.dims());
224 670 : m_valid = true;
225 : }
226 834 : setData(name, data);
227 834 : }
228 :
229 : /// Return slice of the dataset along given dimension with given extents.
230 1294 : Dataset Dataset::slice(const Slice &s) const {
231 2588 : Dataset out(slice_map(m_coords.sizes(), m_data, s));
232 1291 : out.m_coords = m_coords.slice_coords(s);
233 1290 : out.m_readonly = true;
234 1290 : return out;
235 1 : }
236 :
237 12 : Dataset &Dataset::setSlice(const Slice &s, const Dataset &data) {
238 : // Validate slice on all items as a dry-run
239 12 : expect::coords_are_superset(slice(s).coords(), data.coords(), "");
240 27 : for (const auto &[name, item] : m_data)
241 17 : item.validateSlice(s, data.m_data.at(name));
242 : // Only if all items checked for dry-run does modification go-ahead
243 24 : for (auto &&[name, item] : m_data)
244 14 : item.setSlice(s, data.m_data.at(name));
245 10 : return *this;
246 : }
247 :
248 1 : Dataset &Dataset::setSlice(const Slice &s, const DataArray &data) {
249 : // Validate slice on all items as a dry-run
250 1 : expect::coords_are_superset(slice(s).coords(), data.coords(), "");
251 1 : for (const auto &item : m_data)
252 1 : item.second.validateSlice(s, data);
253 : // Only if all items checked for dry-run does modification go-ahead
254 0 : for (auto &&[_, val] : m_data)
255 0 : val.setSlice(s, data);
256 0 : return *this;
257 : }
258 :
259 2 : Dataset &Dataset::setSlice(const Slice &s, const Variable &data) {
260 5 : for (auto &&item : *this)
261 3 : item.setSlice(s, data);
262 2 : return *this;
263 : }
264 :
265 : /// Rename dimension `from` to `to`.
266 : Dataset
267 53 : Dataset::rename_dims(const std::vector<std::pair<Dim, Dim>> &names) const {
268 104 : Dataset out({}, m_coords.rename_dims(names));
269 114 : for (const auto &[name, da] : m_data)
270 64 : out.setData(name, da.rename_dims(names, false));
271 50 : return out;
272 1 : }
273 :
274 : /// Return true if the datasets have identical content.
275 1199 : bool Dataset::operator==(const Dataset &other) const {
276 1199 : if (size() != other.size())
277 80 : return false;
278 1119 : if (coords() != other.coords())
279 240 : return false;
280 1559 : for (const auto &data : *this)
281 930 : if (!other.contains(data.name()) || data != other[data.name()])
282 930 : return false;
283 629 : return true;
284 : }
285 :
286 : /// Return true if the datasets have mismatching content./
287 354 : bool Dataset::operator!=(const Dataset &other) const {
288 354 : return !operator==(other);
289 : }
290 :
291 356 : bool equals_nan(const Dataset &a, const Dataset &b) {
292 356 : if (a.size() != b.size())
293 40 : return false;
294 316 : if (!equals_nan(a.coords(), b.coords()))
295 112 : return false;
296 296 : for (const auto &data : a)
297 204 : if (!b.contains(data.name()) || !equals_nan(data, b[data.name()]))
298 204 : return false;
299 92 : return true;
300 : }
301 :
302 5294 : const Sizes &Dataset::sizes() const { return m_coords.sizes(); }
303 453 : const Sizes &Dataset::dims() const { return sizes(); }
304 14 : Dim Dataset::dim() const {
305 14 : core::expect::ndim_is(sizes(), 1);
306 12 : return *sizes().begin();
307 : }
308 238 : scipp::index Dataset::ndim() const { return scipp::size(m_coords.sizes()); }
309 :
310 18624 : bool Dataset::is_readonly() const noexcept { return m_readonly; }
311 :
312 14340 : bool Dataset::is_valid() const noexcept { return m_valid; }
313 :
314 247 : typename Masks::holder_type union_or(const Masks ¤tMasks,
315 : const Masks &otherMasks) {
316 247 : typename Masks::holder_type out;
317 :
318 337 : for (const auto &[key, item] : currentMasks)
319 90 : out.insert_or_assign(key, copy(item));
320 341 : for (const auto &[key, item] : otherMasks) {
321 97 : if (!currentMasks.contains(key)) {
322 19 : out.insert_or_assign(key, copy(item));
323 153 : } else if (item.dtype() != core::dtype<bool> ||
324 153 : out[key].dtype() != core::dtype<bool>) {
325 3 : throw except::TypeError(" Cannot combine non-boolean mask '" + key +
326 6 : "' in operation");
327 150 : } else if (item.unit() != scipp::units::none ||
328 75 : out[key].unit() != scipp::units::none) {
329 0 : throw except::UnitError(" Cannot combine a unit-specified mask '" + key +
330 0 : "' in operation");
331 75 : } else if (out[key].dims().includes(item.dims())) {
332 67 : out[key] |= item;
333 : } else {
334 8 : out[key] = out[key] | item;
335 : }
336 : }
337 244 : return out;
338 3 : }
339 :
340 291 : void union_or_in_place(Masks ¤tMasks, const Masks &otherMasks) {
341 : using core::to_string;
342 : using units::to_string;
343 :
344 449 : for (const auto &[key, item] : otherMasks) {
345 163 : const auto it = currentMasks.find(key);
346 163 : if (it == currentMasks.end() && currentMasks.is_readonly()) {
347 1 : throw except::NotFoundError("Cannot insert new mask '" + to_string(key) +
348 2 : "' via a slice.");
349 164 : } else if (it != currentMasks.end() && it->second.is_readonly() &&
350 164 : it->second != (it->second | item)) {
351 1 : throw except::DimensionError("Cannot update mask '" + to_string(key) +
352 : "' via slice since the mask is implicitly "
353 2 : "broadcast along the slice dimension.");
354 464 : } else if (it != currentMasks.end() &&
355 303 : (item.dtype() != core::dtype<bool> ||
356 311 : currentMasks[key].dtype() != core::dtype<bool>)) {
357 3 : throw except::TypeError(" Cannot combine non-boolean mask '" + key +
358 6 : "' in operation");
359 308 : } else if (it != currentMasks.end() &&
360 150 : (item.unit() != scipp::units::none ||
361 308 : currentMasks[key].unit() != scipp::units::none)) {
362 0 : throw except::UnitError(" Cannot combine a unit-specified mask '" + key +
363 0 : "' in operation");
364 : }
365 : }
366 :
367 442 : for (const auto &[key, item] : otherMasks) {
368 156 : const auto it = currentMasks.find(key);
369 156 : if (it == currentMasks.end()) {
370 8 : currentMasks.set(key, copy(item));
371 148 : } else if (!it->second.is_readonly()) {
372 147 : it->second |= item;
373 : }
374 : }
375 286 : }
376 :
377 : } // namespace scipp::dataset
|