Line data Source code
1 : // SPDX-License-Identifier: BSD-3-Clause
2 : // Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
3 : /// @file
4 : /// @author Simon Heybrock
5 : #include <numeric>
6 :
7 : #include "scipp/core/bucket.h"
8 : #include "scipp/core/element/comparison.h"
9 : #include "scipp/core/element/logical.h"
10 : #include "scipp/core/histogram.h"
11 : #include "scipp/core/parallel.h"
12 : #include "scipp/core/tag_util.h"
13 :
14 : #include "scipp/variable/accumulate.h"
15 : #include "scipp/variable/cumulative.h"
16 : #include "scipp/variable/operations.h"
17 : #include "scipp/variable/util.h"
18 : #include "scipp/variable/variable_factory.h"
19 :
20 : #include "scipp/dataset/bins.h"
21 : #include "scipp/dataset/except.h"
22 : #include "scipp/dataset/groupby.h"
23 : #include "scipp/dataset/shape.h"
24 : #include "scipp/dataset/util.h"
25 :
26 : #include "../variable/operations_common.h"
27 : #include "bin_common.h"
28 : #include "dataset_operations_common.h"
29 :
30 : using namespace scipp::variable;
31 :
32 : namespace scipp::dataset {
33 :
34 : namespace {
35 77 : auto resize_array(const DataArray &da, const Dim reductionDim,
36 : const scipp::index size, const FillValue fill) {
37 77 : if (!is_bins(da))
38 72 : return resize(da, reductionDim, size, fill);
39 5 : if (variableFactory().has_masks(da.data()))
40 1 : throw except::NotImplementedError(
41 : "Reduction operations for binned data with "
42 2 : "event masks not supported yet.");
43 4 : DataArray dense_dummy(da);
44 4 : dense_dummy.setData(empty(da.dims(), variableFactory().elem_unit(da.data()),
45 4 : variableFactory().elem_dtype(da.data()),
46 4 : variableFactory().has_variances(da.data())));
47 4 : return resize_array(dense_dummy, reductionDim, size, fill);
48 4 : }
49 : } // namespace
50 :
51 : /// Helper for creating output for "combine" step for "apply" steps that reduce
52 : /// a dimension.
53 : ///
54 : /// - Delete anything (but data) that depends on the reduction dimension.
55 : /// - Default-init data.
56 : template <class T>
57 61 : T GroupBy<T>::makeReductionOutput(const Dim reductionDim,
58 : const FillValue fill) const {
59 61 : T out;
60 : if constexpr (std::is_same_v<T, Dataset>) {
61 39 : out = apply_to_items(m_data, resize_array, reductionDim, size(), fill);
62 : } else {
63 22 : out = resize_array(m_data, reductionDim, size(), fill);
64 : }
65 60 : out = out.rename_dims({{reductionDim, dim()}});
66 60 : out.coords().set(dim(), key());
67 60 : return out;
68 1 : }
69 :
70 : namespace {
71 : template <class Op, class Groups>
72 72 : void reduce_(Op op, const Dim reductionDim, const Variable &out_data,
73 : const DataArray &data, const Dim dim, const Groups &groups,
74 : const FillValue fill) {
75 72 : const auto mask_replacement =
76 144 : special_like(Variable(data.data(), Dimensions{}), fill);
77 72 : auto mask = irreducible_mask(data.masks(), reductionDim);
78 72 : const auto process = [&](const auto &range) {
79 : // Apply to each group, storing result in output slice
80 434 : for (scipp::index group = range.begin(); group != range.end(); ++group) {
81 362 : auto out_slice = out_data.slice({dim, group});
82 737 : for (const auto &slice : groups[group]) {
83 375 : const auto data_slice = data.data().slice(slice);
84 375 : if (mask.is_valid())
85 33 : op(out_slice, where(mask.slice(slice), mask_replacement, data_slice));
86 : else
87 342 : op(out_slice, data_slice);
88 : }
89 : }
90 : };
91 72 : core::parallel::parallel_for(core::parallel::blocked_range(0, groups.size()),
92 : process);
93 72 : }
94 : } // namespace
95 :
96 : template <class T>
97 : template <class Op>
98 61 : T GroupBy<T>::reduce(Op op, const Dim reductionDim,
99 : const FillValue fill) const {
100 61 : auto out = makeReductionOutput(reductionDim, fill);
101 : if constexpr (std::is_same_v<T, Dataset>) {
102 90 : for (const auto &item : m_data)
103 51 : reduce_(op, reductionDim, out[item.name()].data(), item, dim(), groups(),
104 : fill);
105 : } else {
106 21 : reduce_(op, reductionDim, out.data(), m_data, dim(), groups(), fill);
107 : }
108 60 : return out;
109 0 : }
110 :
111 : /// Reduce each group by concatenating elements and return combined data.
112 : ///
113 : /// This only supports binned data.
114 7 : template <class T> T GroupBy<T>::concat(const Dim reductionDim) const {
115 31 : const auto conc = [&](const auto &data) {
116 8 : if (key().dims().volume() == scipp::size(groups()))
117 8 : return groupby_concat_bins(data, {}, key(), reductionDim);
118 : else
119 0 : return groupby_concat_bins(data, key(), {}, reductionDim);
120 : };
121 : if constexpr (std::is_same_v<T, DataArray>) {
122 12 : return conc(m_data);
123 : } else {
124 4 : return apply_to_items(m_data, [&](auto &&..._) { return conc(_...); });
125 : }
126 : }
127 :
128 : /// Reduce each group using `sum` and return combined data.
129 40 : template <class T> T GroupBy<T>::sum(const Dim reductionDim) const {
130 40 : return reduce(variable::sum_into, reductionDim, FillValue::ZeroNotBool);
131 : }
132 :
133 : /// Reduce each group using `nansum` and return combined data.
134 0 : template <class T> T GroupBy<T>::nansum(const Dim reductionDim) const {
135 0 : return reduce(variable::nansum_into, reductionDim, FillValue::ZeroNotBool);
136 : }
137 :
138 : /// Reduce each group using `all` and return combined data.
139 4 : template <class T> T GroupBy<T>::all(const Dim reductionDim) const {
140 4 : return reduce(variable::all_into, reductionDim, FillValue::True);
141 : }
142 :
143 : /// Reduce each group using `any` and return combined data.
144 4 : template <class T> T GroupBy<T>::any(const Dim reductionDim) const {
145 4 : return reduce(variable::any_into, reductionDim, FillValue::False);
146 : }
147 :
148 : /// Reduce each group using `max` and return combined data.
149 8 : template <class T> T GroupBy<T>::max(const Dim reductionDim) const {
150 8 : return reduce(variable::max_into, reductionDim, FillValue::Lowest);
151 : }
152 :
153 : /// Reduce each group using `nanmax` and return combined data.
154 0 : template <class T> T GroupBy<T>::nanmax(const Dim reductionDim) const {
155 0 : return reduce(variable::nanmax_into, reductionDim, FillValue::Lowest);
156 : }
157 :
158 : /// Reduce each group using `min` and return combined data.
159 5 : template <class T> T GroupBy<T>::min(const Dim reductionDim) const {
160 5 : return reduce(variable::min_into, reductionDim, FillValue::Max);
161 : }
162 :
163 : /// Reduce each group using `nanmin` and return combined data.
164 0 : template <class T> T GroupBy<T>::nanmin(const Dim reductionDim) const {
165 0 : return reduce(variable::nanmin_into, reductionDim, FillValue::Max);
166 : }
167 :
168 : /// Apply mean to groups and return combined data.
169 13 : template <class T> T GroupBy<T>::mean(const Dim reductionDim) const {
170 : // 1. Sum into output slices
171 13 : auto out = sum(reductionDim);
172 :
173 : // 2. Compute number of slices N contributing to each out slice
174 183 : const auto get_scale = [&](const auto &data) {
175 : // TODO Supporting binned data requires generalized approach to compute
176 : // scale factor.
177 17 : if (is_bins(data))
178 1 : throw except::NotImplementedError(
179 : "groupby.mean does not support binned data yet.");
180 32 : auto scale = makeVariable<double>(Dims{dim()}, Shape{size()});
181 16 : const auto scaleT = scale.template values<double>();
182 16 : const auto mask = irreducible_mask(data.masks(), reductionDim);
183 51 : for (scipp::index group = 0; group < size(); ++group)
184 72 : for (const auto &slice : groups()[group]) {
185 : // N contributing to each slice
186 37 : scaleT[group] += slice.end() - slice.begin();
187 : // N masks for each slice, that need to be subtracted
188 37 : if (mask.is_valid()) {
189 19 : const auto masks_sum = variable::sum(mask.slice(slice), reductionDim);
190 19 : scaleT[group] -= masks_sum.template value<int64_t>();
191 19 : }
192 : }
193 32 : return reciprocal(std::move(scale));
194 16 : };
195 :
196 : // 3. sum/N -> mean
197 : if constexpr (std::is_same_v<T, Dataset>) {
198 20 : for (auto &&item : out) {
199 12 : if (is_int(item.data().dtype()))
200 0 : out.setData(item.name(), item.data() * get_scale(m_data[item.name()]),
201 : AttrPolicy::Keep);
202 : else
203 12 : item *= get_scale(m_data[item.name()]);
204 : }
205 : } else {
206 5 : if (is_int(out.data().dtype()))
207 2 : out.setData(out.data() * get_scale(m_data));
208 : else
209 3 : out *= get_scale(m_data);
210 : }
211 :
212 24 : return out;
213 1 : }
214 :
215 : namespace {
216 : template <class T> struct NanSensitiveLess {
217 : // Compare two values such that x < NaN for all x != NaN.
218 : // Note: if changing this in future, ensure it meets the requirements from
219 : // https://en.cppreference.com/w/cpp/named_req/Compare, as it is used as
220 : // the comparator for keys in a map.
221 1440 : bool operator()(const T &a, const T &b) const {
222 : if constexpr (std::is_floating_point_v<T>)
223 1409 : if (std::isnan(b))
224 0 : return !std::isnan(a);
225 1440 : return a < b;
226 : }
227 : };
228 :
229 : template <class T> struct nan_sensitive_equal {
230 115857 : bool operator()(const T &a, const T &b) const {
231 : if constexpr (std::is_floating_point_v<T>)
232 1028 : return a == b || (std::isnan(a) && std::isnan(b));
233 : else
234 114829 : return a == b;
235 : }
236 : };
237 : } // namespace
238 :
239 : template <class T> struct MakeGroups {
240 59 : static GroupByGrouping apply(const Variable &key, const Dim targetDim) {
241 59 : expect::is_key(key);
242 55 : const auto &values = key.values<T>();
243 :
244 55 : const auto dim = key.dim();
245 : std::unordered_map<T, GroupByGrouping::group, std::hash<T>,
246 : nan_sensitive_equal<T>>
247 55 : indices;
248 55 : const auto end = values.end();
249 55 : scipp::index i = 0;
250 391 : for (auto it = values.begin(); it != end;) {
251 : // Use contiguous (thick) slices if possible to avoid overhead of slice
252 : // handling in follow-up "apply" steps.
253 336 : const auto begin = i;
254 336 : const auto &group_value = *it;
255 115380 : while (it != end && nan_sensitive_equal<T>()(*it, group_value)) {
256 115044 : ++it;
257 115044 : ++i;
258 : }
259 336 : indices[group_value].emplace_back(dim, begin, i);
260 : }
261 :
262 55 : std::vector<T> keys;
263 55 : std::vector<GroupByGrouping::group> groups;
264 55 : keys.reserve(scipp::size(indices));
265 55 : groups.reserve(scipp::size(indices));
266 378 : for (auto &item : indices)
267 323 : keys.emplace_back(item.first);
268 55 : core::parallel::parallel_sort(keys.begin(), keys.end(),
269 0 : NanSensitiveLess<T>());
270 378 : for (const auto &k : keys) {
271 : // false positive, fixed in https://github.com/danmar/cppcheck/pull/4230
272 : // cppcheck-suppress containerOutOfBounds
273 323 : groups.emplace_back(std::move(indices.at(k)));
274 : }
275 :
276 55 : const Dimensions dims{targetDim, scipp::size(indices)};
277 110 : auto keys_ = makeVariable<T>(Dimensions{dims}, Values(std::move(keys)));
278 55 : keys_.setUnit(key.unit());
279 110 : return {dim, std::move(keys_), std::move(groups)};
280 55 : }
281 : };
282 :
283 : template <class T> struct MakeBinGroups {
284 14 : static GroupByGrouping apply(const Variable &key, const Variable &bins) {
285 14 : expect::is_key(key);
286 14 : if (bins.dims().ndim() != 1)
287 0 : throw except::DimensionError("Group-by bins must be 1-dimensional");
288 14 : if (key.unit() != bins.unit())
289 0 : throw except::UnitError("Group-by key must have same unit as bins");
290 14 : const auto &values = key.values<T>();
291 14 : const auto &edges = bins.values<T>();
292 14 : core::expect::histogram::sorted_edges(edges);
293 :
294 14 : const auto dim = key.dim();
295 14 : std::vector<GroupByGrouping::group> groups(edges.size() - 1);
296 54 : for (scipp::index i = 0; i < scipp::size(values);) {
297 : // Use contiguous (thick) slices if possible to avoid overhead of slice
298 : // handling in follow-up "apply" steps.
299 40 : const auto value = values[i];
300 40 : const auto begin = i++;
301 40 : auto right = std::upper_bound(edges.begin(), edges.end(), value);
302 40 : if (right != edges.end() && right != edges.begin()) {
303 30 : auto left = right - 1;
304 85 : while (i < scipp::size(values) && (*left <= values[i]) &&
305 37 : (values[i] < *right))
306 18 : ++i;
307 30 : groups[std::distance(edges.begin(), left)].emplace_back(dim, begin, i);
308 : }
309 : }
310 28 : return {dim, bins, std::move(groups)};
311 14 : }
312 : };
313 :
314 : template <class T>
315 14 : GroupBy<T> call_groupby(const T &array, const Variable &key,
316 : const Variable &bins) {
317 : return {
318 : array,
319 : core::CallDType<double, float, int64_t, int32_t>::apply<MakeBinGroups>(
320 14 : key.dtype(), key, bins)};
321 : }
322 :
323 : template <class T>
324 59 : GroupBy<T> call_groupby(const T &array, const Variable &key, const Dim &dim) {
325 : return {array,
326 : core::CallDType<double, float, int64_t, int32_t, bool, std::string,
327 : core::time_point>::apply<MakeGroups>(key.dtype(), key,
328 59 : dim)};
329 : }
330 :
331 : /// Create GroupBy<DataArray> object as part of "split-apply-combine" mechanism.
332 : ///
333 : /// Groups the slices of `array` according to values in given by a coord.
334 : /// Grouping will create a new coordinate for the dimension of the grouping
335 : /// coord in a later apply/combine step.
336 30 : GroupBy<DataArray> groupby(const DataArray &array, const Dim dim) {
337 31 : const auto &key = array.meta()[dim];
338 56 : return call_groupby(array, key, dim);
339 29 : }
340 :
341 : /// Create GroupBy<DataArray> object as part of "split-apply-combine" mechanism.
342 : ///
343 : /// Groups the slices of `array` according to values in given by a coord.
344 : /// Grouping of a coord is according to given `bins`, which will be added as a
345 : /// new coordinate to the output in a later apply/combine step.
346 2 : GroupBy<DataArray> groupby(const DataArray &array, const Dim dim,
347 : const Variable &bins) {
348 2 : const auto &key = array.meta()[dim];
349 4 : return groupby(array, key, bins);
350 2 : }
351 :
352 : /// Create GroupBy<DataArray> object as part of "split-apply-combine" mechanism.
353 : ///
354 : /// Groups the slices of `array` according to values in given by a coord.
355 : /// Grouping of a coord is according to given `bins`, which will be added as a
356 : /// new coordinate to the output in a later apply/combine step.
357 4 : GroupBy<DataArray> groupby(const DataArray &array, const Variable &key,
358 : const Variable &bins) {
359 4 : if (!array.dims().includes(key.dims()))
360 1 : throw except::DimensionError("Size of Group-by key is incorrect.");
361 :
362 3 : return call_groupby(array, key, bins);
363 : }
364 :
365 : /// Create GroupBy<Dataset> object as part of "split-apply-combine" mechanism.
366 : ///
367 : /// Groups the slices of `dataset` according to values in given by a coord.
368 : /// Grouping will create a new coordinate for the dimension of the grouping
369 : /// coord in a later apply/combine step.
370 31 : GroupBy<Dataset> groupby(const Dataset &dataset, const Dim dim) {
371 31 : const auto &key = dataset.meta()[dim];
372 30 : return call_groupby(dataset, key, dim);
373 : }
374 :
375 : /// Create GroupBy<Dataset> object as part of "split-apply-combine" mechanism.
376 : ///
377 : /// Groups the slices of `dataset` according to values in given by a coord.
378 : /// Grouping of a coord is according to given `bins`, which will be added as a
379 : /// new coordinate to the output in a later apply/combine step.
380 10 : GroupBy<Dataset> groupby(const Dataset &dataset, const Dim dim,
381 : const Variable &bins) {
382 10 : const auto &key = dataset.meta()[dim];
383 10 : return groupby(dataset, key, bins);
384 : }
385 :
386 : /// Create GroupBy<Dataset> object as part of "split-apply-combine" mechanism.
387 : ///
388 : /// Groups the slices of `dataset` according to values in given by a coord.
389 : /// Grouping of a coord is according to given `bins`, which will be added as a
390 : /// new coordinate to the output in a later apply/combine step.
391 12 : GroupBy<Dataset> groupby(const Dataset &dataset, const Variable &key,
392 : const Variable &bins) {
393 17 : for (const auto &dim : dataset.sizes()) {
394 16 : Dimensions dims(dim, dataset.sizes()[dim]);
395 16 : if (dims.includes(key.dims()))
396 : // Found compatible Dimension.
397 22 : return call_groupby(dataset, key, bins);
398 16 : }
399 : // No Dimension contains the key - throw.
400 1 : throw except::DimensionError("Size of Group-by key is incorrect.");
401 : }
402 :
403 : template class GroupBy<DataArray>;
404 : template class GroupBy<Dataset>;
405 :
406 : } // namespace scipp::dataset
|