Line data Source code
1 : // SPDX-License-Identifier: BSD-3-Clause
2 : // Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
3 : /// @file
4 : /// @author Simon Heybrock
5 : #include <numeric>
6 : #include <set>
7 :
8 : #include "scipp/variable/astype.h"
9 : #include "scipp/variable/bin_detail.h"
10 : #include "scipp/variable/bin_util.h"
11 : #include "scipp/variable/bins.h"
12 : #include "scipp/variable/cumulative.h"
13 : #include "scipp/variable/reduction.h"
14 : #include "scipp/variable/shape.h"
15 : #include "scipp/variable/subspan_view.h"
16 : #include "scipp/variable/variable_factory.h"
17 :
18 : #include "scipp/dataset/bin.h"
19 : #include "scipp/dataset/bins.h"
20 : #include "scipp/dataset/bins_view.h"
21 : #include "scipp/dataset/except.h"
22 :
23 : #include "bin_detail.h"
24 : #include "bins_util.h"
25 : #include "dataset_operations_common.h"
26 :
27 : using namespace scipp::variable::bin_detail;
28 : using namespace scipp::dataset::bin_detail;
29 :
30 : namespace scipp::dataset {
31 :
32 : namespace {
33 :
34 : template <class T>
35 8477 : Variable bins_from_sizes(T &&content, const Variable &bin_sizes) {
36 8477 : const auto end = cumsum(bin_sizes);
37 8477 : const auto buffer_dim = content.dims().inner();
38 : return make_bins(zip(end - bin_sizes, end), buffer_dim,
39 16954 : std::forward<T>(content));
40 8477 : }
41 :
42 8459 : template <class Builder> bool use_two_stage_remap(const Builder &bld) {
43 16269 : return bld.nbin().dims().empty() &&
44 7810 : bld.nbin().template value<scipp::index>() == bld.dims().volume() &&
45 : // empirically determined crossover point (approx.)
46 7798 : bld.nbin().template value<scipp::index>() > 16 * 1024 &&
47 16287 : bld.offsets().dims().empty() &&
48 8477 : bld.offsets().template value<scipp::index>() == 0;
49 : }
50 : class Mapper {
51 : public:
52 8531 : virtual ~Mapper() = default;
53 8477 : template <class T> T apply(const Variable &data) {
54 69427 : const auto maybe_bin = [this](const auto &var) {
55 30484 : return is_bins(var) ? apply_to_variable(var) : copy(var);
56 : };
57 : if constexpr (std::is_same_v<T, Variable>)
58 38 : return maybe_bin(data);
59 : else
60 8458 : return dataset::transform(bins_view<T>(data), maybe_bin);
61 : }
62 :
63 : virtual Variable bin_sizes(
64 : const std::optional<Dimensions> &dims_override = std::nullopt) const = 0;
65 : virtual Variable apply_to_variable(const Variable &var,
66 : Variable &&out = {}) const = 0;
67 : };
68 :
69 : class SingleStageMapper : public Mapper {
70 : public:
71 8477 : SingleStageMapper(const Dimensions &dims, const Variable &indices,
72 : const Variable &output_bin_sizes)
73 8477 : : m_dims(dims), m_indices(indices), m_output_bin_sizes(output_bin_sizes) {
74 : // Setup offsets within output bins, for every input bin. If rebinning
75 : // occurs along a dimension each output bin sees contributions from all
76 : // input bins along that dim.
77 8477 : m_offsets = copy(m_output_bin_sizes);
78 8477 : fill_zeros(m_offsets);
79 : // Not using cumsum along *all* dims, since some outer dims may be left
80 : // untouched (no rebin).
81 8477 : std::vector<std::pair<Dim, scipp::index>> strategy;
82 16488 : for (const auto dim : m_indices.dims())
83 8011 : if (dims.contains(dim))
84 7012 : strategy.emplace_back(dim, m_indices.dims()[dim]);
85 : // To avoid excessive memory consumption in intermediate results for
86 : // `output_bin_sizes` (in the loop below, computing sums and cumsums) we
87 : // need to ensure to handle the longest dimensions first,
88 8477 : std::sort(strategy.begin(), strategy.end(),
89 190 : [](auto &&a, auto &&b) { return a.second > b.second; });
90 15489 : for (const auto &item : strategy) {
91 7012 : const auto dim = item.first;
92 7012 : subbin_sizes_add_intersection(
93 7012 : m_offsets, subbin_sizes_cumsum_exclusive(m_output_bin_sizes, dim));
94 7012 : m_output_bin_sizes = sum(m_output_bin_sizes, dim);
95 : }
96 : // cumsum with bin dimension is last, since this corresponds to different
97 : // output bins, whereas the cumsum above handled different subbins of same
98 : // output bin, i.e., contributions of different input bins to some output
99 : // bin.
100 8477 : subbin_sizes_add_intersection(
101 8477 : m_offsets, cumsum_exclusive_subbin_sizes(m_output_bin_sizes));
102 8477 : const auto filtered_input_bin_size = sum_subbin_sizes(m_output_bin_sizes);
103 8477 : auto end = cumsum(filtered_input_bin_size);
104 8477 : m_total_size = end.dims().volume() > 0
105 8477 : ? end.values<scipp::index>().as_span().back()
106 : : 0;
107 16954 : end = broadcast(end,
108 8477 : m_indices.dims()); // required for some cases of rebinning
109 8477 : m_filtered_input_bin_ranges = zip(end - filtered_input_bin_size, end);
110 8477 : }
111 :
112 30516 : Variable apply_to_variable(const Variable &var,
113 : Variable &&out = {}) const override {
114 30516 : const auto &[input_indices, dim, content] = var.constituents<Variable>();
115 : static_cast<void>(input_indices);
116 : // The optional `out` argument is used to avoid creating a temporary buffer
117 : // when TwoStageMapper is applied to a series of columns of matching dtype.
118 30548 : if (!out.is_valid() || out.dtype() != content.dtype() ||
119 32 : out.has_variances() != content.has_variances())
120 30484 : out = resize_default_init(content, dim, m_total_size);
121 : else
122 32 : out.setUnit(content.unit());
123 30516 : auto out_subspans = subspan_view(out, dim, m_filtered_input_bin_ranges);
124 30516 : map_to_bins(out_subspans, as_subspan_view(var), m_offsets,
125 61032 : as_subspan_view(std::as_const(m_indices)));
126 61032 : return out;
127 30516 : }
128 :
129 8477 : Variable bin_sizes(const std::optional<Dimensions> &dims_override =
130 : std::nullopt) const override {
131 : // During mapping of values to the output layout, the output was viewed with
132 : // same bin index ranges as input. Now setup the desired final bin indices.
133 8477 : const auto dims = dims_override.value_or(m_dims);
134 8477 : auto output_dims = merge(m_output_bin_sizes.dims(), dims);
135 : return makeVariable<scipp::index>(
136 : output_dims, units::none,
137 25431 : Values(flatten_subbin_sizes(m_output_bin_sizes, dims.volume())));
138 8477 : }
139 :
140 : Dimensions m_dims;
141 : Variable m_indices;
142 :
143 : private:
144 : Variable m_output_bin_sizes;
145 : Variable m_offsets;
146 : Variable m_filtered_input_bin_ranges;
147 : scipp::index m_total_size;
148 : };
149 :
150 : class TwoStageMapper : public Mapper {
151 : public:
152 18 : TwoStageMapper(SingleStageMapper &&stage1_mapper,
153 : SingleStageMapper &&stage2_mapper)
154 18 : : m_stage1_mapper(std::move(stage1_mapper)),
155 36 : m_stage2_mapper(std::move(stage2_mapper)) {}
156 :
157 50 : Variable apply_to_variable(const Variable &var,
158 : Variable && = {}) const override {
159 : // Note how by having the virtual call on the Variable level we avoid
160 : // making the temporary buffer for the whole content buffer (typically a
161 : // DataArray), but instead just for one of the content buffer's columns
162 : // at a time.
163 : // As a further optimization we can reuse the content buffer. With the
164 : // current implementation of handling dtype this will only work if the dtype
165 : // is the same as that of the previously processed column. Otherwise a new
166 : // buffer is created.
167 50 : m_buffer = m_stage1_mapper.apply_to_variable(var, std::move(m_buffer));
168 50 : Variable indices = m_stage2_mapper.m_indices.bin_indices();
169 : return m_stage2_mapper.apply_to_variable(
170 150 : make_bins_no_validate(indices, m_buffer.dims().inner(), m_buffer));
171 50 : }
172 :
173 18 : Variable bin_sizes(const std::optional<Dimensions> &dims_override =
174 : std::nullopt) const override {
175 18 : const auto dims = dims_override.value_or(m_stage1_mapper.m_dims);
176 18 : return fold(
177 54 : flatten(m_stage2_mapper.bin_sizes(),
178 36 : std::vector<Dim>{Dim::InternalBinCoarse, Dim::InternalBinFine},
179 : Dim::InternalSubbin)
180 36 : .slice({Dim::InternalSubbin, 0, dims.volume()}),
181 36 : Dim::InternalSubbin, dims);
182 18 : }
183 :
184 : private:
185 : SingleStageMapper m_stage1_mapper;
186 : SingleStageMapper m_stage2_mapper;
187 : mutable Variable m_buffer;
188 : };
189 :
190 : template <class Builder>
191 8459 : std::unique_ptr<Mapper> make_mapper(Variable &&indices,
192 : const Builder &builder) {
193 8459 : const auto dims = builder.dims();
194 8459 : if (use_two_stage_remap(builder)) {
195 : // There are many output bins. Mapping directly would lead to excessive
196 : // number of cache misses as well as potential false-sharing problems
197 : // between threads. We therefore map in two stages. This requires an
198 : // additional temporary buffer with size given by the number of events,
199 : // but has proven to be faster in practice.
200 18 : scipp::index chunk_size =
201 18 : floor(sqrt(builder.nbin().template value<scipp::index>()));
202 18 : const auto chunk = astype(scipp::index{chunk_size} * units::none,
203 18 : indices.bin_buffer<Variable>().dtype());
204 18 : Variable fine_indices(std::move(indices));
205 18 : auto indices_ = floor_divide(fine_indices, chunk);
206 18 : fine_indices %= chunk;
207 18 : const auto n_coarse_bin = dims.volume() / chunk_size + 1;
208 :
209 18 : Variable output_bin_sizes = bin_detail::bin_sizes(
210 : indices_, builder.offsets(), n_coarse_bin * units::none);
211 18 : SingleStageMapper stage1_mapper(dims, indices_, output_bin_sizes);
212 :
213 18 : Dimensions stage1_out_dims(Dim::InternalBinCoarse, n_coarse_bin);
214 18 : fine_indices = bins_from_sizes(stage1_mapper.apply<Variable>(fine_indices),
215 : stage1_mapper.bin_sizes(stage1_out_dims));
216 18 : Dimensions fine_dims(Dim::InternalBinFine, chunk_size);
217 36 : const auto fine_output_bin_sizes =
218 : bin_detail::bin_sizes(fine_indices, scipp::index{0} * units::none,
219 : fine_dims.volume() * units::none);
220 18 : SingleStageMapper stage2_mapper(fine_dims, fine_indices,
221 : fine_output_bin_sizes);
222 :
223 18 : return std::make_unique<TwoStageMapper>(std::move(stage1_mapper),
224 36 : std::move(stage2_mapper));
225 18 : } else {
226 8441 : const auto output_bin_sizes =
227 : bin_detail::bin_sizes(indices, builder.offsets(), builder.nbin());
228 8441 : return std::make_unique<SingleStageMapper>(dims, indices, output_bin_sizes);
229 8441 : }
230 8459 : }
231 :
232 : template <class T, class Mapping>
233 25353 : auto extract_unbinned(T &array, Mapping &map) {
234 25353 : const auto dim = array.dims().inner();
235 : using Key = typename Mapping::key_type;
236 25353 : std::vector<Key> to_extract;
237 : // WARNING: Do not use `map` while extracting, `extract` invalidates it!
238 25353 : std::copy_if(map.keys_begin(), map.keys_end(), std::back_inserter(to_extract),
239 22000 : [&](const auto &key) { return !map[key].dims().contains(dim); });
240 25353 : core::Dict<Key, Variable> extracted;
241 25371 : for (const auto &key : to_extract)
242 18 : extracted.insert_or_assign(key, map.extract(key));
243 50706 : return extracted;
244 25353 : }
245 :
246 : /// Combine meta data from buffer and input data array and create final output
247 : /// data array with binned data.
248 : /// - Meta data that does not depend on the buffer dim is lifted to the output
249 : /// array.
250 : /// - Any meta data depending on rebinned dimensions is dropped since it becomes
251 : /// meaningless. Note that rebinned masks have been applied before the binning
252 : /// step.
253 : /// - If rebinning, existing meta data along unchanged dimensions is preserved.
254 : template <class Coords, class Masks, class Attrs>
255 8451 : DataArray add_metadata(const Variable &data, std::unique_ptr<Mapper> mapper,
256 : const Coords &coords, const Masks &masks,
257 : const Attrs &attrs, const std::vector<Variable> &edges,
258 : const std::vector<Variable> &groups,
259 : const std::vector<Dim> &erase) {
260 8451 : auto bin_sizes = mapper->bin_sizes();
261 8451 : auto buffer = mapper->template apply<DataArray>(data);
262 8451 : bin_sizes = squeeze(bin_sizes, erase);
263 8451 : const auto buffer_dim = buffer.dims().inner();
264 8451 : std::set<Dim> dims(erase.begin(), erase.end());
265 48962 : const auto rebinned = [&](const auto &var) {
266 17555 : for (const auto &dim : var.dims().labels())
267 16861 : if (dims.count(dim) || var.dims().contains(buffer_dim))
268 16205 : return true;
269 694 : return false;
270 : };
271 8451 : auto out_coords = extract_unbinned(buffer, buffer.coords());
272 42255 : for (const auto &c : {edges, groups})
273 31283 : for (const auto &coord : c) {
274 14381 : dims.emplace(coord.dims().inner());
275 14381 : Variable to_insert(coord);
276 14381 : to_insert.set_aligned(true);
277 14381 : out_coords.insert_or_assign(coord.dims().inner(), std::move(to_insert));
278 : }
279 25235 : for (const auto &[dim_, coord] : coords)
280 16784 : if (!rebinned(coord) && !out_coords.contains(dim_))
281 633 : out_coords.insert_or_assign(dim_, coord);
282 8451 : auto out_masks = extract_unbinned(buffer, buffer.masks());
283 8508 : for (const auto &[name, mask] : masks)
284 57 : if (!rebinned(mask))
285 6 : out_masks.insert_or_assign(name, copy(mask));
286 8451 : auto out_attrs = extract_unbinned(buffer, buffer.attrs());
287 8509 : for (const auto &[dim_, coord] : attrs)
288 58 : if (!rebinned(coord) && !out_coords.contains(dim_))
289 9 : out_attrs.insert_or_assign(dim_, coord);
290 8451 : return DataArray{bins_from_sizes(std::move(buffer), bin_sizes),
291 16902 : std::move(out_coords), std::move(out_masks),
292 33804 : std::move(out_attrs)};
293 8451 : }
294 :
295 : class TargetBinBuilder {
296 : enum class AxisAction { Group, Bin, Existing, Join };
297 :
298 : public:
299 30886 : [[nodiscard]] const Dimensions &dims() const noexcept { return m_dims; }
300 8495 : [[nodiscard]] const Variable &offsets() const noexcept { return m_offsets; }
301 32508 : [[nodiscard]] const Variable &nbin() const noexcept { return m_nbin; }
302 :
303 : /// `bin_coords` may optionally be used to provide bin-based coords, e.g., for
304 : /// data that has prior grouping but did not retain the original group coord
305 : /// for every event.
306 : template <class CoordsT, class BinCoords = Coords>
307 8464 : void build(Variable &indices, CoordsT &&coords, BinCoords &&bin_coords = {}) {
308 37234 : const auto get_coord = [&](const Dim dim) {
309 14385 : return coords.count(dim) ? coords[dim] : bin_coords.at(dim);
310 : };
311 8464 : m_offsets = makeVariable<scipp::index>(Values{0}, units::none);
312 8464 : m_nbin = dims().volume() * units::none;
313 22883 : for (const auto &[action, dim, key] : m_actions) {
314 14424 : if (action == AxisAction::Group)
315 4908 : update_indices_by_grouping(indices, get_coord(dim), key);
316 9516 : else if (action == AxisAction::Bin) {
317 9477 : const auto linspace = all(islinspace(key, dim)).template value<bool>();
318 : // When binning along an existing dim with a coord (may be edges or
319 : // not), not all input bins can map to all output bins. The array of
320 : // subbin sizes that is normally created thus contains mainly zero
321 : // entries, e.g.,:
322 : // ---1
323 : // --11
324 : // --4-
325 : // 111-
326 : // 2---
327 : //
328 : // each row corresponds to an input bin
329 : // each column corresponds to an output bin
330 : // the example is for a single rebinned dim
331 : // `-` is 0
332 : //
333 : // In practice this array of sizes can become very large (many GByte of
334 : // memory) and has to be avoided. This is not just a performance issue.
335 : // We detect this case, pre select relevant output bins, and store the
336 : // sparse array in a specialized packed format, using the helper type
337 : // SubbinSizes.
338 : // Note that there is another source of memory consumption in the
339 : // algorithm, `indices`, containing the index of the target bin for
340 : // every input event. This is unrelated and varies independently,
341 : // depending on parameters of the input.
342 27774 : if (key.ndim() == 1 && // index setup not implemented for this case
343 9457 : bin_coords.count(dim) && m_offsets.dims().empty() &&
344 25759 : bin_coords.at(dim).dims().contains(dim) &&
345 6825 : allsorted(bin_coords.at(dim), dim)) {
346 726 : const auto &bin_coord = bin_coords.at(dim);
347 726 : const bool histogram =
348 726 : bin_coord.dims()[dim] ==
349 726 : (indices.dims().contains(dim) ? indices.dims()[dim] : 1) + 1;
350 726 : auto begin =
351 726 : begin_edge(histogram ? left_edge(bin_coord) : bin_coord, key);
352 996 : auto end = histogram ? end_edge(right_edge(bin_coord), key)
353 : : begin + 2 * units::none;
354 : // When we have bin edges (of length 2) for a dimension that is not
355 : // a dimension of the input it needs to be squeezed to avoid problems
356 : // in various places later on.
357 726 : begin = squeeze(begin, std::nullopt);
358 726 : end = squeeze(end, std::nullopt);
359 726 : const auto indices_ = zip(begin, end);
360 726 : const auto inner_volume = dims().volume() / dims()[dim] * units::none;
361 : // Number of non-zero entries (per "row" above)
362 726 : m_nbin = (end - begin - 1 * units::none) * inner_volume;
363 : // Offset to first non-zero entry (in "row" above)
364 726 : m_offsets = begin * inner_volume;
365 : // Mask out any output bin edges that need not be considered since
366 : // there is no overlap between given input and output bin.
367 1452 : const auto masked_key = make_bins_no_validate(indices_, dim, key);
368 731 : update_indices_by_binning(indices, get_coord(dim), masked_key,
369 : linspace);
370 746 : } else {
371 8751 : update_indices_by_binning(indices, get_coord(dim), key, linspace);
372 : }
373 39 : } else if (action == AxisAction::Existing) {
374 : // Similar to binning along an existing dim, if a dimension is simply
375 : // kept unchanged there is a 1:1 mapping from input to output dims. We
376 : // can thus avoid storing and processing a lot of length-0 contributions
377 : // to bins.
378 : // Note that this is only possible (in this simple manner) if there are
379 : // no other actions affecting output dimensions.
380 38 : if (m_offsets.dims().empty() && m_dims[dim] == m_dims.volume()) {
381 : // Offset to output bin tracked using base offset for input bins
382 0 : m_nbin = scipp::index{1} * units::none;
383 0 : m_offsets = make_range(0, m_dims[dim], 1, dim);
384 : } else {
385 : // Offset to output bin tracked in indices for individual events
386 38 : update_indices_from_existing(indices, dim);
387 : }
388 1 : } else if (action == AxisAction::Join) {
389 : ; // target bin 0 for all
390 : }
391 : }
392 8459 : }
393 :
394 8451 : [[nodiscard]] auto edges() const noexcept {
395 8451 : std::vector<Variable> vars;
396 22870 : for (const auto &[action, dim, key] : m_actions) {
397 : static_cast<void>(dim);
398 14419 : if (action == AxisAction::Bin || action == AxisAction::Join)
399 9473 : vars.emplace_back(key);
400 : }
401 8451 : return vars;
402 : }
403 :
404 8451 : [[nodiscard]] auto groups() const noexcept {
405 8451 : std::vector<Variable> vars;
406 22870 : for (const auto &[action, dim, key] : m_actions) {
407 : static_cast<void>(dim);
408 14419 : if (action == AxisAction::Group)
409 4908 : vars.emplace_back(key);
410 : }
411 8451 : return vars;
412 : }
413 :
414 4908 : void group(const Variable &groups) {
415 4908 : const auto dim = groups.dims().inner();
416 4908 : m_dims.addInner(dim, groups.dims()[dim]);
417 4908 : m_actions.emplace_back(AxisAction::Group, dim, groups);
418 4908 : }
419 :
420 9492 : void bin(const Variable &edges) {
421 9492 : const auto dim = edges.dims().inner();
422 9492 : m_dims.addInner(dim, edges.dims()[dim] - 1);
423 9487 : m_actions.emplace_back(AxisAction::Bin, dim, edges);
424 9487 : }
425 :
426 38 : void existing(const Dim dim, const scipp::index size) {
427 38 : m_dims.addInner(dim, size);
428 38 : m_actions.emplace_back(AxisAction::Existing, dim, Variable{});
429 38 : }
430 :
431 1 : void join(const Dim dim, const Variable &coord) {
432 1 : m_dims.addInner(dim, 1);
433 3 : m_joined.emplace_back(concat(std::vector{min(coord), max(coord)}, dim));
434 1 : m_actions.emplace_back(AxisAction::Join, dim, m_joined.back());
435 1 : }
436 :
437 : // All input bins mapped to same output bin => "add" 0 everywhere
438 2706 : void erase(const Dim dim) { m_dims.addInner(dim, 1); }
439 :
440 : private:
441 : Dimensions m_dims;
442 : Variable m_offsets;
443 : Variable m_nbin;
444 : std::vector<std::tuple<AxisAction, Dim, Variable>> m_actions;
445 : std::vector<Variable> m_joined;
446 : };
447 :
448 : // Order is defined as:
449 : // 1. Erase binning from any dimensions listed in erase
450 : // 2. Any rebinned dim and dims inside the first rebinned dim, in the order of
451 : // appearance in array.
452 : // 3. All new grouped dims.
453 : // 4. All new binned dims.
454 : template <class Coords>
455 8458 : auto axis_actions(const Variable &data, const Coords &coords,
456 : const std::vector<Variable> &edges,
457 : const std::vector<Variable> &groups,
458 : const std::vector<Dim> &erase) {
459 8458 : TargetBinBuilder builder;
460 11143 : for (const auto dim : erase) {
461 2685 : builder.erase(dim);
462 : }
463 :
464 16916 : constexpr auto get_dims = [](const auto &coords_) {
465 16916 : Dimensions dims;
466 31308 : for (const auto &coord : coords_)
467 14392 : dims.addInner(coord.dims().inner(), 1);
468 16916 : return dims;
469 0 : };
470 8458 : auto edges_dims = get_dims(edges);
471 8458 : auto groups_dims = get_dims(groups);
472 : // If we rebin a dimension that is not the inner dimension of the input, we
473 : // also need to handle bin contents from all dimensions inside the rebinned
474 : // one, even if the grouping/binning along this dimension is unchanged.
475 8458 : bool rebin = false;
476 8458 : const auto dims = data.dims();
477 16437 : for (const auto dim : dims.labels()) {
478 7994 : if (edges_dims.contains(dim) || groups_dims.contains(dim))
479 831 : rebin = true;
480 7994 : if (groups_dims.contains(dim)) {
481 6 : builder.group(groups[groups_dims.index(dim)]);
482 7988 : } else if (edges_dims.contains(dim)) {
483 825 : builder.bin(edges[edges_dims.index(dim)]);
484 7163 : } else if (rebin && !builder.dims().contains(dim)) {
485 : // If the dim is erased then the builder contains it and we skip 'rebin'
486 47 : if (coords.count(dim) && coords.at(dim).dims().ndim() != 1)
487 20 : throw except::DimensionError(
488 10 : "2-D coordinate " + to_string(coords.at(dim)) +
489 : " conflicting with (re)bin of outer dimension. Try specifying new "
490 : "aligned (1-D) edges for dimension '" +
491 : to_string(dim) + "' with the `edges` option of `bin`.");
492 37 : builder.existing(dim, data.dims()[dim]);
493 : }
494 : }
495 13343 : for (const auto &group : groups)
496 4900 : if (!dims.contains(group.dims().inner()))
497 4894 : builder.group(group);
498 17920 : for (const auto &edge : edges)
499 9477 : if (!dims.contains(edge.dims().inner()))
500 8667 : builder.bin(edge);
501 16886 : return builder;
502 8503 : }
503 :
504 : template <class T> class TargetBins {
505 : public:
506 2331 : TargetBins(const Variable &var, const Dimensions &dims) {
507 : // In some cases all events in an input bin map to the same output, but
508 : // right now bin<> cannot handle this and requires target bin indices for
509 : // every bin element.
510 2331 : const auto &[begin_end, dim, buffer] = var.constituents<T>();
511 2331 : m_target_bins_buffer =
512 2331 : (dims.volume() > std::numeric_limits<int32_t>::max())
513 4662 : ? makeVariable<int64_t>(buffer.dims(), units::none)
514 : : makeVariable<int32_t>(buffer.dims(), units::none);
515 2331 : m_target_bins = make_bins_no_validate(begin_end, dim, m_target_bins_buffer);
516 2331 : }
517 2331 : auto &operator*() noexcept { return m_target_bins; }
518 2326 : Variable &&release() noexcept { return std::move(m_target_bins); }
519 :
520 : private:
521 : Variable m_target_bins_buffer;
522 : Variable m_target_bins;
523 : };
524 :
525 : } // namespace
526 :
527 : /// Reduce a dimension by concatenating bin contents of all bins along a
528 : /// dimension.
529 : ///
530 : /// This is used to implement `concatenate(var, dim)`.
531 8 : template <class T> Variable concat_bins(const Variable &var, const Dim dim) {
532 8 : TargetBinBuilder builder;
533 8 : builder.erase(dim);
534 8 : TargetBins<T> target_bins(var, builder.dims());
535 :
536 8 : builder.build(*target_bins, std::map<Dim, Variable>{});
537 8 : auto mapper = make_mapper(target_bins.release(), builder);
538 8 : auto buffer = mapper->template apply<T>(var);
539 8 : auto bin_sizes = mapper->bin_sizes();
540 8 : bin_sizes = squeeze(bin_sizes, scipp::span{&dim, 1});
541 16 : return bins_from_sizes(std::move(buffer), bin_sizes);
542 8 : }
543 : template Variable concat_bins<Variable>(const Variable &, const Dim);
544 : template Variable concat_bins<DataArray>(const Variable &, const Dim);
545 :
546 : /// Implementation of groupby.bins.concatenate
547 : ///
548 : /// If `array` has unaligned, i.e., not 1-D, coords conflicting with the
549 : /// reduction dimension, any binning along the dimensions of the conflicting
550 : /// coords is removed. It is replaced by a single bin along that dimension, with
551 : /// bin edges given my min and max of the old coord.
552 13 : DataArray groupby_concat_bins(const DataArray &array, const Variable &edges,
553 : const Variable &groups, const Dim reductionDim) {
554 13 : TargetBinBuilder builder;
555 13 : if (edges.is_valid())
556 0 : builder.bin(edges);
557 13 : if (groups.is_valid())
558 8 : builder.group(groups);
559 13 : builder.erase(reductionDim);
560 13 : const auto dims = array.dims();
561 28 : for (const auto &dim : dims.labels())
562 15 : if (array.meta().contains(dim)) {
563 17 : if (array.meta()[dim].dims().ndim() != 1 &&
564 9 : array.meta()[dim].dims().contains(reductionDim))
565 1 : builder.join(dim, array.meta()[dim]);
566 7 : else if (dim != reductionDim)
567 1 : builder.existing(dim, array.dims()[dim]);
568 : }
569 :
570 : const auto masked =
571 13 : hide_masked(array.data(), array.masks(), builder.dims().labels());
572 13 : TargetBins<DataArray> target_bins(masked, builder.dims());
573 13 : builder.build(*target_bins, array.meta());
574 : // Note: Unlike in the other cases below we do not call
575 : // `drop_grouped_event_coords` here. Grouping is based on a bin-coord rather
576 : // than event-coord so we do not touch the latter.
577 26 : return add_metadata(masked, make_mapper(target_bins.release(), builder),
578 13 : array.coords(), array.masks(), array.attrs(),
579 52 : builder.edges(), builder.groups(), {reductionDim});
580 13 : }
581 :
582 : namespace {
583 8470 : void validate_bin_args(const DataArray &array,
584 : const std::vector<Variable> &edges,
585 : const std::vector<Variable> &groups) {
586 2325 : if ((is_bins(array) &&
587 16940 : std::get<2>(array.data().constituents<DataArray>()).dims().ndim() > 1) ||
588 8470 : (!is_bins(array) && array.dims().ndim() > 1)) {
589 2 : throw except::BinnedDataError(
590 : "Binning is only implemented for 1-dimensional data. Consider using "
591 4 : "groupby, it might be able to do what you need.");
592 : }
593 8468 : if (edges.empty() && groups.empty())
594 5 : throw std::invalid_argument(
595 : "Arguments 'edges' and 'groups' of scipp.bin are "
596 10 : "both empty. At least one must be set.");
597 17955 : for (const auto &edge : edges) {
598 9497 : const auto dim = edge.dims().inner();
599 9497 : if (edge.dims()[dim] < 2)
600 10 : throw except::BinEdgeError("Not enough bin edges in dim " +
601 15 : to_string(dim) + ". Need at least 2.");
602 9492 : if (!allsorted(edge, dim))
603 0 : throw except::BinEdgeError("Bin edges in dim " + to_string(dim) +
604 0 : " must be sorted.");
605 : }
606 8458 : }
607 :
608 8438 : auto drop_grouped_event_coords(const Variable &data,
609 : const std::vector<Variable> &groups) {
610 8438 : auto [indices, dim, buffer] = data.constituents<DataArray>();
611 : // Do not preserve event coords used for grouping since this is redundant
612 : // information and leads to waste of memory and compute in follow-up
613 : // operations.
614 13338 : for (const auto &var : groups)
615 4900 : if (buffer.coords().contains(var.dims().inner()))
616 4898 : buffer.coords().erase(var.dims().inner());
617 16876 : return make_bins_no_validate(indices, dim, buffer);
618 8438 : }
619 :
620 : } // namespace
621 :
622 8470 : DataArray bin(const DataArray &array, const std::vector<Variable> &edges,
623 : const std::vector<Variable> &groups,
624 : const std::vector<Dim> &erase) {
625 8470 : validate_bin_args(array, edges, groups);
626 8458 : const auto &data = array.data();
627 8458 : const auto &coords = array.coords();
628 8458 : const auto &meta = array.meta();
629 8458 : const auto &masks = array.masks();
630 8458 : const auto &attrs = array.attrs();
631 8458 : if (data.dtype() == dtype<core::bin<DataArray>>) {
632 2325 : return bin(data, coords, masks, attrs, edges, groups, erase);
633 : } else {
634 : // Pretend existing binning along outermost binning dim to enable threading
635 : const auto tmp = pretend_bins_for_threading(
636 10822 : array, groups.empty() ? edges.front().dims().inner()
637 10822 : : groups.front().dims().inner());
638 : auto target_bins_buffer =
639 6133 : (data.dims().volume() > std::numeric_limits<int32_t>::max())
640 : ? makeVariable<int64_t>(data.dims(), units::none)
641 6133 : : makeVariable<int32_t>(data.dims(), units::none);
642 6133 : auto builder = axis_actions(data, meta, edges, groups, erase);
643 6133 : builder.build(target_bins_buffer, meta);
644 : auto target_bins = make_bins_no_validate(
645 12266 : tmp.bin_indices(), data.dims().inner(), target_bins_buffer);
646 12266 : return add_metadata(drop_grouped_event_coords(tmp, groups),
647 12266 : make_mapper(std::move(target_bins), builder), coords,
648 18399 : masks, attrs, builder.edges(), builder.groups(), erase);
649 6133 : }
650 8458 : }
651 :
652 : /// Implementation of a generic binning algorithm.
653 : ///
654 : /// The overall approach of this is as follows:
655 : /// 1. Find target bin index for every input event (bin entry)
656 : /// 2. Next, we conceptually want to do
657 : /// for(i < events.size())
658 : /// target_bin[bin_index[i]].push_back(events[i])
659 : /// However, scipp's data layout for event data is a single 1-D array, and
660 : /// not a list of vector, i.e., the conceptual line above does not work
661 : /// directly. We need to obtain offsets into the 1-D array first, roughly:
662 : /// bin_sizes = count(bin_index) // number of events per target bin
663 : /// bin_offset = cumsum(bin_sizes) - bin_sizes
664 : /// 3. Copy from input to output bin, based on offset
665 : template <class Coords, class Masks, class Attrs>
666 2325 : DataArray bin(const Variable &data, const Coords &coords, const Masks &masks,
667 : const Attrs &attrs, const std::vector<Variable> &edges,
668 : const std::vector<Variable> &groups,
669 : const std::vector<Dim> &erase) {
670 2325 : const auto meta = attrs.merge_from(coords);
671 2325 : auto builder = axis_actions(data, meta, edges, groups, erase);
672 2310 : const auto masked = hide_masked(data, masks, builder.dims().labels());
673 2310 : TargetBins<DataArray> target_bins(masked, builder.dims());
674 2325 : builder.build(*target_bins, bins_view<DataArray>(masked).meta(), meta);
675 : return add_metadata(drop_grouped_event_coords(masked, groups),
676 : make_mapper(target_bins.release(), builder), coords,
677 4610 : masks, attrs, builder.edges(), builder.groups(), erase);
678 2340 : }
679 :
680 : } // namespace scipp::dataset
|