LCOV - code coverage report
Current view: top level - dataset - bin.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 339 345 98.3 %
Date: 2024-04-28 01:25:40 Functions: 54 55 98.2 %

          Line data    Source code
       1             : // SPDX-License-Identifier: BSD-3-Clause
       2             : // Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
       3             : /// @file
       4             : /// @author Simon Heybrock
       5             : #include <numeric>
       6             : #include <set>
       7             : 
       8             : #include "scipp/variable/astype.h"
       9             : #include "scipp/variable/bin_detail.h"
      10             : #include "scipp/variable/bin_util.h"
      11             : #include "scipp/variable/bins.h"
      12             : #include "scipp/variable/cumulative.h"
      13             : #include "scipp/variable/reduction.h"
      14             : #include "scipp/variable/shape.h"
      15             : #include "scipp/variable/subspan_view.h"
      16             : #include "scipp/variable/variable_factory.h"
      17             : 
      18             : #include "scipp/dataset/bin.h"
      19             : #include "scipp/dataset/bins.h"
      20             : #include "scipp/dataset/bins_view.h"
      21             : #include "scipp/dataset/except.h"
      22             : 
      23             : #include "bin_detail.h"
      24             : #include "bins_util.h"
      25             : #include "dataset_operations_common.h"
      26             : 
      27             : using namespace scipp::variable::bin_detail;
      28             : using namespace scipp::dataset::bin_detail;
      29             : 
      30             : namespace scipp::dataset {
      31             : 
      32             : namespace {
      33             : 
      34             : template <class T>
      35        8377 : Variable bins_from_sizes(T &&content, const Variable &bin_sizes) {
      36        8377 :   const auto end = cumsum(bin_sizes);
      37        8377 :   const auto buffer_dim = content.dims().inner();
      38             :   return make_bins(zip(end - bin_sizes, end), buffer_dim,
      39       16754 :                    std::forward<T>(content));
      40        8377 : }
      41             : 
      42        8359 : template <class Builder> bool use_two_stage_remap(const Builder &bld) {
      43       16069 :   return bld.nbin().dims().empty() &&
      44        7710 :          bld.nbin().template value<scipp::index>() == bld.dims().volume() &&
      45             :          // empirically determined crossover point (approx.)
      46        7698 :          bld.nbin().template value<scipp::index>() > 16 * 1024 &&
      47       16087 :          bld.offsets().dims().empty() &&
      48        8377 :          bld.offsets().template value<scipp::index>() == 0;
      49             : }
      50             : class Mapper {
      51             : public:
      52        8431 :   virtual ~Mapper() = default;
      53        8377 :   template <class T> T apply(const Variable &data) {
      54       68587 :     const auto maybe_bin = [this](const auto &var) {
      55       30114 :       return is_bins(var) ? apply_to_variable(var) : copy(var);
      56             :     };
      57             :     if constexpr (std::is_same_v<T, Variable>)
      58          38 :       return maybe_bin(data);
      59             :     else
      60        8358 :       return dataset::transform(bins_view<T>(data), maybe_bin);
      61             :   }
      62             : 
      63             :   virtual Variable bin_sizes(
      64             :       const std::optional<Dimensions> &dims_override = std::nullopt) const = 0;
      65             :   virtual Variable apply_to_variable(const Variable &var,
      66             :                                      Variable &&out = {}) const = 0;
      67             : };
      68             : 
      69             : class SingleStageMapper : public Mapper {
      70             : public:
      71        8377 :   SingleStageMapper(const Dimensions &dims, const Variable &indices,
      72             :                     const Variable &output_bin_sizes)
      73        8377 :       : m_dims(dims), m_indices(indices), m_output_bin_sizes(output_bin_sizes) {
      74             :     // Setup offsets within output bins, for every input bin. If rebinning
      75             :     // occurs along a dimension each output bin sees contributions from all
      76             :     // input bins along that dim.
      77        8377 :     m_offsets = copy(m_output_bin_sizes);
      78        8377 :     fill_zeros(m_offsets);
      79             :     // Not using cumsum along *all* dims, since some outer dims may be left
      80             :     // untouched (no rebin).
      81        8377 :     std::vector<std::pair<Dim, scipp::index>> strategy;
      82       24654 :     for (const auto dim : m_indices.dims())
      83        7900 :       if (dims.contains(dim))
      84        6955 :         strategy.emplace_back(dim, m_indices.dims()[dim]);
      85             :     // To avoid excessive memory consumption in intermediate results for
      86             :     // `output_bin_sizes` (in the loop below, computing sums and cumsums) we
      87             :     // need to ensure to handle the longest dimensions first,
      88        8377 :     std::sort(strategy.begin(), strategy.end(),
      89         192 :               [](auto &&a, auto &&b) { return a.second > b.second; });
      90       15332 :     for (const auto &item : strategy) {
      91        6955 :       const auto dim = item.first;
      92        6955 :       subbin_sizes_add_intersection(
      93        6955 :           m_offsets, subbin_sizes_cumsum_exclusive(m_output_bin_sizes, dim));
      94        6955 :       m_output_bin_sizes = sum(m_output_bin_sizes, dim);
      95             :     }
      96             :     // cumsum with bin dimension is last, since this corresponds to different
      97             :     // output bins, whereas the cumsum above handled different subbins of same
      98             :     // output bin, i.e., contributions of different input bins to some output
      99             :     // bin.
     100        8377 :     subbin_sizes_add_intersection(
     101        8377 :         m_offsets, cumsum_exclusive_subbin_sizes(m_output_bin_sizes));
     102        8377 :     const auto filtered_input_bin_size = sum_subbin_sizes(m_output_bin_sizes);
     103        8377 :     auto end = cumsum(filtered_input_bin_size);
     104        8377 :     m_total_size = end.dims().volume() > 0
     105        8377 :                        ? end.values<scipp::index>().as_span().back()
     106             :                        : 0;
     107       16754 :     end = broadcast(end,
     108        8377 :                     m_indices.dims()); // required for some cases of rebinning
     109        8377 :     m_filtered_input_bin_ranges = zip(end - filtered_input_bin_size, end);
     110        8377 :   }
     111             : 
     112       30146 :   Variable apply_to_variable(const Variable &var,
     113             :                              Variable &&out = {}) const override {
     114       30146 :     const auto &[input_indices, dim, content] = var.constituents<Variable>();
     115             :     static_cast<void>(input_indices);
     116             :     // The optional `out` argument is used to avoid creating a temporary buffer
     117             :     // when TwoStageMapper is applied to a series of columns of matching dtype.
     118       30178 :     if (!out.is_valid() || out.dtype() != content.dtype() ||
     119          32 :         out.has_variances() != content.has_variances())
     120       30114 :       out = resize_default_init(content, dim, m_total_size);
     121             :     else
     122          32 :       out.setUnit(content.unit());
     123       30146 :     auto out_subspans = subspan_view(out, dim, m_filtered_input_bin_ranges);
     124       30146 :     map_to_bins(out_subspans, as_subspan_view(var), m_offsets,
     125       60292 :                 as_subspan_view(std::as_const(m_indices)));
     126       60292 :     return out;
     127       30146 :   }
     128             : 
     129        8377 :   Variable bin_sizes(const std::optional<Dimensions> &dims_override =
     130             :                          std::nullopt) const override {
     131             :     // During mapping of values to the output layout, the output was viewed with
     132             :     // same bin index ranges as input. Now setup the desired final bin indices.
     133        8377 :     const auto dims = dims_override.value_or(m_dims);
     134        8377 :     auto output_dims = merge(m_output_bin_sizes.dims(), dims);
     135             :     return makeVariable<scipp::index>(
     136             :         output_dims, units::none,
     137       25131 :         Values(flatten_subbin_sizes(m_output_bin_sizes, dims.volume())));
     138        8377 :   }
     139             : 
     140             :   Dimensions m_dims;
     141             :   Variable m_indices;
     142             : 
     143             : private:
     144             :   Variable m_output_bin_sizes;
     145             :   Variable m_offsets;
     146             :   Variable m_filtered_input_bin_ranges;
     147             :   scipp::index m_total_size;
     148             : };
     149             : 
     150             : class TwoStageMapper : public Mapper {
     151             : public:
     152          18 :   TwoStageMapper(SingleStageMapper &&stage1_mapper,
     153             :                  SingleStageMapper &&stage2_mapper)
     154          18 :       : m_stage1_mapper(std::move(stage1_mapper)),
     155          36 :         m_stage2_mapper(std::move(stage2_mapper)) {}
     156             : 
     157          50 :   Variable apply_to_variable(const Variable &var,
     158             :                              Variable && = {}) const override {
     159             :     // Note how by having the virtual call on the Variable level we avoid
     160             :     // making the temporary buffer for the whole content buffer (typically a
     161             :     // DataArray), but instead just for one of the content buffer's columns
     162             :     // at a time.
     163             :     // As a further optimization we can reuse the content buffer. With the
     164             :     // current implementation of handling dtype this will only work if the dtype
     165             :     // is the same as that of the previously processed column. Otherwise a new
     166             :     // buffer is created.
     167          50 :     m_buffer = m_stage1_mapper.apply_to_variable(var, std::move(m_buffer));
     168          50 :     Variable indices = m_stage2_mapper.m_indices.bin_indices();
     169             :     return m_stage2_mapper.apply_to_variable(
     170         150 :         make_bins_no_validate(indices, m_buffer.dims().inner(), m_buffer));
     171          50 :   }
     172             : 
     173          18 :   Variable bin_sizes(const std::optional<Dimensions> &dims_override =
     174             :                          std::nullopt) const override {
     175          18 :     const auto dims = dims_override.value_or(m_stage1_mapper.m_dims);
     176          18 :     return fold(
     177          54 :         flatten(m_stage2_mapper.bin_sizes(),
     178          36 :                 std::vector<Dim>{Dim::InternalBinCoarse, Dim::InternalBinFine},
     179             :                 Dim::InternalSubbin)
     180          36 :             .slice({Dim::InternalSubbin, 0, dims.volume()}),
     181          36 :         Dim::InternalSubbin, dims);
     182          18 :   }
     183             : 
     184             : private:
     185             :   SingleStageMapper m_stage1_mapper;
     186             :   SingleStageMapper m_stage2_mapper;
     187             :   mutable Variable m_buffer;
     188             : };
     189             : 
     190             : template <class Builder>
     191        8359 : std::unique_ptr<Mapper> make_mapper(Variable &&indices,
     192             :                                     const Builder &builder) {
     193        8359 :   const auto dims = builder.dims();
     194        8359 :   if (use_two_stage_remap(builder)) {
     195             :     // There are many output bins. Mapping directly would lead to excessive
     196             :     // number of cache misses as well as potential false-sharing problems
     197             :     // between threads. We therefore map in two stages. This requires an
     198             :     // additional temporary buffer with size given by the number of events,
     199             :     // but has proven to be faster in practice.
     200          18 :     scipp::index chunk_size =
     201          18 :         floor(sqrt(builder.nbin().template value<scipp::index>()));
     202          18 :     const auto chunk = astype(scipp::index{chunk_size} * units::none,
     203          18 :                               indices.bin_buffer<Variable>().dtype());
     204          18 :     Variable fine_indices(std::move(indices));
     205          18 :     auto indices_ = floor_divide(fine_indices, chunk);
     206          18 :     fine_indices %= chunk;
     207          18 :     const auto n_coarse_bin = dims.volume() / chunk_size + 1;
     208             : 
     209          18 :     Variable output_bin_sizes = bin_detail::bin_sizes(
     210             :         indices_, builder.offsets(), n_coarse_bin * units::none);
     211          18 :     SingleStageMapper stage1_mapper(dims, indices_, output_bin_sizes);
     212             : 
     213          18 :     Dimensions stage1_out_dims(Dim::InternalBinCoarse, n_coarse_bin);
     214          18 :     fine_indices = bins_from_sizes(stage1_mapper.apply<Variable>(fine_indices),
     215             :                                    stage1_mapper.bin_sizes(stage1_out_dims));
     216          18 :     Dimensions fine_dims(Dim::InternalBinFine, chunk_size);
     217          36 :     const auto fine_output_bin_sizes =
     218             :         bin_detail::bin_sizes(fine_indices, scipp::index{0} * units::none,
     219             :                               fine_dims.volume() * units::none);
     220          18 :     SingleStageMapper stage2_mapper(fine_dims, fine_indices,
     221             :                                     fine_output_bin_sizes);
     222             : 
     223          18 :     return std::make_unique<TwoStageMapper>(std::move(stage1_mapper),
     224          36 :                                             std::move(stage2_mapper));
     225          18 :   } else {
     226        8341 :     const auto output_bin_sizes =
     227             :         bin_detail::bin_sizes(indices, builder.offsets(), builder.nbin());
     228        8341 :     return std::make_unique<SingleStageMapper>(dims, indices, output_bin_sizes);
     229        8341 :   }
     230        8359 : }
     231             : 
     232             : template <class T, class Mapping>
     233       25053 : auto extract_unbinned(T &array, Mapping &map) {
     234       25053 :   const auto dim = array.dims().inner();
     235             :   using Key = typename Mapping::key_type;
     236       25053 :   std::vector<Key> to_extract;
     237             :   // WARNING: Do not use `map` while extracting, `extract` invalidates it!
     238       25053 :   std::copy_if(map.keys_begin(), map.keys_end(), std::back_inserter(to_extract),
     239       21730 :                [&](const auto &key) { return !map[key].dims().contains(dim); });
     240       25053 :   core::Dict<Key, Variable> extracted;
     241       25071 :   for (const auto &key : to_extract)
     242          18 :     extracted.insert_or_assign(key, map.extract(key));
     243       50106 :   return extracted;
     244       25053 : }
     245             : 
     246             : /// Combine meta data from buffer and input data array and create final output
     247             : /// data array with binned data.
     248             : /// - Meta data that does not depend on the buffer dim is lifted to the output
     249             : ///   array.
     250             : /// - Any meta data depending on rebinned dimensions is dropped since it becomes
     251             : ///   meaningless. Note that rebinned masks have been applied before the binning
     252             : ///   step.
     253             : /// - If rebinning, existing meta data along unchanged dimensions is preserved.
     254             : template <class Coords, class Masks, class Attrs>
     255        8351 : DataArray add_metadata(const Variable &data, std::unique_ptr<Mapper> mapper,
     256             :                        const Coords &coords, const Masks &masks,
     257             :                        const Attrs &attrs, const std::vector<Variable> &edges,
     258             :                        const std::vector<Variable> &groups,
     259             :                        const std::vector<Dim> &erase) {
     260        8351 :   auto bin_sizes = mapper->bin_sizes();
     261        8351 :   auto buffer = mapper->template apply<DataArray>(data);
     262        8351 :   bin_sizes = squeeze(bin_sizes, erase);
     263        8351 :   const auto buffer_dim = buffer.dims().inner();
     264        8351 :   std::set<Dim> dims(erase.begin(), erase.end());
     265       57380 :   const auto rebinned = [&](const auto &var) {
     266       17273 :     for (const auto &dim : var.dims().labels())
     267       16641 :       if (dims.count(dim) || var.dims().contains(buffer_dim))
     268       16046 :         return true;
     269         632 :     return false;
     270             :   };
     271        8351 :   auto out_coords = extract_unbinned(buffer, buffer.coords());
     272       41755 :   for (const auto &c : {edges, groups})
     273       30933 :     for (const auto &coord : c) {
     274       14231 :       dims.emplace(coord.dims().inner());
     275       14231 :       Variable to_insert(coord);
     276       14231 :       to_insert.set_aligned(true);
     277       14231 :       out_coords.insert_or_assign(coord.dims().inner(), std::move(to_insert));
     278             :     }
     279       24908 :   for (const auto &[dim_, coord] : coords)
     280       16557 :     if (!rebinned(coord) && !out_coords.contains(dim_))
     281         583 :       out_coords.insert_or_assign(dim_, coord);
     282        8351 :   auto out_masks = extract_unbinned(buffer, buffer.masks());
     283        8414 :   for (const auto &[name, mask] : masks)
     284          63 :     if (!rebinned(mask))
     285           6 :       out_masks.insert_or_assign(name, copy(mask));
     286        8351 :   auto out_attrs = extract_unbinned(buffer, buffer.attrs());
     287        8409 :   for (const auto &[dim_, coord] : attrs)
     288          58 :     if (!rebinned(coord) && !out_coords.contains(dim_))
     289           9 :       out_attrs.insert_or_assign(dim_, coord);
     290        8351 :   return DataArray{bins_from_sizes(std::move(buffer), bin_sizes),
     291       16702 :                    std::move(out_coords), std::move(out_masks),
     292       33404 :                    std::move(out_attrs)};
     293        8351 : }
     294             : 
     295             : class TargetBinBuilder {
     296             :   enum class AxisAction { Group, Bin, Existing, Join };
     297             : 
     298             : public:
     299       30481 :   [[nodiscard]] const Dimensions &dims() const noexcept { return m_dims; }
     300        8395 :   [[nodiscard]] const Variable &offsets() const noexcept { return m_offsets; }
     301       32108 :   [[nodiscard]] const Variable &nbin() const noexcept { return m_nbin; }
     302             : 
     303             :   /// `bin_coords` may optionally be used to provide bin-based coords, e.g., for
     304             :   /// data that has prior grouping but did not retain the original group coord
     305             :   /// for every event.
     306             :   template <class CoordsT, class BinCoords = Coords>
     307        8364 :   void build(Variable &indices, CoordsT &&coords, BinCoords &&bin_coords = {}) {
     308       36834 :     const auto get_coord = [&](const Dim dim) {
     309       14235 :       return coords.count(dim) ? coords[dim] : bin_coords.at(dim);
     310             :     };
     311        8364 :     m_offsets = makeVariable<scipp::index>(Values{0}, units::none);
     312        8364 :     m_nbin = dims().volume() * units::none;
     313       22634 :     for (const auto &[action, dim, key] : m_actions) {
     314       14275 :       if (action == AxisAction::Group)
     315        4884 :         update_indices_by_grouping(indices, get_coord(dim), key);
     316        9391 :       else if (action == AxisAction::Bin) {
     317        9351 :         const auto linspace = all(islinspace(key, dim)).template value<bool>();
     318             :         // When binning along an existing dim with a coord (may be edges or
     319             :         // not), not all input bins can map to all output bins. The array of
     320             :         // subbin sizes that is normally created thus contains mainly zero
     321             :         // entries, e.g.,:
     322             :         // ---1
     323             :         // --11
     324             :         // --4-
     325             :         // 111-
     326             :         // 2---
     327             :         //
     328             :         // each row corresponds to an input bin
     329             :         // each column corresponds to an output bin
     330             :         // the example is for a single rebinned dim
     331             :         // `-` is 0
     332             :         //
     333             :         // In practice this array of sizes can become very large (many GByte of
     334             :         // memory) and has to be avoided. This is not just a performance issue.
     335             :         // We detect this case, pre select relevant output bins, and store the
     336             :         // sparse array in a specialized packed format, using the helper type
     337             :         // SubbinSizes.
     338             :         // Note that there is another source of memory consumption in the
     339             :         // algorithm, `indices`, containing the index of the target bin for
     340             :         // every input event. This is unrelated and varies independently,
     341             :         // depending on parameters of the input.
     342       27372 :         if (key.ndim() == 1 && // index setup not implemented for this case
     343        9331 :             bin_coords.count(dim) && m_offsets.dims().empty() &&
     344       25415 :             bin_coords.at(dim).dims().contains(dim) &&
     345        6733 :             allsorted(bin_coords.at(dim), dim)) {
     346         726 :           const auto &bin_coord = bin_coords.at(dim);
     347         726 :           const bool histogram =
     348         726 :               bin_coord.dims()[dim] ==
     349         726 :               (indices.dims().contains(dim) ? indices.dims()[dim] : 1) + 1;
     350         726 :           auto begin =
     351         726 :               begin_edge(histogram ? left_edge(bin_coord) : bin_coord, key);
     352         998 :           auto end = histogram ? end_edge(right_edge(bin_coord), key)
     353             :                                : begin + 2 * units::none;
     354             :           // When we have bin edges (of length 2) for a dimension that is not
     355             :           // a dimension of the input it needs to be squeezed to avoid problems
     356             :           // in various places later on.
     357         726 :           begin = squeeze(begin, std::nullopt);
     358         726 :           end = squeeze(end, std::nullopt);
     359         726 :           const auto indices_ = zip(begin, end);
     360         726 :           const auto inner_volume = dims().volume() / dims()[dim] * units::none;
     361             :           // Number of non-zero entries (per "row" above)
     362         726 :           m_nbin = (end - begin - 1 * units::none) * inner_volume;
     363             :           // Offset to first non-zero entry (in "row" above)
     364         726 :           m_offsets = begin * inner_volume;
     365             :           // Mask out any output bin edges that need not be considered since
     366             :           // there is no overlap between given input and output bin.
     367        1452 :           const auto masked_key = make_bins_no_validate(indices_, dim, key);
     368         731 :           update_indices_by_binning(indices, get_coord(dim), masked_key,
     369             :                                     linspace);
     370         746 :         } else {
     371        8625 :           update_indices_by_binning(indices, get_coord(dim), key, linspace);
     372             :         }
     373          40 :       } else if (action == AxisAction::Existing) {
     374             :         // Similar to binning along an existing dim, if a dimension is simply
     375             :         // kept unchanged there is a 1:1 mapping from input to output dims. We
     376             :         // can thus avoid storing and processing a lot of length-0 contributions
     377             :         // to bins.
     378             :         // Note that this is only possible (in this simple manner) if there are
     379             :         // no other actions affecting output dimensions.
     380          39 :         if (m_offsets.dims().empty() && m_dims[dim] == m_dims.volume()) {
     381             :           // Offset to output bin tracked using base offset for input bins
     382           0 :           m_nbin = scipp::index{1} * units::none;
     383           0 :           m_offsets = make_range(0, m_dims[dim], 1, dim);
     384             :         } else {
     385             :           // Offset to output bin tracked in indices for individual events
     386          39 :           update_indices_from_existing(indices, dim);
     387             :         }
     388           1 :       } else if (action == AxisAction::Join) {
     389             :         ; // target bin 0 for all
     390             :       }
     391             :     }
     392        8359 :   }
     393             : 
     394        8351 :   [[nodiscard]] auto edges() const noexcept {
     395        8351 :     std::vector<Variable> vars;
     396       22621 :     for (const auto &[action, dim, key] : m_actions) {
     397             :       static_cast<void>(dim);
     398       14270 :       if (action == AxisAction::Bin || action == AxisAction::Join)
     399        9347 :         vars.emplace_back(key);
     400             :     }
     401        8351 :     return vars;
     402             :   }
     403             : 
     404        8351 :   [[nodiscard]] auto groups() const noexcept {
     405        8351 :     std::vector<Variable> vars;
     406       22621 :     for (const auto &[action, dim, key] : m_actions) {
     407             :       static_cast<void>(dim);
     408       14270 :       if (action == AxisAction::Group)
     409        4884 :         vars.emplace_back(key);
     410             :     }
     411        8351 :     return vars;
     412             :   }
     413             : 
     414        4884 :   void group(const Variable &groups) {
     415        4884 :     const auto dim = groups.dims().inner();
     416        4884 :     m_dims.addInner(dim, groups.dims()[dim]);
     417        4884 :     m_actions.emplace_back(AxisAction::Group, dim, groups);
     418        4884 :   }
     419             : 
     420        9366 :   void bin(const Variable &edges) {
     421        9366 :     const auto dim = edges.dims().inner();
     422        9366 :     m_dims.addInner(dim, edges.dims()[dim] - 1);
     423        9361 :     m_actions.emplace_back(AxisAction::Bin, dim, edges);
     424        9361 :   }
     425             : 
     426          39 :   void existing(const Dim dim, const scipp::index size) {
     427          39 :     m_dims.addInner(dim, size);
     428          39 :     m_actions.emplace_back(AxisAction::Existing, dim, Variable{});
     429          39 :   }
     430             : 
     431           1 :   void join(const Dim dim, const Variable &coord) {
     432           1 :     m_dims.addInner(dim, 1);
     433           3 :     m_joined.emplace_back(concat(std::vector{min(coord), max(coord)}, dim));
     434           1 :     m_actions.emplace_back(AxisAction::Join, dim, m_joined.back());
     435           1 :   }
     436             : 
     437             :   // All input bins mapped to same output bin => "add" 0 everywhere
     438          70 :   void erase(const Dim dim) { m_dims.addInner(dim, 1); }
     439             : 
     440             : private:
     441             :   Dimensions m_dims;
     442             :   Variable m_offsets;
     443             :   Variable m_nbin;
     444             :   std::vector<std::tuple<AxisAction, Dim, Variable>> m_actions;
     445             :   std::vector<Variable> m_joined;
     446             : };
     447             : 
     448             : // Order is defined as:
     449             : // 1. Erase binning from any dimensions listed in erase
     450             : // 2. Any rebinned dim and dims inside the first rebinned dim, in the order of
     451             : // appearance in array.
     452             : // 3. All new grouped dims.
     453             : // 4. All new binned dims.
     454             : template <class Coords>
     455        8358 : auto axis_actions(const Variable &data, const Coords &coords,
     456             :                   const std::vector<Variable> &edges,
     457             :                   const std::vector<Variable> &groups,
     458             :                   const std::vector<Dim> &erase) {
     459        8358 :   TargetBinBuilder builder;
     460        8407 :   for (const auto dim : erase) {
     461          49 :     builder.erase(dim);
     462             :   }
     463             : 
     464       16716 :   constexpr auto get_dims = [](const auto &coords_) {
     465       16716 :     Dimensions dims;
     466       30958 :     for (const auto &coord : coords_)
     467       14242 :       dims.addInner(coord.dims().inner(), 1);
     468       16716 :     return dims;
     469           0 :   };
     470        8358 :   auto edges_dims = get_dims(edges);
     471        8358 :   auto groups_dims = get_dims(groups);
     472             :   // If we rebin a dimension that is not the inner dimension of the input, we
     473             :   // also need to handle bin contents from all dimensions inside the rebinned
     474             :   // one, even if the grouping/binning along this dimension is unchanged.
     475        8358 :   bool rebin = false;
     476        8358 :   const auto dims = data.dims();
     477       16226 :   for (const auto dim : dims.labels()) {
     478        7883 :     if (edges_dims.contains(dim) || groups_dims.contains(dim))
     479         824 :       rebin = true;
     480        7883 :     if (groups_dims.contains(dim)) {
     481           6 :       builder.group(groups[groups_dims.index(dim)]);
     482        7877 :     } else if (edges_dims.contains(dim)) {
     483         818 :       builder.bin(edges[edges_dims.index(dim)]);
     484        7059 :     } else if (rebin) {
     485          48 :       if (coords.count(dim) && coords.at(dim).dims().ndim() != 1)
     486          20 :         throw except::DimensionError(
     487          10 :             "2-D coordinate " + to_string(coords.at(dim)) +
     488             :             " conflicting with (re)bin of outer dimension. Try specifying new "
     489             :             "aligned (1-D) edges for dimension '" +
     490             :             to_string(dim) + "' with the `edges` option of `bin`.");
     491          38 :       builder.existing(dim, data.dims()[dim]);
     492             :     }
     493             :   }
     494       13219 :   for (const auto &group : groups)
     495        4876 :     if (!dims.contains(group.dims().inner()))
     496        4870 :       builder.group(group);
     497       17694 :   for (const auto &edge : edges)
     498        9351 :     if (!dims.contains(edge.dims().inner()))
     499        8548 :       builder.bin(edge);
     500       16686 :   return builder;
     501        8403 : }
     502             : 
     503             : template <class T> class TargetBins {
     504             : public:
     505        2302 :   TargetBins(const Variable &var, const Dimensions &dims) {
     506             :     // In some cases all events in an input bin map to the same output, but
     507             :     // right now bin<> cannot handle this and requires target bin indices for
     508             :     // every bin element.
     509        2302 :     const auto &[begin_end, dim, buffer] = var.constituents<T>();
     510        2302 :     m_target_bins_buffer =
     511        2302 :         (dims.volume() > std::numeric_limits<int32_t>::max())
     512        4604 :             ? makeVariable<int64_t>(buffer.dims(), units::none)
     513             :             : makeVariable<int32_t>(buffer.dims(), units::none);
     514        2302 :     m_target_bins = make_bins_no_validate(begin_end, dim, m_target_bins_buffer);
     515        2302 :   }
     516        2302 :   auto &operator*() noexcept { return m_target_bins; }
     517        2297 :   Variable &&release() noexcept { return std::move(m_target_bins); }
     518             : 
     519             : private:
     520             :   Variable m_target_bins_buffer;
     521             :   Variable m_target_bins;
     522             : };
     523             : 
     524             : } // namespace
     525             : 
     526             : /// Reduce a dimension by concatenating bin contents of all bins along a
     527             : /// dimension.
     528             : ///
     529             : /// This is used to implement `concatenate(var, dim)`.
     530           8 : template <class T> Variable concat_bins(const Variable &var, const Dim dim) {
     531           8 :   TargetBinBuilder builder;
     532           8 :   builder.erase(dim);
     533           8 :   TargetBins<T> target_bins(var, builder.dims());
     534             : 
     535           8 :   builder.build(*target_bins, std::map<Dim, Variable>{});
     536           8 :   auto mapper = make_mapper(target_bins.release(), builder);
     537           8 :   auto buffer = mapper->template apply<T>(var);
     538           8 :   auto bin_sizes = mapper->bin_sizes();
     539           8 :   bin_sizes = squeeze(bin_sizes, scipp::span{&dim, 1});
     540          16 :   return bins_from_sizes(std::move(buffer), bin_sizes);
     541           8 : }
     542             : template Variable concat_bins<Variable>(const Variable &, const Dim);
     543             : template Variable concat_bins<DataArray>(const Variable &, const Dim);
     544             : 
     545             : /// Implementation of groupby.bins.concatenate
     546             : ///
     547             : /// If `array` has unaligned, i.e., not 1-D, coords conflicting with the
     548             : /// reduction dimension, any binning along the dimensions of the conflicting
     549             : /// coords is removed. It is replaced by a single bin along that dimension, with
     550             : /// bin edges given my min and max of the old coord.
     551          13 : DataArray groupby_concat_bins(const DataArray &array, const Variable &edges,
     552             :                               const Variable &groups, const Dim reductionDim) {
     553          13 :   TargetBinBuilder builder;
     554          13 :   if (edges.is_valid())
     555           0 :     builder.bin(edges);
     556          13 :   if (groups.is_valid())
     557           8 :     builder.group(groups);
     558          13 :   builder.erase(reductionDim);
     559          13 :   const auto dims = array.dims();
     560          28 :   for (const auto &dim : dims.labels())
     561          15 :     if (array.meta().contains(dim)) {
     562          17 :       if (array.meta()[dim].dims().ndim() != 1 &&
     563           9 :           array.meta()[dim].dims().contains(reductionDim))
     564           1 :         builder.join(dim, array.meta()[dim]);
     565           7 :       else if (dim != reductionDim)
     566           1 :         builder.existing(dim, array.dims()[dim]);
     567             :     }
     568             : 
     569             :   const auto masked =
     570          13 :       hide_masked(array.data(), array.masks(), builder.dims().labels());
     571          13 :   TargetBins<DataArray> target_bins(masked, builder.dims());
     572          13 :   builder.build(*target_bins, array.meta());
     573             :   // Note: Unlike in the other cases below we do not call
     574             :   // `drop_grouped_event_coords` here. Grouping is based on a bin-coord rather
     575             :   // than event-coord so we do not touch the latter.
     576          26 :   return add_metadata(masked, make_mapper(target_bins.release(), builder),
     577          13 :                       array.coords(), array.masks(), array.attrs(),
     578          52 :                       builder.edges(), builder.groups(), {reductionDim});
     579          13 : }
     580             : 
     581             : namespace {
     582        8370 : void validate_bin_args(const DataArray &array,
     583             :                        const std::vector<Variable> &edges,
     584             :                        const std::vector<Variable> &groups) {
     585        2296 :   if ((is_bins(array) &&
     586       16740 :        std::get<2>(array.data().constituents<DataArray>()).dims().ndim() > 1) ||
     587        8370 :       (!is_bins(array) && array.dims().ndim() > 1)) {
     588           2 :     throw except::BinnedDataError(
     589             :         "Binning is only implemented for 1-dimensional data. Consider using "
     590           4 :         "groupby, it might be able to do what you need.");
     591             :   }
     592        8368 :   if (edges.empty() && groups.empty())
     593           5 :     throw std::invalid_argument(
     594             :         "Arguments 'edges' and 'groups' of scipp.bin are "
     595          10 :         "both empty. At least one must be set.");
     596       17729 :   for (const auto &edge : edges) {
     597        9371 :     const auto dim = edge.dims().inner();
     598        9371 :     if (edge.dims()[dim] < 2)
     599          10 :       throw except::BinEdgeError("Not enough bin edges in dim " +
     600          15 :                                  to_string(dim) + ". Need at least 2.");
     601        9366 :     if (!allsorted(edge, dim))
     602           0 :       throw except::BinEdgeError("Bin edges in dim " + to_string(dim) +
     603           0 :                                  " must be sorted.");
     604             :   }
     605        8358 : }
     606             : 
     607        8338 : auto drop_grouped_event_coords(const Variable &data,
     608             :                                const std::vector<Variable> &groups) {
     609        8338 :   auto [indices, dim, buffer] = data.constituents<DataArray>();
     610             :   // Do not preserve event coords used for grouping since this is redundant
     611             :   // information and leads to waste of memory and compute in follow-up
     612             :   // operations.
     613       13214 :   for (const auto &var : groups)
     614        4876 :     if (buffer.coords().contains(var.dims().inner()))
     615        4874 :       buffer.coords().erase(var.dims().inner());
     616       16676 :   return make_bins_no_validate(indices, dim, buffer);
     617        8338 : }
     618             : 
     619             : } // namespace
     620             : 
     621        8370 : DataArray bin(const DataArray &array, const std::vector<Variable> &edges,
     622             :               const std::vector<Variable> &groups,
     623             :               const std::vector<Dim> &erase) {
     624        8370 :   validate_bin_args(array, edges, groups);
     625        8358 :   const auto &data = array.data();
     626        8358 :   const auto &coords = array.coords();
     627        8358 :   const auto &meta = array.meta();
     628        8358 :   const auto &masks = array.masks();
     629        8358 :   const auto &attrs = array.attrs();
     630        8358 :   if (data.dtype() == dtype<core::bin<DataArray>>) {
     631        2296 :     return bin(data, coords, masks, attrs, edges, groups, erase);
     632             :   } else {
     633             :     // Pretend existing binning along outermost binning dim to enable threading
     634             :     const auto tmp = pretend_bins_for_threading(
     635       10703 :         array, groups.empty() ? edges.front().dims().inner()
     636       10703 :                               : groups.front().dims().inner());
     637             :     auto target_bins_buffer =
     638        6062 :         (data.dims().volume() > std::numeric_limits<int32_t>::max())
     639             :             ? makeVariable<int64_t>(data.dims(), units::none)
     640        6062 :             : makeVariable<int32_t>(data.dims(), units::none);
     641        6062 :     auto builder = axis_actions(data, meta, edges, groups, erase);
     642        6062 :     builder.build(target_bins_buffer, meta);
     643             :     auto target_bins = make_bins_no_validate(
     644       12124 :         tmp.bin_indices(), data.dims().inner(), target_bins_buffer);
     645       12124 :     return add_metadata(drop_grouped_event_coords(tmp, groups),
     646       12124 :                         make_mapper(std::move(target_bins), builder), coords,
     647       18186 :                         masks, attrs, builder.edges(), builder.groups(), erase);
     648        6062 :   }
     649        8358 : }
     650             : 
     651             : /// Implementation of a generic binning algorithm.
     652             : ///
     653             : /// The overall approach of this is as follows:
     654             : /// 1. Find target bin index for every input event (bin entry)
     655             : /// 2. Next, we conceptually want to do
     656             : ///        for(i < events.size())
     657             : ///          target_bin[bin_index[i]].push_back(events[i])
     658             : ///    However, scipp's data layout for event data is a single 1-D array, and
     659             : ///    not a list of vector, i.e., the conceptual line above does not work
     660             : ///    directly. We need to obtain offsets into the 1-D array first, roughly:
     661             : ///        bin_sizes = count(bin_index) // number of events per target bin
     662             : ///        bin_offset = cumsum(bin_sizes) - bin_sizes
     663             : /// 3. Copy from input to output bin, based on offset
     664             : template <class Coords, class Masks, class Attrs>
     665        2296 : DataArray bin(const Variable &data, const Coords &coords, const Masks &masks,
     666             :               const Attrs &attrs, const std::vector<Variable> &edges,
     667             :               const std::vector<Variable> &groups,
     668             :               const std::vector<Dim> &erase) {
     669        2296 :   const auto meta = attrs.merge_from(coords);
     670        2296 :   auto builder = axis_actions(data, meta, edges, groups, erase);
     671        2281 :   const auto masked = hide_masked(data, masks, builder.dims().labels());
     672        2281 :   TargetBins<DataArray> target_bins(masked, builder.dims());
     673        2296 :   builder.build(*target_bins, bins_view<DataArray>(masked).meta(), meta);
     674             :   return add_metadata(drop_grouped_event_coords(masked, groups),
     675             :                       make_mapper(target_bins.release(), builder), coords,
     676        4552 :                       masks, attrs, builder.edges(), builder.groups(), erase);
     677        2311 : }
     678             : 
     679             : } // namespace scipp::dataset

Generated by: LCOV version 1.14