LCOV - code coverage report
Current view: top level - dataset - groupby.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 165 178 92.7 %
Date: 2024-12-01 01:56:34 Functions: 44 67 65.7 %

          Line data    Source code
       1             : // SPDX-License-Identifier: BSD-3-Clause
       2             : // Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
       3             : /// @file
       4             : /// @author Simon Heybrock
       5             : #include <numeric>
       6             : 
       7             : #include "scipp/core/bucket.h"
       8             : #include "scipp/core/element/comparison.h"
       9             : #include "scipp/core/element/logical.h"
      10             : #include "scipp/core/histogram.h"
      11             : #include "scipp/core/parallel.h"
      12             : #include "scipp/core/tag_util.h"
      13             : 
      14             : #include "scipp/variable/accumulate.h"
      15             : #include "scipp/variable/cumulative.h"
      16             : #include "scipp/variable/operations.h"
      17             : #include "scipp/variable/util.h"
      18             : #include "scipp/variable/variable_factory.h"
      19             : 
      20             : #include "scipp/dataset/bins.h"
      21             : #include "scipp/dataset/except.h"
      22             : #include "scipp/dataset/groupby.h"
      23             : #include "scipp/dataset/shape.h"
      24             : #include "scipp/dataset/util.h"
      25             : 
      26             : #include "../variable/operations_common.h"
      27             : #include "bin_common.h"
      28             : #include "dataset_operations_common.h"
      29             : 
      30             : using namespace scipp::variable;
      31             : 
      32             : namespace scipp::dataset {
      33             : 
      34             : namespace {
      35          77 : auto resize_array(const DataArray &da, const Dim reductionDim,
      36             :                   const scipp::index size, const FillValue fill) {
      37          77 :   if (!is_bins(da))
      38          72 :     return resize(da, reductionDim, size, fill);
      39           5 :   if (variableFactory().has_masks(da.data()))
      40           1 :     throw except::NotImplementedError(
      41             :         "Reduction operations for binned data with "
      42           2 :         "event masks not supported yet.");
      43           4 :   DataArray dense_dummy(da);
      44           4 :   dense_dummy.setData(empty(da.dims(), variableFactory().elem_unit(da.data()),
      45           4 :                             variableFactory().elem_dtype(da.data()),
      46           4 :                             variableFactory().has_variances(da.data())));
      47           4 :   return resize_array(dense_dummy, reductionDim, size, fill);
      48           4 : }
      49             : } // namespace
      50             : 
      51             : /// Helper for creating output for "combine" step for "apply" steps that reduce
      52             : /// a dimension.
      53             : ///
      54             : /// - Delete anything (but data) that depends on the reduction dimension.
      55             : /// - Default-init data.
      56             : template <class T>
      57          61 : T GroupBy<T>::makeReductionOutput(const Dim reductionDim,
      58             :                                   const FillValue fill) const {
      59          61 :   T out;
      60             :   if constexpr (std::is_same_v<T, Dataset>) {
      61          39 :     out = apply_to_items(m_data, resize_array, reductionDim, size(), fill);
      62             :   } else {
      63          22 :     out = resize_array(m_data, reductionDim, size(), fill);
      64             :   }
      65          60 :   out = out.rename_dims({{reductionDim, dim()}});
      66          60 :   out.coords().set(dim(), key());
      67          60 :   return out;
      68           1 : }
      69             : 
      70             : namespace {
      71             : template <class Op, class Groups>
      72          72 : void reduce_(Op op, const Dim reductionDim, const Variable &out_data,
      73             :              const DataArray &data, const Dim dim, const Groups &groups,
      74             :              const FillValue fill) {
      75          72 :   const auto mask_replacement =
      76         144 :       special_like(Variable(data.data(), Dimensions{}), fill);
      77          72 :   auto mask = irreducible_mask(data.masks(), reductionDim);
      78          72 :   const auto process = [&](const auto &range) {
      79             :     // Apply to each group, storing result in output slice
      80         434 :     for (scipp::index group = range.begin(); group != range.end(); ++group) {
      81         362 :       auto out_slice = out_data.slice({dim, group});
      82         737 :       for (const auto &slice : groups[group]) {
      83         375 :         const auto data_slice = data.data().slice(slice);
      84         375 :         if (mask.is_valid())
      85          33 :           op(out_slice, where(mask.slice(slice), mask_replacement, data_slice));
      86             :         else
      87         342 :           op(out_slice, data_slice);
      88             :       }
      89             :     }
      90             :   };
      91          72 :   core::parallel::parallel_for(core::parallel::blocked_range(0, groups.size()),
      92             :                                process);
      93          72 : }
      94             : } // namespace
      95             : 
      96             : template <class T>
      97             : template <class Op>
      98          61 : T GroupBy<T>::reduce(Op op, const Dim reductionDim,
      99             :                      const FillValue fill) const {
     100          61 :   auto out = makeReductionOutput(reductionDim, fill);
     101             :   if constexpr (std::is_same_v<T, Dataset>) {
     102          90 :     for (const auto &item : m_data)
     103          51 :       reduce_(op, reductionDim, out[item.name()].data(), item, dim(), groups(),
     104             :               fill);
     105             :   } else {
     106          21 :     reduce_(op, reductionDim, out.data(), m_data, dim(), groups(), fill);
     107             :   }
     108          60 :   return out;
     109           0 : }
     110             : 
     111             : /// Reduce each group by concatenating elements and return combined data.
     112             : ///
     113             : /// This only supports binned data.
     114           7 : template <class T> T GroupBy<T>::concat(const Dim reductionDim) const {
     115          31 :   const auto conc = [&](const auto &data) {
     116           8 :     if (key().dims().volume() == scipp::size(groups()))
     117           8 :       return groupby_concat_bins(data, {}, key(), reductionDim);
     118             :     else
     119           0 :       return groupby_concat_bins(data, key(), {}, reductionDim);
     120             :   };
     121             :   if constexpr (std::is_same_v<T, DataArray>) {
     122          12 :     return conc(m_data);
     123             :   } else {
     124           4 :     return apply_to_items(m_data, [&](auto &&..._) { return conc(_...); });
     125             :   }
     126             : }
     127             : 
     128             : /// Reduce each group using `sum` and return combined data.
     129          40 : template <class T> T GroupBy<T>::sum(const Dim reductionDim) const {
     130          40 :   return reduce(variable::sum_into, reductionDim, FillValue::ZeroNotBool);
     131             : }
     132             : 
     133             : /// Reduce each group using `nansum` and return combined data.
     134           0 : template <class T> T GroupBy<T>::nansum(const Dim reductionDim) const {
     135           0 :   return reduce(variable::nansum_into, reductionDim, FillValue::ZeroNotBool);
     136             : }
     137             : 
     138             : /// Reduce each group using `all` and return combined data.
     139           4 : template <class T> T GroupBy<T>::all(const Dim reductionDim) const {
     140           4 :   return reduce(variable::all_into, reductionDim, FillValue::True);
     141             : }
     142             : 
     143             : /// Reduce each group using `any` and return combined data.
     144           4 : template <class T> T GroupBy<T>::any(const Dim reductionDim) const {
     145           4 :   return reduce(variable::any_into, reductionDim, FillValue::False);
     146             : }
     147             : 
     148             : /// Reduce each group using `max` and return combined data.
     149           8 : template <class T> T GroupBy<T>::max(const Dim reductionDim) const {
     150           8 :   return reduce(variable::max_into, reductionDim, FillValue::Lowest);
     151             : }
     152             : 
     153             : /// Reduce each group using `nanmax` and return combined data.
     154           0 : template <class T> T GroupBy<T>::nanmax(const Dim reductionDim) const {
     155           0 :   return reduce(variable::nanmax_into, reductionDim, FillValue::Lowest);
     156             : }
     157             : 
     158             : /// Reduce each group using `min` and return combined data.
     159           5 : template <class T> T GroupBy<T>::min(const Dim reductionDim) const {
     160           5 :   return reduce(variable::min_into, reductionDim, FillValue::Max);
     161             : }
     162             : 
     163             : /// Reduce each group using `nanmin` and return combined data.
     164           0 : template <class T> T GroupBy<T>::nanmin(const Dim reductionDim) const {
     165           0 :   return reduce(variable::nanmin_into, reductionDim, FillValue::Max);
     166             : }
     167             : 
     168             : /// Apply mean to groups and return combined data.
     169          13 : template <class T> T GroupBy<T>::mean(const Dim reductionDim) const {
     170             :   // 1. Sum into output slices
     171          13 :   auto out = sum(reductionDim);
     172             : 
     173             :   // 2. Compute number of slices N contributing to each out slice
     174         183 :   const auto get_scale = [&](const auto &data) {
     175             :     // TODO Supporting binned data requires generalized approach to compute
     176             :     // scale factor.
     177          17 :     if (is_bins(data))
     178           1 :       throw except::NotImplementedError(
     179             :           "groupby.mean does not support binned data yet.");
     180          32 :     auto scale = makeVariable<double>(Dims{dim()}, Shape{size()});
     181          16 :     const auto scaleT = scale.template values<double>();
     182          16 :     const auto mask = irreducible_mask(data.masks(), reductionDim);
     183          51 :     for (scipp::index group = 0; group < size(); ++group)
     184          72 :       for (const auto &slice : groups()[group]) {
     185             :         // N contributing to each slice
     186          37 :         scaleT[group] += slice.end() - slice.begin();
     187             :         // N masks for each slice, that need to be subtracted
     188          37 :         if (mask.is_valid()) {
     189          19 :           const auto masks_sum = variable::sum(mask.slice(slice), reductionDim);
     190          19 :           scaleT[group] -= masks_sum.template value<int64_t>();
     191          19 :         }
     192             :       }
     193          32 :     return reciprocal(std::move(scale));
     194          16 :   };
     195             : 
     196             :   // 3. sum/N -> mean
     197             :   if constexpr (std::is_same_v<T, Dataset>) {
     198          20 :     for (auto &&item : out) {
     199          12 :       if (is_int(item.data().dtype()))
     200           0 :         out.setData(item.name(), item.data() * get_scale(m_data[item.name()]),
     201             :                     AttrPolicy::Keep);
     202             :       else
     203          12 :         item *= get_scale(m_data[item.name()]);
     204             :     }
     205             :   } else {
     206           5 :     if (is_int(out.data().dtype()))
     207           2 :       out.setData(out.data() * get_scale(m_data));
     208             :     else
     209           3 :       out *= get_scale(m_data);
     210             :   }
     211             : 
     212          24 :   return out;
     213           1 : }
     214             : 
     215             : namespace {
     216             : template <class T> struct NanSensitiveLess {
     217             :   // Compare two values such that x < NaN for all x != NaN.
     218             :   // Note: if changing this in future, ensure it meets the requirements from
     219             :   // https://en.cppreference.com/w/cpp/named_req/Compare, as it is used as
     220             :   // the comparator for keys in a map.
     221        1440 :   bool operator()(const T &a, const T &b) const {
     222             :     if constexpr (std::is_floating_point_v<T>)
     223        1409 :       if (std::isnan(b))
     224           0 :         return !std::isnan(a);
     225        1440 :     return a < b;
     226             :   }
     227             : };
     228             : 
     229             : template <class T> struct nan_sensitive_equal {
     230      115857 :   bool operator()(const T &a, const T &b) const {
     231             :     if constexpr (std::is_floating_point_v<T>)
     232        1028 :       return a == b || (std::isnan(a) && std::isnan(b));
     233             :     else
     234      114829 :       return a == b;
     235             :   }
     236             : };
     237             : } // namespace
     238             : 
     239             : template <class T> struct MakeGroups {
     240          59 :   static GroupByGrouping apply(const Variable &key, const Dim targetDim) {
     241          59 :     expect::is_key(key);
     242          55 :     const auto &values = key.values<T>();
     243             : 
     244          55 :     const auto dim = key.dim();
     245             :     std::unordered_map<T, GroupByGrouping::group, std::hash<T>,
     246             :                        nan_sensitive_equal<T>>
     247          55 :         indices;
     248          55 :     const auto end = values.end();
     249          55 :     scipp::index i = 0;
     250         391 :     for (auto it = values.begin(); it != end;) {
     251             :       // Use contiguous (thick) slices if possible to avoid overhead of slice
     252             :       // handling in follow-up "apply" steps.
     253         336 :       const auto begin = i;
     254         336 :       const auto &group_value = *it;
     255      115380 :       while (it != end && nan_sensitive_equal<T>()(*it, group_value)) {
     256      115044 :         ++it;
     257      115044 :         ++i;
     258             :       }
     259         336 :       indices[group_value].emplace_back(dim, begin, i);
     260             :     }
     261             : 
     262          55 :     std::vector<T> keys;
     263          55 :     std::vector<GroupByGrouping::group> groups;
     264          55 :     keys.reserve(scipp::size(indices));
     265          55 :     groups.reserve(scipp::size(indices));
     266         378 :     for (auto &item : indices)
     267         323 :       keys.emplace_back(item.first);
     268          55 :     core::parallel::parallel_sort(keys.begin(), keys.end(),
     269           0 :                                   NanSensitiveLess<T>());
     270         378 :     for (const auto &k : keys) {
     271             :       // false positive, fixed in https://github.com/danmar/cppcheck/pull/4230
     272             :       // cppcheck-suppress containerOutOfBounds
     273         323 :       groups.emplace_back(std::move(indices.at(k)));
     274             :     }
     275             : 
     276          55 :     const Dimensions dims{targetDim, scipp::size(indices)};
     277         110 :     auto keys_ = makeVariable<T>(Dimensions{dims}, Values(std::move(keys)));
     278          55 :     keys_.setUnit(key.unit());
     279         110 :     return {dim, std::move(keys_), std::move(groups)};
     280          55 :   }
     281             : };
     282             : 
     283             : template <class T> struct MakeBinGroups {
     284          14 :   static GroupByGrouping apply(const Variable &key, const Variable &bins) {
     285          14 :     expect::is_key(key);
     286          14 :     if (bins.dims().ndim() != 1)
     287           0 :       throw except::DimensionError("Group-by bins must be 1-dimensional");
     288          14 :     if (key.unit() != bins.unit())
     289           0 :       throw except::UnitError("Group-by key must have same unit as bins");
     290          14 :     const auto &values = key.values<T>();
     291          14 :     const auto &edges = bins.values<T>();
     292          14 :     core::expect::histogram::sorted_edges(edges);
     293             : 
     294          14 :     const auto dim = key.dim();
     295          14 :     std::vector<GroupByGrouping::group> groups(edges.size() - 1);
     296          54 :     for (scipp::index i = 0; i < scipp::size(values);) {
     297             :       // Use contiguous (thick) slices if possible to avoid overhead of slice
     298             :       // handling in follow-up "apply" steps.
     299          40 :       const auto value = values[i];
     300          40 :       const auto begin = i++;
     301          40 :       auto right = std::upper_bound(edges.begin(), edges.end(), value);
     302          40 :       if (right != edges.end() && right != edges.begin()) {
     303          30 :         auto left = right - 1;
     304          85 :         while (i < scipp::size(values) && (*left <= values[i]) &&
     305          37 :                (values[i] < *right))
     306          18 :           ++i;
     307          30 :         groups[std::distance(edges.begin(), left)].emplace_back(dim, begin, i);
     308             :       }
     309             :     }
     310          28 :     return {dim, bins, std::move(groups)};
     311          14 :   }
     312             : };
     313             : 
     314             : template <class T>
     315          14 : GroupBy<T> call_groupby(const T &array, const Variable &key,
     316             :                         const Variable &bins) {
     317             :   return {
     318             :       array,
     319             :       core::CallDType<double, float, int64_t, int32_t>::apply<MakeBinGroups>(
     320          14 :           key.dtype(), key, bins)};
     321             : }
     322             : 
     323             : template <class T>
     324          59 : GroupBy<T> call_groupby(const T &array, const Variable &key, const Dim &dim) {
     325             :   return {array,
     326             :           core::CallDType<double, float, int64_t, int32_t, bool, std::string,
     327             :                           core::time_point>::apply<MakeGroups>(key.dtype(), key,
     328          59 :                                                                dim)};
     329             : }
     330             : 
     331             : /// Create GroupBy<DataArray> object as part of "split-apply-combine" mechanism.
     332             : ///
     333             : /// Groups the slices of `array` according to values in given by a coord.
     334             : /// Grouping will create a new coordinate for the dimension of the grouping
     335             : /// coord in a later apply/combine step.
     336          30 : GroupBy<DataArray> groupby(const DataArray &array, const Dim dim) {
     337          31 :   const auto &key = array.meta()[dim];
     338          56 :   return call_groupby(array, key, dim);
     339          29 : }
     340             : 
     341             : /// Create GroupBy<DataArray> object as part of "split-apply-combine" mechanism.
     342             : ///
     343             : /// Groups the slices of `array` according to values in given by a coord.
     344             : /// Grouping of a coord is according to given `bins`, which will be added as a
     345             : /// new coordinate to the output in a later apply/combine step.
     346           2 : GroupBy<DataArray> groupby(const DataArray &array, const Dim dim,
     347             :                            const Variable &bins) {
     348           2 :   const auto &key = array.meta()[dim];
     349           4 :   return groupby(array, key, bins);
     350           2 : }
     351             : 
     352             : /// Create GroupBy<DataArray> object as part of "split-apply-combine" mechanism.
     353             : ///
     354             : /// Groups the slices of `array` according to values in given by a coord.
     355             : /// Grouping of a coord is according to given `bins`, which will be added as a
     356             : /// new coordinate to the output in a later apply/combine step.
     357           4 : GroupBy<DataArray> groupby(const DataArray &array, const Variable &key,
     358             :                            const Variable &bins) {
     359           4 :   if (!array.dims().includes(key.dims()))
     360           1 :     throw except::DimensionError("Size of Group-by key is incorrect.");
     361             : 
     362           3 :   return call_groupby(array, key, bins);
     363             : }
     364             : 
     365             : /// Create GroupBy<Dataset> object as part of "split-apply-combine" mechanism.
     366             : ///
     367             : /// Groups the slices of `dataset` according to values in given by a coord.
     368             : /// Grouping will create a new coordinate for the dimension of the grouping
     369             : /// coord in a later apply/combine step.
     370          31 : GroupBy<Dataset> groupby(const Dataset &dataset, const Dim dim) {
     371          31 :   const auto &key = dataset.meta()[dim];
     372          30 :   return call_groupby(dataset, key, dim);
     373             : }
     374             : 
     375             : /// Create GroupBy<Dataset> object as part of "split-apply-combine" mechanism.
     376             : ///
     377             : /// Groups the slices of `dataset` according to values in given by a coord.
     378             : /// Grouping of a coord is according to given `bins`, which will be added as a
     379             : /// new coordinate to the output in a later apply/combine step.
     380          10 : GroupBy<Dataset> groupby(const Dataset &dataset, const Dim dim,
     381             :                          const Variable &bins) {
     382          10 :   const auto &key = dataset.meta()[dim];
     383          10 :   return groupby(dataset, key, bins);
     384             : }
     385             : 
     386             : /// Create GroupBy<Dataset> object as part of "split-apply-combine" mechanism.
     387             : ///
     388             : /// Groups the slices of `dataset` according to values in given by a coord.
     389             : /// Grouping of a coord is according to given `bins`, which will be added as a
     390             : /// new coordinate to the output in a later apply/combine step.
     391          12 : GroupBy<Dataset> groupby(const Dataset &dataset, const Variable &key,
     392             :                          const Variable &bins) {
     393          17 :   for (const auto &dim : dataset.sizes()) {
     394          16 :     Dimensions dims(dim, dataset.sizes()[dim]);
     395          16 :     if (dims.includes(key.dims()))
     396             :       // Found compatible Dimension.
     397          22 :       return call_groupby(dataset, key, bins);
     398          16 :   }
     399             :   // No Dimension contains the key - throw.
     400           1 :   throw except::DimensionError("Size of Group-by key is incorrect.");
     401             : }
     402             : 
     403             : template class GroupBy<DataArray>;
     404             : template class GroupBy<Dataset>;
     405             : 
     406             : } // namespace scipp::dataset

Generated by: LCOV version 1.14