LCOV - code coverage report
Current view: top level - dataset - bins.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 218 241 90.5 %
Date: 2024-11-24 01:48:31 Functions: 34 39 87.2 %

          Line data    Source code
       1             : // SPDX-License-Identifier: BSD-3-Clause
       2             : // Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
       3             : /// @file
       4             : /// @author Simon Heybrock
       5             : #include <algorithm>
       6             : #include <limits>
       7             : 
       8             : #include "scipp/core/bucket.h"
       9             : #include "scipp/core/element/event_operations.h"
      10             : #include "scipp/core/element/histogram.h"
      11             : #include "scipp/core/except.h"
      12             : 
      13             : #include "scipp/variable/arithmetic.h"
      14             : #include "scipp/variable/bins.h"
      15             : #include "scipp/variable/creation.h"
      16             : #include "scipp/variable/cumulative.h"
      17             : #include "scipp/variable/reduction.h"
      18             : #include "scipp/variable/subspan_view.h"
      19             : #include "scipp/variable/transform.h"
      20             : #include "scipp/variable/transform_subspan.h"
      21             : #include "scipp/variable/util.h"
      22             : #include "scipp/variable/variable.h"
      23             : #include "scipp/variable/variable_factory.h"
      24             : 
      25             : #include "scipp/dataset/bins.h"
      26             : #include "scipp/dataset/bins_view.h"
      27             : #include "scipp/dataset/dataset.h"
      28             : #include "scipp/dataset/histogram.h"
      29             : 
      30             : #include "../variable/operations_common.h"
      31             : #include "bin_common.h"
      32             : #include "bin_detail.h"
      33             : #include "dataset_operations_common.h"
      34             : 
      35             : namespace scipp::dataset {
      36             : namespace {
      37        9606 : constexpr auto copy_or_match = [](const auto &a, auto &&b, const Dim dim,
      38             :                                   const Variable &srcIndices,
      39             :                                   const Variable &dstIndices) {
      40        9606 :   if (a.dims().contains(dim))
      41        9508 :     copy_slices(a, b, dim, srcIndices, dstIndices);
      42             :   else
      43          98 :     core::expect::equals(a, b);
      44        9606 : };
      45             : 
      46        7302 : constexpr auto expect_matching_keys = [](const auto &a, const auto &b) {
      47        7302 :   bool ok = true;
      48       19261 :   constexpr auto key = [](const auto &x_) {
      49             :     if constexpr (std::is_base_of_v<DataArray, std::decay_t<decltype(x_)>>)
      50         160 :       return x_.name();
      51             :     else
      52       19101 :       return x_.first;
      53             :   };
      54       16930 :   for (const auto &x : a)
      55        9628 :     ok &= b.contains(key(x));
      56       16935 :   for (const auto &x : b)
      57        9633 :     ok &= a.contains(key(x));
      58        7302 :   if (!ok)
      59          21 :     throw std::runtime_error("Mismatching keys in\n" + to_string(a) + " and\n" +
      60             :                              to_string(b));
      61        7281 : };
      62             : 
      63         172 : auto make_fill(const DataArray &function,
      64             :                const std::optional<Variable> &fill_value) {
      65         172 :   Variable fill = fill_value.value_or(zero_like(function.data()));
      66         172 :   if (fill_value) {
      67         138 :     if (fill.dtype() != function.dtype())
      68           0 :       throw except::TypeError(
      69           0 :           "The fill_value (dtype=" + to_string(fill.dtype()) +
      70           0 :           ") must have the same dtype as the function values (dtype=" +
      71           0 :           to_string(function.dtype()) + ").");
      72          34 :   } else if (fill.dtype() == dtype<double>) {
      73          10 :     fill.value<double>() = std::numeric_limits<double>::quiet_NaN();
      74          24 :   } else if (fill.dtype() == dtype<float>) {
      75           6 :     fill.value<float>() = std::numeric_limits<float>::quiet_NaN();
      76             :   }
      77         172 :   return fill;
      78           0 : }
      79             : 
      80             : } // namespace
      81             : 
      82        3525 : void copy_slices(const DataArray &src, DataArray dst, const Dim dim,
      83             :                  const Variable &srcIndices, const Variable &dstIndices) {
      84        3529 :   copy_slices(src.data(), dst.data(), dim, srcIndices, dstIndices);
      85        3545 :   expect_matching_keys(src.meta(), dst.meta());
      86        3509 :   expect_matching_keys(src.masks(), dst.masks());
      87       12821 :   for (const auto &[name, coord] : src.meta())
      88       12821 :     copy_or_match(coord, dst.meta()[name], dim, srcIndices, dstIndices);
      89        3593 :   for (const auto &[name, mask] : src.masks())
      90          88 :     copy_or_match(mask, dst.masks()[name], dim, srcIndices, dstIndices);
      91        3505 : }
      92             : 
      93          61 : void copy_slices(const Dataset &src, Dataset dst, const Dim dim,
      94             :                  const Variable &srcIndices, const Variable &dstIndices) {
      95         154 :   for (const auto &[name, var] : src.coords())
      96          94 :     copy_or_match(var, dst.coords()[name], dim, srcIndices, dstIndices);
      97          60 :   expect_matching_keys(src.coords(), dst.coords());
      98          59 :   expect_matching_keys(src, dst);
      99         132 :   for (const auto &item : src) {
     100          77 :     const auto &dst_ = dst[item.name()];
     101          77 :     expect_matching_keys(item.attrs(), dst_.attrs());
     102          76 :     expect_matching_keys(item.masks(), dst_.masks());
     103          75 :     copy_or_match(item.data(), dst_.data(), dim, srcIndices, dstIndices);
     104          94 :     for (const auto &[name, var] : item.masks())
     105          19 :       copy_or_match(var, dst_.masks()[name], dim, srcIndices, dstIndices);
     106          90 :     for (const auto &[name, var] : item.attrs())
     107          15 :       copy_or_match(var, dst_.attrs()[name], dim, srcIndices, dstIndices);
     108          79 :   }
     109          55 : }
     110             : 
     111             : namespace {
     112       12995 : constexpr auto copy_or_resize = [](const auto &var, const Dim dim,
     113             :                                    const scipp::index size) {
     114       12995 :   auto dims = var.dims();
     115       12995 :   if (dims.contains(dim))
     116       12896 :     dims.resize(dim, size);
     117             :   // Using variableFactory instead of variable::resize for creating
     118             :   // _uninitialized_ variable.
     119       12995 :   return var.dims().contains(dim)
     120       12896 :              ? variable::variableFactory().create(var.dtype(), dims, var.unit(),
     121       12896 :                                                   var.has_variances())
     122       38886 :              : copy(var);
     123       12995 : };
     124             : } // namespace
     125             : 
     126             : // TODO These functions are an unfortunate near-duplicate of `resize`. However,
     127             : // the latter drops coords along the resized dimension. Is there a way to unify
     128             : // this? Can the need to drop coords in resize be avoided?
     129        3472 : DataArray resize_default_init(const DataArray &parent, const Dim dim,
     130             :                               const scipp::index size) {
     131        6944 :   DataArray buffer(copy_or_resize(parent.data(), dim, size));
     132       12718 :   for (const auto &[name, var] : parent.coords())
     133        9246 :     buffer.coords().set(name, copy_or_resize(var, dim, size));
     134        3563 :   for (const auto &[name, var] : parent.masks())
     135          91 :     buffer.masks().set(name, copy_or_resize(var, dim, size));
     136        3486 :   for (const auto &[name, var] : parent.attrs())
     137          14 :     buffer.attrs().set(name, copy_or_resize(var, dim, size));
     138        3472 :   return buffer;
     139           0 : }
     140             : 
     141          49 : Dataset resize_default_init(const Dataset &parent, const Dim dim,
     142             :                             const scipp::index size) {
     143          49 :   auto new_sizes = parent.sizes();
     144          49 :   if (new_sizes.contains(dim))
     145          49 :     new_sizes.resize(dim, size);
     146             : 
     147          98 :   Dataset buffer({}, Coords(new_sizes, {}));
     148         128 :   for (const auto &[name, var] : parent.coords())
     149          79 :     buffer.setCoord(name, copy_or_resize(var, dim, size));
     150         114 :   for (const auto &item : parent) {
     151          65 :     buffer.setData(item.name(), copy_or_resize(item.data(), dim, size));
     152          80 :     for (const auto &[name, var] : item.masks())
     153          15 :       buffer[item.name()].masks().set(name, copy_or_resize(var, dim, size));
     154          78 :     for (const auto &[name, var] : item.attrs())
     155          13 :       buffer[item.name()].attrs().set(name, copy_or_resize(var, dim, size));
     156          65 :   }
     157          98 :   return buffer;
     158          49 : }
     159             : 
     160             : /// Construct a bin-variable over a data array.
     161             : ///
     162             : /// Each bin is represented by a Variable slice. `indices` defines the array of
     163             : /// bins as slices of `buffer` along `dim`.
     164        8739 : Variable make_bins(Variable indices, const Dim dim, DataArray buffer) {
     165        8739 :   expect_valid_bin_indices(indices, dim, buffer.dims());
     166        8739 :   return make_bins_no_validate(std::move(indices), dim, std::move(buffer));
     167             : }
     168             : 
     169             : /// Construct a bin-variable over a data array without index validation.
     170             : ///
     171             : /// Must be used only when it is guaranteed that indices are valid or overlap of
     172             : /// bins is acceptable.
     173       34695 : Variable make_bins_no_validate(Variable indices, const Dim dim,
     174             :                                DataArray buffer) {
     175       34695 :   return variable::make_bins_impl(std::move(indices), dim, std::move(buffer));
     176             : }
     177             : 
     178             : /// Construct a bin-variable over a dataset.
     179             : ///
     180             : /// Each bin is represented by a Variable slice. `indices` defines the array of
     181             : /// bins as slices of `buffer` along `dim`.
     182          32 : Variable make_bins(Variable indices, const Dim dim, Dataset buffer) {
     183          32 :   expect_valid_bin_indices(indices, dim, buffer.sizes());
     184          32 :   return make_bins_no_validate(std::move(indices), dim, std::move(buffer));
     185             : }
     186             : 
     187             : /// Construct a bin-variable over a dataset without index validation.
     188             : ///
     189             : /// Must be used only when it is guaranteed that indices are valid or overlap of
     190             : /// bins is acceptable.
     191         103 : Variable make_bins_no_validate(Variable indices, const Dim dim,
     192             :                                Dataset buffer) {
     193         103 :   return variable::make_bins_impl(std::move(indices), dim, std::move(buffer));
     194             : }
     195             : 
     196       75485 : bool is_bins(const DataArray &array) { return is_bins(array.data()); }
     197             : 
     198           0 : bool is_bins(const Dataset &dataset) {
     199           0 :   return std::any_of(dataset.begin(), dataset.end(),
     200           0 :                      [](const auto &item) { return is_bins(item); });
     201             : }
     202             : 
     203          40 : Variable lookup_previous(const DataArray &function, const Variable &x, Dim dim,
     204             :                          const std::optional<Variable> &fill_value) {
     205          40 :   const auto fill = make_fill(function, fill_value);
     206          40 :   const auto &coord = function.meta()[dim];
     207          40 :   const auto data = masked_data(function, dim, fill);
     208          40 :   const auto weights = subspan_view(data, dim);
     209          40 :   if (!allsorted(coord, dim))
     210           0 :     throw except::DataArrayError(
     211           0 :         "Coordinate of lookup function must be sorted.");
     212             :   // Note that we could do a linspace optimization similar to buckets::map here.
     213             :   // Add this if we have real world application that would benefit.
     214          80 :   return variable::transform(x, subspan_view(coord, dim), weights, fill,
     215             :                              core::element::event::lookup_previous,
     216         120 :                              "lookup_previous");
     217          40 : }
     218             : 
     219        6143 : Variable pretend_bins_for_threading(const DataArray &da, Dim bin_dim) {
     220        6143 :   const auto dim = da.dims().inner();
     221        6143 :   const auto size = std::max(scipp::index(1), da.dims()[dim]);
     222       12286 :   const auto nthread = size > 10000000  ? 24
     223       12282 :                        : size > 1000000 ? 4
     224        6139 :                        : size > 100000  ? 2
     225             :                                         : 1;
     226             : 
     227        6143 :   const auto stride = std::max(scipp::index(1), size / nthread);
     228        6143 :   auto begin = bin_detail::make_range(0, size, stride, bin_dim);
     229        6143 :   auto end = begin + stride * units::none;
     230        6143 :   end.values<scipp::index>().as_span().back() = da.dims()[dim];
     231        6143 :   const auto indices = zip(begin, end);
     232       12286 :   return make_bins_no_validate(indices, dim, da);
     233        6143 : }
     234             : 
     235             : } // namespace scipp::dataset
     236             : 
     237             : namespace scipp::dataset::buckets {
     238             : namespace {
     239             : 
     240          28 : template <class T> auto combine(const Variable &var0, const Variable &var1) {
     241          28 :   const auto &[indices0, dim0, buffer0] = var0.constituents<T>();
     242          28 :   const auto &[indices1, dim1, buffer1] = var1.constituents<T>();
     243             :   static_cast<void>(buffer1);
     244             :   static_cast<void>(dim1);
     245          28 :   const Dim dim = dim0;
     246          28 :   const auto [begin0, end0] = unzip(indices0);
     247          28 :   const auto [begin1, end1] = unzip(indices1);
     248          28 :   const auto sizes0 = end0 - begin0;
     249          28 :   const auto sizes1 = end1 - begin1;
     250          28 :   const auto sizes = sizes0 + sizes1;
     251          28 :   const auto end = cumsum(sizes);
     252          28 :   const auto begin = end - sizes;
     253          28 :   const auto total_size =
     254          28 :       end.dims().volume() > 0
     255          28 :           ? end.template values<scipp::index>().as_span().back()
     256             :           : 0;
     257          28 :   auto buffer = resize_default_init(buffer0, dim, total_size);
     258          28 :   copy_slices(buffer0, buffer, dim, indices0, zip(begin, end - sizes1));
     259          46 :   copy_slices(buffer1, buffer, dim, indices1, zip(begin + sizes0, end));
     260          44 :   return make_bins_no_validate(zip(begin, end), dim, std::move(buffer));
     261          82 : }
     262             : 
     263             : template <class T>
     264          21 : auto concatenate_impl(const Variable &var0, const Variable &var1) {
     265          21 :   return combine<T>(var0, var1);
     266             : }
     267             : 
     268             : } // namespace
     269             : 
     270          21 : Variable concatenate(const Variable &var0, const Variable &var1) {
     271          21 :   if (var0.dtype() == dtype<bucket<Variable>>)
     272           0 :     return concatenate_impl<Variable>(var0, var1);
     273          21 :   else if (var0.dtype() == dtype<bucket<DataArray>>)
     274           9 :     return concatenate_impl<DataArray>(var0, var1);
     275             :   else
     276          12 :     return concatenate_impl<Dataset>(var0, var1);
     277             : }
     278             : 
     279           7 : DataArray concatenate(const DataArray &a, const DataArray &b) {
     280          14 :   return DataArray{buckets::concatenate(a.data(), b.data()),
     281          14 :                    union_(a.coords(), b.coords(), "concatenate"),
     282          14 :                    union_or(a.masks(), b.masks()),
     283          14 :                    intersection(a.attrs(), b.attrs())};
     284             : }
     285             : 
     286             : /// Reduce a dimension by concatenating all elements along the dimension.
     287             : ///
     288             : /// This is the analogue to summing non-bucket data.
     289           8 : Variable concatenate(const Variable &var, const Dim dim) {
     290           8 :   if (var.dtype() == dtype<bucket<Variable>>)
     291           1 :     return concat_bins<Variable>(var, dim);
     292             :   else
     293           7 :     return concat_bins<DataArray>(var, dim);
     294             : }
     295             : 
     296             : /// Reduce a dimension by concatenating all elements along the dimension.
     297             : ///
     298             : /// This is the analogue to summing non-bucket data.
     299           5 : DataArray concatenate(const DataArray &array, const Dim dim) {
     300           5 :   return groupby_concat_bins(array, {}, {}, {dim});
     301             : }
     302             : 
     303           7 : void append(Variable &var0, const Variable &var1) {
     304           7 :   if (var0.dtype() == dtype<bucket<Variable>>)
     305           0 :     var0.setDataHandle(combine<Variable>(var0, var1).data_handle());
     306           7 :   else if (var0.dtype() == dtype<bucket<DataArray>>)
     307           9 :     var0.setDataHandle(combine<DataArray>(var0, var1).data_handle());
     308             :   else
     309           0 :     var0.setDataHandle(combine<Dataset>(var0, var1).data_handle());
     310           6 : }
     311             : 
     312           0 : void append(Variable &&var0, const Variable &var1) { append(var0, var1); }
     313             : 
     314           4 : void append(DataArray &a, const DataArray &b) {
     315           4 :   expect::coords_are_superset(a, b, "bins.append");
     316           4 :   union_or_in_place(a.masks(), b.masks());
     317           4 :   auto data = a.data();
     318           4 :   append(data, b.data());
     319           4 :   a.setData(data);
     320           4 : }
     321             : 
     322         128 : Variable histogram(const Variable &data, const Variable &binEdges) {
     323             :   using namespace scipp::core;
     324         128 :   auto hist_dim = binEdges.dims().inner();
     325         128 :   auto &&[indices, dim, buffer] = data.constituents<DataArray>();
     326             :   // `hist_dim` may be the same as a dim of data if there is existing binning.
     327             :   // We rename to a dummy to avoid duplicate dimensions, perform histogramming,
     328             :   // and then sum over the dummy dimensions, i.e., sum contributions from all
     329             :   // inputs bins to the same output histogram. This also allows for threading of
     330             :   // 1-D histogramming provided that the input has multiple bins along
     331             :   // `hist_dim`.
     332         128 :   const Dim dummy = Dim::InternalHistogram;
     333         128 :   const auto nbin = binEdges.dims()[hist_dim] - 1;
     334         128 :   if (indices.dims().contains(hist_dim)) {
     335             :     // With large existing dim matching the new dim, we would create a large
     336             :     // intermediate histogrammed result, which leads to performance and memory
     337             :     // issues. This is a suboptimal (since it concatenates first) but simple way
     338             :     // to avoid the problem.
     339          47 :     if (indices.dims().volume() * nbin > 100000000) { // about 1 GByte
     340           0 :       const auto tmp = concatenate(data, hist_dim);
     341           0 :       if (tmp.ndim() == 0) // Operate on buffer so we get multi-threading
     342           0 :         return histogram(tmp.bin_buffer<DataArray>(), binEdges).data();
     343             :       else
     344           0 :         return histogram(tmp, binEdges);
     345           0 :     }
     346          47 :     indices = indices.rename_dims({{hist_dim, dummy}});
     347             :   }
     348             : 
     349         128 :   const auto masked = masked_data(buffer, dim);
     350         128 :   const auto coord = buffer.meta()[hist_dim];
     351         128 :   const auto dt = common_type(binEdges, coord);
     352         128 :   const auto promoted_coord = astype(coord, dt, CopyPolicy::TryAvoid);
     353         128 :   const auto promoted_edges = astype(binEdges, dt, CopyPolicy::TryAvoid);
     354             :   auto hist = variable::transform_subspan(
     355             :       buffer.dtype(), hist_dim, nbin,
     356         256 :       subspan_view(promoted_coord, dim, indices),
     357         129 :       subspan_view(masked, dim, indices), promoted_edges, element::histogram,
     358         257 :       "histogram");
     359         127 :   if (hist.dims().contains(dummy))
     360          57 :     return sum(hist, dummy);
     361             :   else
     362          70 :     return hist;
     363         132 : }
     364             : 
     365         132 : Variable map(const DataArray &function, const Variable &x, Dim dim,
     366             :              const std::optional<Variable> &fill_value) {
     367         132 :   const auto fill = make_fill(function, fill_value);
     368         132 :   if (dim == Dim::Invalid)
     369           0 :     dim = edge_dimension(function);
     370         132 :   const auto &edges = function.meta()[dim];
     371         132 :   if (!is_edges(function.dims(), edges.dims(), dim))
     372           1 :     throw except::BinEdgeError(
     373           2 :         "Function used as lookup table in map operation must be a histogram");
     374         131 :   const auto data = masked_data(function, dim, fill);
     375         131 :   const auto weights = subspan_view(data, dim);
     376         131 :   if (all(islinspace(edges, dim)).value<bool>()) {
     377         231 :     return variable::transform(x, subspan_view(edges, dim), weights, fill,
     378         345 :                                core::element::event::map_linspace, "map");
     379             :   } else {
     380          16 :     if (!allsorted(edges, dim))
     381           0 :       throw except::BinEdgeError("Bin edges of histogram must be sorted.");
     382          32 :     return variable::transform(x, subspan_view(edges, dim), weights, fill,
     383          48 :                                core::element::event::map_sorted_edges, "map");
     384             :   }
     385         136 : }
     386             : 
     387         122 : void scale(DataArray &array, const DataArray &histogram, Dim dim) {
     388         122 :   if (dim == Dim::Invalid)
     389          11 :     dim = edge_dimension(histogram);
     390             :   // Coords along dim are ignored since "binning" is dynamic for buckets.
     391         119 :   expect::coords_are_superset(array, histogram.slice({dim, 0}), "bins.scale");
     392             :   // scale applies masks along dim but others are kept
     393         119 :   union_or_in_place(array.masks(), histogram.slice({dim, 0}).masks());
     394         238 :   auto data = bins_view<DataArray>(array.data()).data();
     395         119 :   const auto &coord = bins_view<DataArray>(array.data()).meta()[dim];
     396         119 :   const auto &edges = histogram.meta()[dim];
     397         119 :   const auto masked = masked_data(histogram, dim);
     398         119 :   const auto weights = subspan_view(masked, dim);
     399         119 :   if (all(islinspace(edges, dim)).value<bool>()) {
     400         120 :     transform_in_place(data, coord, subspan_view(edges, dim), weights,
     401             :                        core::element::event::map_and_mul_linspace,
     402             :                        "bins.scale");
     403             :   } else {
     404           1 :     if (!allsorted(edges, dim))
     405           0 :       throw except::BinEdgeError("Bin edges of histogram must be sorted.");
     406           1 :     transform_in_place(data, coord, subspan_view(edges, dim), weights,
     407             :                        core::element::event::map_and_mul_sorted_edges,
     408             :                        "bins.scale");
     409             :   }
     410         127 : }
     411             : } // namespace scipp::dataset::buckets

Generated by: LCOV version 1.14