Line data Source code
1 : // SPDX-License-Identifier: BSD-3-Clause
2 : // Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
3 : /// @file
4 : /// @author Simon Heybrock
5 : #include <algorithm>
6 :
7 : #include "scipp/core/element/histogram.h"
8 : #include "scipp/dataset/bins.h"
9 : #include "scipp/dataset/dataset.h"
10 : #include "scipp/dataset/except.h"
11 : #include "scipp/dataset/groupby.h"
12 : #include "scipp/dataset/histogram.h"
13 : #include "scipp/variable/arithmetic.h"
14 : #include "scipp/variable/astype.h"
15 : #include "scipp/variable/reduction.h"
16 : #include "scipp/variable/shape.h"
17 : #include "scipp/variable/transform_subspan.h"
18 : #include "scipp/variable/util.h"
19 :
20 : #include "bins_util.h"
21 : #include "dataset_operations_common.h"
22 :
23 : using namespace scipp::core;
24 : using namespace scipp::variable;
25 :
26 : namespace scipp::dataset {
27 :
28 47595 : DataArray histogram(const DataArray &events, const Variable &binEdges) {
29 : using namespace scipp::core;
30 47595 : auto dim = binEdges.dims().inner();
31 47595 : auto con_bin_edges = as_contiguous(binEdges, dim);
32 :
33 47595 : DataArray result;
34 47595 : if (events.dtype() == dtype<bucket<DataArray>>) {
35 : // TODO Histogram data from buckets. Is this the natural choice for the API,
36 : // i.e., does it make sense that histograming considers bucket contents?
37 : // Should we instead have a separate named function for this case?
38 229 : result = apply_and_drop_dim(
39 : events,
40 115 : [](const DataArray &events_, const Dim dim_,
41 : const Variable &binEdges_) {
42 : return buckets::histogram(
43 231 : hide_masked(events_.data(), events_.masks(),
44 : scipp::span<const Dim>{&dim_, 1}),
45 229 : binEdges_);
46 : },
47 114 : dim, con_bin_edges);
48 47480 : } else if (!is_histogram(events, dim)) {
49 47478 : const auto event_dim = events.coords().dim_of(dim);
50 94956 : result = apply_and_drop_dim(
51 : events,
52 47478 : [dim](const DataArray &events_, const Dim event_dim_,
53 : const Variable &binEdges_) {
54 47478 : const auto data = masked_data(events_, event_dim_);
55 : // Warning: Don't try to move the `as_contiguous` into `subspan_view`
56 : // without special care: It may return a new variable which will go
57 : // out of scope, leading to subtle bugs. Here on the other hand the
58 : // returned temporary is kept alive until the end of the
59 : // full-expression.
60 47478 : const auto cont_data = as_contiguous(data, event_dim_);
61 : const auto cont_coord =
62 47478 : as_contiguous(events_.coords()[dim], event_dim_);
63 47478 : if (data.ndim() == 1 && data.dims().volume() > 100000) {
64 40 : const DataArray content(cont_data, {{dim, cont_coord}});
65 : const auto binned =
66 10 : pretend_bins_for_threading(content, Dim::InternalHistogram);
67 : // This sums automatically over Dim::InternalHistogram
68 10 : return buckets::histogram(binned, binEdges_);
69 10 : }
70 : // Due to the combinatoric explosion of dtype combinations, we promote
71 : // either the bin edges or the coord in case their dtypes mismatch.
72 : // This is less efficient, but hopefully an edge case. If performance
73 : // matters, users should ensure that they use bin edges and coord with
74 : // the same dtype.
75 47468 : const auto dt = common_type(binEdges_, cont_coord);
76 : const auto promoted_coord =
77 47468 : astype(cont_coord, dt, CopyPolicy::TryAvoid);
78 : const auto promoted_edges =
79 47468 : astype(binEdges_, dt, CopyPolicy::TryAvoid);
80 : return transform_subspan(
81 47468 : events_.dtype(), dim, binEdges_.dims()[dim] - 1,
82 94936 : subspan_view(promoted_coord, event_dim_),
83 94936 : subspan_view(cont_data, event_dim_), promoted_edges,
84 94936 : element::histogram, "histogram");
85 47478 : },
86 47478 : event_dim, con_bin_edges);
87 : } else {
88 2 : throw except::BinEdgeError(
89 : "Data is already histogrammed. Expected event data or dense point "
90 4 : "data, got data with bin edges.");
91 : }
92 47592 : result.coords().set(dim, binEdges);
93 95184 : return result;
94 47598 : }
95 :
96 1 : Dataset histogram(const Dataset &dataset, const Variable &binEdges) {
97 : return apply_to_items(
98 : dataset,
99 2 : [](const auto &item, const Dim, const auto &binEdges_) {
100 2 : return histogram(item, binEdges_);
101 : },
102 2 : binEdges.dims().inner(), binEdges);
103 : }
104 :
105 : /// Return the dimensions of the given data array that have an "bin edge"
106 : /// coordinate.
107 18 : std::set<Dim> edge_dimensions(const DataArray &a) {
108 18 : std::set<Dim> dims;
109 39 : for (const auto &[d, coord] : a.coords())
110 40 : if (a.dims().contains(d) && coord.dims().contains(d) &&
111 19 : coord.dims()[d] == a.dims()[d] + 1)
112 15 : dims.insert(d);
113 18 : return dims;
114 0 : }
115 :
116 : /// Return the Dim of the given data array that has an "bin edge" coordinate.
117 : ///
118 : /// Throws if there is not exactly one such dimension.
119 18 : Dim edge_dimension(const DataArray &a) {
120 18 : const auto &dims = edge_dimensions(a);
121 18 : if (dims.size() != 1)
122 7 : throw except::BinEdgeError("Expected bin edges in only one dimension.");
123 22 : return *dims.begin();
124 18 : }
125 :
126 : namespace {
127 47491 : template <typename T> bool is_histogram_impl(const T &a, const Dim dim) {
128 47491 : const auto dims = a.dims();
129 47491 : const auto coords = a.coords();
130 47691 : return dims.contains(dim) && coords.contains(dim) &&
131 47889 : coords[dim].dims().contains(dim) &&
132 47689 : coords[dim].dims()[dim] == dims.at(dim) + 1;
133 47491 : }
134 : } // namespace
135 : /// Return true if the data array represents a histogram for given dim.
136 47489 : bool is_histogram(const DataArray &a, const Dim dim) {
137 47489 : return is_histogram_impl(a, dim);
138 : }
139 :
140 : /// Return true if the dataset represents a histogram for given dim.
141 2 : bool is_histogram(const Dataset &a, const Dim dim) {
142 2 : return is_histogram_impl(a, dim);
143 : }
144 :
145 : } // namespace scipp::dataset
|