Line data Source code
1 : // SPDX-License-Identifier: BSD-3-Clause
2 : // Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
3 : /// @file
4 : /// @author Simon Heybrock
5 : #pragma once
6 : #include <limits>
7 :
8 : #include "scipp/common/overloaded.h"
9 : #include "scipp/core/eigen.h"
10 : #include "scipp/core/element/arg_list.h"
11 : #include "scipp/core/element/util.h"
12 : #include "scipp/core/histogram.h"
13 : #include "scipp/core/subbin_sizes.h"
14 : #include "scipp/core/time_point.h"
15 : #include "scipp/core/transform_common.h"
16 :
17 : namespace scipp::core::element {
18 :
19 293918 : auto map_to_bins_direct = [](auto &binned, auto &bins, const auto &data,
20 : const auto &bin_indices) {
21 293918 : const auto size = scipp::size(bin_indices);
22 : using T = std::decay_t<decltype(data)>;
23 176759895 : for (scipp::index i = 0; i < size; ++i) {
24 176465977 : const auto i_bin = bin_indices[i];
25 176465977 : if (i_bin < 0)
26 854691 : continue;
27 : if constexpr (is_ValueAndVariance_v<T>) {
28 242179 : binned.variance[bins[i_bin]] = data.variance[i];
29 242179 : binned.value[bins[i_bin]++] = data.value[i];
30 : } else {
31 175369107 : binned[bins[i_bin]++] = data[i];
32 : }
33 : }
34 293918 : };
35 :
36 : constexpr bool is_powerof2(int v) { return v && ((v & (v - 1)) == 0); }
37 :
38 : template <int chunksize>
39 40 : auto map_to_bins_chunkwise = [](auto &binned, auto &bins, const auto &data,
40 : const auto &bin_indices) {
41 : // compiler is smart for div or mod 2**N, otherwise this would be too slow
42 : static_assert(is_powerof2(chunksize));
43 : using InnerIndex = int16_t;
44 : static_assert(chunksize <= std::numeric_limits<InnerIndex>::max());
45 40 : const auto size = scipp::size(bin_indices);
46 : using T = std::decay_t<decltype(data)>;
47 :
48 : using Val =
49 : std::conditional_t<is_ValueAndVariance_v<T>, typename T::value_type, T>;
50 : // TODO Ideally this buffer would be reused (on a per-thread basis)
51 : // for every application of the kernel.
52 : std::vector<std::tuple<std::vector<typename Val::value_type>,
53 : std::vector<InnerIndex>>>
54 40 : chunks((bins.size() - 1) / chunksize + 1);
55 3932 : for (scipp::index i = 0; i < size;) {
56 : // We operate in blocks so the size of the map of buffers, i.e.,
57 : // additional memory use of the algorithm, is bounded. This also
58 : // avoids costly allocations from resize operations.
59 3892 : const scipp::index max = std::min(size, i + scipp::size(bins) * 8);
60 : // 1. Map to chunks
61 37107278 : for (; i < max; ++i) {
62 37103386 : const auto i_bin = bin_indices[i];
63 37103386 : if (i_bin < 0)
64 6009138 : continue;
65 31094248 : const InnerIndex j = i_bin % chunksize;
66 31094248 : const auto i_chunk = i_bin / chunksize;
67 31094248 : auto &[vals, ind] = chunks[i_chunk];
68 : if constexpr (is_ValueAndVariance_v<T>) {
69 0 : vals.emplace_back(data.value[i]);
70 0 : vals.emplace_back(data.variance[i]);
71 : } else {
72 31094248 : vals.emplace_back(data[i]);
73 : }
74 31094248 : ind.emplace_back(j);
75 : }
76 : // 2. Map chunks to bins
77 42804 : for (scipp::index i_chunk = 0; i_chunk < scipp::size(chunks); ++i_chunk) {
78 38912 : auto &[vals, ind] = chunks[i_chunk];
79 31133160 : for (scipp::index j = 0; j < scipp::size(ind); ++j) {
80 31094248 : const auto i_bin = chunksize * i_chunk + ind[j];
81 : if constexpr (is_ValueAndVariance_v<T>) {
82 0 : binned.value[bins[i_bin]] = vals[2 * j];
83 0 : binned.variance[bins[i_bin]++] = vals[2 * j + 1];
84 : } else {
85 31094248 : binned[bins[i_bin]++] = vals[j];
86 : }
87 : }
88 38912 : vals.clear();
89 38912 : ind.clear();
90 : }
91 : }
92 40 : };
93 :
94 : // - Each span covers an *input* bin.
95 : // - `offsets` Start indices of the output bins
96 : // - `bin_indices` Target output bin index (within input bin)
97 : template <class T, class Index>
98 : using bin_arg = std::tuple<scipp::span<T>, SubbinSizes, scipp::span<const T>,
99 : scipp::span<const Index>>;
100 : static constexpr auto bin = overloaded{
101 : element::arg_list<
102 : bin_arg<double, int64_t>, bin_arg<double, int32_t>,
103 : bin_arg<float, int64_t>, bin_arg<float, int32_t>,
104 : bin_arg<int64_t, int64_t>, bin_arg<int64_t, int32_t>,
105 : bin_arg<int32_t, int64_t>, bin_arg<int32_t, int32_t>,
106 : bin_arg<bool, int64_t>, bin_arg<bool, int32_t>,
107 : bin_arg<Eigen::Vector3d, int64_t>, bin_arg<Eigen::Vector3d, int32_t>,
108 : bin_arg<std::string, int64_t>, bin_arg<std::string, int32_t>,
109 : bin_arg<time_point, int64_t>, bin_arg<time_point, int32_t>>,
110 : transform_flags::expect_in_variance_if_out_variance,
111 30146 : [](units::Unit &binned, const units::Unit &, const units::Unit &data,
112 30146 : const units::Unit &) { binned = data; },
113 293958 : [](const auto &binned, const auto &offsets, const auto &data,
114 : const auto &bin_indices) {
115 293958 : auto bins(offsets.sizes());
116 : // If there are many bins, we have two performance issues:
117 : // 1. `bins` is large and will not fit into L1, L2, or L3 cache.
118 : // 2. Writes to output are very random, implying a cache miss for every
119 : // event.
120 : // We can avoid some of this issue by first sorting into chunks, then
121 : // chunks into bins. For example, instead of mapping directly to 65536
122 : // bins, we may map to 256 chunks, and each chunk to 256 bins.
123 293958 : const bool many_bins = bins.size() > 512;
124 293958 : const bool multiple_events_per_bin = bins.size() * 4 < bin_indices.size();
125 293958 : if (many_bins && multiple_events_per_bin) { // avoid overhead
126 40 : if (bins.size() <= 128 * 128)
127 40 : map_to_bins_chunkwise<128>(binned, bins, data, bin_indices);
128 0 : else if (bins.size() <= 256 * 256)
129 0 : map_to_bins_chunkwise<256>(binned, bins, data, bin_indices);
130 0 : else if (bins.size() <= 512 * 512)
131 0 : map_to_bins_chunkwise<512>(binned, bins, data, bin_indices);
132 : else
133 0 : map_to_bins_chunkwise<1024>(binned, bins, data, bin_indices);
134 : } else {
135 293918 : map_to_bins_direct(binned, bins, data, bin_indices);
136 : }
137 293958 : }};
138 :
139 : } // namespace scipp::core::element
|