Coverage for install/scipp/core/bin_remapping.py: 74%
66 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-04-28 01:28 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-04-28 01:28 +0000
1# SPDX-License-Identifier: BSD-3-Clause
2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
3# @author Simon Heybrock
4import itertools
5import uuid
6from math import prod
7from typing import Dict, List
9from .._scipp import core as _cpp
10from ..typing import Dims, VariableLikeType
11from .concepts import concrete_dims, irreducible_mask, rewrap_reduced_data
12from .cpp_classes import DataArray, Variable
13from .cumulative import cumsum
14from .operations import where
15from .variable import index
18def hide_masked(da: DataArray, dim: Dims) -> DataArray:
19 if (mask := irreducible_mask(da, dim)) is not None:
20 # Avoid using boolean indexing since it would result in (partial) content
21 # buffer copy. Instead index just begin/end and reuse content buffer.
22 comps = da.bins.constituents
23 # If the mask is 1-D we can drop entire "rows" or "columns". This can
24 # drastically reduce the number of bins to handle in some cases for better
25 # performance. For 2-D or higher masks we fall back to making bins "empty" by
26 # setting end=begin.
27 if mask.ndim == 1:
28 select = ~mask
29 comps['begin'] = comps['begin'][select]
30 comps['end'] = comps['end'][select]
31 else:
32 comps['end'] = where(mask, comps['begin'], comps['end'])
33 return _cpp._bins_no_validate(**comps)
34 else:
35 return da.data
38def _with_bin_sizes(var: Variable, sizes: Variable) -> Variable:
39 end = cumsum(sizes)
40 begin = end - sizes
41 data = var if var.bins is None else var.bins.constituents['data']
42 dim = var.dim if var.bins is None else var.bins.constituents['dim']
43 return _cpp._bins_no_validate(data=data, dim=dim, begin=begin, end=end)
46def _concat_bins(var: Variable, dim: List[str]) -> Variable:
47 # To concat bins, two things need to happen:
48 # 1. Data needs to be written to a contiguous chunk.
49 # 2. New bin begin/end indices need to be setup.
50 # If the dims to concatenate are the *inner* dims a call to `copy()` performs 1.
51 # Otherwise, we first transpose and then `copy()`.
52 # For step 2. we simply sum the (transposed) input bin sizes over the concat dims,
53 # which `_with_bin_sizes` can use to compute new begin/end indices.
54 changed_dims = list(concrete_dims(var, dim))
55 unchanged_dims = [d for d in var.dims if d not in changed_dims]
56 # TODO It would be possible to support a copy=False parameter, to skip the copy if
57 # the copy would not result in any moving or reordering.
58 out = var.transpose(unchanged_dims + changed_dims).copy()
59 sizes = out.bins.size().sum(changed_dims)
60 return _with_bin_sizes(out, sizes)
63def _combine_bins(
64 var: Variable,
65 coords: Dict[str, Variable],
66 edges: List[Variable],
67 groups: List[Variable],
68 dim: Dims,
69) -> Dict[str, Variable]:
70 from .binning import make_binned
72 # Overview
73 # --------
74 # The purpose of this code is to combine existing bins, but in a more general
75 # manner than `concat`, which combines all bins along a dimension. Here we operate
76 # more like `groupby`, which combines selected subsets and creates a new output dim.
77 #
78 # Approach
79 # --------
80 # The algorithm works conceptually similar to `_concat_bins`, but with an additional
81 # step, calling `make_binned` for grouping within the erased dims. For the final
82 # output binning, instead of summing the input bin sizes over all erased dims, we
83 # sum only within the groups created by `make_binned`.
84 # Preserve subspace dim order of input data, instead of the one given by `dim`
85 concrete_dims_ = concrete_dims(var, dim)
86 changed_dims = [d for d in var.dims if d in concrete_dims_]
87 unchanged_dims = [d for d in var.dims if d not in changed_dims]
88 changed_shape = [var.sizes[d] for d in changed_dims]
89 unchanged_shape = [var.sizes[d] for d in unchanged_dims]
90 changed_volume = prod(changed_shape)
92 # Move modified dims to innermost. Below this enables us to keep other dims
93 # (listed in unchanged_dims) untouched by creating pseudo bins that wrap the entire
94 # changed subspaces. make_binned below will thus only operate within each pseudo
95 # bins, without mixing contents from different unchanged bins.
96 var = var.transpose(unchanged_dims + changed_dims)
97 params = DataArray(var.bins.size(), coords=coords)
98 params.coords['begin'] = var.bins.constituents['begin'].copy()
99 params.coords['end'] = var.bins.constituents['end'].copy()
101 # Sizes and begin/end indices of changed subspace
102 sub_sizes = index(changed_volume).broadcast(
103 dims=unchanged_dims, shape=unchanged_shape
104 )
105 params = params.flatten(to=uuid.uuid4().hex)
106 # Setup pseudo binning for unchanged subspace. All further reordering (for grouping
107 # and binning) will then occur *within* those pseudo bins (by splitting them).
108 params = _with_bin_sizes(params, sub_sizes)
109 # Apply desired binning/grouping to sizes and begin/end, splitting the pseudo bins.
110 params = make_binned(params, edges=edges, groups=groups)
112 # Setup view of source content with desired target bin order
113 source = _cpp._bins_no_validate(
114 data=var.bins.constituents['data'],
115 dim=var.bins.constituents['dim'],
116 begin=params.bins.constituents['data'].coords['begin'],
117 end=params.bins.constituents['data'].coords['end'],
118 )
119 # Call `copy()` to reorder data. This is based on the underlying behavior of `copy`
120 # for binned data: It computes a new contiguous and ordered mapping of bin contents
121 # to the content buffer. The main purpose of that mechanism is to deal, e.g., with
122 # copies of slices, but here we can leverage the same mechanism.
123 # Then we call `_with_bin_sizes` to put in place new indices, "merging" the
124 # reordered input bins to desired output bins.
125 return _with_bin_sizes(source.copy(), sizes=params.data.bins.sum())
128def combine_bins(
129 da: DataArray, edges: List[Variable], groups: List[Variable], dim: Dims
130) -> DataArray:
131 masked = hide_masked(da, dim)
132 if len(edges) == 0 and len(groups) == 0:
133 data = _concat_bins(masked, dim=dim)
134 else:
135 names = [coord.dim for coord in itertools.chain(edges, groups)]
136 coords = {name: da.coords[name] for name in names}
137 data = _combine_bins(masked, coords=coords, edges=edges, groups=groups, dim=dim)
138 out = rewrap_reduced_data(da, data, dim=dim)
139 for coord in itertools.chain(edges, groups):
140 out.coords[coord.dim] = coord
141 return out
144def concat_bins(obj: VariableLikeType, dim: Dims = None) -> VariableLikeType:
145 da = obj if isinstance(obj, DataArray) else DataArray(obj)
146 out = combine_bins(da, edges=[], groups=[], dim=dim)
147 return out if isinstance(obj, DataArray) else out.data