Coverage for install/scipp/core/bin

1# SPDX-License-Identifier: BSD-3-Clause

3# @author Simon Heybrock

4import itertools

5import uuid

6from collections.abc import Sequence

7from math import prod

8from typing import TYPE_CHECKING, TypeVar

10from .._scipp import core as _cpp

11from ..typing import Dims

12from .concepts import concrete_dims, irreducible_mask, rewrap_reduced_data

13from .cpp_classes import DataArray, Variable

14from .cumulative import cumsum

15from .operations import where

16from .variable import index

18if TYPE_CHECKING:

19 from .bins import Bins

22def hide_masked(da: DataArray, dim: Dims) -> Variable:

23 if da.bins is None:

24 raise ValueError("Input must be binned")

25 if (mask := irreducible_mask(da, dim)) is not None:

26 # Avoid using boolean indexing since it would result in (partial) content

27 # buffer copy. Instead index just begin/end and reuse content buffer.

28 comps = da.bins.constituents

29 # If the mask is 1-D we can drop entire "rows" or "columns". This can

30 # drastically reduce the number of bins to handle in some cases for better

31 # performance. For 2-D or higher masks we fall back to making bins "empty" by

32 # setting end=begin.

33 if mask.ndim == 1:

34 select = ~mask

35 comps['begin'] = comps['begin'][select]

36 comps['end'] = comps['end'][select]

37 else:

38 comps['end'] = where(mask, comps['begin'], comps['end'])

39 return _cpp._bins_no_validate(**comps) # type: ignore[no-any-return]

40 else:

41 return da.data

44def _with_bin_sizes(var: Variable | DataArray, sizes: Variable) -> Variable:

45 end = cumsum(sizes)

46 begin = end - sizes

47 data = var if var.bins is None else var.bins.constituents['data']

48 dim = var.dim if var.bins is None else var.bins.constituents['dim']

49 return _cpp._bins_no_validate(data=data, dim=dim, begin=begin, end=end) # type: ignore[no-any-return]

52def _concat_bins(var: Variable, dim: Dims) -> Variable:

53 # To concat bins, two things need to happen:

54 # 1. Data needs to be written to a contiguous chunk.

55 # 2. New bin begin/end indices need to be setup.

56 # If the dims to concatenate are the *inner* dims a call to `copy()` performs 1.

57 # Otherwise, we first transpose and then `copy()`.

58 # For step 2. we simply sum the (transposed) input bin sizes over the concat dims,

59 # which `_with_bin_sizes` can use to compute new begin/end indices.

60 changed_dims = list(concrete_dims(var, dim))

61 unchanged_dims = [d for d in var.dims if d not in changed_dims]

62 # TODO It would be possible to support a copy=False parameter, to skip the copy if

63 # the copy would not result in any moving or reordering.

64 out = var.transpose(unchanged_dims + changed_dims).copy()

65 out_bins: Bins[Variable] = out.bins # type: ignore[assignment]

66 sizes = out_bins.size().sum(changed_dims)

67 return _with_bin_sizes(out, sizes)

70def _combine_bins(

71 var: Variable,

72 coords: dict[str, Variable],

73 edges: Sequence[Variable],

74 groups: Sequence[Variable],

75 dim: Dims,

76) -> Variable:

77 from .binning import make_binned

79 # Overview

80 # --------

81 # The purpose of this code is to combine existing bins, but in a more general

82 # manner than `concat`, which combines all bins along a dimension. Here we operate

83 # more like `groupby`, which combines selected subsets and creates a new output dim.

84 #

85 # Approach

86 # --------

87 # The algorithm works conceptually similar to `_concat_bins`, but with an additional

88 # step, calling `make_binned` for grouping within the erased dims. For the final

89 # output binning, instead of summing the input bin sizes over all erased dims, we

90 # sum only within the groups created by `make_binned`.

91 # Preserve subspace dim order of input data, instead of the one given by `dim`

92 concrete_dims_ = concrete_dims(var, dim)

93 changed_dims = [d for d in var.dims if d in concrete_dims_]

94 unchanged_dims = [d for d in var.dims if d not in changed_dims]

95 changed_shape = [var.sizes[d] for d in changed_dims]

96 unchanged_shape = [var.sizes[d] for d in unchanged_dims]

97 changed_volume = prod(changed_shape)

99 # Move modified dims to innermost. Below this enables us to keep other dims

100 # (listed in unchanged_dims) untouched by creating pseudo bins that wrap the entire

101 # changed subspaces. make_binned below will thus only operate within each pseudo

102 # bins, without mixing contents from different unchanged bins.

103 var = var.transpose(unchanged_dims + changed_dims)

104 var_bins: Bins[Variable] = var.bins # type: ignore[assignment]

105 params = DataArray(var_bins.size(), coords=coords)

106 params.coords['begin'] = var_bins.constituents['begin'].copy()

107 params.coords['end'] = var_bins.constituents['end'].copy()

108

109 # Sizes and begin/end indices of changed subspace

110 sub_sizes = index(changed_volume).broadcast(

111 dims=unchanged_dims, shape=unchanged_shape

112 )

113 params = params.flatten(to=uuid.uuid4().hex)

114 # Setup pseudo binning for unchanged subspace. All further reordering (for grouping

115 # and binning) will then occur *within* those pseudo bins (by splitting them).

116 params_data = _with_bin_sizes(params, sub_sizes)

117 # Apply desired binning/grouping to sizes and begin/end, splitting the pseudo bins.

118 params = make_binned(params_data, edges=edges, groups=groups)

119

120 # Setup view of source content with desired target bin order

121 source = _cpp._bins_no_validate(

122 data=var_bins.constituents['data'],

123 dim=var_bins.constituents['dim'],

124 begin=params.bins.constituents['data'].coords['begin'], # type: ignore[union-attr]

125 end=params.bins.constituents['data'].coords['end'], # type: ignore[union-attr]

126 )

127 # Call `copy()` to reorder data. This is based on the underlying behavior of `copy`

128 # for binned data: It computes a new contiguous and ordered mapping of bin contents

129 # to the content buffer. The main purpose of that mechanism is to deal, e.g., with

130 # copies of slices, but here we can leverage the same mechanism.

131 # Then we call `_with_bin_sizes` to put in place new indices, "merging" the

132 # reordered input bins to desired output bins.

133 return _with_bin_sizes(

134 source.copy(),

135 sizes=params.data.bins.sum(), # type: ignore[union-attr]

136 )

137

138

139def combine_bins(

140 da: DataArray, edges: Sequence[Variable], groups: Sequence[Variable], dim: Dims

141) -> DataArray:

142 if da.bins is None:

143 raise ValueError("Input must be binned")

144 masked = hide_masked(da, dim)

145 if len(edges) == 0 and len(groups) == 0:

146 data = _concat_bins(masked, dim=dim)

147 else:

148 names = [coord.dim for coord in itertools.chain(edges, groups)]

149 coords = {name: da.coords[name] for name in names}

150 data = _combine_bins(masked, coords=coords, edges=edges, groups=groups, dim=dim)

151 out = rewrap_reduced_data(da, data, dim=dim)

152 for coord in itertools.chain(edges, groups):

153 out.coords[coord.dim] = coord

154 return out

155

156

157_VarDa = TypeVar('_VarDa', Variable, DataArray)

158

159

160def concat_bins(obj: _VarDa, dim: Dims = None) -> _VarDa:

161 da = obj if isinstance(obj, DataArray) else DataArray(obj) # type: ignore[arg-type, redundant-expr]

162 out = combine_bins(da, edges=[], groups=[], dim=dim)

163 return out if isinstance(obj, DataArray) else out.data # type: ignore[redundant-expr]

Coverage for install/scipp/core/bin_remapping.py: 70%

76 statements