Coverage for install/scipp/core/bin_remapping.py: 74%

66 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-04-28 01:28 +0000

1# SPDX-License-Identifier: BSD-3-Clause 

2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp) 

3# @author Simon Heybrock 

4import itertools 

5import uuid 

6from math import prod 

7from typing import Dict, List 

8 

9from .._scipp import core as _cpp 

10from ..typing import Dims, VariableLikeType 

11from .concepts import concrete_dims, irreducible_mask, rewrap_reduced_data 

12from .cpp_classes import DataArray, Variable 

13from .cumulative import cumsum 

14from .operations import where 

15from .variable import index 

16 

17 

18def hide_masked(da: DataArray, dim: Dims) -> DataArray: 

19 if (mask := irreducible_mask(da, dim)) is not None: 

20 # Avoid using boolean indexing since it would result in (partial) content 

21 # buffer copy. Instead index just begin/end and reuse content buffer. 

22 comps = da.bins.constituents 

23 # If the mask is 1-D we can drop entire "rows" or "columns". This can 

24 # drastically reduce the number of bins to handle in some cases for better 

25 # performance. For 2-D or higher masks we fall back to making bins "empty" by 

26 # setting end=begin. 

27 if mask.ndim == 1: 

28 select = ~mask 

29 comps['begin'] = comps['begin'][select] 

30 comps['end'] = comps['end'][select] 

31 else: 

32 comps['end'] = where(mask, comps['begin'], comps['end']) 

33 return _cpp._bins_no_validate(**comps) 

34 else: 

35 return da.data 

36 

37 

38def _with_bin_sizes(var: Variable, sizes: Variable) -> Variable: 

39 end = cumsum(sizes) 

40 begin = end - sizes 

41 data = var if var.bins is None else var.bins.constituents['data'] 

42 dim = var.dim if var.bins is None else var.bins.constituents['dim'] 

43 return _cpp._bins_no_validate(data=data, dim=dim, begin=begin, end=end) 

44 

45 

46def _concat_bins(var: Variable, dim: List[str]) -> Variable: 

47 # To concat bins, two things need to happen: 

48 # 1. Data needs to be written to a contiguous chunk. 

49 # 2. New bin begin/end indices need to be setup. 

50 # If the dims to concatenate are the *inner* dims a call to `copy()` performs 1. 

51 # Otherwise, we first transpose and then `copy()`. 

52 # For step 2. we simply sum the (transposed) input bin sizes over the concat dims, 

53 # which `_with_bin_sizes` can use to compute new begin/end indices. 

54 changed_dims = list(concrete_dims(var, dim)) 

55 unchanged_dims = [d for d in var.dims if d not in changed_dims] 

56 # TODO It would be possible to support a copy=False parameter, to skip the copy if 

57 # the copy would not result in any moving or reordering. 

58 out = var.transpose(unchanged_dims + changed_dims).copy() 

59 sizes = out.bins.size().sum(changed_dims) 

60 return _with_bin_sizes(out, sizes) 

61 

62 

63def _combine_bins( 

64 var: Variable, 

65 coords: Dict[str, Variable], 

66 edges: List[Variable], 

67 groups: List[Variable], 

68 dim: Dims, 

69) -> Dict[str, Variable]: 

70 from .binning import make_binned 

71 

72 # Overview 

73 # -------- 

74 # The purpose of this code is to combine existing bins, but in a more general 

75 # manner than `concat`, which combines all bins along a dimension. Here we operate 

76 # more like `groupby`, which combines selected subsets and creates a new output dim. 

77 # 

78 # Approach 

79 # -------- 

80 # The algorithm works conceptually similar to `_concat_bins`, but with an additional 

81 # step, calling `make_binned` for grouping within the erased dims. For the final 

82 # output binning, instead of summing the input bin sizes over all erased dims, we 

83 # sum only within the groups created by `make_binned`. 

84 # Preserve subspace dim order of input data, instead of the one given by `dim` 

85 concrete_dims_ = concrete_dims(var, dim) 

86 changed_dims = [d for d in var.dims if d in concrete_dims_] 

87 unchanged_dims = [d for d in var.dims if d not in changed_dims] 

88 changed_shape = [var.sizes[d] for d in changed_dims] 

89 unchanged_shape = [var.sizes[d] for d in unchanged_dims] 

90 changed_volume = prod(changed_shape) 

91 

92 # Move modified dims to innermost. Below this enables us to keep other dims 

93 # (listed in unchanged_dims) untouched by creating pseudo bins that wrap the entire 

94 # changed subspaces. make_binned below will thus only operate within each pseudo 

95 # bins, without mixing contents from different unchanged bins. 

96 var = var.transpose(unchanged_dims + changed_dims) 

97 params = DataArray(var.bins.size(), coords=coords) 

98 params.coords['begin'] = var.bins.constituents['begin'].copy() 

99 params.coords['end'] = var.bins.constituents['end'].copy() 

100 

101 # Sizes and begin/end indices of changed subspace 

102 sub_sizes = index(changed_volume).broadcast( 

103 dims=unchanged_dims, shape=unchanged_shape 

104 ) 

105 params = params.flatten(to=uuid.uuid4().hex) 

106 # Setup pseudo binning for unchanged subspace. All further reordering (for grouping 

107 # and binning) will then occur *within* those pseudo bins (by splitting them). 

108 params = _with_bin_sizes(params, sub_sizes) 

109 # Apply desired binning/grouping to sizes and begin/end, splitting the pseudo bins. 

110 params = make_binned(params, edges=edges, groups=groups) 

111 

112 # Setup view of source content with desired target bin order 

113 source = _cpp._bins_no_validate( 

114 data=var.bins.constituents['data'], 

115 dim=var.bins.constituents['dim'], 

116 begin=params.bins.constituents['data'].coords['begin'], 

117 end=params.bins.constituents['data'].coords['end'], 

118 ) 

119 # Call `copy()` to reorder data. This is based on the underlying behavior of `copy` 

120 # for binned data: It computes a new contiguous and ordered mapping of bin contents 

121 # to the content buffer. The main purpose of that mechanism is to deal, e.g., with 

122 # copies of slices, but here we can leverage the same mechanism. 

123 # Then we call `_with_bin_sizes` to put in place new indices, "merging" the 

124 # reordered input bins to desired output bins. 

125 return _with_bin_sizes(source.copy(), sizes=params.data.bins.sum()) 

126 

127 

128def combine_bins( 

129 da: DataArray, edges: List[Variable], groups: List[Variable], dim: Dims 

130) -> DataArray: 

131 masked = hide_masked(da, dim) 

132 if len(edges) == 0 and len(groups) == 0: 

133 data = _concat_bins(masked, dim=dim) 

134 else: 

135 names = [coord.dim for coord in itertools.chain(edges, groups)] 

136 coords = {name: da.coords[name] for name in names} 

137 data = _combine_bins(masked, coords=coords, edges=edges, groups=groups, dim=dim) 

138 out = rewrap_reduced_data(da, data, dim=dim) 

139 for coord in itertools.chain(edges, groups): 

140 out.coords[coord.dim] = coord 

141 return out 

142 

143 

144def concat_bins(obj: VariableLikeType, dim: Dims = None) -> VariableLikeType: 

145 da = obj if isinstance(obj, DataArray) else DataArray(obj) 

146 out = combine_bins(da, edges=[], groups=[], dim=dim) 

147 return out if isinstance(obj, DataArray) else out.data