Coverage for install/scipp/core/binning.py: 71%

283 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-17 01:51 +0000

1# SPDX-License-Identifier: BSD-3-Clause 

2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp) 

3# @author Simon Heybrock 

4import itertools 

5import uuid 

6from collections.abc import Iterable, Sequence 

7from typing import Any, SupportsIndex, TypeVar, overload 

8 

9from .._scipp import core as _cpp 

10from .bin_remapping import combine_bins 

11from .bins import Bins 

12from .cpp_classes import BinEdgeError, CoordError, DataArray, Dataset, DType, Variable 

13from .data_group import DataGroup, data_group_overload 

14from .math import round as round_ 

15from .shape import concat 

16from .variable import arange, array, epoch, linspace, scalar 

17 

18_DaDs = TypeVar('_DaDs', bound=DataArray | Dataset) 

19 

20 

21@overload 

22def make_histogrammed( 

23 x: Variable | DataArray, *, edges: Variable, erase: Iterable[str] = () 

24) -> DataArray: ... 

25 

26 

27@overload 

28def make_histogrammed( 

29 x: Dataset, *, edges: Variable, erase: Iterable[str] = () 

30) -> Dataset: ... 

31 

32 

33def make_histogrammed( 

34 x: Variable | DataArray | Dataset, *, edges: Variable, erase: Iterable[str] = () 

35) -> DataArray | Dataset: 

36 """Create dense data by histogramming data into given bins. 

37 

38 If the input is binned data, then existing binning dimensions are preserved. 

39 Histogramming along an existing binned dimension will replace this binning. 

40 

41 Usually :py:func:`scipp.hist` should be preferred. 

42 

43 Parameters 

44 ---------- 

45 x: 

46 Input data. 

47 edges: 

48 Bin edges. If these have more than one dimension, binning occurs along 

49 the inner dimension. 

50 erase: 

51 Names of dimensions to erase from the input. 

52 

53 Returns 

54 ------- 

55 : 

56 DataArray / Dataset with values equal to the sum 

57 of values in each given bin. 

58 

59 See Also 

60 -------- 

61 scipp.hist: 

62 Recommended interface for histogramming data. 

63 scipp.bin: 

64 For binning data. 

65 """ 

66 if isinstance(x, Variable): 

67 data = scalar(1.0, unit='counts').broadcast(sizes=x.sizes) 

68 x = DataArray(data, coords={edges.dim: x}) 

69 elif isinstance(x, DataArray) and x.bins is not None: 

70 dim = edges.dims[-1] 

71 if dim not in x.bins.coords: 

72 # The second `dim` is necessary in case the coord is multi-dimensional. 

73 if x.coords.is_edges(dim, dim): 

74 raise BinEdgeError( 

75 "Cannot histogram data with existing bin edges " 

76 "unless event data coordinate for histogramming is available." 

77 ) 

78 return make_histogrammed(x.bins.sum(), edges=edges, erase=erase) 

79 _check_erase_dimension_clash(erase, edges) 

80 # The C++ implementation uses an older heuristic histogramming a single dimension. 

81 # We therefore transpose and flatten the input to match this. 

82 hist_dim = edges.dims[-1] 

83 to_flatten = [dim for dim in x.dims if dim in erase] 

84 if hist_dim in x.dims: 

85 to_flatten.append(hist_dim) 

86 if to_flatten: 

87 x = _drop_coords_for_hist(x, to_flatten, keep=(hist_dim,)) 

88 x = _transpose_and_flatten_for_hist(x, to_flatten, to=hist_dim) 

89 return _cpp.histogram(x, edges) # type: ignore[no-any-return] 

90 

91 

92def _drop_coords_for_hist(x: _DaDs, dims: Iterable[str], keep: Iterable[str]) -> _DaDs: 

93 """Drop unnecessary coords from a DataArray making flatten/bin expensive.""" 

94 data = x if x.bins is None else x.bins 

95 to_drop = [] 

96 for name, coord in data.coords.items(): 

97 if (name not in keep) and (set(coord.dims) & set(dims)): 

98 to_drop.append(name) 

99 return data.drop_coords(to_drop) # type: ignore[return-value] 

100 

101 

102def _transpose_and_flatten_for_hist(x: _DaDs, dims: Sequence[str], to: str) -> _DaDs: 

103 """Transpose and flatten a DataArray to prepare for histogram.""" 

104 new_order = [*(dim for dim in x.dims if dim not in dims), *dims] 

105 # `make_histogrammed` does not fully support `Dataset`. 

106 # This needs to be fixed, but for now, we just ignore the type error here. 

107 transposed = x.transpose(new_order) # type: ignore[union-attr] 

108 return transposed.flatten(dims=dims, to=to) # type: ignore[return-value] 

109 

110 

111def make_binned( 

112 x: Variable | DataArray, 

113 *, 

114 edges: Sequence[Variable] | None = None, 

115 groups: Sequence[Variable] | None = None, 

116 erase: Sequence[str] = (), 

117) -> DataArray: 

118 """Create binned data by binning input along all dimensions given by edges or 

119 groups. 

120 

121 Usually :py:func:`scipp.bin` or :py:func:`scipp.group` should be preferred, 

122 unless the more precise control over which dimensions should be erased is required, 

123 or unless grouping and binning at the same time is required. 

124 

125 This does not histogram the data, each output bin will contain a "list" of 

126 input values. 

127 

128 At least one argument of ``edges`` and ``groups`` is required. 

129 

130 If the input is binned and certain bins are masked then changing the binning 

131 will apply the masks, i.e., masked bins are treated as empty. 

132 

133 Warning 

134 ------- 

135 

136 When there is existing binning or grouping, the algorithm assumes that coordinates 

137 of the binned data are correct, i.e., compatible with the corresponding 

138 coordinate values in the individual bins. If this is not the case then the behavior 

139 is UNSPECIFIED. That is, the algorithm may or may not ignore the existing 

140 coordinates. If you encounter such as case, remove the conflicting coordinate, 

141 e.g., using :py:func:`scipp.DataArray.drop_coords`. 

142 

143 Parameters 

144 ---------- 

145 x: 

146 Input data. 

147 edges: 

148 Bin edges, one per dimension to bin in. 

149 groups: 

150 Keys to group input by one per dimension to group in. 

151 erase: 

152 Dimension labels to remove from output. 

153 

154 Returns 

155 ------- 

156 : 

157 Binned ``x``. 

158 

159 See Also 

160 -------- 

161 scipp.hist: 

162 For histogramming data. 

163 scipp.bin: 

164 Recommended interface for binning data. 

165 scipp.group: 

166 Recommended interface for grouping data. 

167 scipp.bins: 

168 For creating binned data based on explicitly given index ranges. 

169 """ 

170 if groups is None: 

171 groups = [] 

172 if edges is None: 

173 edges = [] 

174 _check_erase_dimension_clash(erase, *edges, *groups) 

175 

176 if isinstance(x, Variable) and x.bins is not None: 

177 x = DataArray(x) 

178 elif isinstance(x, Variable): 

179 coords = [*edges, *groups] 

180 if len(coords) != 1: 

181 raise ValueError( 

182 "Edges for exactly one dimension must be specified when " 

183 "binning or histogramming a variable." 

184 ) 

185 data = scalar(1.0, unit='counts').broadcast(sizes=x.sizes).copy() 

186 x = DataArray(data, coords={coords[0].dim: x}) 

187 if _can_operate_on_bins(x, edges, groups, erase): 

188 return combine_bins(x, edges=edges, groups=groups, dim=erase) 

189 # Many-to-many mapping is expensive, concat first is generally cheaper, 

190 # despite extra copies. If some coords are dense, perform binning in two steps, 

191 # since concat is not possible then (without mapping dense coords to binned coords, 

192 # which might bypass some other optimizations). 

193 if erase and x.bins is not None: 

194 dense_edges = [var for var in edges if var.dims[-1] not in x.bins.coords] 

195 dense_groups = [var for var in groups if var.dims[-1] not in x.bins.coords] 

196 if len(dense_edges) + len(dense_groups) == 0: 

197 x = x.bins.concat(erase) 

198 erase = () 

199 elif len(dense_edges) + len(dense_groups) < len(edges) + len(groups): 

200 x = make_binned(x, edges=dense_edges, groups=dense_groups, erase=erase) 

201 b: Bins[DataArray] = x.bins # type: ignore[assignment] 

202 edges = [var for var in edges if var.dims[-1] in b.coords] 

203 groups = [var for var in groups if var.dims[-1] in b.coords] 

204 erase = () 

205 if x.ndim == 0: 

206 return ( # type: ignore[no-any-return] 

207 _cpp.bin(x.value, edges, groups, erase) 

208 .assign_coords(x.coords) 

209 .assign_masks(x.masks) 

210 ) 

211 x = _prepare_multi_dim_dense(x, *edges, *groups) 

212 return _cpp.bin(x, edges, groups, erase) # type: ignore[no-any-return] 

213 

214 

215def _prepare_multi_dim_dense(x: DataArray, *edges_or_groups: Variable) -> DataArray: 

216 """Prepare data for binning or grouping. 

217 

218 This function is a workaround for the C++ implementation not being able to deal with 

219 multi-dimensional dense input data. The workaround is to flatten the data along the 

220 auxiliary dimensions and regroup. 

221 

222 In case the ultimate operation is histogramming, this leads to the desired 

223 higher-dimensional histogram. In case of binning or grouping, we obtain binned data 

224 with one additional dimension, whereas conceptually we might expect only the 

225 requested dimensions, with the auxiliary dimensions inside the bin content. As this 

226 case is likely rare and extra dimensions in bin content are barely supported in 

227 scipp, we consider this acceptable for now. 

228 """ 

229 if x.bins is not None or x.ndim == 1: 

230 return x 

231 if any(var.ndim != 1 for var in edges_or_groups): 

232 raise ValueError("Cannot bin multi-dimensional dense data with ragged edges.") 

233 op_dims = _get_op_dims(x, *edges_or_groups) 

234 if len(op_dims) != 1: 

235 raise ValueError("Cannot bin multi-dimensional dense data along multiple dims.") 

236 extra = {dim for dim in x.dims if dim != next(iter(op_dims))} 

237 original_coords = { 

238 name: coord 

239 for name, coord in x.coords.items() 

240 if set(coord.dims).issubset(extra) 

241 } 

242 helper_coords = {dim: arange(dim, x.sizes[dim]) for dim in extra} 

243 return ( 

244 x.assign_coords(helper_coords) 

245 .flatten(to=str(uuid.uuid4())) 

246 .group(*helper_coords.values()) 

247 .drop_coords(tuple(extra)) 

248 .assign_coords(original_coords) 

249 ) 

250 

251 

252def _check_erase_dimension_clash( 

253 erase: Iterable[str], *edges_or_groups: Variable 

254) -> None: 

255 new_dims: set[str] = set() 

256 for var in edges_or_groups: 

257 new_dims.update(var.dims) 

258 if set(erase) & new_dims: 

259 raise ValueError( 

260 f"Clash of dimension(s) to reduce {erase} with dimensions defined by " 

261 f"edges or groups: {new_dims}." 

262 ) 

263 

264 

265def _can_operate_on_bins( 

266 x: DataArray, 

267 edges: Iterable[Variable], 

268 groups: Iterable[Variable], 

269 erase: Iterable[str], 

270) -> bool: 

271 if x.bins is None: 

272 return False 

273 dims: set[str] = set() 

274 for coord in itertools.chain(edges, groups): 

275 if coord.ndim != 1: 

276 return False 

277 if coord.dim in x.bins.coords: 

278 return False 

279 if coord.dim not in x.coords: 

280 return False 

281 dims.update(x.coords[coord.dim].dims) 

282 return dims <= set(erase) 

283 

284 

285def _require_coord(name: str, coord: object) -> None: 

286 if coord is None: 

287 raise CoordError(f"Coordinate '{name}' not found.") 

288 

289 

290def _get_coord(x: Variable | DataArray | Dataset, name: str) -> Variable: 

291 if isinstance(x, Variable): 

292 return x 

293 if isinstance(x, Dataset): 

294 if not x.values(): 

295 raise ValueError("Dataset is empty") 

296 cmin: Variable | None = None 

297 cmax: Variable | None = None 

298 for da in x.values(): 

299 c = _get_coord(da, name) 

300 cmin = c.min() if cmin is None else min(cmin, c.min()) # type: ignore[call-overload] 

301 cmax = c.max() if cmax is None else max(cmin, c.max()) # type: ignore[call-overload] 

302 coord = concat([cmin, cmax], dim='dummy') # type: ignore[type-var] 

303 else: 

304 event_coord = x.bins.deprecated_meta.get(name) if x.bins is not None else None 

305 coord = x.deprecated_meta.get(name, event_coord) 

306 _require_coord(name, coord) 

307 return coord # type: ignore[return-value] 

308 

309 

310def _upper_bound(x: Variable) -> Variable: 

311 import numpy as np 

312 

313 bound = x.nanmax() 

314 if bound.dtype in ('int32', 'int64', 'datetime64'): 

315 bound.value += 1 

316 else: 

317 bound.value = np.nextafter( 

318 bound.value, (bound + scalar(1, unit=bound.unit, dtype=bound.dtype)).value 

319 ) 

320 return bound 

321 

322 

323def _parse_coords_arg( 

324 x: Variable | DataArray | Dataset, name: str, arg: SupportsIndex | Variable 

325) -> Variable: 

326 if isinstance(arg, Variable) and name in arg.dims: 

327 return arg 

328 coord = _get_coord(x, name) 

329 start = coord.nanmin() 

330 if ( 

331 not isinstance(x, Variable) 

332 and (name in x.coords) 

333 and x.coords.is_edges(name, name) 

334 ): 

335 stop = coord.nanmax() # existing bin-edges, do not extend 

336 else: 

337 stop = _upper_bound(coord) 

338 if start > stop: 

339 raise ValueError( 

340 'Empty data range, cannot automatically determine bounds. ' 

341 'Must provide concrete bin edges.' 

342 ) 

343 if not isinstance(arg, Variable): 

344 if start.dtype == DType.datetime64: 

345 base = epoch(unit=start.unit) 

346 return base + round_( 

347 linspace(name, start - base, stop - base, num=arg.__index__() + 1) 

348 ).to(dtype='int64') 

349 return linspace(name, start, stop, num=arg.__index__() + 1).to( 

350 dtype=start.dtype, copy=False 

351 ) 

352 step = arg.to(dtype=start.dtype, unit=start.unit) 

353 if step.value == 0: 

354 raise ValueError("Step size cannot be 0.") 

355 return arange(name, start, stop + step, step=step) 

356 

357 

358def _make_edges( 

359 x: Variable | DataArray | Dataset, 

360 arg_dict: dict[str, SupportsIndex | Variable] | None, 

361 kwargs: dict[str, SupportsIndex | Variable], 

362) -> dict[str, Variable]: 

363 if arg_dict is not None: 

364 kwargs = dict(**arg_dict, **kwargs) 

365 return {name: _parse_coords_arg(x, name, arg) for name, arg in kwargs.items()} 

366 

367 

368def _find_replaced_dims( 

369 x: Variable | DataArray | Dataset, 

370 *, 

371 dims: Iterable[str], 

372 dim: str | tuple[str, ...] | None, 

373) -> list[str]: 

374 if isinstance(x, Variable): 

375 replaced = set(x.dims) 

376 elif dim is None: 

377 replaced = set() 

378 for name in dims: 

379 if name in x.coords: 

380 replaced.update(x.coords[name].dims) 

381 else: 

382 replaced = {dim} if isinstance(dim, str) else set(dim) 

383 return [d for d in x.dims if d in (replaced - set(dims))] 

384 

385 

386@overload 

387def hist( 

388 x: Variable | DataArray, 

389 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

390 /, 

391 *, 

392 dim: str | tuple[str, ...] | None = None, 

393 **kwargs: SupportsIndex | Variable, 

394) -> Variable | DataArray: ... 

395 

396 

397@overload 

398def hist( 

399 x: Dataset, 

400 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

401 /, 

402 *, 

403 dim: str | tuple[str, ...] | None = None, 

404 **kwargs: SupportsIndex | Variable, 

405) -> Dataset: ... 

406 

407 

408@overload 

409def hist( 

410 x: DataGroup[Any], 

411 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

412 /, 

413 *, 

414 dim: str | tuple[str, ...] | None = None, 

415 **kwargs: SupportsIndex | Variable, 

416) -> DataGroup[Any]: ... 

417 

418 

419@data_group_overload 

420def hist( 

421 x: Variable | DataArray | Dataset | DataGroup[Any], 

422 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

423 /, 

424 *, 

425 dim: str | tuple[str, ...] | None = None, 

426 **kwargs: SupportsIndex | Variable, 

427) -> Variable | DataArray | Dataset | DataGroup[Any]: 

428 """Compute a histogram. 

429 

430 Bin edges can be specified in three ways: 

431 

432 1. When an integer is provided, a 'linspace' with this requested number of 

433 bins is created, based on the min and max of the corresponding coordinate. 

434 2. A scalar Scipp variable (a value with a unit) is interpreted as a target 

435 bin width, and an 'arange' covering the min and max of the corresponding 

436 coordinate is created. 

437 3. A custom coordinate, given as a Scipp variable with compatible unit. 

438 Typically this should have a single dimension matching the target dimension. 

439 

440 The `dim` argument controls which dimensions are summed over and which are 

441 preserved. The default `dim=None` means that the dimensions of the coordinate 

442 used for histogramming are summed over. In case of an input that is binned-data 

443 there may be no such coordinate, in which case `dim=None` is equivalent to `dim=()`, 

444 resulting in a new dimension in the output. In many cases this default yields the 

445 desired behavior, there are two classes of exceptions where specifying `dim` 

446 explicitly can be useful: 

447 

448 1. Given input data with an N-D coordinate, where N>1, we can use `dim` to restrict 

449 the sum to a subset of M dimensions, resulting in an (N-M)-D "array" of histograms. 

450 This can be of particular importance when the input is binned data: Frequently 

451 we may want to bin to add an additional dimension, but if there is a dense 

452 coordinate present the default `dim=None` would result in removal of the 

453 coordinate's dimensions. This can be prevented by setting `dim=()`, which will 

454 always add a new dimensions. 

455 2. Given M-D input data with an N-D coordinate, where N<M, we can specify `dim` to 

456 sum over, e.g., the remaining M-N dimensions while histogramming. This is often 

457 equivalent to not specifying `dim` and a call to `sum` after histogramming but 

458 is more memory efficient. 

459 

460 If the dimensions of the input coordinate are not known, using an explicit `dim` 

461 argument can be useful to obtain predictable behavior in generic code. 

462 

463 Parameters 

464 ---------- 

465 x: 

466 Input data. 

467 arg_dict: 

468 Dictionary mapping dimension labels to binning parameters. 

469 dim: 

470 Dimension(s) to sum over when histogramming. If None (the default), the 

471 dimensions of the coordinate used for histogramming are summed over. 

472 **kwargs: 

473 Mapping of dimension label to corresponding binning parameters. 

474 

475 Returns 

476 ------- 

477 : 

478 Histogrammed data. 

479 

480 See Also 

481 -------- 

482 scipp.nanhist: 

483 Like :py:func:`scipp.hist`, but NaN values are skipped. 

484 scipp.bin: 

485 Creating binned data by binning instead of summing all contributions. 

486 scipp.binning.make_histogrammed: 

487 Lower level function for histogramming. 

488 

489 Examples 

490 -------- 

491 

492 Histogram a table by one of its coord columns, specifying (1) number of bins, (2) 

493 bin width, or (3) actual binning: 

494 

495 >>> from numpy.random import default_rng 

496 >>> rng = default_rng(seed=1234) 

497 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

498 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

499 >>> data = sc.ones(dims=['row'], unit='K', shape=[100]) 

500 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y}) 

501 >>> table.hist(x=2) 

502 <scipp.DataArray> 

503 Dimensions: Sizes[x:2, ] 

504 Coordinates: 

505 * x float64 [m] (x [bin-edge]) [0.00313229, 0.497696, 0.992259] 

506 Data: 

507 float64 [K] (x) [53, 47] 

508 

509 >>> table.hist(x=sc.scalar(0.2, unit='m')).sizes 

510 {'x': 5} 

511 

512 >>> table.hist(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes 

513 {'x': 9} 

514 

515 Histogram a table by two of its coord columns: 

516 

517 >>> table.hist(x=4, y=6).sizes 

518 {'x': 4, 'y': 6} 

519 

520 Histogram binned data, using existing bins: 

521 

522 >>> binned = table.bin(x=10) 

523 >>> binned.hist().sizes 

524 {'x': 10} 

525 

526 Histogram binned data, using new bins along existing dimension: 

527 

528 >>> binned = table.bin(x=10) 

529 >>> binned.hist(x=20).sizes 

530 {'x': 20} 

531 

532 Histogram binned data along an additional dimension: 

533 

534 >>> binned = table.bin(x=10) 

535 >>> binned.hist(y=5).sizes 

536 {'x': 10, 'y': 5} 

537 

538 The `dim` argument controls which dimensions are summed over and which are 

539 preserved. Given 3-D data with a 2-D coordinate, the default `dim=None` results in: 

540 

541 >>> xyz = sc.data.table_xyz(100).bin(x=4, y=5, z=6) 

542 >>> xyz.coords['t'] = sc.array(dims=['x', 'y'], unit='s', values=rng.random((4, 5))) 

543 >>> xyz.hist(t=3).sizes 

544 {'z': 6, 't': 3} 

545 

546 Specifying `dim=('x', 'y', 'z')` or equivalently `dim=xyz.dims` will additionally 

547 sum over the z-dimension, resulting in a 1-D histogram: 

548 

549 >>> xyz.hist(t=3, dim=('x', 'y', 'z')).sizes 

550 {'t': 3} 

551 

552 To preserve a dimension of the input's t-coordinate, we can drop this dimension 

553 from the tuple of dimensions to sum over: 

554 

555 >>> xyz.hist(t=4, dim='y').sizes 

556 {'x': 4, 'z': 6, 't': 4} 

557 """ # noqa: E501 

558 if isinstance(x, DataGroup): 

559 # Only to make mypy happy because we have `DataGroup` in annotation of `x` 

560 # so that Sphinx shows it. 

561 raise TypeError("Internal error: input should not be a DataGroup") 

562 edges = _make_edges(x, arg_dict, kwargs) 

563 erase = _find_replaced_dims(x, dims=edges, dim=dim) 

564 if isinstance(x, Variable) and len(edges) != 1: 

565 raise ValueError( 

566 "Edges for exactly one dimension must be specified when " 

567 "binning or histogramming a variable." 

568 ) 

569 if len(edges) == 0: 

570 if x.bins is None: 

571 raise TypeError("Data is not binned so bin edges must be provided.") 

572 return x.bins.sum() 

573 if len(edges) == 1: 

574 # TODO Note that this may swap dims, is that ok? 

575 out = make_histogrammed(x, edges=next(iter(edges.values())), erase=erase) 

576 else: 

577 # Drop coords that would disappear by histogramming, to avoid costly handling 

578 # in intermediate binning step. 

579 if isinstance(x, DataArray): 

580 x = _drop_coords_for_hist(x, dims=erase, keep=edges) 

581 elif isinstance(x, Dataset): 

582 x = Dataset( 

583 { 

584 k: _drop_coords_for_hist(v, dims=erase, keep=edges) 

585 for k, v in x.items() 

586 } 

587 ) 

588 edge_values = list(edges.values()) 

589 # If histogramming by the final edges needs to use a non-event coord then we 

590 # must not erase that dim, since it removes the coord required for histogramming 

591 remaining_erase = set(erase) 

592 if isinstance(x, DataArray) and x.bins is not None: 

593 hist_dim = edge_values[-1].dims[-1] 

594 if hist_dim not in x.bins.coords: 

595 erase = [e for e in erase if e not in x.coords[hist_dim].dims] 

596 remaining_erase -= set(erase) 

597 out = make_histogrammed( 

598 make_binned( 

599 x, # type: ignore[arg-type] 

600 edges=edge_values[:-1], 

601 erase=erase, 

602 ), 

603 edges=edge_values[-1], 

604 erase=remaining_erase, 

605 ) 

606 return out 

607 

608 

609def _get_op_dims(x: DataArray, *edges_or_groups: Variable) -> set[str]: 

610 edge_dims = {edge.dims[-1] for edge in edges_or_groups} 

611 coords = [x.coords[dim] for dim in edge_dims if dim in x.coords] 

612 return {coord.dims[-1] for coord in coords if coord.ndim > 0} 

613 

614 

615@overload 

616def nanhist( 

617 x: Variable | DataArray, 

618 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

619 /, 

620 *, 

621 dim: str | tuple[str, ...] | None = None, 

622 **kwargs: SupportsIndex | Variable, 

623) -> Variable | DataArray: ... 

624 

625 

626@overload 

627def nanhist( 

628 x: Dataset, 

629 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

630 /, 

631 *, 

632 dim: str | tuple[str, ...] | None = None, 

633 **kwargs: SupportsIndex | Variable, 

634) -> Dataset: ... 

635 

636 

637@overload 

638def nanhist( 

639 x: DataGroup[Any], 

640 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

641 /, 

642 *, 

643 dim: str | tuple[str, ...] | None = None, 

644 **kwargs: SupportsIndex | Variable, 

645) -> DataGroup[Any]: ... 

646 

647 

648@data_group_overload 

649def nanhist( 

650 x: Variable | DataArray | Dataset | DataGroup[Any], 

651 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

652 /, 

653 *, 

654 dim: str | tuple[str, ...] | None = None, 

655 **kwargs: SupportsIndex | Variable, 

656) -> Variable | DataArray | Dataset | DataGroup[Any]: 

657 """Compute a histogram, skipping NaN values. 

658 

659 Like :py:func:`scipp.hist`, but NaN values are skipped. See there for details and 

660 examples. 

661 

662 Parameters 

663 ---------- 

664 x: 

665 Input data. 

666 arg_dict: 

667 Dictionary mapping dimension labels to binning parameters. 

668 dim: 

669 Dimension(s) to sum over when histogramming. If None (the default), the 

670 dimensions of the coordinate used for histogramming are summed over. 

671 **kwargs: 

672 Mapping of dimension label to corresponding binning parameters. 

673 

674 Returns 

675 ------- 

676 : 

677 Histogrammed data. 

678 """ 

679 if isinstance(x, DataGroup): 

680 # Only to make mypy happy because we have `DataGroup` in annotation of `x` 

681 # so that Sphinx shows it. 

682 raise TypeError("Internal error: input should not be a DataGroup") 

683 edges: dict[str, SupportsIndex | Variable] = _make_edges(x, arg_dict, kwargs) # type: ignore[assignment] 

684 if len(edges) > 0: 

685 x = x.bin(edges, dim=dim) # type: ignore[union-attr] 

686 if x.bins is None: 

687 raise TypeError("Data is not binned so bin edges must be provided.") 

688 return x.bins.nansum() 

689 

690 

691@overload 

692def bin( 

693 x: Variable | DataArray, 

694 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

695 /, 

696 *, 

697 dim: str | tuple[str, ...] | None = None, 

698 **kwargs: SupportsIndex | Variable, 

699) -> DataArray: ... 

700 

701 

702@overload 

703def bin( 

704 x: DataGroup[Any], 

705 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

706 /, 

707 *, 

708 dim: str | tuple[str, ...] | None = None, 

709 **kwargs: SupportsIndex | Variable, 

710) -> DataGroup[Any]: ... 

711 

712 

713@data_group_overload 

714def bin( 

715 x: Variable | DataArray | DataGroup[Any], 

716 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

717 /, 

718 *, 

719 dim: str | tuple[str, ...] | None = None, 

720 **kwargs: SupportsIndex | Variable, 

721) -> DataArray | DataGroup[Any]: 

722 """Create binned data by binning input along all dimensions given by edges. 

723 

724 Bin edges can be specified in three ways: 

725 

726 1. When an integer is provided, a 'linspace' with this requested number of 

727 bins is created, based on the min and max of the corresponding coordinate. 

728 2. A scalar Scipp variable (a value with a unit) is interpreted as a target 

729 bin width, and an 'arange' covering the min and max of the corresponding 

730 coordinate is created. 

731 3. A custom coordinate, given as a Scipp variable with compatible unit. 

732 Typically, this should have a single dimension matching the target dimension. 

733 

734 The `dim` argument controls which dimensions are concatenated and which are 

735 preserved. The default `dim=None` means that the dimensions of the coordinate 

736 used for binning are concatenated. In case of an input that is binned-data 

737 there may be no such coordinate, in which case `dim=None` is equivalent to `dim=()`, 

738 resulting in a new dimension in the output. In many cases this default yields the 

739 desired behavior, there are two classes of exceptions where specifying `dim` 

740 explicitly can be useful: 

741 

742 1. Given input data with an N-D coordinate, where N>1, we can use `dim` to restrict 

743 the binning to a subset of M dimensions, resulting in an (N-M)-D "array" of bins. 

744 This can be of particular importance when the input is binned data: Frequently 

745 we may want to bin to add an additional dimension, but if there is a dense 

746 coordinate present the default `dim=None` would result in removal of the 

747 coordinate's dimensions. This can be prevented by setting `dim=()`, which will 

748 always add a new dimensions. 

749 2. Given M-D input data with an N-D coordinate, where N<M, we can specify `dim` to 

750 concatenate, e.g., the remaining M-N dimensions while binning. This is often 

751 equivalent to not specifying `dim` and a call to `da.bins.concat()` after 

752 binning but is more memory efficient. 

753 

754 If the dimensions of the input coordinate are not known, using an explicit `dim` 

755 argument can be useful to obtain predictable behavior in generic code. 

756 

757 Warning 

758 ------- 

759 

760 When there is existing binning or grouping, the algorithm assumes that coordinates 

761 of the binned data are correct, i.e., compatible with the corresponding 

762 coordinate values in the individual bins. If this is not the case then the behavior 

763 if UNSPECIFIED. That is, the algorithm may or may not ignore the existing 

764 coordinates. If you encounter such as case, remove the conflicting coordinate, 

765 e.g., using :py:func:`scipp.DataArray.drop_coords`. 

766 

767 Parameters 

768 ---------- 

769 x: 

770 Input data. 

771 arg_dict: 

772 Dictionary mapping dimension labels to binning parameters. 

773 dim: 

774 Dimension(s) to concatenate into a single bin. If None (the default), the 

775 dimensions of the coordinate used for binning are concatenated. 

776 **kwargs: 

777 Mapping of dimension label to corresponding binning parameters. 

778 

779 Returns 

780 ------- 

781 : 

782 Binned data. 

783 

784 See Also 

785 -------- 

786 scipp.hist: 

787 For histogramming data. 

788 scipp.group: 

789 Creating binned data by grouping, instead of binning based on edges. 

790 scipp.binning.make_binned: 

791 Lower level function that can bin and group. 

792 

793 Examples 

794 -------- 

795 

796 Bin a table by one of its coord columns, specifying (1) number of bins, (2) 

797 bin width, or (3) actual binning: 

798 

799 >>> from numpy.random import default_rng 

800 >>> rng = default_rng(seed=1234) 

801 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

802 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

803 >>> data = sc.ones(dims=['row'], unit='K', shape=[100]) 

804 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y}) 

805 >>> table.bin(x=2).sizes 

806 {'x': 2} 

807 

808 >>> table.bin(x=sc.scalar(0.2, unit='m')).sizes 

809 {'x': 5} 

810 

811 >>> table.bin(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes 

812 {'x': 9} 

813 

814 Bin a table by two of its coord columns: 

815 

816 >>> table.bin(x=4, y=6).sizes 

817 {'x': 4, 'y': 6} 

818 

819 Bin binned data, using new bins along existing dimension: 

820 

821 >>> binned = table.bin(x=10) 

822 >>> binned.bin(x=20).sizes 

823 {'x': 20} 

824 

825 Bin binned data along an additional dimension: 

826 

827 >>> binned = table.bin(x=10) 

828 >>> binned.bin(y=5).sizes 

829 {'x': 10, 'y': 5} 

830 

831 The `dim` argument controls which dimensions are concatenated and which are 

832 preserved. Given 3-D data with a 2-D coordinate, the default `dim=None` results in: 

833 

834 >>> xyz = sc.data.table_xyz(100).bin(x=4, y=5, z=6) 

835 >>> values = rng.random((4, 5)) 

836 >>> xyz.coords['t'] = sc.array(dims=['x', 'y'], unit='s', values=values) 

837 >>> xyz.bin(t=3).sizes 

838 {'z': 6, 't': 3} 

839 

840 Specifying `dim=('x', 'y', 'z')` or equivalently `dim=xyz.dims` will additionally 

841 concatenate along the z-dimension, resulting in a 1-D array of bins: 

842 

843 >>> xyz.bin(t=3, dim=('x', 'y', 'z')).sizes 

844 {'t': 3} 

845 

846 To preserve a dimension of the input's t-coordinate, we can drop this dimension 

847 from the tuple of dimensions to concatenate: 

848 

849 >>> xyz.bin(t=4, dim='y').sizes 

850 {'x': 4, 'z': 6, 't': 4} 

851 

852 Finally, we can add a new dimension without touching the existing dimensions: 

853 

854 >>> xyz.bin(t=4, dim=()).sizes 

855 {'x': 4, 'y': 5, 'z': 6, 't': 4} 

856 

857 Note that this is generally only useful if the input is binned data with a binned 

858 t-coordinate. 

859 """ 

860 if isinstance(x, DataGroup): 

861 # Only to make mypy happy because we have `DataGroup` in annotation of `x` 

862 # so that Sphinx shows it. 

863 raise TypeError("Internal error: input should not be a DataGroup") 

864 edges = _make_edges(x, arg_dict, kwargs) 

865 erase = _find_replaced_dims(x, dims=edges, dim=dim) 

866 return make_binned(x, edges=list(edges.values()), erase=erase) 

867 

868 

869@overload 

870def rebin( 

871 x: Variable | DataArray, 

872 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

873 /, 

874 **kwargs: SupportsIndex | Variable, 

875) -> DataArray: ... 

876 

877 

878@overload 

879def rebin( 

880 x: Dataset, 

881 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

882 /, 

883 **kwargs: SupportsIndex | Variable, 

884) -> Dataset: ... 

885 

886 

887@overload 

888def rebin( 

889 x: DataGroup[Any], 

890 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

891 /, 

892 **kwargs: SupportsIndex | Variable, 

893) -> DataGroup[Any]: ... 

894 

895 

896@data_group_overload 

897def rebin( 

898 x: Variable | DataArray | Dataset | DataGroup[Any], 

899 arg_dict: dict[str, SupportsIndex | Variable] | None = None, 

900 /, 

901 **kwargs: SupportsIndex | Variable, 

902) -> Variable | DataArray | Dataset | DataGroup[Any]: 

903 """Rebin a data array or dataset. 

904 

905 The coordinate of the input for the dimension to be rebinned must contain bin edges, 

906 i.e., the data must be histogrammed. 

907 

908 If the input has masks that contain the dimension being rebinned then those 

909 masks are applied to the data before rebinning. That is, masked values are treated 

910 as zero. 

911 

912 Parameters 

913 ---------- 

914 x: 

915 Data to rebin. 

916 arg_dict: 

917 Dictionary mapping dimension labels to binning parameters. 

918 **kwargs: 

919 Mapping of dimension label to corresponding binning parameters. 

920 

921 Returns 

922 ------- 

923 : 

924 Data rebinned according to the new bin edges. 

925 

926 See Also 

927 -------- 

928 scipp.bin: 

929 For changing the binning of binned (as opposed to dense, histogrammed) data. 

930 scipp.hist: 

931 For histogramming data. 

932 

933 Examples 

934 -------- 

935 

936 Rebin a data array along one of its dimensions, specifying (1) number of bins, (2) 

937 bin width, or (3) actual binning: 

938 

939 >>> from numpy.random import default_rng 

940 >>> rng = default_rng(seed=1234) 

941 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

942 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

943 >>> data = sc.ones(dims=['row'], unit='K', shape=[100]) 

944 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y}) 

945 >>> da = table.hist(x=100, y=100) 

946 >>> da.rebin(x=2).sizes 

947 {'x': 2, 'y': 100} 

948 

949 >>> da.rebin(x=sc.scalar(0.2, unit='m')).sizes 

950 {'x': 5, 'y': 100} 

951 

952 >>> da.rebin(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes 

953 {'x': 9, 'y': 100} 

954 

955 Rebin a data array along two of its dimensions: 

956 

957 >>> da = table.hist(x=100, y=100) 

958 >>> da.rebin(x=4, y=6).sizes 

959 {'x': 4, 'y': 6} 

960 """ 

961 if isinstance(x, DataGroup): 

962 # Only to make mypy happy because we have `DataGroup` in annotation of `x` 

963 # so that Sphinx shows it. 

964 raise TypeError("Internal error: input should not be a DataGroup") 

965 edges = _make_edges(x, arg_dict, kwargs) 

966 out = x 

967 for dim, edge in edges.items(): 

968 out = _cpp.rebin(out, dim, edge) 

969 return out 

970 

971 

972def _make_groups(x: DataArray, arg: str | Variable) -> Variable: 

973 import numpy as np 

974 

975 if isinstance(arg, Variable): 

976 return arg 

977 coord: Variable | None = x.bins.coords.get(arg) if x.bins is not None else None 

978 if coord is None: 

979 coord = x.coords.get(arg) 

980 _require_coord(arg, coord) 

981 if coord.bins is not None: 

982 coord = coord.copy().bins.constituents['data'] # type: ignore[assignment, union-attr] 

983 

984 if 0 in coord.shape: 

985 unique = coord.values[0:0] 

986 # We are currently using np.unique to find all unique groups. This can be very slow 

987 # for large inputs. In many cases groups are in a bounded range of integers, and we 

988 # can sometimes bypass a full call to np.unique by checking a sub-range first 

989 elif coord.dtype in (DType.int32, DType.int64): 

990 min_ = coord.min().value 

991 max_ = coord.max().value 

992 values = coord.values 

993 unique = values[0:0] 

994 for pivot in [1000, 100, 10, 1]: 

995 if len(unique) == max_ - min_ + 1: 

996 break 

997 unique = np.unique(values[: len(values) // pivot]) 

998 else: 

999 unique = np.unique(coord.values) 

1000 return array(dims=[arg], values=unique, unit=coord.unit) 

1001 

1002 

1003@overload 

1004def group( 

1005 x: DataArray, 

1006 /, 

1007 *args: str | Variable, 

1008 dim: str | tuple[str, ...] | None = None, 

1009) -> DataArray: ... 

1010 

1011 

1012@overload 

1013def group( 

1014 x: DataGroup[Any], 

1015 /, 

1016 *args: str | Variable, 

1017 dim: str | tuple[str, ...] | None = None, 

1018) -> DataGroup[Any]: ... 

1019 

1020 

1021@data_group_overload 

1022def group( 

1023 x: DataArray | DataGroup[Any], 

1024 /, 

1025 *args: str | Variable, 

1026 dim: str | tuple[str, ...] | None = None, 

1027) -> DataArray | DataGroup[Any]: 

1028 """Create binned data by grouping input by one or more coordinates. 

1029 

1030 Grouping can be specified in two ways: (1) When a string is provided the unique 

1031 values of the corresponding coordinate are used as groups. (2) When a Scipp variable 

1032 is provided then the variable's values are used as groups. 

1033 

1034 Note that option (1) may be very slow if the input is very large. 

1035 

1036 The `dim` argument controls which dimensions are concatenated and which are 

1037 preserved. The default `dim=None` means that the dimensions of the coordinate 

1038 used for binning are concatenated. In case of an input that is binned-data 

1039 there may be no such coordinate, in which case `dim=None` is equivalent to `dim=()`, 

1040 resulting in a new dimension in the output. In many cases this default yields the 

1041 desired behavior, there are two classes of exceptions where specifying `dim` 

1042 explicitly can be useful: 

1043 

1044 1. Given input data with an N-D coordinate, where N>1, we can use `dim` to restrict 

1045 the grouping to a subset of M dimensions, resulting in an (N-M)-D array of bins. 

1046 This can be of particular importance when the input is binned data: Frequently 

1047 we may want to group to add an additional dimension, but if there is a dense 

1048 coordinate present the default `dim=None` would result in removal of the 

1049 coordinate's dimensions. This can be prevented by setting `dim=()`, which will 

1050 always add a new dimensions. 

1051 2. Given M-D input data with an N-D coordinate, where N<M, we can specify `dim` to 

1052 concatenate, e.g., the remaining M-N dimensions while grouping. This is often 

1053 equivalent to not specifying `dim` and a call to `da.bins.concat()` after 

1054 grouping but is more memory efficient. 

1055 

1056 If the dimensions of the input coordinate are not known, using an explicit `dim` 

1057 argument can be useful to obtain predictable behavior in generic code. 

1058 

1059 Warning 

1060 ------- 

1061 

1062 When there is existing binning or grouping, the algorithm assumes that coordinates 

1063 of the binned data are correct, i.e., compatible with the corresponding 

1064 coordinate values in the individual bins. If this is not the case then the behavior 

1065 if UNSPECIFIED. That is, the algorithm may or may not ignore the existing 

1066 coordinates. If you encounter such as case, remove the conflicting coordinate, 

1067 e.g., using :py:func:`scipp.DataArray.drop_coords`. 

1068 

1069 Parameters 

1070 ---------- 

1071 x: 

1072 Input data. 

1073 *args: 

1074 Dimension labels or grouping variables. 

1075 dim: 

1076 Dimension(s) to concatenate into a single bin. If None (the default), the 

1077 dimensions of the coordinate used for grouping are concatenated. 

1078 

1079 Returns 

1080 ------- 

1081 : 

1082 Binned data. 

1083 

1084 See Also 

1085 -------- 

1086 scipp.bin: 

1087 Creating binned data by binning based on edges, instead of grouping. 

1088 scipp.binning.make_binned: 

1089 Lower level function that can bin and group. 

1090 

1091 Examples 

1092 -------- 

1093 

1094 Group a table by one of its coord columns, specifying (1) a coord name or (2) 

1095 an actual grouping: 

1096 

1097 >>> from numpy.random import default_rng 

1098 >>> rng = default_rng(seed=1234) 

1099 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

1100 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

1101 >>> data = sc.ones(dims=['row'], unit='K', shape=[100]) 

1102 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y}) 

1103 >>> table.coords['label'] = (table.coords['x'] * 10).to(dtype='int64') 

1104 >>> table.group('label').sizes 

1105 {'label': 10} 

1106 

1107 >>> groups = sc.array(dims=['label'], values=[1, 3, 5], unit='m') 

1108 >>> table.group(groups).sizes 

1109 {'label': 3} 

1110 

1111 Group a table by two of its coord columns: 

1112 

1113 >>> table.coords['a'] = (table.coords['x'] * 10).to(dtype='int64') 

1114 >>> table.coords['b'] = (table.coords['y'] * 10).to(dtype='int64') 

1115 >>> table.group('a', 'b').sizes 

1116 {'a': 10, 'b': 10} 

1117 

1118 >>> groups = sc.array(dims=['a'], values=[1, 3, 5], unit='m') 

1119 >>> table.group(groups, 'b').sizes 

1120 {'a': 3, 'b': 10} 

1121 

1122 Group binned data along an additional dimension: 

1123 

1124 >>> table.coords['a'] = (table.coords['y'] * 10).to(dtype='int64') 

1125 >>> binned = table.bin(x=10) 

1126 >>> binned.group('a').sizes 

1127 {'x': 10, 'a': 10} 

1128 

1129 The `dim` argument controls which dimensions are concatenated and which are 

1130 preserved. Given 3-D data with a 2-D coordinate, the default `dim=None` results in: 

1131 

1132 >>> xyz = sc.data.table_xyz(100).bin(x=4, y=5, z=6) 

1133 >>> times = rng.integers(low=1, high=3, size=(4, 5)) 

1134 >>> xyz.coords['t'] = sc.array(dims=['x', 'y'], unit='s', values=times) 

1135 >>> xyz.group('t').sizes 

1136 {'z': 6, 't': 2} 

1137 

1138 Specifying `dim=('x', 'y', 'z')` or equivalently `dim=xyz.dims` will additionally 

1139 concatenate along the z-dimension, resulting in a 1-D array of bins: 

1140 

1141 >>> xyz.group('t', dim=('x', 'y', 'z')).sizes 

1142 {'t': 2} 

1143 

1144 To preserve a dimension of the input's t-coordinate, we can drop this dimension 

1145 from the tuple of dimensions to concatenate: 

1146 

1147 >>> xyz.group('t', dim='y').sizes 

1148 {'x': 4, 'z': 6, 't': 2} 

1149 

1150 Finally, we can add a new dimension without touching the existing dimensions: 

1151 

1152 >>> xyz.group('t', dim=()).sizes 

1153 {'x': 4, 'y': 5, 'z': 6, 't': 2} 

1154 

1155 Note that this is generally only useful if the input is binned data with a binned 

1156 t-coordinate. 

1157 """ 

1158 if isinstance(x, DataGroup): 

1159 # Only to make mypy happy because we have `DataGroup` in annotation of `x` 

1160 # so that Sphinx shows it. 

1161 raise TypeError("Internal error: input should not be a DataGroup") 

1162 groups = [_make_groups(x, name) for name in args] 

1163 erase = _find_replaced_dims(x, dims=[g.dim for g in groups], dim=dim) 

1164 return make_binned(x, groups=groups, erase=erase)