Coverage for install/scipp/core/binning.py: 71%
283 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-01 01:59 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-01 01:59 +0000
1# SPDX-License-Identifier: BSD-3-Clause
2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
3# @author Simon Heybrock
4import itertools
5import uuid
6from collections.abc import Iterable, Sequence
7from typing import Any, SupportsIndex, TypeVar, overload
9from .._scipp import core as _cpp
10from .bin_remapping import combine_bins
11from .bins import Bins
12from .cpp_classes import BinEdgeError, CoordError, DataArray, Dataset, DType, Variable
13from .data_group import DataGroup, data_group_overload
14from .math import round as round_
15from .shape import concat
16from .variable import arange, array, epoch, linspace, scalar
18_DaDs = TypeVar('_DaDs', bound=DataArray | Dataset)
21@overload
22def make_histogrammed(
23 x: Variable | DataArray, *, edges: Variable, erase: Iterable[str] = ()
24) -> DataArray: ...
27@overload
28def make_histogrammed(
29 x: Dataset, *, edges: Variable, erase: Iterable[str] = ()
30) -> Dataset: ...
33def make_histogrammed(
34 x: Variable | DataArray | Dataset, *, edges: Variable, erase: Iterable[str] = ()
35) -> DataArray | Dataset:
36 """Create dense data by histogramming data into given bins.
38 If the input is binned data, then existing binning dimensions are preserved.
39 Histogramming along an existing binned dimension will replace this binning.
41 Usually :py:func:`scipp.hist` should be preferred.
43 Parameters
44 ----------
45 x:
46 Input data.
47 edges:
48 Bin edges. If these have more than one dimension, binning occurs along
49 the inner dimension.
50 erase:
51 Names of dimensions to erase from the input.
53 Returns
54 -------
55 :
56 DataArray / Dataset with values equal to the sum
57 of values in each given bin.
59 See Also
60 --------
61 scipp.hist:
62 Recommended interface for histogramming data.
63 scipp.bin:
64 For binning data.
65 """
66 if isinstance(x, Variable):
67 data = scalar(1.0, unit='counts').broadcast(sizes=x.sizes)
68 x = DataArray(data, coords={edges.dim: x})
69 elif isinstance(x, DataArray) and x.bins is not None:
70 dim = edges.dims[-1]
71 if dim not in x.bins.coords:
72 # The second `dim` is necessary in case the coord is multi-dimensional.
73 if x.coords.is_edges(dim, dim):
74 raise BinEdgeError(
75 "Cannot histogram data with existing bin edges "
76 "unless event data coordinate for histogramming is available."
77 )
78 return make_histogrammed(x.bins.sum(), edges=edges, erase=erase)
79 _check_erase_dimension_clash(erase, edges)
80 # The C++ implementation uses an older heuristic histogramming a single dimension.
81 # We therefore transpose and flatten the input to match this.
82 hist_dim = edges.dims[-1]
83 to_flatten = [dim for dim in x.dims if dim in erase]
84 if hist_dim in x.dims:
85 to_flatten.append(hist_dim)
86 if to_flatten:
87 x = _drop_coords_for_hist(x, to_flatten, keep=(hist_dim,))
88 x = _transpose_and_flatten_for_hist(x, to_flatten, to=hist_dim)
89 return _cpp.histogram(x, edges) # type: ignore[no-any-return]
92def _drop_coords_for_hist(x: _DaDs, dims: Iterable[str], keep: Iterable[str]) -> _DaDs:
93 """Drop unnecessary coords from a DataArray making flatten/bin expensive."""
94 data = x if x.bins is None else x.bins
95 to_drop = []
96 for name, coord in data.coords.items():
97 if (name not in keep) and (set(coord.dims) & set(dims)):
98 to_drop.append(name)
99 return data.drop_coords(to_drop) # type: ignore[return-value]
102def _transpose_and_flatten_for_hist(x: _DaDs, dims: Sequence[str], to: str) -> _DaDs:
103 """Transpose and flatten a DataArray to prepare for histogram."""
104 new_order = [*(dim for dim in x.dims if dim not in dims), *dims]
105 # `make_histogrammed` does not fully support `Dataset`.
106 # This needs to be fixed, but for now, we just ignore the type error here.
107 transposed = x.transpose(new_order) # type: ignore[union-attr]
108 return transposed.flatten(dims=dims, to=to) # type: ignore[return-value]
111def make_binned(
112 x: Variable | DataArray,
113 *,
114 edges: Sequence[Variable] | None = None,
115 groups: Sequence[Variable] | None = None,
116 erase: Sequence[str] = (),
117) -> DataArray:
118 """Create binned data by binning input along all dimensions given by edges or
119 groups.
121 Usually :py:func:`scipp.bin` or :py:func:`scipp.group` should be preferred,
122 unless the more precise control over which dimensions should be erased is required,
123 or unless grouping and binning at the same time is required.
125 This does not histogram the data, each output bin will contain a "list" of
126 input values.
128 At least one argument of ``edges`` and ``groups`` is required.
130 If the input is binned and certain bins are masked then changing the binning
131 will apply the masks, i.e., masked bins are treated as empty.
133 Warning
134 -------
136 When there is existing binning or grouping, the algorithm assumes that coordinates
137 of the binned data are correct, i.e., compatible with the corresponding
138 coordinate values in the individual bins. If this is not the case then the behavior
139 is UNSPECIFIED. That is, the algorithm may or may not ignore the existing
140 coordinates. If you encounter such as case, remove the conflicting coordinate,
141 e.g., using :py:func:`scipp.DataArray.drop_coords`.
143 Parameters
144 ----------
145 x:
146 Input data.
147 edges:
148 Bin edges, one per dimension to bin in.
149 groups:
150 Keys to group input by one per dimension to group in.
151 erase:
152 Dimension labels to remove from output.
154 Returns
155 -------
156 :
157 Binned ``x``.
159 See Also
160 --------
161 scipp.hist:
162 For histogramming data.
163 scipp.bin:
164 Recommended interface for binning data.
165 scipp.group:
166 Recommended interface for grouping data.
167 scipp.bins:
168 For creating binned data based on explicitly given index ranges.
169 """
170 if groups is None:
171 groups = []
172 if edges is None:
173 edges = []
174 _check_erase_dimension_clash(erase, *edges, *groups)
176 if isinstance(x, Variable) and x.bins is not None:
177 x = DataArray(x)
178 elif isinstance(x, Variable):
179 coords = [*edges, *groups]
180 if len(coords) != 1:
181 raise ValueError(
182 "Edges for exactly one dimension must be specified when "
183 "binning or histogramming a variable."
184 )
185 data = scalar(1.0, unit='counts').broadcast(sizes=x.sizes).copy()
186 x = DataArray(data, coords={coords[0].dim: x})
187 if _can_operate_on_bins(x, edges, groups, erase):
188 return combine_bins(x, edges=edges, groups=groups, dim=erase)
189 # Many-to-many mapping is expensive, concat first is generally cheaper,
190 # despite extra copies. If some coords are dense, perform binning in two steps,
191 # since concat is not possible then (without mapping dense coords to binned coords,
192 # which might bypass some other optimizations).
193 if erase and x.bins is not None:
194 dense_edges = [var for var in edges if var.dims[-1] not in x.bins.coords]
195 dense_groups = [var for var in groups if var.dims[-1] not in x.bins.coords]
196 if len(dense_edges) + len(dense_groups) == 0:
197 x = x.bins.concat(erase)
198 erase = ()
199 elif len(dense_edges) + len(dense_groups) < len(edges) + len(groups):
200 x = make_binned(x, edges=dense_edges, groups=dense_groups, erase=erase)
201 b: Bins[DataArray] = x.bins # type: ignore[assignment]
202 edges = [var for var in edges if var.dims[-1] in b.coords]
203 groups = [var for var in groups if var.dims[-1] in b.coords]
204 erase = ()
205 if x.ndim == 0:
206 return ( # type: ignore[no-any-return]
207 _cpp.bin(x.value, edges, groups, erase)
208 .assign_coords(x.coords)
209 .assign_masks(x.masks)
210 )
211 x = _prepare_multi_dim_dense(x, *edges, *groups)
212 return _cpp.bin(x, edges, groups, erase) # type: ignore[no-any-return]
215def _prepare_multi_dim_dense(x: DataArray, *edges_or_groups: Variable) -> DataArray:
216 """Prepare data for binning or grouping.
218 This function is a workaround for the C++ implementation not being able to deal with
219 multi-dimensional dense input data. The workaround is to flatten the data along the
220 auxiliary dimensions and regroup.
222 In case the ultimate operation is histogramming, this leads to the desired
223 higher-dimensional histogram. In case of binning or grouping, we obtain binned data
224 with one additional dimension, whereas conceptually we might expect only the
225 requested dimensions, with the auxiliary dimensions inside the bin content. As this
226 case is likely rare and extra dimensions in bin content are barely supported in
227 scipp, we consider this acceptable for now.
228 """
229 if x.bins is not None or x.ndim == 1:
230 return x
231 if any(var.ndim != 1 for var in edges_or_groups):
232 raise ValueError("Cannot bin multi-dimensional dense data with ragged edges.")
233 op_dims = _get_op_dims(x, *edges_or_groups)
234 if len(op_dims) != 1:
235 raise ValueError("Cannot bin multi-dimensional dense data along multiple dims.")
236 extra = {dim for dim in x.dims if dim != next(iter(op_dims))}
237 original_coords = {
238 name: coord
239 for name, coord in x.coords.items()
240 if set(coord.dims).issubset(extra)
241 }
242 helper_coords = {dim: arange(dim, x.sizes[dim]) for dim in extra}
243 return (
244 x.assign_coords(helper_coords)
245 .flatten(to=str(uuid.uuid4()))
246 .group(*helper_coords.values())
247 .drop_coords(tuple(extra))
248 .assign_coords(original_coords)
249 )
252def _check_erase_dimension_clash(
253 erase: Iterable[str], *edges_or_groups: Variable
254) -> None:
255 new_dims: set[str] = set()
256 for var in edges_or_groups:
257 new_dims.update(var.dims)
258 if set(erase) & new_dims:
259 raise ValueError(
260 f"Clash of dimension(s) to reduce {erase} with dimensions defined by "
261 f"edges or groups: {new_dims}."
262 )
265def _can_operate_on_bins(
266 x: DataArray,
267 edges: Iterable[Variable],
268 groups: Iterable[Variable],
269 erase: Iterable[str],
270) -> bool:
271 if x.bins is None:
272 return False
273 dims: set[str] = set()
274 for coord in itertools.chain(edges, groups):
275 if coord.ndim != 1:
276 return False
277 if coord.dim in x.bins.coords:
278 return False
279 if coord.dim not in x.coords:
280 return False
281 dims.update(x.coords[coord.dim].dims)
282 return dims <= set(erase)
285def _require_coord(name: str, coord: object) -> None:
286 if coord is None:
287 raise CoordError(f"Coordinate '{name}' not found.")
290def _get_coord(x: Variable | DataArray | Dataset, name: str) -> Variable:
291 if isinstance(x, Variable):
292 return x
293 if isinstance(x, Dataset):
294 if not x.values():
295 raise ValueError("Dataset is empty")
296 cmin: Variable | None = None
297 cmax: Variable | None = None
298 for da in x.values():
299 c = _get_coord(da, name)
300 cmin = c.min() if cmin is None else min(cmin, c.min()) # type: ignore[call-overload]
301 cmax = c.max() if cmax is None else max(cmin, c.max()) # type: ignore[call-overload]
302 coord = concat([cmin, cmax], dim='dummy') # type: ignore[type-var]
303 else:
304 event_coord = x.bins.deprecated_meta.get(name) if x.bins is not None else None
305 coord = x.deprecated_meta.get(name, event_coord)
306 _require_coord(name, coord)
307 return coord # type: ignore[return-value]
310def _upper_bound(x: Variable) -> Variable:
311 import numpy as np
313 bound = x.nanmax()
314 if bound.dtype in ('int32', 'int64', 'datetime64'):
315 bound.value += 1
316 else:
317 bound.value = np.nextafter(
318 bound.value, (bound + scalar(1, unit=bound.unit, dtype=bound.dtype)).value
319 )
320 return bound
323def _parse_coords_arg(
324 x: Variable | DataArray | Dataset, name: str, arg: SupportsIndex | Variable
325) -> Variable:
326 if isinstance(arg, Variable) and name in arg.dims:
327 return arg
328 coord = _get_coord(x, name)
329 start = coord.nanmin()
330 if (
331 not isinstance(x, Variable)
332 and (name in x.coords)
333 and x.coords.is_edges(name, name)
334 ):
335 stop = coord.nanmax() # existing bin-edges, do not extend
336 else:
337 stop = _upper_bound(coord)
338 if start > stop:
339 raise ValueError(
340 'Empty data range, cannot automatically determine bounds. '
341 'Must provide concrete bin edges.'
342 )
343 if not isinstance(arg, Variable):
344 if start.dtype == DType.datetime64:
345 base = epoch(unit=start.unit)
346 return base + round_(
347 linspace(name, start - base, stop - base, num=arg.__index__() + 1)
348 ).to(dtype='int64')
349 return linspace(name, start, stop, num=arg.__index__() + 1).to(
350 dtype=start.dtype, copy=False
351 )
352 step = arg.to(dtype=start.dtype, unit=start.unit)
353 if step.value == 0:
354 raise ValueError("Step size cannot be 0.")
355 return arange(name, start, stop + step, step=step)
358def _make_edges(
359 x: Variable | DataArray | Dataset,
360 arg_dict: dict[str, SupportsIndex | Variable] | None,
361 kwargs: dict[str, SupportsIndex | Variable],
362) -> dict[str, Variable]:
363 if arg_dict is not None:
364 kwargs = dict(**arg_dict, **kwargs)
365 return {name: _parse_coords_arg(x, name, arg) for name, arg in kwargs.items()}
368def _find_replaced_dims(
369 x: Variable | DataArray | Dataset,
370 *,
371 dims: Iterable[str],
372 dim: str | tuple[str, ...] | None,
373) -> list[str]:
374 if isinstance(x, Variable):
375 replaced = set(x.dims)
376 elif dim is None:
377 replaced = set()
378 for name in dims:
379 if name in x.coords:
380 replaced.update(x.coords[name].dims)
381 else:
382 replaced = {dim} if isinstance(dim, str) else set(dim)
383 return [d for d in x.dims if d in (replaced - set(dims))]
386@overload
387def hist(
388 x: Variable | DataArray,
389 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
390 /,
391 *,
392 dim: str | tuple[str, ...] | None = None,
393 **kwargs: SupportsIndex | Variable,
394) -> Variable | DataArray: ...
397@overload
398def hist(
399 x: Dataset,
400 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
401 /,
402 *,
403 dim: str | tuple[str, ...] | None = None,
404 **kwargs: SupportsIndex | Variable,
405) -> Dataset: ...
408@overload
409def hist(
410 x: DataGroup[Any],
411 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
412 /,
413 *,
414 dim: str | tuple[str, ...] | None = None,
415 **kwargs: SupportsIndex | Variable,
416) -> DataGroup[Any]: ...
419@data_group_overload
420def hist(
421 x: Variable | DataArray | Dataset | DataGroup[Any],
422 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
423 /,
424 *,
425 dim: str | tuple[str, ...] | None = None,
426 **kwargs: SupportsIndex | Variable,
427) -> Variable | DataArray | Dataset | DataGroup[Any]:
428 """Compute a histogram.
430 Bin edges can be specified in three ways:
432 1. When an integer is provided, a 'linspace' with this requested number of
433 bins is created, based on the min and max of the corresponding coordinate.
434 2. A scalar Scipp variable (a value with a unit) is interpreted as a target
435 bin width, and an 'arange' covering the min and max of the corresponding
436 coordinate is created.
437 3. A custom coordinate, given as a Scipp variable with compatible unit.
438 Typically this should have a single dimension matching the target dimension.
440 The `dim` argument controls which dimensions are summed over and which are
441 preserved. The default `dim=None` means that the dimensions of the coordinate
442 used for histogramming are summed over. In case of an input that is binned-data
443 there may be no such coordinate, in which case `dim=None` is equivalent to `dim=()`,
444 resulting in a new dimension in the output. In many cases this default yields the
445 desired behavior, there are two classes of exceptions where specifying `dim`
446 explicitly can be useful:
448 1. Given input data with an N-D coordinate, where N>1, we can use `dim` to restrict
449 the sum to a subset of M dimensions, resulting in an (N-M)-D "array" of histograms.
450 This can be of particular importance when the input is binned data: Frequently
451 we may want to bin to add an additional dimension, but if there is a dense
452 coordinate present the default `dim=None` would result in removal of the
453 coordinate's dimensions. This can be prevented by setting `dim=()`, which will
454 always add a new dimensions.
455 2. Given M-D input data with an N-D coordinate, where N<M, we can specify `dim` to
456 sum over, e.g., the remaining M-N dimensions while histogramming. This is often
457 equivalent to not specifying `dim` and a call to `sum` after histogramming but
458 is more memory efficient.
460 If the dimensions of the input coordinate are not known, using an explicit `dim`
461 argument can be useful to obtain predictable behavior in generic code.
463 Parameters
464 ----------
465 x:
466 Input data.
467 arg_dict:
468 Dictionary mapping dimension labels to binning parameters.
469 dim:
470 Dimension(s) to sum over when histogramming. If None (the default), the
471 dimensions of the coordinate used for histogramming are summed over.
472 **kwargs:
473 Mapping of dimension label to corresponding binning parameters.
475 Returns
476 -------
477 :
478 Histogrammed data.
480 See Also
481 --------
482 scipp.nanhist:
483 Like :py:func:`scipp.hist`, but NaN values are skipped.
484 scipp.bin:
485 Creating binned data by binning instead of summing all contributions.
486 scipp.binning.make_histogrammed:
487 Lower level function for histogramming.
489 Examples
490 --------
492 Histogram a table by one of its coord columns, specifying (1) number of bins, (2)
493 bin width, or (3) actual binning:
495 >>> from numpy.random import default_rng
496 >>> rng = default_rng(seed=1234)
497 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100))
498 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100))
499 >>> data = sc.ones(dims=['row'], unit='K', shape=[100])
500 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y})
501 >>> table.hist(x=2)
502 <scipp.DataArray>
503 Dimensions: Sizes[x:2, ]
504 Coordinates:
505 * x float64 [m] (x [bin-edge]) [0.00313229, 0.497696, 0.992259]
506 Data:
507 float64 [K] (x) [53, 47]
509 >>> table.hist(x=sc.scalar(0.2, unit='m')).sizes
510 {'x': 5}
512 >>> table.hist(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes
513 {'x': 9}
515 Histogram a table by two of its coord columns:
517 >>> table.hist(x=4, y=6).sizes
518 {'x': 4, 'y': 6}
520 Histogram binned data, using existing bins:
522 >>> binned = table.bin(x=10)
523 >>> binned.hist().sizes
524 {'x': 10}
526 Histogram binned data, using new bins along existing dimension:
528 >>> binned = table.bin(x=10)
529 >>> binned.hist(x=20).sizes
530 {'x': 20}
532 Histogram binned data along an additional dimension:
534 >>> binned = table.bin(x=10)
535 >>> binned.hist(y=5).sizes
536 {'x': 10, 'y': 5}
538 The `dim` argument controls which dimensions are summed over and which are
539 preserved. Given 3-D data with a 2-D coordinate, the default `dim=None` results in:
541 >>> xyz = sc.data.table_xyz(100).bin(x=4, y=5, z=6)
542 >>> xyz.coords['t'] = sc.array(dims=['x', 'y'], unit='s', values=rng.random((4, 5)))
543 >>> xyz.hist(t=3).sizes
544 {'z': 6, 't': 3}
546 Specifying `dim=('x', 'y', 'z')` or equivalently `dim=xyz.dims` will additionally
547 sum over the z-dimension, resulting in a 1-D histogram:
549 >>> xyz.hist(t=3, dim=('x', 'y', 'z')).sizes
550 {'t': 3}
552 To preserve a dimension of the input's t-coordinate, we can drop this dimension
553 from the tuple of dimensions to sum over:
555 >>> xyz.hist(t=4, dim='y').sizes
556 {'x': 4, 'z': 6, 't': 4}
557 """ # noqa: E501
558 if isinstance(x, DataGroup):
559 # Only to make mypy happy because we have `DataGroup` in annotation of `x`
560 # so that Sphinx shows it.
561 raise TypeError("Internal error: input should not be a DataGroup")
562 edges = _make_edges(x, arg_dict, kwargs)
563 erase = _find_replaced_dims(x, dims=edges, dim=dim)
564 if isinstance(x, Variable) and len(edges) != 1:
565 raise ValueError(
566 "Edges for exactly one dimension must be specified when "
567 "binning or histogramming a variable."
568 )
569 if len(edges) == 0:
570 if x.bins is None:
571 raise TypeError("Data is not binned so bin edges must be provided.")
572 return x.bins.sum()
573 if len(edges) == 1:
574 # TODO Note that this may swap dims, is that ok?
575 out = make_histogrammed(x, edges=next(iter(edges.values())), erase=erase)
576 else:
577 # Drop coords that would disappear by histogramming, to avoid costly handling
578 # in intermediate binning step.
579 if isinstance(x, DataArray):
580 x = _drop_coords_for_hist(x, dims=erase, keep=edges)
581 elif isinstance(x, Dataset):
582 x = Dataset(
583 {
584 k: _drop_coords_for_hist(v, dims=erase, keep=edges)
585 for k, v in x.items()
586 }
587 )
588 edge_values = list(edges.values())
589 # If histogramming by the final edges needs to use a non-event coord then we
590 # must not erase that dim, since it removes the coord required for histogramming
591 remaining_erase = set(erase)
592 if isinstance(x, DataArray) and x.bins is not None:
593 hist_dim = edge_values[-1].dims[-1]
594 if hist_dim not in x.bins.coords:
595 erase = [e for e in erase if e not in x.coords[hist_dim].dims]
596 remaining_erase -= set(erase)
597 out = make_histogrammed(
598 make_binned(
599 x, # type: ignore[arg-type]
600 edges=edge_values[:-1],
601 erase=erase,
602 ),
603 edges=edge_values[-1],
604 erase=remaining_erase,
605 )
606 return out
609def _get_op_dims(x: DataArray, *edges_or_groups: Variable) -> set[str]:
610 edge_dims = {edge.dims[-1] for edge in edges_or_groups}
611 coords = [x.coords[dim] for dim in edge_dims if dim in x.coords]
612 return {coord.dims[-1] for coord in coords if coord.ndim > 0}
615@overload
616def nanhist(
617 x: Variable | DataArray,
618 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
619 /,
620 *,
621 dim: str | tuple[str, ...] | None = None,
622 **kwargs: SupportsIndex | Variable,
623) -> Variable | DataArray: ...
626@overload
627def nanhist(
628 x: Dataset,
629 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
630 /,
631 *,
632 dim: str | tuple[str, ...] | None = None,
633 **kwargs: SupportsIndex | Variable,
634) -> Dataset: ...
637@overload
638def nanhist(
639 x: DataGroup[Any],
640 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
641 /,
642 *,
643 dim: str | tuple[str, ...] | None = None,
644 **kwargs: SupportsIndex | Variable,
645) -> DataGroup[Any]: ...
648@data_group_overload
649def nanhist(
650 x: Variable | DataArray | Dataset | DataGroup[Any],
651 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
652 /,
653 *,
654 dim: str | tuple[str, ...] | None = None,
655 **kwargs: SupportsIndex | Variable,
656) -> Variable | DataArray | Dataset | DataGroup[Any]:
657 """Compute a histogram, skipping NaN values.
659 Like :py:func:`scipp.hist`, but NaN values are skipped. See there for details and
660 examples.
662 Parameters
663 ----------
664 x:
665 Input data.
666 arg_dict:
667 Dictionary mapping dimension labels to binning parameters.
668 dim:
669 Dimension(s) to sum over when histogramming. If None (the default), the
670 dimensions of the coordinate used for histogramming are summed over.
671 **kwargs:
672 Mapping of dimension label to corresponding binning parameters.
674 Returns
675 -------
676 :
677 Histogrammed data.
678 """
679 if isinstance(x, DataGroup):
680 # Only to make mypy happy because we have `DataGroup` in annotation of `x`
681 # so that Sphinx shows it.
682 raise TypeError("Internal error: input should not be a DataGroup")
683 edges: dict[str, SupportsIndex | Variable] = _make_edges(x, arg_dict, kwargs) # type: ignore[assignment]
684 if len(edges) > 0:
685 x = x.bin(edges, dim=dim) # type: ignore[union-attr]
686 if x.bins is None:
687 raise TypeError("Data is not binned so bin edges must be provided.")
688 return x.bins.nansum()
691@overload
692def bin(
693 x: Variable | DataArray,
694 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
695 /,
696 *,
697 dim: str | tuple[str, ...] | None = None,
698 **kwargs: SupportsIndex | Variable,
699) -> DataArray: ...
702@overload
703def bin(
704 x: DataGroup[Any],
705 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
706 /,
707 *,
708 dim: str | tuple[str, ...] | None = None,
709 **kwargs: SupportsIndex | Variable,
710) -> DataGroup[Any]: ...
713@data_group_overload
714def bin(
715 x: Variable | DataArray | DataGroup[Any],
716 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
717 /,
718 *,
719 dim: str | tuple[str, ...] | None = None,
720 **kwargs: SupportsIndex | Variable,
721) -> DataArray | DataGroup[Any]:
722 """Create binned data by binning input along all dimensions given by edges.
724 Bin edges can be specified in three ways:
726 1. When an integer is provided, a 'linspace' with this requested number of
727 bins is created, based on the min and max of the corresponding coordinate.
728 2. A scalar Scipp variable (a value with a unit) is interpreted as a target
729 bin width, and an 'arange' covering the min and max of the corresponding
730 coordinate is created.
731 3. A custom coordinate, given as a Scipp variable with compatible unit.
732 Typically, this should have a single dimension matching the target dimension.
734 The `dim` argument controls which dimensions are concatenated and which are
735 preserved. The default `dim=None` means that the dimensions of the coordinate
736 used for binning are concatenated. In case of an input that is binned-data
737 there may be no such coordinate, in which case `dim=None` is equivalent to `dim=()`,
738 resulting in a new dimension in the output. In many cases this default yields the
739 desired behavior, there are two classes of exceptions where specifying `dim`
740 explicitly can be useful:
742 1. Given input data with an N-D coordinate, where N>1, we can use `dim` to restrict
743 the binning to a subset of M dimensions, resulting in an (N-M)-D "array" of bins.
744 This can be of particular importance when the input is binned data: Frequently
745 we may want to bin to add an additional dimension, but if there is a dense
746 coordinate present the default `dim=None` would result in removal of the
747 coordinate's dimensions. This can be prevented by setting `dim=()`, which will
748 always add a new dimensions.
749 2. Given M-D input data with an N-D coordinate, where N<M, we can specify `dim` to
750 concatenate, e.g., the remaining M-N dimensions while binning. This is often
751 equivalent to not specifying `dim` and a call to `da.bins.concat()` after
752 binning but is more memory efficient.
754 If the dimensions of the input coordinate are not known, using an explicit `dim`
755 argument can be useful to obtain predictable behavior in generic code.
757 Warning
758 -------
760 When there is existing binning or grouping, the algorithm assumes that coordinates
761 of the binned data are correct, i.e., compatible with the corresponding
762 coordinate values in the individual bins. If this is not the case then the behavior
763 if UNSPECIFIED. That is, the algorithm may or may not ignore the existing
764 coordinates. If you encounter such as case, remove the conflicting coordinate,
765 e.g., using :py:func:`scipp.DataArray.drop_coords`.
767 Parameters
768 ----------
769 x:
770 Input data.
771 arg_dict:
772 Dictionary mapping dimension labels to binning parameters.
773 dim:
774 Dimension(s) to concatenate into a single bin. If None (the default), the
775 dimensions of the coordinate used for binning are concatenated.
776 **kwargs:
777 Mapping of dimension label to corresponding binning parameters.
779 Returns
780 -------
781 :
782 Binned data.
784 See Also
785 --------
786 scipp.hist:
787 For histogramming data.
788 scipp.group:
789 Creating binned data by grouping, instead of binning based on edges.
790 scipp.binning.make_binned:
791 Lower level function that can bin and group.
793 Examples
794 --------
796 Bin a table by one of its coord columns, specifying (1) number of bins, (2)
797 bin width, or (3) actual binning:
799 >>> from numpy.random import default_rng
800 >>> rng = default_rng(seed=1234)
801 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100))
802 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100))
803 >>> data = sc.ones(dims=['row'], unit='K', shape=[100])
804 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y})
805 >>> table.bin(x=2).sizes
806 {'x': 2}
808 >>> table.bin(x=sc.scalar(0.2, unit='m')).sizes
809 {'x': 5}
811 >>> table.bin(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes
812 {'x': 9}
814 Bin a table by two of its coord columns:
816 >>> table.bin(x=4, y=6).sizes
817 {'x': 4, 'y': 6}
819 Bin binned data, using new bins along existing dimension:
821 >>> binned = table.bin(x=10)
822 >>> binned.bin(x=20).sizes
823 {'x': 20}
825 Bin binned data along an additional dimension:
827 >>> binned = table.bin(x=10)
828 >>> binned.bin(y=5).sizes
829 {'x': 10, 'y': 5}
831 The `dim` argument controls which dimensions are concatenated and which are
832 preserved. Given 3-D data with a 2-D coordinate, the default `dim=None` results in:
834 >>> xyz = sc.data.table_xyz(100).bin(x=4, y=5, z=6)
835 >>> values = rng.random((4, 5))
836 >>> xyz.coords['t'] = sc.array(dims=['x', 'y'], unit='s', values=values)
837 >>> xyz.bin(t=3).sizes
838 {'z': 6, 't': 3}
840 Specifying `dim=('x', 'y', 'z')` or equivalently `dim=xyz.dims` will additionally
841 concatenate along the z-dimension, resulting in a 1-D array of bins:
843 >>> xyz.bin(t=3, dim=('x', 'y', 'z')).sizes
844 {'t': 3}
846 To preserve a dimension of the input's t-coordinate, we can drop this dimension
847 from the tuple of dimensions to concatenate:
849 >>> xyz.bin(t=4, dim='y').sizes
850 {'x': 4, 'z': 6, 't': 4}
852 Finally, we can add a new dimension without touching the existing dimensions:
854 >>> xyz.bin(t=4, dim=()).sizes
855 {'x': 4, 'y': 5, 'z': 6, 't': 4}
857 Note that this is generally only useful if the input is binned data with a binned
858 t-coordinate.
859 """
860 if isinstance(x, DataGroup):
861 # Only to make mypy happy because we have `DataGroup` in annotation of `x`
862 # so that Sphinx shows it.
863 raise TypeError("Internal error: input should not be a DataGroup")
864 edges = _make_edges(x, arg_dict, kwargs)
865 erase = _find_replaced_dims(x, dims=edges, dim=dim)
866 return make_binned(x, edges=list(edges.values()), erase=erase)
869@overload
870def rebin(
871 x: Variable | DataArray,
872 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
873 /,
874 **kwargs: SupportsIndex | Variable,
875) -> DataArray: ...
878@overload
879def rebin(
880 x: Dataset,
881 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
882 /,
883 **kwargs: SupportsIndex | Variable,
884) -> Dataset: ...
887@overload
888def rebin(
889 x: DataGroup[Any],
890 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
891 /,
892 **kwargs: SupportsIndex | Variable,
893) -> DataGroup[Any]: ...
896@data_group_overload
897def rebin(
898 x: Variable | DataArray | Dataset | DataGroup[Any],
899 arg_dict: dict[str, SupportsIndex | Variable] | None = None,
900 /,
901 **kwargs: SupportsIndex | Variable,
902) -> Variable | DataArray | Dataset | DataGroup[Any]:
903 """Rebin a data array or dataset.
905 The coordinate of the input for the dimension to be rebinned must contain bin edges,
906 i.e., the data must be histogrammed.
908 If the input has masks that contain the dimension being rebinned then those
909 masks are applied to the data before rebinning. That is, masked values are treated
910 as zero.
912 Parameters
913 ----------
914 x:
915 Data to rebin.
916 arg_dict:
917 Dictionary mapping dimension labels to binning parameters.
918 **kwargs:
919 Mapping of dimension label to corresponding binning parameters.
921 Returns
922 -------
923 :
924 Data rebinned according to the new bin edges.
926 See Also
927 --------
928 scipp.bin:
929 For changing the binning of binned (as opposed to dense, histogrammed) data.
930 scipp.hist:
931 For histogramming data.
933 Examples
934 --------
936 Rebin a data array along one of its dimensions, specifying (1) number of bins, (2)
937 bin width, or (3) actual binning:
939 >>> from numpy.random import default_rng
940 >>> rng = default_rng(seed=1234)
941 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100))
942 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100))
943 >>> data = sc.ones(dims=['row'], unit='K', shape=[100])
944 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y})
945 >>> da = table.hist(x=100, y=100)
946 >>> da.rebin(x=2).sizes
947 {'x': 2, 'y': 100}
949 >>> da.rebin(x=sc.scalar(0.2, unit='m')).sizes
950 {'x': 5, 'y': 100}
952 >>> da.rebin(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes
953 {'x': 9, 'y': 100}
955 Rebin a data array along two of its dimensions:
957 >>> da = table.hist(x=100, y=100)
958 >>> da.rebin(x=4, y=6).sizes
959 {'x': 4, 'y': 6}
960 """
961 if isinstance(x, DataGroup):
962 # Only to make mypy happy because we have `DataGroup` in annotation of `x`
963 # so that Sphinx shows it.
964 raise TypeError("Internal error: input should not be a DataGroup")
965 edges = _make_edges(x, arg_dict, kwargs)
966 out = x
967 for dim, edge in edges.items():
968 out = _cpp.rebin(out, dim, edge)
969 return out
972def _make_groups(x: DataArray, arg: str | Variable) -> Variable:
973 import numpy as np
975 if isinstance(arg, Variable):
976 return arg
977 coord: Variable | None = x.bins.coords.get(arg) if x.bins is not None else None
978 if coord is None:
979 coord = x.coords.get(arg)
980 _require_coord(arg, coord)
981 if coord.bins is not None:
982 coord = coord.copy().bins.constituents['data'] # type: ignore[assignment, union-attr]
984 if 0 in coord.shape:
985 unique = coord.values[0:0]
986 # We are currently using np.unique to find all unique groups. This can be very slow
987 # for large inputs. In many cases groups are in a bounded range of integers, and we
988 # can sometimes bypass a full call to np.unique by checking a sub-range first
989 elif coord.dtype in (DType.int32, DType.int64):
990 min_ = coord.min().value
991 max_ = coord.max().value
992 values = coord.values
993 unique = values[0:0]
994 for pivot in [1000, 100, 10, 1]:
995 if len(unique) == max_ - min_ + 1:
996 break
997 unique = np.unique(values[: len(values) // pivot])
998 else:
999 unique = np.unique(coord.values)
1000 return array(dims=[arg], values=unique, unit=coord.unit)
1003@overload
1004def group(
1005 x: DataArray,
1006 /,
1007 *args: str | Variable,
1008 dim: str | tuple[str, ...] | None = None,
1009) -> DataArray: ...
1012@overload
1013def group(
1014 x: DataGroup[Any],
1015 /,
1016 *args: str | Variable,
1017 dim: str | tuple[str, ...] | None = None,
1018) -> DataGroup[Any]: ...
1021@data_group_overload
1022def group(
1023 x: DataArray | DataGroup[Any],
1024 /,
1025 *args: str | Variable,
1026 dim: str | tuple[str, ...] | None = None,
1027) -> DataArray | DataGroup[Any]:
1028 """Create binned data by grouping input by one or more coordinates.
1030 Grouping can be specified in two ways: (1) When a string is provided the unique
1031 values of the corresponding coordinate are used as groups. (2) When a Scipp variable
1032 is provided then the variable's values are used as groups.
1034 Note that option (1) may be very slow if the input is very large.
1036 The `dim` argument controls which dimensions are concatenated and which are
1037 preserved. The default `dim=None` means that the dimensions of the coordinate
1038 used for binning are concatenated. In case of an input that is binned-data
1039 there may be no such coordinate, in which case `dim=None` is equivalent to `dim=()`,
1040 resulting in a new dimension in the output. In many cases this default yields the
1041 desired behavior, there are two classes of exceptions where specifying `dim`
1042 explicitly can be useful:
1044 1. Given input data with an N-D coordinate, where N>1, we can use `dim` to restrict
1045 the grouping to a subset of M dimensions, resulting in an (N-M)-D array of bins.
1046 This can be of particular importance when the input is binned data: Frequently
1047 we may want to group to add an additional dimension, but if there is a dense
1048 coordinate present the default `dim=None` would result in removal of the
1049 coordinate's dimensions. This can be prevented by setting `dim=()`, which will
1050 always add a new dimensions.
1051 2. Given M-D input data with an N-D coordinate, where N<M, we can specify `dim` to
1052 concatenate, e.g., the remaining M-N dimensions while grouping. This is often
1053 equivalent to not specifying `dim` and a call to `da.bins.concat()` after
1054 grouping but is more memory efficient.
1056 If the dimensions of the input coordinate are not known, using an explicit `dim`
1057 argument can be useful to obtain predictable behavior in generic code.
1059 Warning
1060 -------
1062 When there is existing binning or grouping, the algorithm assumes that coordinates
1063 of the binned data are correct, i.e., compatible with the corresponding
1064 coordinate values in the individual bins. If this is not the case then the behavior
1065 if UNSPECIFIED. That is, the algorithm may or may not ignore the existing
1066 coordinates. If you encounter such as case, remove the conflicting coordinate,
1067 e.g., using :py:func:`scipp.DataArray.drop_coords`.
1069 Parameters
1070 ----------
1071 x:
1072 Input data.
1073 *args:
1074 Dimension labels or grouping variables.
1075 dim:
1076 Dimension(s) to concatenate into a single bin. If None (the default), the
1077 dimensions of the coordinate used for grouping are concatenated.
1079 Returns
1080 -------
1081 :
1082 Binned data.
1084 See Also
1085 --------
1086 scipp.bin:
1087 Creating binned data by binning based on edges, instead of grouping.
1088 scipp.binning.make_binned:
1089 Lower level function that can bin and group.
1091 Examples
1092 --------
1094 Group a table by one of its coord columns, specifying (1) a coord name or (2)
1095 an actual grouping:
1097 >>> from numpy.random import default_rng
1098 >>> rng = default_rng(seed=1234)
1099 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100))
1100 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100))
1101 >>> data = sc.ones(dims=['row'], unit='K', shape=[100])
1102 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y})
1103 >>> table.coords['label'] = (table.coords['x'] * 10).to(dtype='int64')
1104 >>> table.group('label').sizes
1105 {'label': 10}
1107 >>> groups = sc.array(dims=['label'], values=[1, 3, 5], unit='m')
1108 >>> table.group(groups).sizes
1109 {'label': 3}
1111 Group a table by two of its coord columns:
1113 >>> table.coords['a'] = (table.coords['x'] * 10).to(dtype='int64')
1114 >>> table.coords['b'] = (table.coords['y'] * 10).to(dtype='int64')
1115 >>> table.group('a', 'b').sizes
1116 {'a': 10, 'b': 10}
1118 >>> groups = sc.array(dims=['a'], values=[1, 3, 5], unit='m')
1119 >>> table.group(groups, 'b').sizes
1120 {'a': 3, 'b': 10}
1122 Group binned data along an additional dimension:
1124 >>> table.coords['a'] = (table.coords['y'] * 10).to(dtype='int64')
1125 >>> binned = table.bin(x=10)
1126 >>> binned.group('a').sizes
1127 {'x': 10, 'a': 10}
1129 The `dim` argument controls which dimensions are concatenated and which are
1130 preserved. Given 3-D data with a 2-D coordinate, the default `dim=None` results in:
1132 >>> xyz = sc.data.table_xyz(100).bin(x=4, y=5, z=6)
1133 >>> times = rng.integers(low=1, high=3, size=(4, 5))
1134 >>> xyz.coords['t'] = sc.array(dims=['x', 'y'], unit='s', values=times)
1135 >>> xyz.group('t').sizes
1136 {'z': 6, 't': 2}
1138 Specifying `dim=('x', 'y', 'z')` or equivalently `dim=xyz.dims` will additionally
1139 concatenate along the z-dimension, resulting in a 1-D array of bins:
1141 >>> xyz.group('t', dim=('x', 'y', 'z')).sizes
1142 {'t': 2}
1144 To preserve a dimension of the input's t-coordinate, we can drop this dimension
1145 from the tuple of dimensions to concatenate:
1147 >>> xyz.group('t', dim='y').sizes
1148 {'x': 4, 'z': 6, 't': 2}
1150 Finally, we can add a new dimension without touching the existing dimensions:
1152 >>> xyz.group('t', dim=()).sizes
1153 {'x': 4, 'y': 5, 'z': 6, 't': 2}
1155 Note that this is generally only useful if the input is binned data with a binned
1156 t-coordinate.
1157 """
1158 if isinstance(x, DataGroup):
1159 # Only to make mypy happy because we have `DataGroup` in annotation of `x`
1160 # so that Sphinx shows it.
1161 raise TypeError("Internal error: input should not be a DataGroup")
1162 groups = [_make_groups(x, name) for name in args]
1163 erase = _find_replaced_dims(x, dims=[g.dim for g in groups], dim=dim)
1164 return make_binned(x, groups=groups, erase=erase)