Coverage for install/scipp/core/bins.py: 56%
235 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-01 01:59 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-12-01 01:59 +0000
1# SPDX-License-Identifier: BSD-3-Clause
2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
3# @author Simon Heybrock
4from __future__ import annotations
6from collections.abc import Callable
7from typing import Generic, Literal, Sequence, TypedDict, TypeVar
9from .._scipp import core as _cpp
10from ..typing import Dims, MetaDataMap, VariableLike
11from ._cpp_wrapper_util import call_func as _call_cpp_func
12from .bin_remapping import concat_bins
13from .cpp_classes import DataArray, Dataset, DType, Unit, Variable
14from .data_group import DataGroup
15from .deprecation import _warn_attr_removal
16from .domains import merge_equal_adjacent
17from .math import midpoints
18from .operations import islinspace
19from .shape import concat
20from .variable import scalar
23class Lookup:
24 """Lookup table.
26 This is class should never be instantiated manually.
27 Instead, use :func:`scipp.lookup`.
29 See :func:`scipp.lookup` also for usage examples.
30 """
32 def __init__(
33 self,
34 op: Callable[[DataArray, Variable, str, Variable | None], Variable],
35 func: DataArray,
36 dim: str,
37 fill_value: Variable | None = None,
38 ):
39 if (
40 not func.masks
41 and func.ndim == 1
42 and len(func) > 0
43 and func.dtype in [DType.bool, DType.int32, DType.int64]
44 ):
45 # Significant speedup if `func` is large but mostly constant.
46 if op == _cpp.buckets.map:
47 if not islinspace(func.coords[dim], dim).value:
48 func = merge_equal_adjacent(func)
49 else:
50 # In this case the C++ implementation currently used no linspace
51 # optimization, so the extra check is skipped.
52 transition = func.data[:-1] != func.data[1:]
53 func = concat([func[0], func[1:][transition]], dim)
54 self.op = op
55 self.func = func
56 self.dim = dim
57 self.fill_value = fill_value
58 self.__transform_coords_input_keys__ = (dim,) # for transform_coords
60 def __call__(self, var: Variable) -> Variable:
61 """Return table values for the given points."""
62 return self.op(self.func, var, self.dim, self.fill_value)
64 def __getitem__(self, var: Variable) -> Variable:
65 """Return table values for the given points."""
66 return self(var)
69def lookup(
70 func: DataArray,
71 dim: str | None = None,
72 *,
73 mode: Literal['previous', 'nearest'] | None = None,
74 fill_value: Variable | None = None,
75) -> Lookup:
76 """Create a "lookup table" from a histogram (data array with bin-edge coord).
78 The lookup table can be used to map, e.g., time-stamps to corresponding values
79 given by a time-series log.
81 Parameters
82 ----------
83 func:
84 Data array defining the lookup table.
85 dim:
86 Dimension along which the lookup occurs.
87 mode:
88 Mode used for looking up function values. Must be ``None`` when ``func`` is a
89 histogram. Otherwise this defaults to 'nearest'.
90 fill_value:
91 Value to use for points outside the range of the function as well as points in
92 masked regions of the function. If set to None (the default) this will use NaN
93 for floating point types and 0 for integral types. Must have the same dtype and
94 unit as the function values.
96 Returns
97 -------
98 :
99 The created lookup table.
101 Examples
102 --------
104 >>> x = sc.linspace(dim='x', start=0.0, stop=1.0, num=4)
105 >>> vals = sc.array(dims=['x'], values=[3, 2, 1])
106 >>> hist = sc.DataArray(data=vals, coords={'x': x})
107 >>> sc.lookup(hist, 'x')[sc.array(dims=['event'], values=[0.1,0.4,0.1,0.6,0.9])]
108 <scipp.Variable> (event: 5) int64 [dimensionless] [3, 2, ..., 2, 1]
109 """
110 if dim is None:
111 dim = func.dim
112 func = DataArray(func.data, coords={dim: func.coords[dim]}, masks=func.masks)
113 if func.dims[-1] != dim:
114 # We automatically transpose the data so that `dim` is the inner dimension to
115 # ensure contiguous memory access.
116 dims = (*[d for d in func.dims if d != dim], dim)
117 func.data = func.data.transpose(dims).copy()
118 func.coords[dim] = func.coords[dim].transpose(dims).copy()
119 for key, mask in func.masks.items():
120 func.masks[key] = mask.transpose(
121 # Masks potentially have fewer dims than the data.
122 [d for d in dims if d in mask.dims] or None
123 ).copy()
124 if func.coords.is_edges(dim, dim):
125 if mode is not None:
126 raise ValueError("Input is a histogram, 'mode' must not be set.")
127 return Lookup(_cpp.buckets.map, func, dim, fill_value)
128 if mode is None:
129 mode = 'nearest'
130 elif mode not in ['previous', 'nearest']:
131 raise ValueError(f"Mode must be one of ['previous', 'nearest'], got '{mode}'")
132 if mode == 'nearest' and func.sizes[dim] != 0:
133 coord = func.coords[dim]
134 lowest = coord[dim, 0:0].max() # trick to get lowest representable value
135 parts = [lowest] if coord.sizes[dim] < 2 else [lowest, midpoints(coord, dim)]
136 func.coords[dim] = concat(parts, dim)
137 return Lookup(_cpp.lookup_previous, func, dim, fill_value)
140class Constituents(TypedDict):
141 """A dict with bin constituents."""
143 data: Variable | DataArray | Dataset
144 """Data content."""
145 begin: Variable
146 """Begin indices for each bin."""
147 end: Variable
148 """End indices for each bin."""
149 dim: str
150 """Dimension in 'data' that the binning applies to."""
153_O = TypeVar("_O", Variable, DataArray, Dataset)
156class Bins(Generic[_O]):
157 """Proxy for access to bin contents and operations on bins of a variable.
159 This class is returned from the `bins` property of variables and should
160 generally not be created directly.
162 ``Bins`` is generic over the parent type, *not* the event type.
163 That is, ``Variable.bins`` always returns ``Bins[Variable]`` regardless of whether
164 the event list is a variable or data array.
165 """
167 def __init__(self, obj: _O) -> None:
168 self._obj: _O = obj
170 def _data(self) -> Variable:
171 if isinstance(self._obj, DataArray | Dataset):
172 # Raises AttributeError for datasets as it should.
173 return self._obj.data # type: ignore[attr-defined, no-any-return]
174 else:
175 return self._obj
177 def __mul__(self, lut: Lookup) -> _O:
178 if isinstance(self._obj, Dataset):
179 raise NotImplementedError(
180 "Multiplication of events in a dataset is not implemented"
181 )
182 target_dtype = (
183 scalar(1, dtype=self.dtype) * scalar(1, dtype=lut.func.dtype)
184 ).dtype
185 copy = self._obj.to(dtype=target_dtype)
186 _cpp.buckets.scale(copy, lut.func, lut.dim)
187 return copy
189 def __truediv__(self, lut: Lookup) -> _O:
190 if isinstance(self._obj, Dataset):
191 raise NotImplementedError(
192 "Division of events in a dataset is not implemented"
193 )
194 target_dtype = (
195 scalar(1, dtype=self.dtype) / scalar(1, dtype=lut.func.dtype)
196 ).dtype
197 copy = self._obj.to(dtype=target_dtype)
198 _cpp.buckets.scale(copy, _cpp.reciprocal(lut.func), lut.dim)
199 return copy
201 def __imul__(self, lut: Lookup) -> Bins[_O]: # noqa: PYI034
202 _cpp.buckets.scale(self._obj, lut.func, lut.dim)
203 return self
205 def __itruediv__(self, lut: Lookup) -> Bins[_O]: # noqa: PYI034
206 _cpp.buckets.scale(self._obj, _cpp.reciprocal(lut.func), lut.dim)
207 return self
209 def __getitem__(self, key: tuple[str, Variable | slice]) -> DataArray:
210 """
211 Extract events from bins based on labels or label ranges and return a copy.
213 This is similar to regular label-based indexing, but considers the event-coords,
214 i.e., the coord values of individual bin entries. Unlike normal label-based
215 indexing this returns a copy, as a subset of events is extracted.
216 """
217 if isinstance(self._obj, Dataset):
218 raise NotImplementedError(
219 "Extracting events from Datasets is not implemented."
220 )
222 dim, index = key
223 if isinstance(index, Variable):
224 if index.ndim == 0:
225 if not isinstance(self._obj, DataArray):
226 raise NotImplementedError(
227 "Getting events by label is only implemented for DataArrays."
228 )
229 return self._obj.group(index.flatten(to=dim)).squeeze(dim)
230 elif isinstance(index, slice):
231 from .binning import _upper_bound
233 if index.step is not None:
234 raise ValueError(
235 "Label-based indexing with step (stride) is not "
236 f"supported. Got '{key}'"
237 )
238 start = index.start
239 stop = index.stop
240 if start is None:
241 start = self.coords[dim].min()
242 if stop is None:
243 stop = _upper_bound(self.coords[dim].max())
245 if not (isinstance(start, Variable) and isinstance(stop, Variable)):
246 raise ValueError(
247 "Bins can only by sliced using label-based indexing. Expected "
248 f"start and stop to be scipp.Variable, got '{start}' and '{stop}'."
249 )
251 if start > stop:
252 if index.start is None:
253 start = stop
254 elif index.stop is None:
255 stop = start
257 return self._obj.bin({dim: concat([start, stop], dim)}).squeeze(dim)
258 raise ValueError(
259 f"Unsupported key '{key}'. Expected a dimension label and "
260 "a 0-D variable or a dimension label and a slice object with start "
261 "and stop given by a 0-D variable."
262 )
264 @property
265 def coords(self) -> MetaDataMap:
266 """Coords of the bins"""
267 return _cpp._bins_view(self._data()).coords # type: ignore[no-any-return]
269 def drop_coords(self, coords: str | Sequence[str]) -> _O:
270 """Drop coords from bin content"""
271 if isinstance(self._obj, Dataset):
272 raise NotImplementedError("bins.drop_coords does not support datasets")
273 content = self.constituents
274 content['data'] = content['data'].drop_coords(coords) # type: ignore[union-attr]
275 data: Variable = _cpp._bins_no_validate(**content)
276 if isinstance(self._obj, DataArray):
277 out = self._obj.copy(deep=False)
278 out.data = data
279 return out
280 return data
282 @property
283 def meta(self) -> MetaDataMap:
284 """Coords and attrs of the bins
286 .. deprecated:: 23.9.0
287 Use :py:attr:`coords` with unset alignment flag instead, or
288 store attributes in higher-level data structures.
289 """
290 _warn_attr_removal()
291 return self.deprecated_meta
293 @property
294 def attrs(self) -> MetaDataMap:
295 """Attrs of the bins
297 .. deprecated:: 23.9.0
298 Use :py:attr:`coords` with unset alignment flag instead, or
299 store attributes in higher-level data structures.
300 """
301 _warn_attr_removal()
302 return self.deprecated_attrs
304 @property
305 def deprecated_meta(self) -> MetaDataMap:
306 return _cpp._bins_view(self._data()).deprecated_meta # type: ignore[no-any-return]
308 @property
309 def deprecated_attrs(self) -> MetaDataMap:
310 return _cpp._bins_view(self._data()).deprecated_attrs # type: ignore[no-any-return]
312 @property
313 def masks(self) -> MetaDataMap:
314 """Masks of the bins"""
315 return _cpp._bins_view(self._data()).masks # type: ignore[no-any-return]
317 def drop_masks(self, masks: str | Sequence[str]) -> _O:
318 """Drop masks from bin content"""
319 if isinstance(self._obj, Dataset):
320 raise NotImplementedError("bins.drop_masks does not support datasets")
321 content = self.constituents
322 content['data'] = content['data'].drop_masks(masks) # type: ignore[union-attr]
323 data: Variable = _cpp._bins_no_validate(**content)
324 if isinstance(self._obj, DataArray):
325 out = self._obj.copy(deep=False)
326 out.data = data
327 return out
328 return data
330 @property
331 def data(self) -> Variable:
332 """Data of the bins"""
333 return _cpp._bins_view(self._data()).data # type: ignore[no-any-return]
335 @data.setter
336 def data(self, data: Variable) -> None:
337 """Set data of the bins"""
338 _cpp._bins_view(self._data()).data = data
340 @property
341 def unit(self) -> Unit | None:
342 """Unit of the bin elements"""
343 return self.constituents['data'].unit # type: ignore[union-attr]
345 @unit.setter
346 def unit(self, unit: Unit | str | None) -> None:
347 """Set unit of the bin elements"""
348 self.constituents['data'].unit = unit # type: ignore[union-attr]
350 @property
351 def dtype(self) -> DType:
352 """Data type of the bin elements."""
353 return self.constituents['data'].dtype # type: ignore[union-attr]
355 @property
356 def aligned(self) -> bool:
357 """Alignment flag for coordinates of bin elements."""
358 return self.constituents['data'].aligned # type: ignore[union-attr]
360 @property
361 def constituents(self) -> Constituents:
362 """Constituents of binned data, as supported by :py:func:`sc.bins`."""
363 return _call_cpp_func(_cpp.bins_constituents, self._data()) # type: ignore[return-value]
365 def sum(self) -> _O:
366 """Sum of events in each bin.
368 Returns
369 -------
370 :
371 The sum of each of the input bins.
373 See Also
374 --------
375 scipp.sum:
376 For summing non-bin data or summing bins.
377 """
378 return _call_cpp_func(_cpp.bins_sum, self._obj) # type: ignore[return-value]
380 def nansum(self) -> _O:
381 """Sum of events in each bin ignoring NaN's.
383 Returns
384 -------
385 :
386 The sum of each of the input bins without NaN's.
388 See Also
389 --------
390 scipp.nansum:
391 For summing non-bin data or summing bins.
392 """
393 return _call_cpp_func(_cpp.bins_nansum, self._obj) # type: ignore[return-value]
395 def mean(self) -> _O:
396 """Arithmetic mean of events in each bin.
398 Returns
399 -------
400 :
401 The mean of each of the input bins.
403 See Also
404 --------
405 scipp.mean:
406 For calculating the mean of non-bin data or across bins.
407 """
408 return _call_cpp_func(_cpp.bins_mean, self._obj) # type: ignore[return-value]
410 def nanmean(self) -> _O:
411 """Arithmetic mean of events in each bin ignoring NaN's.
413 Returns
414 -------
415 :
416 The mean of each of the input bins without NaN's.
418 See Also
419 --------
420 scipp.nanmean:
421 For calculating the mean of non-bin data or across bins.
422 """
423 return _call_cpp_func(_cpp.bins_nanmean, self._obj) # type: ignore[return-value]
425 def max(self) -> _O:
426 """Maximum of events in each bin.
428 Returns
429 -------
430 :
431 The maximum of each of the input bins.
433 See Also
434 --------
435 scipp.max:
436 For calculating the maximum of non-bin data or across bins.
437 """
438 return _call_cpp_func(_cpp.bins_max, self._obj) # type: ignore[return-value]
440 def nanmax(self) -> _O:
441 """Maximum of events in each bin ignoring NaN's.
443 Returns
444 -------
445 :
446 The maximum of each of the input bins without NaN's.
448 See Also
449 --------
450 scipp.nanmax:
451 For calculating the maximum of non-bin data or across bins.
452 """
453 return _call_cpp_func(_cpp.bins_nanmax, self._obj) # type: ignore[return-value]
455 def min(self) -> _O:
456 """Minimum of events in each bin.
458 Returns
459 -------
460 :
461 The minimum of each of the input bins.
463 See Also
464 --------
465 scipp.min:
466 For calculating the minimum of non-bin data or across bins.
467 """
468 return _call_cpp_func(_cpp.bins_min, self._obj) # type: ignore[return-value]
470 def nanmin(self) -> _O:
471 """Minimum of events in each bin ignoring NaN's.
473 Returns
474 -------
475 :
476 The minimum of each of the input bins without NaN's.
478 See Also
479 --------
480 scipp.nanmin:
481 For calculating the minimum of non-bin data or across bins.
482 """
483 return _call_cpp_func(_cpp.bins_nanmin, self._obj) # type: ignore[return-value]
485 def all(self) -> _O:
486 """Logical AND of events in each bin ignoring NaN's.
488 Returns
489 -------
490 :
491 The AND of each of the input bins without NaN's.
493 See Also
494 --------
495 scipp.all:
496 For performing an AND of non-bin data or across bins.
497 """
498 return _call_cpp_func(_cpp.bins_all, self._obj) # type: ignore[return-value]
500 def any(self) -> _O:
501 """Logical OR of events in each bin ignoring NaN's.
503 Returns
504 -------
505 :
506 The OR of each of the input bins without NaN's.
508 See Also
509 --------
510 scipp.all:
511 For performing an OR of non-bin data or across bins.
512 """
513 return _call_cpp_func(_cpp.bins_any, self._obj) # type: ignore[return-value]
515 def size(self) -> Variable:
516 """Number of events or elements in a bin.
518 Returns
519 -------
520 :
521 The number of elements in each of the input bins.
522 """
523 return _call_cpp_func(_cpp.bin_sizes, self._obj) # type: ignore[return-value]
525 def concat(self, dim: Dims = None) -> _O:
526 """Concatenate bins element-wise by concatenating bin contents along
527 their internal bin dimension.
529 This is a reduction operation similar to :py:func:`scipp.sum` but operates on
530 binned data. Elements (bins) are concatenated along their internal dimension.
532 Parameters
533 ----------
534 dim:
535 Reduction dimension.
537 Returns
538 -------
539 :
540 All bins along `dim` concatenated into a single bin.
541 """
542 if isinstance(self._obj, Dataset):
543 raise NotImplementedError(
544 "Concatenating bins is not implemented for datasets"
545 )
546 return concat_bins(self._obj, dim)
548 def concatenate(
549 self,
550 other: Variable | DataArray,
551 *,
552 out: DataArray | None = None,
553 ) -> Variable | DataArray:
554 """Concatenate bins element-wise by concatenating bin contents along
555 their internal bin dimension.
557 The bins to concatenate are obtained element-wise from `self` and `other`.
559 Parameters
560 ----------
561 other:
562 Other input containing bins.
563 out:
564 Optional output buffer.
566 Returns
567 -------
568 :
569 The bins of the two inputs merged.
571 Raises
572 ------
573 scipp.DTypeError
574 If `other` is not binned data.
575 """
576 if out is None:
577 return _call_cpp_func(_cpp.buckets.concatenate, self._obj, other) # type: ignore[return-value]
578 else:
579 if self._obj is out:
580 _call_cpp_func(_cpp.buckets.append, self._obj, other)
581 else:
582 out = _call_cpp_func(_cpp.buckets.concatenate, self._obj, other) # type: ignore[assignment]
583 return out
586def _bins(obj: _O) -> Bins[_O] | None:
587 """
588 Returns helper :py:class:`scipp.Bins` allowing bin-wise operations
589 to be performed or `None` if not binned data.
590 """
591 if _cpp.is_bins(obj):
592 return Bins(obj)
593 else:
594 return None
597def _set_bins(obj: _O, bins: Bins[_O]) -> None:
598 # Should only be used by __iadd__ and friends
599 if obj is not bins._obj:
600 raise ValueError("Cannot set bins with a new object")
603def bins(
604 *,
605 data: VariableLike,
606 dim: str,
607 begin: Variable | None = None,
608 end: Variable | None = None,
609) -> Variable:
610 """Create a binned variable from bin indices.
612 The elements of the returned variable are "bins", defined as views into
613 ``data``. The returned variable keeps and manages a copy of ``data``
614 internally.
616 The variables ``begin`` and ``end`` must have the same dims and shape and
617 ``dtype=sc.DType.int64``. The output dims and shape are given by ``begin``.
618 If only ``begin`` is given, each bucket is a slice containing a non-range
619 slice of ``data`` at the given indices. If neither ``begin`` nor ``end``
620 are given, the output has ``dims=[dim]`` and contains all non-range slices
621 along that dimension.
623 Parameters
624 ----------
625 begin:
626 Optional begin indices of bins, used for slicing ``data``.
627 If not provided each row of ``data`` is mapped to a different bin.
628 end:
629 Optional end indices of bins, used for slicing ``data``. If not
630 provided, ``begin`` is used as starting offsets for each bin, i.e., the end of
631 the Nth bin is set to the begin of the N+1st bin.
632 dim:
633 Dimension of ``data`` that will be sliced to obtain data for
634 any given bin.
635 data:
636 A variable, data array, or dataset containing combined data of all bins.
638 Returns
639 -------
640 :
641 Variable containing data in bins.
643 See Also
644 --------
645 scipp.bin:
646 For creating DataArrays based on binning of coord value
647 instead of explicitly given index ranges.
648 """
649 if any(isinstance(x, DataGroup) for x in [begin, end, data]):
650 raise ValueError("`scipp.bins` does not support DataGroup arguments.")
651 return _call_cpp_func(_cpp.bins, begin, end, dim, data) # type: ignore[return-value]
654def bins_like(x: VariableLike, fill_value: Variable) -> Variable:
655 """Create a binned variable by "broadcasting" fill values to bins of given sizes.
657 The dimensions and shape of ``fill_value`` must be such that they can be broadcast
658 to those of ``x``. Each element of ``fill_value`` defines the values of all the bin
659 elements of the corresponding bin. The output shares the bin indices of ``x``.
661 Parameters
662 ----------
663 x:
664 Binned variable or data array serving as prototype for bin sizes.
665 fill_value:
666 Fill values to use for the bins.
668 Returns
669 -------
670 :
671 Variable containing fill value in bins.
672 """
673 if isinstance(x, DataGroup) or isinstance(fill_value, DataGroup): # type: ignore[unreachable]
674 raise ValueError("`scipp.bins_like` does not support DataGroup arguments.")
675 if isinstance(x, Dataset) or isinstance(fill_value, Dataset): # type: ignore[unreachable]
676 raise ValueError("`scipp.bins_like` does not support Dataset arguments.")
677 var = x.data if isinstance(x, DataArray) else x
678 return _call_cpp_func(_cpp.bins_like, var, fill_value) # type: ignore[return-value]