Coverage for install/scipp/core/bins.py: 55%
195 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-04-28 01:28 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-04-28 01:28 +0000
1# SPDX-License-Identifier: BSD-3-Clause
2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp)
3# @author Simon Heybrock
4import warnings
5from typing import Callable, Dict, Literal, Optional, Tuple, Union
7from .._scipp import core as _cpp
8from ..typing import Dims, MetaDataMap, VariableLike
9from ._cpp_wrapper_util import call_func as _call_cpp_func
10from .bin_remapping import concat_bins
11from .cpp_classes import Variable
12from .data_group import DataGroup
13from .deprecation import _warn_attr_removal
14from .domains import merge_equal_adjacent
15from .math import midpoints
16from .operations import islinspace
17from .shape import concat
18from .variable import scalar
21class Lookup:
22 """Lookup table.
24 This is class should never be instantiated manually.
25 Instead, use :func:`scipp.lookup`.
27 See :func:`scipp.lookup` also for usage examples.
28 """
30 def __init__(
31 self,
32 op: Callable,
33 func: _cpp.DataArray,
34 dim: str,
35 fill_value: Optional[Variable] = None,
36 ):
37 if (
38 not func.masks
39 and func.ndim == 1
40 and len(func) > 0
41 and func.dtype in [_cpp.DType.bool, _cpp.DType.int32, _cpp.DType.int64]
42 ):
43 # Significant speedup if `func` is large but mostly constant.
44 if op == _cpp.buckets.map:
45 if not islinspace(func.coords[dim], dim).value:
46 func = merge_equal_adjacent(func)
47 else:
48 # In this case the C++ implementation currently used no linspace
49 # optimization, so the extra check is skipped.
50 transition = func.data[:-1] != func.data[1:]
51 func = concat([func[0], func[1:][transition]], dim)
52 self.op = op
53 self.func = func
54 self.dim = dim
55 self.fill_value = fill_value
56 self.__transform_coords_input_keys__ = (dim,) # for transform_coords
58 def __call__(self, var: Variable) -> Variable:
59 """Return table values for the given points."""
60 return self.op(self.func, var, self.dim, self.fill_value)
62 def __getitem__(self, var: Variable) -> Variable:
63 """Return table values for the given points."""
64 return self(var)
67def lookup(
68 func: _cpp.DataArray,
69 dim: Optional[str] = None,
70 *,
71 mode: Optional[Literal['previous', 'nearest']] = None,
72 fill_value: Optional[_cpp.Variable] = None,
73) -> Lookup:
74 """Create a "lookup table" from a histogram (data array with bin-edge coord).
76 The lookup table can be used to map, e.g., time-stamps to corresponding values
77 given by a time-series log.
79 Parameters
80 ----------
81 func:
82 Data array defining the lookup table.
83 dim:
84 Dimension along which the lookup occurs.
85 mode:
86 Mode used for looking up function values. Must be ``None`` when ``func`` is a
87 histogram. Otherwise this defaults to 'nearest'.
88 fill_value:
89 Value to use for points outside the range of the function as well as points in
90 masked regions of the function. If set to None (the default) this will use NaN
91 for floating point types and 0 for integral types. Must have the same dtype and
92 unit as the function values.
94 Returns
95 -------
96 :
97 The created lookup table.
99 Examples
100 --------
102 >>> x = sc.linspace(dim='x', start=0.0, stop=1.0, num=4)
103 >>> vals = sc.array(dims=['x'], values=[3, 2, 1])
104 >>> hist = sc.DataArray(data=vals, coords={'x': x})
105 >>> sc.lookup(hist, 'x')[sc.array(dims=['event'], values=[0.1,0.4,0.1,0.6,0.9])]
106 <scipp.Variable> (event: 5) int64 [dimensionless] [3, 2, ..., 2, 1]
107 """
108 if dim is None:
109 dim = func.dim
110 func = _cpp.DataArray(func.data, coords={dim: func.coords[dim]}, masks=func.masks)
111 if func.coords.is_edges(dim):
112 if mode is not None:
113 raise ValueError("Input is a histogram, 'mode' must not be set.")
114 return Lookup(_cpp.buckets.map, func, dim, fill_value)
115 if mode is None:
116 mode = 'nearest'
117 elif mode not in ['previous', 'nearest']:
118 raise ValueError(f"Mode must be one of ['previous', 'nearest'], got '{mode}'")
119 if mode == 'nearest' and func.sizes[dim] != 0:
120 coord = func.coords[dim]
121 lowest = coord[dim, 0:0].max() # trick to get lowest representable value
122 parts = [lowest] if coord.sizes[dim] < 2 else [lowest, midpoints(coord, dim)]
123 func.coords[dim] = concat(parts, dim)
124 return Lookup(_cpp.lookup_previous, func, dim, fill_value)
127class Bins:
128 """Proxy for access to bin contents and operations on bins of a variable.
130 This class is returned from the `bins` property of variables and should
131 generally not be created directly.
132 """
134 def __init__(self, obj):
135 self._obj = obj
137 def _data(self):
138 try:
139 return self._obj.data
140 except AttributeError:
141 return self._obj
143 def __mul__(self, lut: lookup):
144 target_dtype = (
145 scalar(1, dtype=self.dtype) * scalar(1, dtype=lut.func.dtype)
146 ).dtype
147 copy = self._obj.to(dtype=target_dtype)
148 _cpp.buckets.scale(copy, lut.func, lut.dim)
149 return copy
151 def __truediv__(self, lut: lookup):
152 target_dtype = (
153 scalar(1, dtype=self.dtype) / scalar(1, dtype=lut.func.dtype)
154 ).dtype
155 copy = self._obj.to(dtype=target_dtype)
156 _cpp.buckets.scale(copy, _cpp.reciprocal(lut.func), lut.dim)
157 return copy
159 def __imul__(self, lut: lookup):
160 _cpp.buckets.scale(self._obj, lut.func, lut.dim)
161 return self
163 def __itruediv__(self, lut: lookup):
164 _cpp.buckets.scale(self._obj, _cpp.reciprocal(lut.func), lut.dim)
165 return self
167 def __getitem__(self, key: Tuple[str, Union[_cpp.Variable, slice]]):
168 """
169 Extract events from bins based on labels or label ranges and return a copy.
171 This is similar to regular label-based indexing, but considers the event-coords,
172 i.e., the coord values of individual bin entries. Unlike normal label-based
173 indexing this returns a copy, as a subset of events is extracted.
174 """
175 dim, index = key
176 if isinstance(index, _cpp.Variable):
177 if index.ndim == 0:
178 return self._obj.group(index.flatten(to=dim)).squeeze(dim)
179 elif isinstance(index, slice):
180 from .binning import _upper_bound
182 if index.step is not None:
183 raise ValueError(
184 "Label-based indexing with step (stride) is not "
185 f"supported. Got '{key}'"
186 )
187 start = index.start
188 stop = index.stop
189 if start is None:
190 start = self._obj.bins.coords[dim].min()
191 if stop is None:
192 stop = _upper_bound(self._obj.bins.coords[dim].max())
194 if not (
195 isinstance(start, _cpp.Variable) and isinstance(stop, _cpp.Variable)
196 ):
197 raise ValueError(
198 "Bins can only by sliced using label-based indexing. Expected "
199 f"start and stop to be scipp.Variable, got '{start}' and '{stop}'."
200 )
202 if start > stop:
203 if index.start is None:
204 start = stop
205 elif index.stop is None:
206 stop = start
208 return self._obj.bin({dim: concat([start, stop], dim)}).squeeze(dim)
209 raise ValueError(
210 f"Unsupported key '{key}'. Expected a dimension label and "
211 "a 0-D variable or a dimension label and a slice object with start "
212 "and stop given by a 0-D variable."
213 )
215 @property
216 def coords(self) -> MetaDataMap:
217 """Coords of the bins"""
218 return _cpp._bins_view(self._data()).coords
220 @property
221 def meta(self) -> MetaDataMap:
222 """Coords and attrs of the bins
224 .. deprecated:: 23.9.0
225 Use :py:attr:`coords` with unset alignment flag instead, or
226 store attributes in higher-level data structures.
227 """
228 _warn_attr_removal()
229 return self.deprecated_meta
231 @property
232 def attrs(self) -> MetaDataMap:
233 """Attrs of the bins
235 .. deprecated:: 23.9.0
236 Use :py:attr:`coords` with unset alignment flag instead, or
237 store attributes in higher-level data structures.
238 """
239 _warn_attr_removal()
240 return self.deprecated_attrs
242 @property
243 def deprecated_meta(self) -> MetaDataMap:
244 return _cpp._bins_view(self._data()).deprecated_meta
246 @property
247 def deprecated_attrs(self) -> MetaDataMap:
248 return _cpp._bins_view(self._data()).deprecated_attrs
250 @property
251 def masks(self) -> MetaDataMap:
252 """Masks of the bins"""
253 return _cpp._bins_view(self._data()).masks
255 @property
256 def data(self) -> _cpp.Variable:
257 """Data of the bins"""
258 return _cpp._bins_view(self._data()).data
260 @data.setter
261 def data(self, data: _cpp.Variable):
262 """Set data of the bins"""
263 _cpp._bins_view(self._data()).data = data
265 @property
266 def unit(self) -> _cpp.Unit:
267 """Unit of the bin elements"""
268 return self.constituents['data'].unit
270 @unit.setter
271 def unit(self, unit: Union[_cpp.Unit, str]):
272 """Set unit of the bin elements"""
273 self.constituents['data'].unit = unit
275 @property
276 def dtype(self) -> _cpp.DType:
277 """Data type of the bin elements."""
278 return self.constituents['data'].dtype
280 @property
281 def aligned(self) -> bool:
282 """Alignment flag for coordinates of bin elements."""
283 return self.constituents['data'].aligned
285 @property
286 def constituents(self) -> Dict[str, Union[str, _cpp.Variable, _cpp.DataArray]]:
287 """Constituents of binned data, as supported by :py:func:`sc.bins`."""
288 return _call_cpp_func(_cpp.bins_constituents, self._data())
290 def sum(self) -> Union[_cpp.Variable, _cpp.DataArray]:
291 """Sum of events in each bin.
293 Returns
294 -------
295 :
296 The sum of each of the input bins.
298 See Also
299 --------
300 scipp.sum:
301 For summing non-bin data or summing bins.
302 """
303 return _call_cpp_func(_cpp.bins_sum, self._obj)
305 def nansum(self) -> Union[_cpp.Variable, _cpp.DataArray]:
306 """Sum of events in each bin ignoring NaN's.
308 Returns
309 -------
310 :
311 The sum of each of the input bins without NaN's.
313 See Also
314 --------
315 scipp.nansum:
316 For summing non-bin data or summing bins.
317 """
318 return _call_cpp_func(_cpp.bins_nansum, self._obj)
320 def mean(self) -> Union[_cpp.Variable, _cpp.DataArray]:
321 """Arithmetic mean of events in each bin.
323 Returns
324 -------
325 :
326 The mean of each of the input bins.
328 See Also
329 --------
330 scipp.mean:
331 For calculating the mean of non-bin data or across bins.
332 """
333 return _call_cpp_func(_cpp.bins_mean, self._obj)
335 def nanmean(self) -> Union[_cpp.Variable, _cpp.DataArray]:
336 """Arithmetic mean of events in each bin ignoring NaN's.
338 Returns
339 -------
340 :
341 The mean of each of the input bins without NaN's.
343 See Also
344 --------
345 scipp.nanmean:
346 For calculating the mean of non-bin data or across bins.
347 """
348 return _call_cpp_func(_cpp.bins_nanmean, self._obj)
350 def max(self) -> Union[_cpp.Variable, _cpp.DataArray]:
351 """Maximum of events in each bin.
353 Returns
354 -------
355 :
356 The maximum of each of the input bins.
358 See Also
359 --------
360 scipp.max:
361 For calculating the maximum of non-bin data or across bins.
362 """
363 return _call_cpp_func(_cpp.bins_max, self._obj)
365 def nanmax(self) -> Union[_cpp.Variable, _cpp.DataArray]:
366 """Maximum of events in each bin ignoring NaN's.
368 Returns
369 -------
370 :
371 The maximum of each of the input bins without NaN's.
373 See Also
374 --------
375 scipp.nanmax:
376 For calculating the maximum of non-bin data or across bins.
377 """
378 return _call_cpp_func(_cpp.bins_nanmax, self._obj)
380 def min(self) -> Union[_cpp.Variable, _cpp.DataArray]:
381 """Minimum of events in each bin.
383 Returns
384 -------
385 :
386 The minimum of each of the input bins.
388 See Also
389 --------
390 scipp.min:
391 For calculating the minimum of non-bin data or across bins.
392 """
393 return _call_cpp_func(_cpp.bins_min, self._obj)
395 def nanmin(self) -> Union[_cpp.Variable, _cpp.DataArray]:
396 """Minimum of events in each bin ignoring NaN's.
398 Returns
399 -------
400 :
401 The minimum of each of the input bins without NaN's.
403 See Also
404 --------
405 scipp.nanmin:
406 For calculating the minimum of non-bin data or across bins.
407 """
408 return _call_cpp_func(_cpp.bins_nanmin, self._obj)
410 def all(self) -> Union[_cpp.Variable, _cpp.DataArray]:
411 """Logical AND of events in each bin ignoring NaN's.
413 Returns
414 -------
415 :
416 The AND of each of the input bins without NaN's.
418 See Also
419 --------
420 scipp.all:
421 For performing an AND of non-bin data or across bins.
422 """
423 return _call_cpp_func(_cpp.bins_all, self._obj)
425 def any(self) -> Union[_cpp.Variable, _cpp.DataArray]:
426 """Logical OR of events in each bin ignoring NaN's.
428 Returns
429 -------
430 :
431 The OR of each of the input bins without NaN's.
433 See Also
434 --------
435 scipp.all:
436 For performing an OR of non-bin data or across bins.
437 """
438 return _call_cpp_func(_cpp.bins_any, self._obj)
440 def size(self) -> Union[_cpp.Variable, _cpp.DataArray]:
441 """Number of events or elements in a bin.
443 Returns
444 -------
445 :
446 The number of elements in each of the input bins.
447 """
448 return _call_cpp_func(_cpp.bin_sizes, self._obj)
450 def concat(self, dim: Dims = None) -> Union[_cpp.Variable, _cpp.DataArray]:
451 """Concatenate bins element-wise by concatenating bin contents along
452 their internal bin dimension.
454 This is a reduction operation similar to :py:func:`scipp.sum` but operates on
455 binned data. Elements (bins) are concatenated along their internal dimension.
457 Parameters
458 ----------
459 dim:
460 Reduction dimension.
462 Returns
463 -------
464 :
465 All bins along `dim` concatenated into a single bin.
466 """
467 return concat_bins(self._obj, dim)
469 def concatenate(
470 self,
471 other: Union[_cpp.Variable, _cpp.DataArray],
472 *,
473 out: Optional[_cpp.DataArray] = None,
474 ) -> Union[_cpp.Variable, _cpp.DataArray]:
475 """Concatenate bins element-wise by concatenating bin contents along
476 their internal bin dimension.
478 The bins to concatenate are obtained element-wise from `self` and `other`.
480 Parameters
481 ----------
482 other:
483 Other input containing bins.
484 out:
485 Optional output buffer.
487 Returns
488 -------
489 :
490 The bins of the two inputs merged.
492 Raises
493 ------
494 scipp.DTypeError
495 If `other` is not binned data.
496 """
497 if out is None:
498 return _call_cpp_func(_cpp.buckets.concatenate, self._obj, other)
499 else:
500 if self._obj is out:
501 _call_cpp_func(_cpp.buckets.append, self._obj, other)
502 else:
503 out = _call_cpp_func(_cpp.buckets.concatenate, self._obj, other)
504 return out
507class GroupbyBins:
508 """Proxy for operations on bins of a groupby object."""
510 def __init__(self, obj):
511 self._obj = obj
513 def concat(self, dim):
514 warnings.warn(
515 "groupby(...).bins.concat(dim) is deprecated. Use `group` or `bin` instead",
516 UserWarning,
517 stacklevel=2,
518 )
519 return self._obj.concat(dim)
522def _bins(obj):
523 """
524 Returns helper :py:class:`scipp.Bins` allowing bin-wise operations
525 to be performed or `None` if not binned data.
526 """
527 if _cpp.is_bins(obj):
528 return Bins(obj)
529 else:
530 return None
533def _set_bins(obj, bins: Bins):
534 # Should only be used by __iadd__ and friends
535 if obj is not bins._obj:
536 raise ValueError("Cannot set bins with a new object")
539def _groupby_bins(obj):
540 return GroupbyBins(obj)
543def bins(
544 *,
545 data: VariableLike,
546 dim: str,
547 begin: Optional[_cpp.Variable] = None,
548 end: Optional[_cpp.Variable] = None,
549) -> _cpp.Variable:
550 """Create a binned variable from bin indices.
552 The elements of the returned variable are "bins", defined as views into
553 ``data``. The returned variable keeps and manages a copy of ``data``
554 internally.
556 The variables ``begin`` and ``end`` must have the same dims and shape and
557 ``dtype=sc.DType.int64``. The output dims and shape are given by ``begin``.
558 If only ``begin`` is given, each bucket is a slice containing a non-range
559 slice of ``data`` at the given indices. If neither ``begin`` nor ``end``
560 are given, the output has ``dims=[dim]`` and contains all non-range slices
561 along that dimension.
563 Parameters
564 ----------
565 begin:
566 Optional begin indices of bins, used for slicing ``data``.
567 If not provided each row of ``data`` is mapped to a different bin.
568 end:
569 Optional end indices of bins, used for slicing ``data``. If not
570 provided, ``begin`` is used as starting offsets for each bin, i.e., the end of
571 the Nth bin is set to the begin of the N+1st bin.
572 dim:
573 Dimension of ``data`` that will be sliced to obtain data for
574 any given bin.
575 data:
576 A variable, data array, or dataset containing combined data of all bins.
578 Returns
579 -------
580 :
581 Variable containing data in bins.
583 See Also
584 --------
585 scipp.bin:
586 For creating DataArrays based on binning of coord value
587 instead of explicitly given index ranges.
588 """
589 if any(isinstance(x, DataGroup) for x in [begin, end, data]):
590 raise ValueError("`scipp.bins` does not support DataGroup arguments.")
591 return _call_cpp_func(_cpp.bins, begin, end, dim, data)
594def bins_like(x: VariableLike, fill_value: _cpp.Variable) -> _cpp.Variable:
595 """Create a binned variable by "broadcasting" fill values to bins of given sizes.
597 The dimensions and shape of ``fill_value`` must be such that they can be broadcast
598 to those of ``x``. Each element of ``fill_value`` defines the values of all the bin
599 elements of the corresponding bin. The output shares the bin indices of ``x``.
601 Parameters
602 ----------
603 x:
604 Binned variable or data array serving as prototype for bin sizes.
605 fill_value:
606 Fill values to use for the bins.
608 Returns
609 -------
610 :
611 Variable containing fill value in bins.
612 """
613 if isinstance(x, DataGroup) or isinstance(fill_value, DataGroup):
614 raise ValueError("`scipp.bins_like` does not support DataGroup arguments.")
615 var = x
616 if not isinstance(x, _cpp.Variable):
617 var = var.data
618 return _call_cpp_func(_cpp.bins_like, var, fill_value)