Coverage for install/scipp/core/bins.py: 56%

235 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-12-01 01:59 +0000

1# SPDX-License-Identifier: BSD-3-Clause 

2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp) 

3# @author Simon Heybrock 

4from __future__ import annotations 

5 

6from collections.abc import Callable 

7from typing import Generic, Literal, Sequence, TypedDict, TypeVar 

8 

9from .._scipp import core as _cpp 

10from ..typing import Dims, MetaDataMap, VariableLike 

11from ._cpp_wrapper_util import call_func as _call_cpp_func 

12from .bin_remapping import concat_bins 

13from .cpp_classes import DataArray, Dataset, DType, Unit, Variable 

14from .data_group import DataGroup 

15from .deprecation import _warn_attr_removal 

16from .domains import merge_equal_adjacent 

17from .math import midpoints 

18from .operations import islinspace 

19from .shape import concat 

20from .variable import scalar 

21 

22 

23class Lookup: 

24 """Lookup table. 

25 

26 This is class should never be instantiated manually. 

27 Instead, use :func:`scipp.lookup`. 

28 

29 See :func:`scipp.lookup` also for usage examples. 

30 """ 

31 

32 def __init__( 

33 self, 

34 op: Callable[[DataArray, Variable, str, Variable | None], Variable], 

35 func: DataArray, 

36 dim: str, 

37 fill_value: Variable | None = None, 

38 ): 

39 if ( 

40 not func.masks 

41 and func.ndim == 1 

42 and len(func) > 0 

43 and func.dtype in [DType.bool, DType.int32, DType.int64] 

44 ): 

45 # Significant speedup if `func` is large but mostly constant. 

46 if op == _cpp.buckets.map: 

47 if not islinspace(func.coords[dim], dim).value: 

48 func = merge_equal_adjacent(func) 

49 else: 

50 # In this case the C++ implementation currently used no linspace 

51 # optimization, so the extra check is skipped. 

52 transition = func.data[:-1] != func.data[1:] 

53 func = concat([func[0], func[1:][transition]], dim) 

54 self.op = op 

55 self.func = func 

56 self.dim = dim 

57 self.fill_value = fill_value 

58 self.__transform_coords_input_keys__ = (dim,) # for transform_coords 

59 

60 def __call__(self, var: Variable) -> Variable: 

61 """Return table values for the given points.""" 

62 return self.op(self.func, var, self.dim, self.fill_value) 

63 

64 def __getitem__(self, var: Variable) -> Variable: 

65 """Return table values for the given points.""" 

66 return self(var) 

67 

68 

69def lookup( 

70 func: DataArray, 

71 dim: str | None = None, 

72 *, 

73 mode: Literal['previous', 'nearest'] | None = None, 

74 fill_value: Variable | None = None, 

75) -> Lookup: 

76 """Create a "lookup table" from a histogram (data array with bin-edge coord). 

77 

78 The lookup table can be used to map, e.g., time-stamps to corresponding values 

79 given by a time-series log. 

80 

81 Parameters 

82 ---------- 

83 func: 

84 Data array defining the lookup table. 

85 dim: 

86 Dimension along which the lookup occurs. 

87 mode: 

88 Mode used for looking up function values. Must be ``None`` when ``func`` is a 

89 histogram. Otherwise this defaults to 'nearest'. 

90 fill_value: 

91 Value to use for points outside the range of the function as well as points in 

92 masked regions of the function. If set to None (the default) this will use NaN 

93 for floating point types and 0 for integral types. Must have the same dtype and 

94 unit as the function values. 

95 

96 Returns 

97 ------- 

98 : 

99 The created lookup table. 

100 

101 Examples 

102 -------- 

103 

104 >>> x = sc.linspace(dim='x', start=0.0, stop=1.0, num=4) 

105 >>> vals = sc.array(dims=['x'], values=[3, 2, 1]) 

106 >>> hist = sc.DataArray(data=vals, coords={'x': x}) 

107 >>> sc.lookup(hist, 'x')[sc.array(dims=['event'], values=[0.1,0.4,0.1,0.6,0.9])] 

108 <scipp.Variable> (event: 5) int64 [dimensionless] [3, 2, ..., 2, 1] 

109 """ 

110 if dim is None: 

111 dim = func.dim 

112 func = DataArray(func.data, coords={dim: func.coords[dim]}, masks=func.masks) 

113 if func.dims[-1] != dim: 

114 # We automatically transpose the data so that `dim` is the inner dimension to 

115 # ensure contiguous memory access. 

116 dims = (*[d for d in func.dims if d != dim], dim) 

117 func.data = func.data.transpose(dims).copy() 

118 func.coords[dim] = func.coords[dim].transpose(dims).copy() 

119 for key, mask in func.masks.items(): 

120 func.masks[key] = mask.transpose( 

121 # Masks potentially have fewer dims than the data. 

122 [d for d in dims if d in mask.dims] or None 

123 ).copy() 

124 if func.coords.is_edges(dim, dim): 

125 if mode is not None: 

126 raise ValueError("Input is a histogram, 'mode' must not be set.") 

127 return Lookup(_cpp.buckets.map, func, dim, fill_value) 

128 if mode is None: 

129 mode = 'nearest' 

130 elif mode not in ['previous', 'nearest']: 

131 raise ValueError(f"Mode must be one of ['previous', 'nearest'], got '{mode}'") 

132 if mode == 'nearest' and func.sizes[dim] != 0: 

133 coord = func.coords[dim] 

134 lowest = coord[dim, 0:0].max() # trick to get lowest representable value 

135 parts = [lowest] if coord.sizes[dim] < 2 else [lowest, midpoints(coord, dim)] 

136 func.coords[dim] = concat(parts, dim) 

137 return Lookup(_cpp.lookup_previous, func, dim, fill_value) 

138 

139 

140class Constituents(TypedDict): 

141 """A dict with bin constituents.""" 

142 

143 data: Variable | DataArray | Dataset 

144 """Data content.""" 

145 begin: Variable 

146 """Begin indices for each bin.""" 

147 end: Variable 

148 """End indices for each bin.""" 

149 dim: str 

150 """Dimension in 'data' that the binning applies to.""" 

151 

152 

153_O = TypeVar("_O", Variable, DataArray, Dataset) 

154 

155 

156class Bins(Generic[_O]): 

157 """Proxy for access to bin contents and operations on bins of a variable. 

158 

159 This class is returned from the `bins` property of variables and should 

160 generally not be created directly. 

161 

162 ``Bins`` is generic over the parent type, *not* the event type. 

163 That is, ``Variable.bins`` always returns ``Bins[Variable]`` regardless of whether 

164 the event list is a variable or data array. 

165 """ 

166 

167 def __init__(self, obj: _O) -> None: 

168 self._obj: _O = obj 

169 

170 def _data(self) -> Variable: 

171 if isinstance(self._obj, DataArray | Dataset): 

172 # Raises AttributeError for datasets as it should. 

173 return self._obj.data # type: ignore[attr-defined, no-any-return] 

174 else: 

175 return self._obj 

176 

177 def __mul__(self, lut: Lookup) -> _O: 

178 if isinstance(self._obj, Dataset): 

179 raise NotImplementedError( 

180 "Multiplication of events in a dataset is not implemented" 

181 ) 

182 target_dtype = ( 

183 scalar(1, dtype=self.dtype) * scalar(1, dtype=lut.func.dtype) 

184 ).dtype 

185 copy = self._obj.to(dtype=target_dtype) 

186 _cpp.buckets.scale(copy, lut.func, lut.dim) 

187 return copy 

188 

189 def __truediv__(self, lut: Lookup) -> _O: 

190 if isinstance(self._obj, Dataset): 

191 raise NotImplementedError( 

192 "Division of events in a dataset is not implemented" 

193 ) 

194 target_dtype = ( 

195 scalar(1, dtype=self.dtype) / scalar(1, dtype=lut.func.dtype) 

196 ).dtype 

197 copy = self._obj.to(dtype=target_dtype) 

198 _cpp.buckets.scale(copy, _cpp.reciprocal(lut.func), lut.dim) 

199 return copy 

200 

201 def __imul__(self, lut: Lookup) -> Bins[_O]: # noqa: PYI034 

202 _cpp.buckets.scale(self._obj, lut.func, lut.dim) 

203 return self 

204 

205 def __itruediv__(self, lut: Lookup) -> Bins[_O]: # noqa: PYI034 

206 _cpp.buckets.scale(self._obj, _cpp.reciprocal(lut.func), lut.dim) 

207 return self 

208 

209 def __getitem__(self, key: tuple[str, Variable | slice]) -> DataArray: 

210 """ 

211 Extract events from bins based on labels or label ranges and return a copy. 

212 

213 This is similar to regular label-based indexing, but considers the event-coords, 

214 i.e., the coord values of individual bin entries. Unlike normal label-based 

215 indexing this returns a copy, as a subset of events is extracted. 

216 """ 

217 if isinstance(self._obj, Dataset): 

218 raise NotImplementedError( 

219 "Extracting events from Datasets is not implemented." 

220 ) 

221 

222 dim, index = key 

223 if isinstance(index, Variable): 

224 if index.ndim == 0: 

225 if not isinstance(self._obj, DataArray): 

226 raise NotImplementedError( 

227 "Getting events by label is only implemented for DataArrays." 

228 ) 

229 return self._obj.group(index.flatten(to=dim)).squeeze(dim) 

230 elif isinstance(index, slice): 

231 from .binning import _upper_bound 

232 

233 if index.step is not None: 

234 raise ValueError( 

235 "Label-based indexing with step (stride) is not " 

236 f"supported. Got '{key}'" 

237 ) 

238 start = index.start 

239 stop = index.stop 

240 if start is None: 

241 start = self.coords[dim].min() 

242 if stop is None: 

243 stop = _upper_bound(self.coords[dim].max()) 

244 

245 if not (isinstance(start, Variable) and isinstance(stop, Variable)): 

246 raise ValueError( 

247 "Bins can only by sliced using label-based indexing. Expected " 

248 f"start and stop to be scipp.Variable, got '{start}' and '{stop}'." 

249 ) 

250 

251 if start > stop: 

252 if index.start is None: 

253 start = stop 

254 elif index.stop is None: 

255 stop = start 

256 

257 return self._obj.bin({dim: concat([start, stop], dim)}).squeeze(dim) 

258 raise ValueError( 

259 f"Unsupported key '{key}'. Expected a dimension label and " 

260 "a 0-D variable or a dimension label and a slice object with start " 

261 "and stop given by a 0-D variable." 

262 ) 

263 

264 @property 

265 def coords(self) -> MetaDataMap: 

266 """Coords of the bins""" 

267 return _cpp._bins_view(self._data()).coords # type: ignore[no-any-return] 

268 

269 def drop_coords(self, coords: str | Sequence[str]) -> _O: 

270 """Drop coords from bin content""" 

271 if isinstance(self._obj, Dataset): 

272 raise NotImplementedError("bins.drop_coords does not support datasets") 

273 content = self.constituents 

274 content['data'] = content['data'].drop_coords(coords) # type: ignore[union-attr] 

275 data: Variable = _cpp._bins_no_validate(**content) 

276 if isinstance(self._obj, DataArray): 

277 out = self._obj.copy(deep=False) 

278 out.data = data 

279 return out 

280 return data 

281 

282 @property 

283 def meta(self) -> MetaDataMap: 

284 """Coords and attrs of the bins 

285 

286 .. deprecated:: 23.9.0 

287 Use :py:attr:`coords` with unset alignment flag instead, or 

288 store attributes in higher-level data structures. 

289 """ 

290 _warn_attr_removal() 

291 return self.deprecated_meta 

292 

293 @property 

294 def attrs(self) -> MetaDataMap: 

295 """Attrs of the bins 

296 

297 .. deprecated:: 23.9.0 

298 Use :py:attr:`coords` with unset alignment flag instead, or 

299 store attributes in higher-level data structures. 

300 """ 

301 _warn_attr_removal() 

302 return self.deprecated_attrs 

303 

304 @property 

305 def deprecated_meta(self) -> MetaDataMap: 

306 return _cpp._bins_view(self._data()).deprecated_meta # type: ignore[no-any-return] 

307 

308 @property 

309 def deprecated_attrs(self) -> MetaDataMap: 

310 return _cpp._bins_view(self._data()).deprecated_attrs # type: ignore[no-any-return] 

311 

312 @property 

313 def masks(self) -> MetaDataMap: 

314 """Masks of the bins""" 

315 return _cpp._bins_view(self._data()).masks # type: ignore[no-any-return] 

316 

317 def drop_masks(self, masks: str | Sequence[str]) -> _O: 

318 """Drop masks from bin content""" 

319 if isinstance(self._obj, Dataset): 

320 raise NotImplementedError("bins.drop_masks does not support datasets") 

321 content = self.constituents 

322 content['data'] = content['data'].drop_masks(masks) # type: ignore[union-attr] 

323 data: Variable = _cpp._bins_no_validate(**content) 

324 if isinstance(self._obj, DataArray): 

325 out = self._obj.copy(deep=False) 

326 out.data = data 

327 return out 

328 return data 

329 

330 @property 

331 def data(self) -> Variable: 

332 """Data of the bins""" 

333 return _cpp._bins_view(self._data()).data # type: ignore[no-any-return] 

334 

335 @data.setter 

336 def data(self, data: Variable) -> None: 

337 """Set data of the bins""" 

338 _cpp._bins_view(self._data()).data = data 

339 

340 @property 

341 def unit(self) -> Unit | None: 

342 """Unit of the bin elements""" 

343 return self.constituents['data'].unit # type: ignore[union-attr] 

344 

345 @unit.setter 

346 def unit(self, unit: Unit | str | None) -> None: 

347 """Set unit of the bin elements""" 

348 self.constituents['data'].unit = unit # type: ignore[union-attr] 

349 

350 @property 

351 def dtype(self) -> DType: 

352 """Data type of the bin elements.""" 

353 return self.constituents['data'].dtype # type: ignore[union-attr] 

354 

355 @property 

356 def aligned(self) -> bool: 

357 """Alignment flag for coordinates of bin elements.""" 

358 return self.constituents['data'].aligned # type: ignore[union-attr] 

359 

360 @property 

361 def constituents(self) -> Constituents: 

362 """Constituents of binned data, as supported by :py:func:`sc.bins`.""" 

363 return _call_cpp_func(_cpp.bins_constituents, self._data()) # type: ignore[return-value] 

364 

365 def sum(self) -> _O: 

366 """Sum of events in each bin. 

367 

368 Returns 

369 ------- 

370 : 

371 The sum of each of the input bins. 

372 

373 See Also 

374 -------- 

375 scipp.sum: 

376 For summing non-bin data or summing bins. 

377 """ 

378 return _call_cpp_func(_cpp.bins_sum, self._obj) # type: ignore[return-value] 

379 

380 def nansum(self) -> _O: 

381 """Sum of events in each bin ignoring NaN's. 

382 

383 Returns 

384 ------- 

385 : 

386 The sum of each of the input bins without NaN's. 

387 

388 See Also 

389 -------- 

390 scipp.nansum: 

391 For summing non-bin data or summing bins. 

392 """ 

393 return _call_cpp_func(_cpp.bins_nansum, self._obj) # type: ignore[return-value] 

394 

395 def mean(self) -> _O: 

396 """Arithmetic mean of events in each bin. 

397 

398 Returns 

399 ------- 

400 : 

401 The mean of each of the input bins. 

402 

403 See Also 

404 -------- 

405 scipp.mean: 

406 For calculating the mean of non-bin data or across bins. 

407 """ 

408 return _call_cpp_func(_cpp.bins_mean, self._obj) # type: ignore[return-value] 

409 

410 def nanmean(self) -> _O: 

411 """Arithmetic mean of events in each bin ignoring NaN's. 

412 

413 Returns 

414 ------- 

415 : 

416 The mean of each of the input bins without NaN's. 

417 

418 See Also 

419 -------- 

420 scipp.nanmean: 

421 For calculating the mean of non-bin data or across bins. 

422 """ 

423 return _call_cpp_func(_cpp.bins_nanmean, self._obj) # type: ignore[return-value] 

424 

425 def max(self) -> _O: 

426 """Maximum of events in each bin. 

427 

428 Returns 

429 ------- 

430 : 

431 The maximum of each of the input bins. 

432 

433 See Also 

434 -------- 

435 scipp.max: 

436 For calculating the maximum of non-bin data or across bins. 

437 """ 

438 return _call_cpp_func(_cpp.bins_max, self._obj) # type: ignore[return-value] 

439 

440 def nanmax(self) -> _O: 

441 """Maximum of events in each bin ignoring NaN's. 

442 

443 Returns 

444 ------- 

445 : 

446 The maximum of each of the input bins without NaN's. 

447 

448 See Also 

449 -------- 

450 scipp.nanmax: 

451 For calculating the maximum of non-bin data or across bins. 

452 """ 

453 return _call_cpp_func(_cpp.bins_nanmax, self._obj) # type: ignore[return-value] 

454 

455 def min(self) -> _O: 

456 """Minimum of events in each bin. 

457 

458 Returns 

459 ------- 

460 : 

461 The minimum of each of the input bins. 

462 

463 See Also 

464 -------- 

465 scipp.min: 

466 For calculating the minimum of non-bin data or across bins. 

467 """ 

468 return _call_cpp_func(_cpp.bins_min, self._obj) # type: ignore[return-value] 

469 

470 def nanmin(self) -> _O: 

471 """Minimum of events in each bin ignoring NaN's. 

472 

473 Returns 

474 ------- 

475 : 

476 The minimum of each of the input bins without NaN's. 

477 

478 See Also 

479 -------- 

480 scipp.nanmin: 

481 For calculating the minimum of non-bin data or across bins. 

482 """ 

483 return _call_cpp_func(_cpp.bins_nanmin, self._obj) # type: ignore[return-value] 

484 

485 def all(self) -> _O: 

486 """Logical AND of events in each bin ignoring NaN's. 

487 

488 Returns 

489 ------- 

490 : 

491 The AND of each of the input bins without NaN's. 

492 

493 See Also 

494 -------- 

495 scipp.all: 

496 For performing an AND of non-bin data or across bins. 

497 """ 

498 return _call_cpp_func(_cpp.bins_all, self._obj) # type: ignore[return-value] 

499 

500 def any(self) -> _O: 

501 """Logical OR of events in each bin ignoring NaN's. 

502 

503 Returns 

504 ------- 

505 : 

506 The OR of each of the input bins without NaN's. 

507 

508 See Also 

509 -------- 

510 scipp.all: 

511 For performing an OR of non-bin data or across bins. 

512 """ 

513 return _call_cpp_func(_cpp.bins_any, self._obj) # type: ignore[return-value] 

514 

515 def size(self) -> Variable: 

516 """Number of events or elements in a bin. 

517 

518 Returns 

519 ------- 

520 : 

521 The number of elements in each of the input bins. 

522 """ 

523 return _call_cpp_func(_cpp.bin_sizes, self._obj) # type: ignore[return-value] 

524 

525 def concat(self, dim: Dims = None) -> _O: 

526 """Concatenate bins element-wise by concatenating bin contents along 

527 their internal bin dimension. 

528 

529 This is a reduction operation similar to :py:func:`scipp.sum` but operates on 

530 binned data. Elements (bins) are concatenated along their internal dimension. 

531 

532 Parameters 

533 ---------- 

534 dim: 

535 Reduction dimension. 

536 

537 Returns 

538 ------- 

539 : 

540 All bins along `dim` concatenated into a single bin. 

541 """ 

542 if isinstance(self._obj, Dataset): 

543 raise NotImplementedError( 

544 "Concatenating bins is not implemented for datasets" 

545 ) 

546 return concat_bins(self._obj, dim) 

547 

548 def concatenate( 

549 self, 

550 other: Variable | DataArray, 

551 *, 

552 out: DataArray | None = None, 

553 ) -> Variable | DataArray: 

554 """Concatenate bins element-wise by concatenating bin contents along 

555 their internal bin dimension. 

556 

557 The bins to concatenate are obtained element-wise from `self` and `other`. 

558 

559 Parameters 

560 ---------- 

561 other: 

562 Other input containing bins. 

563 out: 

564 Optional output buffer. 

565 

566 Returns 

567 ------- 

568 : 

569 The bins of the two inputs merged. 

570 

571 Raises 

572 ------ 

573 scipp.DTypeError 

574 If `other` is not binned data. 

575 """ 

576 if out is None: 

577 return _call_cpp_func(_cpp.buckets.concatenate, self._obj, other) # type: ignore[return-value] 

578 else: 

579 if self._obj is out: 

580 _call_cpp_func(_cpp.buckets.append, self._obj, other) 

581 else: 

582 out = _call_cpp_func(_cpp.buckets.concatenate, self._obj, other) # type: ignore[assignment] 

583 return out 

584 

585 

586def _bins(obj: _O) -> Bins[_O] | None: 

587 """ 

588 Returns helper :py:class:`scipp.Bins` allowing bin-wise operations 

589 to be performed or `None` if not binned data. 

590 """ 

591 if _cpp.is_bins(obj): 

592 return Bins(obj) 

593 else: 

594 return None 

595 

596 

597def _set_bins(obj: _O, bins: Bins[_O]) -> None: 

598 # Should only be used by __iadd__ and friends 

599 if obj is not bins._obj: 

600 raise ValueError("Cannot set bins with a new object") 

601 

602 

603def bins( 

604 *, 

605 data: VariableLike, 

606 dim: str, 

607 begin: Variable | None = None, 

608 end: Variable | None = None, 

609) -> Variable: 

610 """Create a binned variable from bin indices. 

611 

612 The elements of the returned variable are "bins", defined as views into 

613 ``data``. The returned variable keeps and manages a copy of ``data`` 

614 internally. 

615 

616 The variables ``begin`` and ``end`` must have the same dims and shape and 

617 ``dtype=sc.DType.int64``. The output dims and shape are given by ``begin``. 

618 If only ``begin`` is given, each bucket is a slice containing a non-range 

619 slice of ``data`` at the given indices. If neither ``begin`` nor ``end`` 

620 are given, the output has ``dims=[dim]`` and contains all non-range slices 

621 along that dimension. 

622 

623 Parameters 

624 ---------- 

625 begin: 

626 Optional begin indices of bins, used for slicing ``data``. 

627 If not provided each row of ``data`` is mapped to a different bin. 

628 end: 

629 Optional end indices of bins, used for slicing ``data``. If not 

630 provided, ``begin`` is used as starting offsets for each bin, i.e., the end of 

631 the Nth bin is set to the begin of the N+1st bin. 

632 dim: 

633 Dimension of ``data`` that will be sliced to obtain data for 

634 any given bin. 

635 data: 

636 A variable, data array, or dataset containing combined data of all bins. 

637 

638 Returns 

639 ------- 

640 : 

641 Variable containing data in bins. 

642 

643 See Also 

644 -------- 

645 scipp.bin: 

646 For creating DataArrays based on binning of coord value 

647 instead of explicitly given index ranges. 

648 """ 

649 if any(isinstance(x, DataGroup) for x in [begin, end, data]): 

650 raise ValueError("`scipp.bins` does not support DataGroup arguments.") 

651 return _call_cpp_func(_cpp.bins, begin, end, dim, data) # type: ignore[return-value] 

652 

653 

654def bins_like(x: VariableLike, fill_value: Variable) -> Variable: 

655 """Create a binned variable by "broadcasting" fill values to bins of given sizes. 

656 

657 The dimensions and shape of ``fill_value`` must be such that they can be broadcast 

658 to those of ``x``. Each element of ``fill_value`` defines the values of all the bin 

659 elements of the corresponding bin. The output shares the bin indices of ``x``. 

660 

661 Parameters 

662 ---------- 

663 x: 

664 Binned variable or data array serving as prototype for bin sizes. 

665 fill_value: 

666 Fill values to use for the bins. 

667 

668 Returns 

669 ------- 

670 : 

671 Variable containing fill value in bins. 

672 """ 

673 if isinstance(x, DataGroup) or isinstance(fill_value, DataGroup): # type: ignore[unreachable] 

674 raise ValueError("`scipp.bins_like` does not support DataGroup arguments.") 

675 if isinstance(x, Dataset) or isinstance(fill_value, Dataset): # type: ignore[unreachable] 

676 raise ValueError("`scipp.bins_like` does not support Dataset arguments.") 

677 var = x.data if isinstance(x, DataArray) else x 

678 return _call_cpp_func(_cpp.bins_like, var, fill_value) # type: ignore[return-value]