Coverage for install/scipp/core/bins.py: 55%

195 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-04-28 01:28 +0000

1# SPDX-License-Identifier: BSD-3-Clause 

2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp) 

3# @author Simon Heybrock 

4import warnings 

5from typing import Callable, Dict, Literal, Optional, Tuple, Union 

6 

7from .._scipp import core as _cpp 

8from ..typing import Dims, MetaDataMap, VariableLike 

9from ._cpp_wrapper_util import call_func as _call_cpp_func 

10from .bin_remapping import concat_bins 

11from .cpp_classes import Variable 

12from .data_group import DataGroup 

13from .deprecation import _warn_attr_removal 

14from .domains import merge_equal_adjacent 

15from .math import midpoints 

16from .operations import islinspace 

17from .shape import concat 

18from .variable import scalar 

19 

20 

21class Lookup: 

22 """Lookup table. 

23 

24 This is class should never be instantiated manually. 

25 Instead, use :func:`scipp.lookup`. 

26 

27 See :func:`scipp.lookup` also for usage examples. 

28 """ 

29 

30 def __init__( 

31 self, 

32 op: Callable, 

33 func: _cpp.DataArray, 

34 dim: str, 

35 fill_value: Optional[Variable] = None, 

36 ): 

37 if ( 

38 not func.masks 

39 and func.ndim == 1 

40 and len(func) > 0 

41 and func.dtype in [_cpp.DType.bool, _cpp.DType.int32, _cpp.DType.int64] 

42 ): 

43 # Significant speedup if `func` is large but mostly constant. 

44 if op == _cpp.buckets.map: 

45 if not islinspace(func.coords[dim], dim).value: 

46 func = merge_equal_adjacent(func) 

47 else: 

48 # In this case the C++ implementation currently used no linspace 

49 # optimization, so the extra check is skipped. 

50 transition = func.data[:-1] != func.data[1:] 

51 func = concat([func[0], func[1:][transition]], dim) 

52 self.op = op 

53 self.func = func 

54 self.dim = dim 

55 self.fill_value = fill_value 

56 self.__transform_coords_input_keys__ = (dim,) # for transform_coords 

57 

58 def __call__(self, var: Variable) -> Variable: 

59 """Return table values for the given points.""" 

60 return self.op(self.func, var, self.dim, self.fill_value) 

61 

62 def __getitem__(self, var: Variable) -> Variable: 

63 """Return table values for the given points.""" 

64 return self(var) 

65 

66 

67def lookup( 

68 func: _cpp.DataArray, 

69 dim: Optional[str] = None, 

70 *, 

71 mode: Optional[Literal['previous', 'nearest']] = None, 

72 fill_value: Optional[_cpp.Variable] = None, 

73) -> Lookup: 

74 """Create a "lookup table" from a histogram (data array with bin-edge coord). 

75 

76 The lookup table can be used to map, e.g., time-stamps to corresponding values 

77 given by a time-series log. 

78 

79 Parameters 

80 ---------- 

81 func: 

82 Data array defining the lookup table. 

83 dim: 

84 Dimension along which the lookup occurs. 

85 mode: 

86 Mode used for looking up function values. Must be ``None`` when ``func`` is a 

87 histogram. Otherwise this defaults to 'nearest'. 

88 fill_value: 

89 Value to use for points outside the range of the function as well as points in 

90 masked regions of the function. If set to None (the default) this will use NaN 

91 for floating point types and 0 for integral types. Must have the same dtype and 

92 unit as the function values. 

93 

94 Returns 

95 ------- 

96 : 

97 The created lookup table. 

98 

99 Examples 

100 -------- 

101 

102 >>> x = sc.linspace(dim='x', start=0.0, stop=1.0, num=4) 

103 >>> vals = sc.array(dims=['x'], values=[3, 2, 1]) 

104 >>> hist = sc.DataArray(data=vals, coords={'x': x}) 

105 >>> sc.lookup(hist, 'x')[sc.array(dims=['event'], values=[0.1,0.4,0.1,0.6,0.9])] 

106 <scipp.Variable> (event: 5) int64 [dimensionless] [3, 2, ..., 2, 1] 

107 """ 

108 if dim is None: 

109 dim = func.dim 

110 func = _cpp.DataArray(func.data, coords={dim: func.coords[dim]}, masks=func.masks) 

111 if func.coords.is_edges(dim): 

112 if mode is not None: 

113 raise ValueError("Input is a histogram, 'mode' must not be set.") 

114 return Lookup(_cpp.buckets.map, func, dim, fill_value) 

115 if mode is None: 

116 mode = 'nearest' 

117 elif mode not in ['previous', 'nearest']: 

118 raise ValueError(f"Mode must be one of ['previous', 'nearest'], got '{mode}'") 

119 if mode == 'nearest' and func.sizes[dim] != 0: 

120 coord = func.coords[dim] 

121 lowest = coord[dim, 0:0].max() # trick to get lowest representable value 

122 parts = [lowest] if coord.sizes[dim] < 2 else [lowest, midpoints(coord, dim)] 

123 func.coords[dim] = concat(parts, dim) 

124 return Lookup(_cpp.lookup_previous, func, dim, fill_value) 

125 

126 

127class Bins: 

128 """Proxy for access to bin contents and operations on bins of a variable. 

129 

130 This class is returned from the `bins` property of variables and should 

131 generally not be created directly. 

132 """ 

133 

134 def __init__(self, obj): 

135 self._obj = obj 

136 

137 def _data(self): 

138 try: 

139 return self._obj.data 

140 except AttributeError: 

141 return self._obj 

142 

143 def __mul__(self, lut: lookup): 

144 target_dtype = ( 

145 scalar(1, dtype=self.dtype) * scalar(1, dtype=lut.func.dtype) 

146 ).dtype 

147 copy = self._obj.to(dtype=target_dtype) 

148 _cpp.buckets.scale(copy, lut.func, lut.dim) 

149 return copy 

150 

151 def __truediv__(self, lut: lookup): 

152 target_dtype = ( 

153 scalar(1, dtype=self.dtype) / scalar(1, dtype=lut.func.dtype) 

154 ).dtype 

155 copy = self._obj.to(dtype=target_dtype) 

156 _cpp.buckets.scale(copy, _cpp.reciprocal(lut.func), lut.dim) 

157 return copy 

158 

159 def __imul__(self, lut: lookup): 

160 _cpp.buckets.scale(self._obj, lut.func, lut.dim) 

161 return self 

162 

163 def __itruediv__(self, lut: lookup): 

164 _cpp.buckets.scale(self._obj, _cpp.reciprocal(lut.func), lut.dim) 

165 return self 

166 

167 def __getitem__(self, key: Tuple[str, Union[_cpp.Variable, slice]]): 

168 """ 

169 Extract events from bins based on labels or label ranges and return a copy. 

170 

171 This is similar to regular label-based indexing, but considers the event-coords, 

172 i.e., the coord values of individual bin entries. Unlike normal label-based 

173 indexing this returns a copy, as a subset of events is extracted. 

174 """ 

175 dim, index = key 

176 if isinstance(index, _cpp.Variable): 

177 if index.ndim == 0: 

178 return self._obj.group(index.flatten(to=dim)).squeeze(dim) 

179 elif isinstance(index, slice): 

180 from .binning import _upper_bound 

181 

182 if index.step is not None: 

183 raise ValueError( 

184 "Label-based indexing with step (stride) is not " 

185 f"supported. Got '{key}'" 

186 ) 

187 start = index.start 

188 stop = index.stop 

189 if start is None: 

190 start = self._obj.bins.coords[dim].min() 

191 if stop is None: 

192 stop = _upper_bound(self._obj.bins.coords[dim].max()) 

193 

194 if not ( 

195 isinstance(start, _cpp.Variable) and isinstance(stop, _cpp.Variable) 

196 ): 

197 raise ValueError( 

198 "Bins can only by sliced using label-based indexing. Expected " 

199 f"start and stop to be scipp.Variable, got '{start}' and '{stop}'." 

200 ) 

201 

202 if start > stop: 

203 if index.start is None: 

204 start = stop 

205 elif index.stop is None: 

206 stop = start 

207 

208 return self._obj.bin({dim: concat([start, stop], dim)}).squeeze(dim) 

209 raise ValueError( 

210 f"Unsupported key '{key}'. Expected a dimension label and " 

211 "a 0-D variable or a dimension label and a slice object with start " 

212 "and stop given by a 0-D variable." 

213 ) 

214 

215 @property 

216 def coords(self) -> MetaDataMap: 

217 """Coords of the bins""" 

218 return _cpp._bins_view(self._data()).coords 

219 

220 @property 

221 def meta(self) -> MetaDataMap: 

222 """Coords and attrs of the bins 

223 

224 .. deprecated:: 23.9.0 

225 Use :py:attr:`coords` with unset alignment flag instead, or 

226 store attributes in higher-level data structures. 

227 """ 

228 _warn_attr_removal() 

229 return self.deprecated_meta 

230 

231 @property 

232 def attrs(self) -> MetaDataMap: 

233 """Attrs of the bins 

234 

235 .. deprecated:: 23.9.0 

236 Use :py:attr:`coords` with unset alignment flag instead, or 

237 store attributes in higher-level data structures. 

238 """ 

239 _warn_attr_removal() 

240 return self.deprecated_attrs 

241 

242 @property 

243 def deprecated_meta(self) -> MetaDataMap: 

244 return _cpp._bins_view(self._data()).deprecated_meta 

245 

246 @property 

247 def deprecated_attrs(self) -> MetaDataMap: 

248 return _cpp._bins_view(self._data()).deprecated_attrs 

249 

250 @property 

251 def masks(self) -> MetaDataMap: 

252 """Masks of the bins""" 

253 return _cpp._bins_view(self._data()).masks 

254 

255 @property 

256 def data(self) -> _cpp.Variable: 

257 """Data of the bins""" 

258 return _cpp._bins_view(self._data()).data 

259 

260 @data.setter 

261 def data(self, data: _cpp.Variable): 

262 """Set data of the bins""" 

263 _cpp._bins_view(self._data()).data = data 

264 

265 @property 

266 def unit(self) -> _cpp.Unit: 

267 """Unit of the bin elements""" 

268 return self.constituents['data'].unit 

269 

270 @unit.setter 

271 def unit(self, unit: Union[_cpp.Unit, str]): 

272 """Set unit of the bin elements""" 

273 self.constituents['data'].unit = unit 

274 

275 @property 

276 def dtype(self) -> _cpp.DType: 

277 """Data type of the bin elements.""" 

278 return self.constituents['data'].dtype 

279 

280 @property 

281 def aligned(self) -> bool: 

282 """Alignment flag for coordinates of bin elements.""" 

283 return self.constituents['data'].aligned 

284 

285 @property 

286 def constituents(self) -> Dict[str, Union[str, _cpp.Variable, _cpp.DataArray]]: 

287 """Constituents of binned data, as supported by :py:func:`sc.bins`.""" 

288 return _call_cpp_func(_cpp.bins_constituents, self._data()) 

289 

290 def sum(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

291 """Sum of events in each bin. 

292 

293 Returns 

294 ------- 

295 : 

296 The sum of each of the input bins. 

297 

298 See Also 

299 -------- 

300 scipp.sum: 

301 For summing non-bin data or summing bins. 

302 """ 

303 return _call_cpp_func(_cpp.bins_sum, self._obj) 

304 

305 def nansum(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

306 """Sum of events in each bin ignoring NaN's. 

307 

308 Returns 

309 ------- 

310 : 

311 The sum of each of the input bins without NaN's. 

312 

313 See Also 

314 -------- 

315 scipp.nansum: 

316 For summing non-bin data or summing bins. 

317 """ 

318 return _call_cpp_func(_cpp.bins_nansum, self._obj) 

319 

320 def mean(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

321 """Arithmetic mean of events in each bin. 

322 

323 Returns 

324 ------- 

325 : 

326 The mean of each of the input bins. 

327 

328 See Also 

329 -------- 

330 scipp.mean: 

331 For calculating the mean of non-bin data or across bins. 

332 """ 

333 return _call_cpp_func(_cpp.bins_mean, self._obj) 

334 

335 def nanmean(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

336 """Arithmetic mean of events in each bin ignoring NaN's. 

337 

338 Returns 

339 ------- 

340 : 

341 The mean of each of the input bins without NaN's. 

342 

343 See Also 

344 -------- 

345 scipp.nanmean: 

346 For calculating the mean of non-bin data or across bins. 

347 """ 

348 return _call_cpp_func(_cpp.bins_nanmean, self._obj) 

349 

350 def max(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

351 """Maximum of events in each bin. 

352 

353 Returns 

354 ------- 

355 : 

356 The maximum of each of the input bins. 

357 

358 See Also 

359 -------- 

360 scipp.max: 

361 For calculating the maximum of non-bin data or across bins. 

362 """ 

363 return _call_cpp_func(_cpp.bins_max, self._obj) 

364 

365 def nanmax(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

366 """Maximum of events in each bin ignoring NaN's. 

367 

368 Returns 

369 ------- 

370 : 

371 The maximum of each of the input bins without NaN's. 

372 

373 See Also 

374 -------- 

375 scipp.nanmax: 

376 For calculating the maximum of non-bin data or across bins. 

377 """ 

378 return _call_cpp_func(_cpp.bins_nanmax, self._obj) 

379 

380 def min(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

381 """Minimum of events in each bin. 

382 

383 Returns 

384 ------- 

385 : 

386 The minimum of each of the input bins. 

387 

388 See Also 

389 -------- 

390 scipp.min: 

391 For calculating the minimum of non-bin data or across bins. 

392 """ 

393 return _call_cpp_func(_cpp.bins_min, self._obj) 

394 

395 def nanmin(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

396 """Minimum of events in each bin ignoring NaN's. 

397 

398 Returns 

399 ------- 

400 : 

401 The minimum of each of the input bins without NaN's. 

402 

403 See Also 

404 -------- 

405 scipp.nanmin: 

406 For calculating the minimum of non-bin data or across bins. 

407 """ 

408 return _call_cpp_func(_cpp.bins_nanmin, self._obj) 

409 

410 def all(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

411 """Logical AND of events in each bin ignoring NaN's. 

412 

413 Returns 

414 ------- 

415 : 

416 The AND of each of the input bins without NaN's. 

417 

418 See Also 

419 -------- 

420 scipp.all: 

421 For performing an AND of non-bin data or across bins. 

422 """ 

423 return _call_cpp_func(_cpp.bins_all, self._obj) 

424 

425 def any(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

426 """Logical OR of events in each bin ignoring NaN's. 

427 

428 Returns 

429 ------- 

430 : 

431 The OR of each of the input bins without NaN's. 

432 

433 See Also 

434 -------- 

435 scipp.all: 

436 For performing an OR of non-bin data or across bins. 

437 """ 

438 return _call_cpp_func(_cpp.bins_any, self._obj) 

439 

440 def size(self) -> Union[_cpp.Variable, _cpp.DataArray]: 

441 """Number of events or elements in a bin. 

442 

443 Returns 

444 ------- 

445 : 

446 The number of elements in each of the input bins. 

447 """ 

448 return _call_cpp_func(_cpp.bin_sizes, self._obj) 

449 

450 def concat(self, dim: Dims = None) -> Union[_cpp.Variable, _cpp.DataArray]: 

451 """Concatenate bins element-wise by concatenating bin contents along 

452 their internal bin dimension. 

453 

454 This is a reduction operation similar to :py:func:`scipp.sum` but operates on 

455 binned data. Elements (bins) are concatenated along their internal dimension. 

456 

457 Parameters 

458 ---------- 

459 dim: 

460 Reduction dimension. 

461 

462 Returns 

463 ------- 

464 : 

465 All bins along `dim` concatenated into a single bin. 

466 """ 

467 return concat_bins(self._obj, dim) 

468 

469 def concatenate( 

470 self, 

471 other: Union[_cpp.Variable, _cpp.DataArray], 

472 *, 

473 out: Optional[_cpp.DataArray] = None, 

474 ) -> Union[_cpp.Variable, _cpp.DataArray]: 

475 """Concatenate bins element-wise by concatenating bin contents along 

476 their internal bin dimension. 

477 

478 The bins to concatenate are obtained element-wise from `self` and `other`. 

479 

480 Parameters 

481 ---------- 

482 other: 

483 Other input containing bins. 

484 out: 

485 Optional output buffer. 

486 

487 Returns 

488 ------- 

489 : 

490 The bins of the two inputs merged. 

491 

492 Raises 

493 ------ 

494 scipp.DTypeError 

495 If `other` is not binned data. 

496 """ 

497 if out is None: 

498 return _call_cpp_func(_cpp.buckets.concatenate, self._obj, other) 

499 else: 

500 if self._obj is out: 

501 _call_cpp_func(_cpp.buckets.append, self._obj, other) 

502 else: 

503 out = _call_cpp_func(_cpp.buckets.concatenate, self._obj, other) 

504 return out 

505 

506 

507class GroupbyBins: 

508 """Proxy for operations on bins of a groupby object.""" 

509 

510 def __init__(self, obj): 

511 self._obj = obj 

512 

513 def concat(self, dim): 

514 warnings.warn( 

515 "groupby(...).bins.concat(dim) is deprecated. Use `group` or `bin` instead", 

516 UserWarning, 

517 stacklevel=2, 

518 ) 

519 return self._obj.concat(dim) 

520 

521 

522def _bins(obj): 

523 """ 

524 Returns helper :py:class:`scipp.Bins` allowing bin-wise operations 

525 to be performed or `None` if not binned data. 

526 """ 

527 if _cpp.is_bins(obj): 

528 return Bins(obj) 

529 else: 

530 return None 

531 

532 

533def _set_bins(obj, bins: Bins): 

534 # Should only be used by __iadd__ and friends 

535 if obj is not bins._obj: 

536 raise ValueError("Cannot set bins with a new object") 

537 

538 

539def _groupby_bins(obj): 

540 return GroupbyBins(obj) 

541 

542 

543def bins( 

544 *, 

545 data: VariableLike, 

546 dim: str, 

547 begin: Optional[_cpp.Variable] = None, 

548 end: Optional[_cpp.Variable] = None, 

549) -> _cpp.Variable: 

550 """Create a binned variable from bin indices. 

551 

552 The elements of the returned variable are "bins", defined as views into 

553 ``data``. The returned variable keeps and manages a copy of ``data`` 

554 internally. 

555 

556 The variables ``begin`` and ``end`` must have the same dims and shape and 

557 ``dtype=sc.DType.int64``. The output dims and shape are given by ``begin``. 

558 If only ``begin`` is given, each bucket is a slice containing a non-range 

559 slice of ``data`` at the given indices. If neither ``begin`` nor ``end`` 

560 are given, the output has ``dims=[dim]`` and contains all non-range slices 

561 along that dimension. 

562 

563 Parameters 

564 ---------- 

565 begin: 

566 Optional begin indices of bins, used for slicing ``data``. 

567 If not provided each row of ``data`` is mapped to a different bin. 

568 end: 

569 Optional end indices of bins, used for slicing ``data``. If not 

570 provided, ``begin`` is used as starting offsets for each bin, i.e., the end of 

571 the Nth bin is set to the begin of the N+1st bin. 

572 dim: 

573 Dimension of ``data`` that will be sliced to obtain data for 

574 any given bin. 

575 data: 

576 A variable, data array, or dataset containing combined data of all bins. 

577 

578 Returns 

579 ------- 

580 : 

581 Variable containing data in bins. 

582 

583 See Also 

584 -------- 

585 scipp.bin: 

586 For creating DataArrays based on binning of coord value 

587 instead of explicitly given index ranges. 

588 """ 

589 if any(isinstance(x, DataGroup) for x in [begin, end, data]): 

590 raise ValueError("`scipp.bins` does not support DataGroup arguments.") 

591 return _call_cpp_func(_cpp.bins, begin, end, dim, data) 

592 

593 

594def bins_like(x: VariableLike, fill_value: _cpp.Variable) -> _cpp.Variable: 

595 """Create a binned variable by "broadcasting" fill values to bins of given sizes. 

596 

597 The dimensions and shape of ``fill_value`` must be such that they can be broadcast 

598 to those of ``x``. Each element of ``fill_value`` defines the values of all the bin 

599 elements of the corresponding bin. The output shares the bin indices of ``x``. 

600 

601 Parameters 

602 ---------- 

603 x: 

604 Binned variable or data array serving as prototype for bin sizes. 

605 fill_value: 

606 Fill values to use for the bins. 

607 

608 Returns 

609 ------- 

610 : 

611 Variable containing fill value in bins. 

612 """ 

613 if isinstance(x, DataGroup) or isinstance(fill_value, DataGroup): 

614 raise ValueError("`scipp.bins_like` does not support DataGroup arguments.") 

615 var = x 

616 if not isinstance(x, _cpp.Variable): 

617 var = var.data 

618 return _call_cpp_func(_cpp.bins_like, var, fill_value)