Coverage for install/scipp/core/binning.py: 71%

224 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-04-28 01:28 +0000

1# SPDX-License-Identifier: BSD-3-Clause 

2# Copyright (c) 2023 Scipp contributors (https://github.com/scipp) 

3# @author Simon Heybrock 

4import itertools 

5import warnings 

6from numbers import Integral 

7from typing import Dict, Optional, Sequence, Union, overload 

8 

9from .._scipp import core as _cpp 

10from .bin_remapping import combine_bins 

11from .cpp_classes import BinEdgeError, CoordError, DataArray, Dataset, DType, Variable 

12from .data_group import DataGroup, data_group_overload 

13from .math import round as round_ 

14from .shape import concat 

15from .util import VisibleDeprecationWarning 

16from .variable import arange, array, epoch, linspace, scalar 

17 

18 

19@overload 

20def make_histogrammed( 

21 x: Union[Variable, DataArray], *, edges: Variable 

22) -> DataArray: ... 

23 

24 

25@overload 

26def make_histogrammed(x: Dataset, *, edges: Variable) -> Dataset: ... 

27 

28 

29def make_histogrammed(x, *, edges): 

30 """Create dense data by histogramming data into given bins. 

31 

32 If the input is binned data, then existing binning dimensions are preserved. 

33 Histogramming along an existing binned dimension will replace this binning. 

34 

35 Usually :py:func:`scipp.hist` should be preferred. 

36 

37 Parameters 

38 ---------- 

39 x: 

40 Input data. 

41 edges: 

42 Bin edges. If these have more than one dimension, binning occurs along 

43 the inner dimension. 

44 

45 Returns 

46 ------- 

47 : 

48 DataArray / Dataset with values equal to the sum 

49 of values in each given bin. 

50 

51 See Also 

52 -------- 

53 scipp.hist: 

54 Recommended interface for histogramming data. 

55 scipp.bin: 

56 For binning data. 

57 """ 

58 if isinstance(x, Variable): 

59 data = scalar(1.0, unit='counts').broadcast(sizes=x.sizes) 

60 x = DataArray(data, coords={edges.dim: x}) 

61 elif isinstance(x, DataArray) and x.bins is not None: 

62 dim = edges.dims[-1] 

63 if dim not in x.bins.coords: 

64 if x.coords.is_edges(dim): 

65 raise BinEdgeError( 

66 "Cannot histogram data with existing bin edges " 

67 "unless event data coordinate for histogramming is available." 

68 ) 

69 return make_histogrammed(x.bins.sum(), edges=edges) 

70 return _cpp.histogram(x, edges) 

71 

72 

73def make_binned( 

74 x: Union[Variable, DataArray], 

75 *, 

76 edges: Optional[Sequence[Variable]] = None, 

77 groups: Optional[Sequence[Variable]] = None, 

78 erase: Optional[Sequence[str]] = None, 

79) -> DataArray: 

80 """Create binned data by binning input along all dimensions given by edges or 

81 groups. 

82 

83 Usually :py:func:`scipp.bin` or :py:func:`scipp.group` should be preferred, 

84 unless the more precise control over which dimensions should be erased is required, 

85 or unless grouping and binning at the same time is required. 

86 

87 This does not histogram the data, each output bin will contain a "list" of 

88 input values. 

89 

90 At least one argument of ``edges`` and ``groups`` is required. 

91 

92 If the input is binned and certain bins are masked then changing the binning 

93 will apply the masks, i.e., masked bins are treated as empty. 

94 

95 Warning 

96 ------- 

97 

98 When there is existing binning or grouping, the algorithm assumes that coordinates 

99 of the binned data are correct, i.e., compatible with the corresponding 

100 coordinate values in the individual bins. If this is not the case then the behavior 

101 if UNSPECIFIED. That is, the algorithm may or may not ignore the existing 

102 coordinates. If you encounter such as case, remove the conflicting coordinate, 

103 e.g., using :py:func:`scipp.DataArray.drop_coords`. 

104 

105 Parameters 

106 ---------- 

107 x: 

108 Input data. 

109 edges: 

110 Bin edges, one per dimension to bin in. 

111 groups: 

112 Keys to group input by one per dimension to group in. 

113 erase: 

114 Dimension labels to remove from output. 

115 

116 Returns 

117 ------- 

118 : 

119 Binned ``x``. 

120 

121 See Also 

122 -------- 

123 scipp.hist: 

124 For histogramming data. 

125 scipp.bin: 

126 Recommended interface for binning data. 

127 scipp.group: 

128 Recommended interface for grouping data. 

129 scipp.bins: 

130 For creating binned data based on explicitly given index ranges. 

131 """ 

132 if erase is None: 

133 erase = [] 

134 if groups is None: 

135 groups = [] 

136 if edges is None: 

137 edges = [] 

138 if isinstance(x, Variable) and x.bins is not None: 

139 x = DataArray(x) 

140 elif isinstance(x, Variable): 

141 coords = [*edges, *groups] 

142 if len(coords) != 1: 

143 raise ValueError( 

144 "Edges for exactly one dimension must be specified when " 

145 "binning or histogramming a variable." 

146 ) 

147 data = scalar(1.0, unit='counts').broadcast(sizes=x.sizes).copy() 

148 x = DataArray(data, coords={coords[0].dim: x}) 

149 if _can_operate_on_bins(x, edges, groups, erase): 

150 return combine_bins(x, edges=edges, groups=groups, dim=erase) 

151 return _cpp.bin(x, edges, groups, erase) 

152 

153 

154def _can_operate_on_bins(x, edges, groups, erase) -> bool: 

155 if x.bins is None: 

156 return False 

157 dims = [] 

158 for coord in itertools.chain(edges, groups): 

159 if coord.ndim != 1: 

160 return False 

161 if coord.dim in x.bins.coords: 

162 return False 

163 if coord.dim not in x.coords: 

164 return False 

165 dims += x.coords[coord.dim].dims 

166 return set(dims) <= set(erase) 

167 

168 

169def _require_coord(name, coord): 

170 if coord is None: 

171 raise CoordError(f"Coordinate '{name}' not found.") 

172 

173 

174def _get_coord(x, name): 

175 if isinstance(x, Variable): 

176 return x 

177 if isinstance(x, Dataset): 

178 cmin = None 

179 cmax = None 

180 for da in x.values(): 

181 c = _get_coord(da, name) 

182 cmin = c.min() if cmin is None else min(cmin, c.min()) 

183 cmax = c.max() if cmax is None else max(cmin, c.max()) 

184 coord = concat([cmin, cmax], dim='dummy') 

185 else: 

186 event_coord = x.bins.deprecated_meta.get(name) if x.bins is not None else None 

187 coord = x.deprecated_meta.get(name, event_coord) 

188 _require_coord(name, coord) 

189 return coord 

190 

191 

192def _upper_bound(x: Variable) -> Variable: 

193 import numpy as np 

194 

195 bound = x.max() 

196 if bound.dtype in ('int32', 'int64', 'datetime64'): 

197 bound.value += 1 

198 else: 

199 bound.value = np.nextafter( 

200 bound.value, (bound + scalar(1, unit=bound.unit, dtype=bound.dtype)).value 

201 ) 

202 return bound 

203 

204 

205def _parse_coords_arg( 

206 x: Union[Variable, DataArray, Dataset], name: str, arg: Union[int, Variable] 

207) -> Variable: 

208 if isinstance(arg, Variable) and name in arg.dims: 

209 return arg 

210 coord = _get_coord(x, name) 

211 start = coord.min() 

212 if ( 

213 not isinstance(x, Variable) 

214 and (name in x.coords) 

215 and x.coords.is_edges(name, name) 

216 ): 

217 stop = coord.max() # existing bin-edges, do not extend 

218 else: 

219 stop = _upper_bound(coord) 

220 if start > stop: 

221 raise ValueError( 

222 ( 

223 'Empty data range, cannot automatically determine bounds. ' 

224 'Must provide concrete bin edges.' 

225 ) 

226 ) 

227 if isinstance(arg, Integral): 

228 if start.dtype == DType.datetime64: 

229 base = epoch(unit=start.unit) 

230 return base + round_( 

231 linspace(name, start - base, stop - base, num=arg + 1) 

232 ).to(dtype='int64') 

233 return linspace(name, start, stop, num=arg + 1).to( 

234 dtype=start.dtype, copy=False 

235 ) 

236 step = arg.to(dtype=start.dtype, unit=start.unit) 

237 if step.value == 0: 

238 raise ValueError("Step size cannot be 0.") 

239 return arange(name, start, stop + step, step=step) 

240 

241 

242def _make_edges( 

243 x: Union[Variable, DataArray, Dataset], 

244 arg_dict: Optional[Dict[str, Union[int, Variable]]], 

245 kwargs: Dict[str, Union[int, Variable]], 

246) -> Dict[str, Variable]: 

247 if arg_dict is not None: 

248 kwargs = dict(**arg_dict, **kwargs) 

249 return {name: _parse_coords_arg(x, name, arg) for name, arg in kwargs.items()} 

250 

251 

252def _find_replaced_dims(x, dims): 

253 if x.bins is None: 

254 return [] 

255 erase = set() 

256 for dim in dims: 

257 if (coord := x.coords.get(dim)) is not None: 

258 if dim not in coord.dims: 

259 erase = erase.union(coord.dims) 

260 return [dim for dim in erase if dim not in dims] 

261 

262 

263@overload 

264def hist( 

265 x: Union[Variable, DataArray], 

266 arg_dict: Optional[Dict[str, Union[int, Variable]]] = None, 

267 /, 

268 **kwargs: Union[int, Variable], 

269) -> DataArray: ... 

270 

271 

272@overload 

273def hist( 

274 x: Dataset, 

275 arg_dict: Optional[Dict[str, Union[int, Variable]]] = None, 

276 /, 

277 **kwargs: Union[int, Variable], 

278) -> Dataset: ... 

279 

280 

281@overload 

282def hist( 

283 x: DataGroup, 

284 arg_dict: Optional[Dict[str, Union[int, Variable]]] = None, 

285 /, 

286 **kwargs: Union[int, Variable], 

287) -> DataGroup: ... 

288 

289 

290@data_group_overload 

291def hist(x, arg_dict=None, /, **kwargs): 

292 """Compute a histogram. 

293 

294 Bin edges can be specified in three ways: 

295 

296 1. When an integer is provided, a 'linspace' with this requested number of 

297 bins is created, based on the min and max of the corresponding coordinate. 

298 2. A scalar Scipp variable (a value with a unit) is interpreted as a target 

299 bin width, and an 'arange' covering the min and max of the corresponding 

300 coordinate is created. 

301 3. A custom coordinate, given as a Scipp variable with compatible unit. 

302 Typically this should have a single dimension matching the target dimension. 

303 

304 When histogramming a dimension with an existing dimension-coord, the binning for 

305 the dimension is modified, i.e., the input and the output will have the same 

306 dimension labels. 

307 

308 When histogramming by non-dimension-coords, the output will have new dimensions 

309 given by the names of these coordinates. These new dimensions replace the 

310 dimensions the input coordinates depend on. 

311 

312 Parameters 

313 ---------- 

314 x: 

315 Input data. 

316 arg_dict: 

317 Dictionary mapping dimension labels to binning parameters. 

318 **kwargs: 

319 Mapping of dimension label to corresponding binning parameters. 

320 

321 Returns 

322 ------- 

323 : 

324 Histogrammed data. 

325 

326 See Also 

327 -------- 

328 scipp.bin: 

329 Creating binned data by binning instead of summing all contributions. 

330 scipp.binning.make_histogrammed: 

331 Lower level function for histogramming that does not automatically 

332 replace/erase dimensions. 

333 

334 Examples 

335 -------- 

336 

337 Histogram a table by one of its coord columns, specifying (1) number of bins, (2) 

338 bin width, or (3) actual binning: 

339 

340 >>> from numpy.random import default_rng 

341 >>> rng = default_rng(seed=1234) 

342 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

343 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

344 >>> data = sc.ones(dims=['row'], unit='K', shape=[100]) 

345 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y}) 

346 >>> table.hist(x=2) 

347 <scipp.DataArray> 

348 Dimensions: Sizes[x:2, ] 

349 Coordinates: 

350 * x float64 [m] (x [bin-edge]) [0.00313229, 0.497696, 0.992259] 

351 Data: 

352 float64 [K] (x) [53, 47] 

353 

354 >>> table.hist(x=sc.scalar(0.2, unit='m')).sizes 

355 {'x': 5} 

356 

357 >>> table.hist(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes 

358 {'x': 9} 

359 

360 Histogram a table by two of its coord columns: 

361 

362 >>> table.hist(x=4, y=6).sizes 

363 {'x': 4, 'y': 6} 

364 

365 Histogram binned data, using existing bins: 

366 

367 >>> binned = table.bin(x=10) 

368 >>> binned.hist().sizes 

369 {'x': 10} 

370 

371 Histogram binned data, using new bins along existing dimension: 

372 

373 >>> binned = table.bin(x=10) 

374 >>> binned.hist(x=20).sizes 

375 {'x': 20} 

376 

377 Histogram binned data along an additional dimension: 

378 

379 >>> binned = table.bin(x=10) 

380 >>> binned.hist(y=5).sizes 

381 {'x': 10, 'y': 5} 

382 """ # noqa: E501 

383 edges = _make_edges(x, arg_dict, kwargs) 

384 erase = _find_replaced_dims(x, edges) 

385 if isinstance(x, Variable) and len(edges) != 1: 

386 raise ValueError( 

387 "Edges for exactly one dimension must be specified when " 

388 "binning or histogramming a variable." 

389 ) 

390 if len(edges) == 0: 

391 if x.bins is None: 

392 raise TypeError("Data is not binned so bin edges must be provided.") 

393 return x.bins.sum() 

394 if len(edges) == 1: 

395 # TODO Note that this may swap dims, is that ok? 

396 out = make_histogrammed(x, edges=next(iter(edges.values()))) 

397 else: 

398 edges = list(edges.values()) 

399 # If histogramming by the final edges needs to use a non-event coord then we 

400 # must not erase that dim, since it removes the coord required for histogramming 

401 if isinstance(x, DataArray) and x.bins is not None: 

402 hist_dim = edges[-1].dims[-1] 

403 if hist_dim not in x.bins.coords: 

404 hist_coord_dim = x.coords[hist_dim].dims[-1] 

405 erase = [e for e in erase if e != hist_coord_dim] 

406 out = make_histogrammed( 

407 make_binned(x, edges=edges[:-1], erase=erase), edges=edges[-1] 

408 ) 

409 for dim in erase: 

410 if dim in out.dims: 

411 out = out.sum(dim) 

412 return out 

413 

414 

415@overload 

416def nanhist( 

417 x: Union[Variable, DataArray], 

418 arg_dict: Optional[Dict[str, Union[int, Variable]]] = None, 

419 /, 

420 **kwargs: Union[int, Variable], 

421) -> DataArray: ... 

422 

423 

424@overload 

425def nanhist( 

426 x: DataGroup, 

427 arg_dict: Optional[Dict[str, Union[int, Variable]]] = None, 

428 /, 

429 **kwargs: Union[int, Variable], 

430) -> DataGroup: ... 

431 

432 

433@data_group_overload 

434def nanhist(x, arg_dict=None, /, **kwargs): 

435 """Compute a histogram, skipping NaN values. 

436 

437 Like :py:func:`scipp.hist`, but NaN values are skipped. See there for details and 

438 examples. 

439 

440 Parameters 

441 ---------- 

442 x: 

443 Input data. 

444 arg_dict: 

445 Dictionary mapping dimension labels to binning parameters. 

446 **kwargs: 

447 Mapping of dimension label to corresponding binning parameters. 

448 

449 Returns 

450 ------- 

451 : 

452 Histogrammed data. 

453 """ 

454 edges = _make_edges(x, arg_dict, kwargs) 

455 if len(edges) > 0: 

456 x = x.bin(edges) 

457 if x.bins is None: 

458 raise TypeError("Data is not binned so bin edges must be provided.") 

459 return x.bins.nansum() 

460 

461 

462@overload 

463def bin( 

464 x: Union[Variable, DataArray], 

465 arg_dict: Optional[Dict[str, Union[int, Variable]]] = None, 

466 /, 

467 **kwargs: Union[int, Variable], 

468) -> DataArray: ... 

469 

470 

471@overload 

472def bin( 

473 x: DataGroup, 

474 arg_dict: Optional[Dict[str, Union[int, Variable]]] = None, 

475 /, 

476 **kwargs: Union[int, Variable], 

477) -> DataGroup: ... 

478 

479 

480@data_group_overload 

481def bin(x, arg_dict=None, /, **kwargs): 

482 """Create binned data by binning input along all dimensions given by edges. 

483 

484 Bin edges can be specified in three ways: 

485 

486 1. When an integer is provided, a 'linspace' with this requested number of 

487 bins is created, based on the min and max of the corresponding coordinate. 

488 2. A scalar Scipp variable (a value with a unit) is interpreted as a target 

489 bin width, and an 'arange' covering the min and max of the corresponding 

490 coordinate is created. 

491 3. A custom coordinate, given as a Scipp variable with compatible unit. 

492 Typically, this should have a single dimension matching the target dimension. 

493 

494 When binning a dimension with an existing dimension-coord, the binning for 

495 the dimension is modified, i.e., the input and the output will have the same 

496 dimension labels. 

497 

498 When binning by non-dimension-coords, the output will have new dimensions 

499 given by the names of these coordinates. These new dimensions replace the 

500 dimensions the input coordinates depend on. 

501 

502 Warning 

503 ------- 

504 

505 When there is existing binning or grouping, the algorithm assumes that coordinates 

506 of the binned data are correct, i.e., compatible with the corresponding 

507 coordinate values in the individual bins. If this is not the case then the behavior 

508 if UNSPECIFIED. That is, the algorithm may or may not ignore the existing 

509 coordinates. If you encounter such as case, remove the conflicting coordinate, 

510 e.g., using :py:func:`scipp.DataArray.drop_coords`. 

511 

512 Parameters 

513 ---------- 

514 x: 

515 Input data. 

516 arg_dict: 

517 Dictionary mapping dimension labels to binning parameters. 

518 **kwargs: 

519 Mapping of dimension label to corresponding binning parameters. 

520 

521 Returns 

522 ------- 

523 : 

524 Binned data. 

525 

526 See Also 

527 -------- 

528 scipp.hist: 

529 For histogramming data. 

530 scipp.group: 

531 Creating binned data by grouping, instead of binning based on edges. 

532 scipp.binning.make_binned: 

533 Lower level function that can bin and group, and does not automatically 

534 replace/erase dimensions. 

535 

536 Examples 

537 -------- 

538 

539 Bin a table by one of its coord columns, specifying (1) number of bins, (2) 

540 bin width, or (3) actual binning: 

541 

542 >>> from numpy.random import default_rng 

543 >>> rng = default_rng(seed=1234) 

544 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

545 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

546 >>> data = sc.ones(dims=['row'], unit='K', shape=[100]) 

547 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y}) 

548 >>> table.bin(x=2).sizes 

549 {'x': 2} 

550 

551 >>> table.bin(x=sc.scalar(0.2, unit='m')).sizes 

552 {'x': 5} 

553 

554 >>> table.bin(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes 

555 {'x': 9} 

556 

557 Bin a table by two of its coord columns: 

558 

559 >>> table.bin(x=4, y=6).sizes 

560 {'x': 4, 'y': 6} 

561 

562 Bin binned data, using new bins along existing dimension: 

563 

564 >>> binned = table.bin(x=10) 

565 >>> binned.bin(x=20).sizes 

566 {'x': 20} 

567 

568 Bin binned data along an additional dimension: 

569 

570 >>> binned = table.bin(x=10) 

571 >>> binned.bin(y=5).sizes 

572 {'x': 10, 'y': 5} 

573 """ 

574 if arg_dict is None: 

575 for name, item in kwargs.items(): 

576 if name in ('edges', 'groups', 'erase') and isinstance(item, list): 

577 warnings.warn( 

578 "The 'edges', 'groups', and 'erase' keyword arguments " 

579 "are deprecated. Use, e.g., 'sc.bin(da, x=x_edges)' or " 

580 "'sc.group(da, groups)'. See the documentation for details.", 

581 UserWarning, 

582 stacklevel=2, 

583 ) 

584 return make_binned(x, **kwargs) 

585 edges = _make_edges(x, arg_dict, kwargs) 

586 erase = _find_replaced_dims(x, edges) 

587 return make_binned(x, edges=list(edges.values()), erase=erase) 

588 

589 

590@overload 

591def rebin( 

592 x: DataArray, 

593 arg_dict: Optional[dict[str, Union[int, Variable]]] = None, 

594 deprecated=None, 

595 /, 

596 **kwargs: Union[int, Variable], 

597) -> DataArray: ... 

598 

599 

600@overload 

601def rebin( 

602 x: Dataset, 

603 arg_dict: Optional[dict[str, Union[int, Variable]]] = None, 

604 deprecated=None, 

605 /, 

606 **kwargs: Union[int, Variable], 

607) -> Dataset: ... 

608 

609 

610@overload 

611def rebin( 

612 x: DataGroup, 

613 arg_dict: Optional[dict[str, Union[int, Variable]]] = None, 

614 deprecated=None, 

615 /, 

616 **kwargs: Union[int, Variable], 

617) -> DataGroup: ... 

618 

619 

620@data_group_overload 

621def rebin(x, arg_dict=None, deprecated=None, /, **kwargs): 

622 """Rebin a data array or dataset. 

623 

624 The coordinate of the input for the dimension to be rebinned must contain bin edges, 

625 i.e., the data must be histogrammed. 

626 

627 If the input has masks that contain the dimension being rebinned then those 

628 masks are applied to the data before rebinning. That is, masked values are treated 

629 as zero. 

630 

631 Parameters 

632 ---------- 

633 x: 

634 Data to rebin. 

635 arg_dict: 

636 Dictionary mapping dimension labels to binning parameters. 

637 **kwargs: 

638 Mapping of dimension label to corresponding binning parameters. 

639 

640 Returns 

641 ------- 

642 : 

643 Data rebinned according to the new bin edges. 

644 

645 See Also 

646 -------- 

647 scipp.bin: 

648 For changing the binning of binned (as opposed to dense, histogrammed) data. 

649 scipp.hist: 

650 For histogramming data. 

651 

652 Examples 

653 -------- 

654 

655 Rebin a data array along one of its dimensions, specifying (1) number of bins, (2) 

656 bin width, or (3) actual binning: 

657 

658 >>> from numpy.random import default_rng 

659 >>> rng = default_rng(seed=1234) 

660 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

661 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

662 >>> data = sc.ones(dims=['row'], unit='K', shape=[100]) 

663 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y}) 

664 >>> da = table.hist(x=100, y=100) 

665 >>> da.rebin(x=2).sizes 

666 {'x': 2, 'y': 100} 

667 

668 >>> da.rebin(x=sc.scalar(0.2, unit='m')).sizes 

669 {'x': 5, 'y': 100} 

670 

671 >>> da.rebin(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes 

672 {'x': 9, 'y': 100} 

673 

674 Rebin a data array along two of its dimensions: 

675 

676 >>> da = table.hist(x=100, y=100) 

677 >>> da.rebin(x=4, y=6).sizes 

678 {'x': 4, 'y': 6} 

679 """ 

680 if isinstance(arg_dict, str): 

681 if deprecated is not None or 'bins' in kwargs: 

682 warnings.warn( 

683 "The 'bins' keyword argument and positional syntax for setting bin " 

684 "edges is deprecated. Use, e.g., 'sc.rebin(da, x=x_edges)'. See the " 

685 "documentation for details.", 

686 UserWarning, 

687 stacklevel=2, 

688 ) 

689 bins = {'bins': deprecated, **kwargs} 

690 return _cpp.rebin(x, arg_dict, **bins) 

691 edges = _make_edges(x, arg_dict, kwargs) 

692 out = x 

693 for dim, edge in edges.items(): 

694 out = _cpp.rebin(out, dim, edge) 

695 return out 

696 

697 

698def _make_groups(x, arg): 

699 import numpy as np 

700 

701 if isinstance(arg, Variable): 

702 return arg 

703 coord = x.bins.coords.get(arg) if x.bins is not None else None 

704 if coord is None: 

705 coord = x.coords.get(arg) 

706 _require_coord(arg, coord) 

707 if coord.bins is not None: 

708 coord = coord.copy().bins.constituents['data'] 

709 

710 if coord.values.size == 0: 

711 unique = coord.values[0:0] 

712 # We are currently using np.unique to find all unique groups. This can be very slow 

713 # for large inputs. In many cases groups are in a bounded range of integers, and we 

714 # can sometimes bypass a full call to np.unique by checking a sub-range first 

715 elif coord.dtype in (DType.int32, DType.int64): 

716 min_ = coord.min().value 

717 max_ = coord.max().value 

718 values = coord.values 

719 unique = values[0:0] 

720 for pivot in [1000, 100, 10, 1]: 

721 if len(unique) == max_ - min_ + 1: 

722 break 

723 unique = np.unique(values[: len(values) // pivot]) 

724 else: 

725 unique = np.unique(coord.values) 

726 return array(dims=[arg], values=unique, unit=coord.unit) 

727 

728 

729@overload 

730def group(x: DataArray, /, *args: Union[str, Variable]) -> DataArray: ... 

731 

732 

733@overload 

734def group(x: DataGroup, /, *args: Union[str, Variable]) -> DataGroup: ... 

735 

736 

737@data_group_overload 

738def group(x, /, *args: Union[str, Variable]): 

739 """Create binned data by grouping input by one or more coordinates. 

740 

741 Grouping can be specified in two ways: (1) When a string is provided the unique 

742 values of the corresponding coordinate are used as groups. (2) When a Scipp variable 

743 is provided then the variable's values are used as groups. 

744 

745 Note that option (1) may be very slow if the input is very large. 

746 

747 When grouping a dimension with an existing dimension-coord, the binning for 

748 the dimension is modified, i.e., the input and the output will have the same 

749 dimension labels. 

750 

751 When grouping by non-dimension-coords, the output will have new dimensions 

752 given by the names of these coordinates. These new dimensions replace the 

753 dimensions the input coordinates depend on. 

754 

755 Warning 

756 ------- 

757 

758 When there is existing binning or grouping, the algorithm assumes that coordinates 

759 of the binned data are correct, i.e., compatible with the corresponding 

760 coordinate values in the individual bins. If this is not the case then the behavior 

761 if UNSPECIFIED. That is, the algorithm may or may not ignore the existing 

762 coordinates. If you encounter such as case, remove the conflicting coordinate, 

763 e.g., using :py:func:`scipp.DataArray.drop_coords`. 

764 

765 Parameters 

766 ---------- 

767 x: 

768 Input data. 

769 *args: 

770 Dimension labels or grouping variables. 

771 

772 Returns 

773 ------- 

774 : 

775 Binned data. 

776 

777 See Also 

778 -------- 

779 scipp.bin: 

780 Creating binned data by binning based on edges, instead of grouping. 

781 scipp.binning.make_binned: 

782 Lower level function that can bin and group, and does not automatically 

783 replace/erase dimensions. 

784 

785 Examples 

786 -------- 

787 

788 Group a table by one of its coord columns, specifying (1) a coord name or (2) 

789 an actual grouping: 

790 

791 >>> from numpy.random import default_rng 

792 >>> rng = default_rng(seed=1234) 

793 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

794 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100)) 

795 >>> data = sc.ones(dims=['row'], unit='K', shape=[100]) 

796 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y}) 

797 >>> table.coords['label'] = (table.coords['x'] * 10).to(dtype='int64') 

798 >>> table.group('label').sizes 

799 {'label': 10} 

800 

801 >>> groups = sc.array(dims=['label'], values=[1, 3, 5], unit='m') 

802 >>> table.group(groups).sizes 

803 {'label': 3} 

804 

805 Group a table by two of its coord columns: 

806 

807 >>> table.coords['a'] = (table.coords['x'] * 10).to(dtype='int64') 

808 >>> table.coords['b'] = (table.coords['y'] * 10).to(dtype='int64') 

809 >>> table.group('a', 'b').sizes 

810 {'a': 10, 'b': 10} 

811 

812 >>> groups = sc.array(dims=['a'], values=[1, 3, 5], unit='m') 

813 >>> table.group(groups, 'b').sizes 

814 {'a': 3, 'b': 10} 

815 

816 Group binned data along an additional dimension: 

817 

818 >>> table.coords['a'] = (table.coords['y'] * 10).to(dtype='int64') 

819 >>> binned = table.bin(x=10) 

820 >>> binned.group('a').sizes 

821 {'x': 10, 'a': 10} 

822 """ 

823 groups = [_make_groups(x, name) for name in args] 

824 erase = _find_replaced_dims(x, [g.dim for g in groups]) 

825 return make_binned(x, groups=groups, erase=erase) 

826 

827 

828def histogram( 

829 x: Union[DataArray, Dataset], *, bins: Variable 

830) -> Union[DataArray, Dataset]: 

831 """Deprecated. See :py:func:`scipp.hist`.""" 

832 warnings.warn( 

833 "'histogram' is deprecated. Use 'hist' instead.", UserWarning, stacklevel=2 

834 ) 

835 warnings.warn( 

836 "'histogram' is deprecated. Use 'hist' instead.", 

837 VisibleDeprecationWarning, 

838 stacklevel=2, 

839 ) 

840 return make_histogrammed(x, edges=bins)