Coverage for install/scipp/core/binning.py: 71%

1# SPDX-License-Identifier: BSD-3-Clause

3# @author Simon Heybrock

4import itertools

5import uuid

6from collections.abc import Iterable, Sequence

7from typing import Any, SupportsIndex, TypeVar, overload

9from .._scipp import core as _cpp

10from .bin_remapping import combine_bins

11from .bins import Bins

12from .cpp_classes import BinEdgeError, CoordError, DataArray, Dataset, DType, Variable

13from .data_group import DataGroup, data_group_overload

14from .math import round as round_

15from .shape import concat

16from .variable import arange, array, epoch, linspace, scalar

18_DaDs = TypeVar('_DaDs', bound=DataArray | Dataset)

21@overload

22def make_histogrammed(

23 x: Variable | DataArray, *, edges: Variable, erase: Iterable[str] = ()

24) -> DataArray: ...

27@overload

28def make_histogrammed(

29 x: Dataset, *, edges: Variable, erase: Iterable[str] = ()

30) -> Dataset: ...

33def make_histogrammed(

34 x: Variable | DataArray | Dataset, *, edges: Variable, erase: Iterable[str] = ()

35) -> DataArray | Dataset:

36 """Create dense data by histogramming data into given bins.

38 If the input is binned data, then existing binning dimensions are preserved.

39 Histogramming along an existing binned dimension will replace this binning.

41 Usually :py:func:`scipp.hist` should be preferred.

43 Parameters

44 ----------

45 x:

46 Input data.

47 edges:

48 Bin edges. If these have more than one dimension, binning occurs along

49 the inner dimension.

50 erase:

51 Names of dimensions to erase from the input.

53 Returns

54 -------

55 :

56 DataArray / Dataset with values equal to the sum

57 of values in each given bin.

59 See Also

60 --------

61 scipp.hist:

62 Recommended interface for histogramming data.

63 scipp.bin:

64 For binning data.

65 """

66 if isinstance(x, Variable):

67 data = scalar(1.0, unit='counts').broadcast(sizes=x.sizes)

68 x = DataArray(data, coords={edges.dim: x})

69 elif isinstance(x, DataArray) and x.bins is not None:

70 dim = edges.dims[-1]

71 if dim not in x.bins.coords:

72 # The second `dim` is necessary in case the coord is multi-dimensional.

73 if x.coords.is_edges(dim, dim):

74 raise BinEdgeError(

75 "Cannot histogram data with existing bin edges "

76 "unless event data coordinate for histogramming is available."

77 )

78 return make_histogrammed(x.bins.sum(), edges=edges, erase=erase)

79 _check_erase_dimension_clash(erase, edges)

80 # The C++ implementation uses an older heuristic histogramming a single dimension.

81 # We therefore transpose and flatten the input to match this.

82 hist_dim = edges.dims[-1]

83 to_flatten = [dim for dim in x.dims if dim in erase]

84 if hist_dim in x.dims:

85 to_flatten.append(hist_dim)

86 if to_flatten:

87 x = _drop_coords_for_hist(x, to_flatten, keep=(hist_dim,))

88 x = _transpose_and_flatten_for_hist(x, to_flatten, to=hist_dim)

89 return _cpp.histogram(x, edges) # type: ignore[no-any-return]

92def _drop_coords_for_hist(x: _DaDs, dims: Iterable[str], keep: Iterable[str]) -> _DaDs:

93 """Drop unnecessary coords from a DataArray making flatten/bin expensive."""

94 data = x if x.bins is None else x.bins

95 to_drop = []

96 for name, coord in data.coords.items():

97 if (name not in keep) and (set(coord.dims) & set(dims)):

98 to_drop.append(name)

99 return data.drop_coords(to_drop) # type: ignore[return-value]

100

101

102def _transpose_and_flatten_for_hist(x: _DaDs, dims: Sequence[str], to: str) -> _DaDs:

103 """Transpose and flatten a DataArray to prepare for histogram."""

104 new_order = [*(dim for dim in x.dims if dim not in dims), *dims]

105 # `make_histogrammed` does not fully support `Dataset`.

106 # This needs to be fixed, but for now, we just ignore the type error here.

107 transposed = x.transpose(new_order) # type: ignore[union-attr]

108 return transposed.flatten(dims=dims, to=to) # type: ignore[return-value]

109

110

111def make_binned(

112 x: Variable | DataArray,

113 *,

114 edges: Sequence[Variable] | None = None,

115 groups: Sequence[Variable] | None = None,

116 erase: Sequence[str] = (),

117) -> DataArray:

118 """Create binned data by binning input along all dimensions given by edges or

119 groups.

120

121 Usually :py:func:`scipp.bin` or :py:func:`scipp.group` should be preferred,

122 unless the more precise control over which dimensions should be erased is required,

123 or unless grouping and binning at the same time is required.

124

125 This does not histogram the data, each output bin will contain a "list" of

126 input values.

127

128 At least one argument of ``edges`` and ``groups`` is required.

129

130 If the input is binned and certain bins are masked then changing the binning

131 will apply the masks, i.e., masked bins are treated as empty.

132

133 Warning

134 -------

135

136 When there is existing binning or grouping, the algorithm assumes that coordinates

137 of the binned data are correct, i.e., compatible with the corresponding

138 coordinate values in the individual bins. If this is not the case then the behavior

139 is UNSPECIFIED. That is, the algorithm may or may not ignore the existing

140 coordinates. If you encounter such as case, remove the conflicting coordinate,

141 e.g., using :py:func:`scipp.DataArray.drop_coords`.

142

143 Parameters

144 ----------

145 x:

146 Input data.

147 edges:

148 Bin edges, one per dimension to bin in.

149 groups:

150 Keys to group input by one per dimension to group in.

151 erase:

152 Dimension labels to remove from output.

153

154 Returns

155 -------

156 :

157 Binned ``x``.

158

159 See Also

160 --------

161 scipp.hist:

162 For histogramming data.

163 scipp.bin:

164 Recommended interface for binning data.

165 scipp.group:

166 Recommended interface for grouping data.

167 scipp.bins:

168 For creating binned data based on explicitly given index ranges.

169 """

170 if groups is None:

171 groups = []

172 if edges is None:

173 edges = []

174 _check_erase_dimension_clash(erase, *edges, *groups)

175

176 if isinstance(x, Variable) and x.bins is not None:

177 x = DataArray(x)

178 elif isinstance(x, Variable):

179 coords = [*edges, *groups]

180 if len(coords) != 1:

181 raise ValueError(

182 "Edges for exactly one dimension must be specified when "

183 "binning or histogramming a variable."

184 )

185 data = scalar(1.0, unit='counts').broadcast(sizes=x.sizes).copy()

186 x = DataArray(data, coords={coords[0].dim: x})

187 if _can_operate_on_bins(x, edges, groups, erase):

188 return combine_bins(x, edges=edges, groups=groups, dim=erase)

189 # Many-to-many mapping is expensive, concat first is generally cheaper,

190 # despite extra copies. If some coords are dense, perform binning in two steps,

191 # since concat is not possible then (without mapping dense coords to binned coords,

192 # which might bypass some other optimizations).

193 if erase and x.bins is not None:

194 dense_edges = [var for var in edges if var.dims[-1] not in x.bins.coords]

195 dense_groups = [var for var in groups if var.dims[-1] not in x.bins.coords]

196 if len(dense_edges) + len(dense_groups) == 0:

197 x = x.bins.concat(erase)

198 erase = ()

199 elif len(dense_edges) + len(dense_groups) < len(edges) + len(groups):

200 x = make_binned(x, edges=dense_edges, groups=dense_groups, erase=erase)

201 b: Bins[DataArray] = x.bins # type: ignore[assignment]

202 edges = [var for var in edges if var.dims[-1] in b.coords]

203 groups = [var for var in groups if var.dims[-1] in b.coords]

204 erase = ()

205 if x.ndim == 0:

206 return ( # type: ignore[no-any-return]

207 _cpp.bin(x.value, edges, groups, erase)

208 .assign_coords(x.coords)

209 .assign_masks(x.masks)

210 )

211 x = _prepare_multi_dim_dense(x, *edges, *groups)

212 return _cpp.bin(x, edges, groups, erase) # type: ignore[no-any-return]

213

214

215def _prepare_multi_dim_dense(x: DataArray, *edges_or_groups: Variable) -> DataArray:

216 """Prepare data for binning or grouping.

217

218 This function is a workaround for the C++ implementation not being able to deal with

219 multi-dimensional dense input data. The workaround is to flatten the data along the

220 auxiliary dimensions and regroup.

221

222 In case the ultimate operation is histogramming, this leads to the desired

223 higher-dimensional histogram. In case of binning or grouping, we obtain binned data

224 with one additional dimension, whereas conceptually we might expect only the

225 requested dimensions, with the auxiliary dimensions inside the bin content. As this

226 case is likely rare and extra dimensions in bin content are barely supported in

227 scipp, we consider this acceptable for now.

228 """

229 if x.bins is not None or x.ndim == 1:

230 return x

231 if any(var.ndim != 1 for var in edges_or_groups):

232 raise ValueError("Cannot bin multi-dimensional dense data with ragged edges.")

233 op_dims = _get_op_dims(x, *edges_or_groups)

234 if len(op_dims) != 1:

235 raise ValueError("Cannot bin multi-dimensional dense data along multiple dims.")

236 extra = {dim for dim in x.dims if dim != next(iter(op_dims))}

237 original_coords = {

238 name: coord

239 for name, coord in x.coords.items()

240 if set(coord.dims).issubset(extra)

241 }

242 helper_coords = {dim: arange(dim, x.sizes[dim]) for dim in extra}

243 return (

244 x.assign_coords(helper_coords)

245 .flatten(to=str(uuid.uuid4()))

246 .group(*helper_coords.values())

247 .drop_coords(tuple(extra))

248 .assign_coords(original_coords)

249 )

250

251

252def _check_erase_dimension_clash(

253 erase: Iterable[str], *edges_or_groups: Variable

254) -> None:

255 new_dims: set[str] = set()

256 for var in edges_or_groups:

257 new_dims.update(var.dims)

258 if set(erase) & new_dims:

259 raise ValueError(

260 f"Clash of dimension(s) to reduce {erase} with dimensions defined by "

261 f"edges or groups: {new_dims}."

262 )

263

264

265def _can_operate_on_bins(

266 x: DataArray,

267 edges: Iterable[Variable],

268 groups: Iterable[Variable],

269 erase: Iterable[str],

270) -> bool:

271 if x.bins is None:

272 return False

273 dims: set[str] = set()

274 for coord in itertools.chain(edges, groups):

275 if coord.ndim != 1:

276 return False

277 if coord.dim in x.bins.coords:

278 return False

279 if coord.dim not in x.coords:

280 return False

281 dims.update(x.coords[coord.dim].dims)

282 return dims <= set(erase)

283

284

285def _require_coord(name: str, coord: object) -> None:

286 if coord is None:

287 raise CoordError(f"Coordinate '{name}' not found.")

288

289

290def _get_coord(x: Variable | DataArray | Dataset, name: str) -> Variable:

291 if isinstance(x, Variable):

292 return x

293 if isinstance(x, Dataset):

294 if not x.values():

295 raise ValueError("Dataset is empty")

296 cmin: Variable | None = None

297 cmax: Variable | None = None

298 for da in x.values():

299 c = _get_coord(da, name)

300 cmin = c.min() if cmin is None else min(cmin, c.min()) # type: ignore[call-overload]

301 cmax = c.max() if cmax is None else max(cmin, c.max()) # type: ignore[call-overload]

302 coord = concat([cmin, cmax], dim='dummy') # type: ignore[type-var]

303 else:

304 event_coord = x.bins.deprecated_meta.get(name) if x.bins is not None else None

305 coord = x.deprecated_meta.get(name, event_coord)

306 _require_coord(name, coord)

307 return coord # type: ignore[return-value]

308

309

310def _upper_bound(x: Variable) -> Variable:

311 import numpy as np

312

313 bound = x.nanmax()

314 if bound.dtype in ('int32', 'int64', 'datetime64'):

315 bound.value += 1

316 else:

317 bound.value = np.nextafter(

318 bound.value, (bound + scalar(1, unit=bound.unit, dtype=bound.dtype)).value

319 )

320 return bound

321

322

323def _parse_coords_arg(

324 x: Variable | DataArray | Dataset, name: str, arg: SupportsIndex | Variable

325) -> Variable:

326 if isinstance(arg, Variable) and name in arg.dims:

327 return arg

328 coord = _get_coord(x, name)

329 start = coord.nanmin()

330 if (

331 not isinstance(x, Variable)

332 and (name in x.coords)

333 and x.coords.is_edges(name, name)

334 ):

335 stop = coord.nanmax() # existing bin-edges, do not extend

336 else:

337 stop = _upper_bound(coord)

338 if start > stop:

339 raise ValueError(

340 'Empty data range, cannot automatically determine bounds. '

341 'Must provide concrete bin edges.'

342 )

343 if not isinstance(arg, Variable):

344 if start.dtype == DType.datetime64:

345 base = epoch(unit=start.unit)

346 return base + round_(

347 linspace(name, start - base, stop - base, num=arg.__index__() + 1)

348 ).to(dtype='int64')

349 return linspace(name, start, stop, num=arg.__index__() + 1).to(

350 dtype=start.dtype, copy=False

351 )

352 step = arg.to(dtype=start.dtype, unit=start.unit)

353 if step.value == 0:

354 raise ValueError("Step size cannot be 0.")

355 return arange(name, start, stop + step, step=step)

356

357

358def _make_edges(

359 x: Variable | DataArray | Dataset,

360 arg_dict: dict[str, SupportsIndex | Variable] | None,

361 kwargs: dict[str, SupportsIndex | Variable],

362) -> dict[str, Variable]:

363 if arg_dict is not None:

364 kwargs = dict(**arg_dict, **kwargs)

365 return {name: _parse_coords_arg(x, name, arg) for name, arg in kwargs.items()}

366

367

368def _find_replaced_dims(

369 x: Variable | DataArray | Dataset,

370 *,

371 dims: Iterable[str],

372 dim: str | tuple[str, ...] | None,

373) -> list[str]:

374 if isinstance(x, Variable):

375 replaced = set(x.dims)

376 elif dim is None:

377 replaced = set()

378 for name in dims:

379 if name in x.coords:

380 replaced.update(x.coords[name].dims)

381 else:

382 replaced = {dim} if isinstance(dim, str) else set(dim)

383 return [d for d in x.dims if d in (replaced - set(dims))]

384

385

386@overload

387def hist(

388 x: Variable | DataArray,

389 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

390 /,

391 *,

392 dim: str | tuple[str, ...] | None = None,

393 **kwargs: SupportsIndex | Variable,

394) -> Variable | DataArray: ...

395

396

397@overload

398def hist(

399 x: Dataset,

400 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

401 /,

402 *,

403 dim: str | tuple[str, ...] | None = None,

404 **kwargs: SupportsIndex | Variable,

405) -> Dataset: ...

406

407

408@overload

409def hist(

410 x: DataGroup[Any],

411 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

412 /,

413 *,

414 dim: str | tuple[str, ...] | None = None,

415 **kwargs: SupportsIndex | Variable,

416) -> DataGroup[Any]: ...

417

418

419@data_group_overload

420def hist(

421 x: Variable | DataArray | Dataset | DataGroup[Any],

422 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

423 /,

424 *,

425 dim: str | tuple[str, ...] | None = None,

426 **kwargs: SupportsIndex | Variable,

427) -> Variable | DataArray | Dataset | DataGroup[Any]:

428 """Compute a histogram.

429

430 Bin edges can be specified in three ways:

431

432 1. When an integer is provided, a 'linspace' with this requested number of

433 bins is created, based on the min and max of the corresponding coordinate.

434 2. A scalar Scipp variable (a value with a unit) is interpreted as a target

435 bin width, and an 'arange' covering the min and max of the corresponding

436 coordinate is created.

437 3. A custom coordinate, given as a Scipp variable with compatible unit.

438 Typically this should have a single dimension matching the target dimension.

439

440 The `dim` argument controls which dimensions are summed over and which are

441 preserved. The default `dim=None` means that the dimensions of the coordinate

442 used for histogramming are summed over. In case of an input that is binned-data

443 there may be no such coordinate, in which case `dim=None` is equivalent to `dim=()`,

444 resulting in a new dimension in the output. In many cases this default yields the

445 desired behavior, there are two classes of exceptions where specifying `dim`

446 explicitly can be useful:

447

448 1. Given input data with an N-D coordinate, where N>1, we can use `dim` to restrict

449 the sum to a subset of M dimensions, resulting in an (N-M)-D "array" of histograms.

450 This can be of particular importance when the input is binned data: Frequently

451 we may want to bin to add an additional dimension, but if there is a dense

452 coordinate present the default `dim=None` would result in removal of the

453 coordinate's dimensions. This can be prevented by setting `dim=()`, which will

454 always add a new dimensions.

455 2. Given M-D input data with an N-D coordinate, where N<M, we can specify `dim` to

456 sum over, e.g., the remaining M-N dimensions while histogramming. This is often

457 equivalent to not specifying `dim` and a call to `sum` after histogramming but

458 is more memory efficient.

459

460 If the dimensions of the input coordinate are not known, using an explicit `dim`

461 argument can be useful to obtain predictable behavior in generic code.

462

463 Parameters

464 ----------

465 x:

466 Input data.

467 arg_dict:

468 Dictionary mapping dimension labels to binning parameters.

469 dim:

470 Dimension(s) to sum over when histogramming. If None (the default), the

471 dimensions of the coordinate used for histogramming are summed over.

472 **kwargs:

473 Mapping of dimension label to corresponding binning parameters.

474

475 Returns

476 -------

477 :

478 Histogrammed data.

479

480 See Also

481 --------

482 scipp.nanhist:

483 Like :py:func:`scipp.hist`, but NaN values are skipped.

484 scipp.bin:

485 Creating binned data by binning instead of summing all contributions.

486 scipp.binning.make_histogrammed:

487 Lower level function for histogramming.

488

489 Examples

490 --------

491

492 Histogram a table by one of its coord columns, specifying (1) number of bins, (2)

493 bin width, or (3) actual binning:

494

495 >>> from numpy.random import default_rng

496 >>> rng = default_rng(seed=1234)

497 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100))

498 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100))

499 >>> data = sc.ones(dims=['row'], unit='K', shape=[100])

500 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y})

501 >>> table.hist(x=2)

502 <scipp.DataArray>

503 Dimensions: Sizes[x:2, ]

504 Coordinates:

505 * x float64 [m] (x [bin-edge]) [0.00313229, 0.497696, 0.992259]

506 Data:

507 float64 [K] (x) [53, 47]

508

509 >>> table.hist(x=sc.scalar(0.2, unit='m')).sizes

510 {'x': 5}

511

512 >>> table.hist(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes

513 {'x': 9}

514

515 Histogram a table by two of its coord columns:

516

517 >>> table.hist(x=4, y=6).sizes

518 {'x': 4, 'y': 6}

519

520 Histogram binned data, using existing bins:

521

522 >>> binned = table.bin(x=10)

523 >>> binned.hist().sizes

524 {'x': 10}

525

526 Histogram binned data, using new bins along existing dimension:

527

528 >>> binned = table.bin(x=10)

529 >>> binned.hist(x=20).sizes

530 {'x': 20}

531

532 Histogram binned data along an additional dimension:

533

534 >>> binned = table.bin(x=10)

535 >>> binned.hist(y=5).sizes

536 {'x': 10, 'y': 5}

537

538 The `dim` argument controls which dimensions are summed over and which are

539 preserved. Given 3-D data with a 2-D coordinate, the default `dim=None` results in:

540

541 >>> xyz = sc.data.table_xyz(100).bin(x=4, y=5, z=6)

542 >>> xyz.coords['t'] = sc.array(dims=['x', 'y'], unit='s', values=rng.random((4, 5)))

543 >>> xyz.hist(t=3).sizes

544 {'z': 6, 't': 3}

545

546 Specifying `dim=('x', 'y', 'z')` or equivalently `dim=xyz.dims` will additionally

547 sum over the z-dimension, resulting in a 1-D histogram:

548

549 >>> xyz.hist(t=3, dim=('x', 'y', 'z')).sizes

550 {'t': 3}

551

552 To preserve a dimension of the input's t-coordinate, we can drop this dimension

553 from the tuple of dimensions to sum over:

554

555 >>> xyz.hist(t=4, dim='y').sizes

556 {'x': 4, 'z': 6, 't': 4}

557 """ # noqa: E501

558 if isinstance(x, DataGroup):

559 # Only to make mypy happy because we have `DataGroup` in annotation of `x`

560 # so that Sphinx shows it.

561 raise TypeError("Internal error: input should not be a DataGroup")

562 edges = _make_edges(x, arg_dict, kwargs)

563 erase = _find_replaced_dims(x, dims=edges, dim=dim)

564 if isinstance(x, Variable) and len(edges) != 1:

565 raise ValueError(

566 "Edges for exactly one dimension must be specified when "

567 "binning or histogramming a variable."

568 )

569 if len(edges) == 0:

570 if x.bins is None:

571 raise TypeError("Data is not binned so bin edges must be provided.")

572 return x.bins.sum()

573 if len(edges) == 1:

574 # TODO Note that this may swap dims, is that ok?

575 out = make_histogrammed(x, edges=next(iter(edges.values())), erase=erase)

576 else:

577 # Drop coords that would disappear by histogramming, to avoid costly handling

578 # in intermediate binning step.

579 if isinstance(x, DataArray):

580 x = _drop_coords_for_hist(x, dims=erase, keep=edges)

581 elif isinstance(x, Dataset):

582 x = Dataset(

583 {

584 k: _drop_coords_for_hist(v, dims=erase, keep=edges)

585 for k, v in x.items()

586 }

587 )

588 edge_values = list(edges.values())

589 # If histogramming by the final edges needs to use a non-event coord then we

590 # must not erase that dim, since it removes the coord required for histogramming

591 remaining_erase = set(erase)

592 if isinstance(x, DataArray) and x.bins is not None:

593 hist_dim = edge_values[-1].dims[-1]

594 if hist_dim not in x.bins.coords:

595 erase = [e for e in erase if e not in x.coords[hist_dim].dims]

596 remaining_erase -= set(erase)

597 out = make_histogrammed(

598 make_binned(

599 x, # type: ignore[arg-type]

600 edges=edge_values[:-1],

601 erase=erase,

602 ),

603 edges=edge_values[-1],

604 erase=remaining_erase,

605 )

606 return out

607

608

609def _get_op_dims(x: DataArray, *edges_or_groups: Variable) -> set[str]:

610 edge_dims = {edge.dims[-1] for edge in edges_or_groups}

611 coords = [x.coords[dim] for dim in edge_dims if dim in x.coords]

612 return {coord.dims[-1] for coord in coords if coord.ndim > 0}

613

614

615@overload

616def nanhist(

617 x: Variable | DataArray,

618 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

619 /,

620 *,

621 dim: str | tuple[str, ...] | None = None,

622 **kwargs: SupportsIndex | Variable,

623) -> Variable | DataArray: ...

624

625

626@overload

627def nanhist(

628 x: Dataset,

629 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

630 /,

631 *,

632 dim: str | tuple[str, ...] | None = None,

633 **kwargs: SupportsIndex | Variable,

634) -> Dataset: ...

635

636

637@overload

638def nanhist(

639 x: DataGroup[Any],

640 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

641 /,

642 *,

643 dim: str | tuple[str, ...] | None = None,

644 **kwargs: SupportsIndex | Variable,

645) -> DataGroup[Any]: ...

646

647

648@data_group_overload

649def nanhist(

650 x: Variable | DataArray | Dataset | DataGroup[Any],

651 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

652 /,

653 *,

654 dim: str | tuple[str, ...] | None = None,

655 **kwargs: SupportsIndex | Variable,

656) -> Variable | DataArray | Dataset | DataGroup[Any]:

657 """Compute a histogram, skipping NaN values.

658

659 Like :py:func:`scipp.hist`, but NaN values are skipped. See there for details and

660 examples.

661

662 Parameters

663 ----------

664 x:

665 Input data.

666 arg_dict:

667 Dictionary mapping dimension labels to binning parameters.

668 dim:

669 Dimension(s) to sum over when histogramming. If None (the default), the

670 dimensions of the coordinate used for histogramming are summed over.

671 **kwargs:

672 Mapping of dimension label to corresponding binning parameters.

673

674 Returns

675 -------

676 :

677 Histogrammed data.

678 """

679 if isinstance(x, DataGroup):

680 # Only to make mypy happy because we have `DataGroup` in annotation of `x`

681 # so that Sphinx shows it.

682 raise TypeError("Internal error: input should not be a DataGroup")

683 edges: dict[str, SupportsIndex | Variable] = _make_edges(x, arg_dict, kwargs) # type: ignore[assignment]

684 if len(edges) > 0:

685 x = x.bin(edges, dim=dim) # type: ignore[union-attr]

686 if x.bins is None:

687 raise TypeError("Data is not binned so bin edges must be provided.")

688 return x.bins.nansum()

689

690

691@overload

692def bin(

693 x: Variable | DataArray,

694 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

695 /,

696 *,

697 dim: str | tuple[str, ...] | None = None,

698 **kwargs: SupportsIndex | Variable,

699) -> DataArray: ...

700

701

702@overload

703def bin(

704 x: DataGroup[Any],

705 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

706 /,

707 *,

708 dim: str | tuple[str, ...] | None = None,

709 **kwargs: SupportsIndex | Variable,

710) -> DataGroup[Any]: ...

711

712

713@data_group_overload

714def bin(

715 x: Variable | DataArray | DataGroup[Any],

716 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

717 /,

718 *,

719 dim: str | tuple[str, ...] | None = None,

720 **kwargs: SupportsIndex | Variable,

721) -> DataArray | DataGroup[Any]:

722 """Create binned data by binning input along all dimensions given by edges.

723

724 Bin edges can be specified in three ways:

725

726 1. When an integer is provided, a 'linspace' with this requested number of

727 bins is created, based on the min and max of the corresponding coordinate.

728 2. A scalar Scipp variable (a value with a unit) is interpreted as a target

729 bin width, and an 'arange' covering the min and max of the corresponding

730 coordinate is created.

731 3. A custom coordinate, given as a Scipp variable with compatible unit.

732 Typically, this should have a single dimension matching the target dimension.

733

734 The `dim` argument controls which dimensions are concatenated and which are

735 preserved. The default `dim=None` means that the dimensions of the coordinate

736 used for binning are concatenated. In case of an input that is binned-data

737 there may be no such coordinate, in which case `dim=None` is equivalent to `dim=()`,

738 resulting in a new dimension in the output. In many cases this default yields the

739 desired behavior, there are two classes of exceptions where specifying `dim`

740 explicitly can be useful:

741

742 1. Given input data with an N-D coordinate, where N>1, we can use `dim` to restrict

743 the binning to a subset of M dimensions, resulting in an (N-M)-D "array" of bins.

744 This can be of particular importance when the input is binned data: Frequently

745 we may want to bin to add an additional dimension, but if there is a dense

746 coordinate present the default `dim=None` would result in removal of the

747 coordinate's dimensions. This can be prevented by setting `dim=()`, which will

748 always add a new dimensions.

749 2. Given M-D input data with an N-D coordinate, where N<M, we can specify `dim` to

750 concatenate, e.g., the remaining M-N dimensions while binning. This is often

751 equivalent to not specifying `dim` and a call to `da.bins.concat()` after

752 binning but is more memory efficient.

753

754 If the dimensions of the input coordinate are not known, using an explicit `dim`

755 argument can be useful to obtain predictable behavior in generic code.

756

757 Warning

758 -------

759

760 When there is existing binning or grouping, the algorithm assumes that coordinates

761 of the binned data are correct, i.e., compatible with the corresponding

762 coordinate values in the individual bins. If this is not the case then the behavior

763 if UNSPECIFIED. That is, the algorithm may or may not ignore the existing

764 coordinates. If you encounter such as case, remove the conflicting coordinate,

765 e.g., using :py:func:`scipp.DataArray.drop_coords`.

766

767 Parameters

768 ----------

769 x:

770 Input data.

771 arg_dict:

772 Dictionary mapping dimension labels to binning parameters.

773 dim:

774 Dimension(s) to concatenate into a single bin. If None (the default), the

775 dimensions of the coordinate used for binning are concatenated.

776 **kwargs:

777 Mapping of dimension label to corresponding binning parameters.

778

779 Returns

780 -------

781 :

782 Binned data.

783

784 See Also

785 --------

786 scipp.hist:

787 For histogramming data.

788 scipp.group:

789 Creating binned data by grouping, instead of binning based on edges.

790 scipp.binning.make_binned:

791 Lower level function that can bin and group.

792

793 Examples

794 --------

795

796 Bin a table by one of its coord columns, specifying (1) number of bins, (2)

797 bin width, or (3) actual binning:

798

799 >>> from numpy.random import default_rng

800 >>> rng = default_rng(seed=1234)

801 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100))

802 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100))

803 >>> data = sc.ones(dims=['row'], unit='K', shape=[100])

804 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y})

805 >>> table.bin(x=2).sizes

806 {'x': 2}

807

808 >>> table.bin(x=sc.scalar(0.2, unit='m')).sizes

809 {'x': 5}

810

811 >>> table.bin(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes

812 {'x': 9}

813

814 Bin a table by two of its coord columns:

815

816 >>> table.bin(x=4, y=6).sizes

817 {'x': 4, 'y': 6}

818

819 Bin binned data, using new bins along existing dimension:

820

821 >>> binned = table.bin(x=10)

822 >>> binned.bin(x=20).sizes

823 {'x': 20}

824

825 Bin binned data along an additional dimension:

826

827 >>> binned = table.bin(x=10)

828 >>> binned.bin(y=5).sizes

829 {'x': 10, 'y': 5}

830

831 The `dim` argument controls which dimensions are concatenated and which are

832 preserved. Given 3-D data with a 2-D coordinate, the default `dim=None` results in:

833

834 >>> xyz = sc.data.table_xyz(100).bin(x=4, y=5, z=6)

835 >>> values = rng.random((4, 5))

836 >>> xyz.coords['t'] = sc.array(dims=['x', 'y'], unit='s', values=values)

837 >>> xyz.bin(t=3).sizes

838 {'z': 6, 't': 3}

839

840 Specifying `dim=('x', 'y', 'z')` or equivalently `dim=xyz.dims` will additionally

841 concatenate along the z-dimension, resulting in a 1-D array of bins:

842

843 >>> xyz.bin(t=3, dim=('x', 'y', 'z')).sizes

844 {'t': 3}

845

846 To preserve a dimension of the input's t-coordinate, we can drop this dimension

847 from the tuple of dimensions to concatenate:

848

849 >>> xyz.bin(t=4, dim='y').sizes

850 {'x': 4, 'z': 6, 't': 4}

851

852 Finally, we can add a new dimension without touching the existing dimensions:

853

854 >>> xyz.bin(t=4, dim=()).sizes

855 {'x': 4, 'y': 5, 'z': 6, 't': 4}

856

857 Note that this is generally only useful if the input is binned data with a binned

858 t-coordinate.

859 """

860 if isinstance(x, DataGroup):

861 # Only to make mypy happy because we have `DataGroup` in annotation of `x`

862 # so that Sphinx shows it.

863 raise TypeError("Internal error: input should not be a DataGroup")

864 edges = _make_edges(x, arg_dict, kwargs)

865 erase = _find_replaced_dims(x, dims=edges, dim=dim)

866 return make_binned(x, edges=list(edges.values()), erase=erase)

867

868

869@overload

870def rebin(

871 x: Variable | DataArray,

872 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

873 /,

874 **kwargs: SupportsIndex | Variable,

875) -> DataArray: ...

876

877

878@overload

879def rebin(

880 x: Dataset,

881 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

882 /,

883 **kwargs: SupportsIndex | Variable,

884) -> Dataset: ...

885

886

887@overload

888def rebin(

889 x: DataGroup[Any],

890 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

891 /,

892 **kwargs: SupportsIndex | Variable,

893) -> DataGroup[Any]: ...

894

895

896@data_group_overload

897def rebin(

898 x: Variable | DataArray | Dataset | DataGroup[Any],

899 arg_dict: dict[str, SupportsIndex | Variable] | None = None,

900 /,

901 **kwargs: SupportsIndex | Variable,

902) -> Variable | DataArray | Dataset | DataGroup[Any]:

903 """Rebin a data array or dataset.

904

905 The coordinate of the input for the dimension to be rebinned must contain bin edges,

906 i.e., the data must be histogrammed.

907

908 If the input has masks that contain the dimension being rebinned then those

909 masks are applied to the data before rebinning. That is, masked values are treated

910 as zero.

911

912 Parameters

913 ----------

914 x:

915 Data to rebin.

916 arg_dict:

917 Dictionary mapping dimension labels to binning parameters.

918 **kwargs:

919 Mapping of dimension label to corresponding binning parameters.

920

921 Returns

922 -------

923 :

924 Data rebinned according to the new bin edges.

925

926 See Also

927 --------

928 scipp.bin:

929 For changing the binning of binned (as opposed to dense, histogrammed) data.

930 scipp.hist:

931 For histogramming data.

932

933 Examples

934 --------

935

936 Rebin a data array along one of its dimensions, specifying (1) number of bins, (2)

937 bin width, or (3) actual binning:

938

939 >>> from numpy.random import default_rng

940 >>> rng = default_rng(seed=1234)

941 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100))

942 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100))

943 >>> data = sc.ones(dims=['row'], unit='K', shape=[100])

944 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y})

945 >>> da = table.hist(x=100, y=100)

946 >>> da.rebin(x=2).sizes

947 {'x': 2, 'y': 100}

948

949 >>> da.rebin(x=sc.scalar(0.2, unit='m')).sizes

950 {'x': 5, 'y': 100}

951

952 >>> da.rebin(x=sc.linspace('x', 0.2, 0.8, num=10, unit='m')).sizes

953 {'x': 9, 'y': 100}

954

955 Rebin a data array along two of its dimensions:

956

957 >>> da = table.hist(x=100, y=100)

958 >>> da.rebin(x=4, y=6).sizes

959 {'x': 4, 'y': 6}

960 """

961 if isinstance(x, DataGroup):

962 # Only to make mypy happy because we have `DataGroup` in annotation of `x`

963 # so that Sphinx shows it.

964 raise TypeError("Internal error: input should not be a DataGroup")

965 edges = _make_edges(x, arg_dict, kwargs)

966 out = x

967 for dim, edge in edges.items():

968 out = _cpp.rebin(out, dim, edge)

969 return out

970

971

972def _make_groups(x: DataArray, arg: str | Variable) -> Variable:

973 import numpy as np

974

975 if isinstance(arg, Variable):

976 return arg

977 coord: Variable | None = x.bins.coords.get(arg) if x.bins is not None else None

978 if coord is None:

979 coord = x.coords.get(arg)

980 _require_coord(arg, coord)

981 if coord.bins is not None:

982 coord = coord.copy().bins.constituents['data'] # type: ignore[assignment, union-attr]

983

984 if 0 in coord.shape:

985 unique = coord.values[0:0]

986 # We are currently using np.unique to find all unique groups. This can be very slow

987 # for large inputs. In many cases groups are in a bounded range of integers, and we

988 # can sometimes bypass a full call to np.unique by checking a sub-range first

989 elif coord.dtype in (DType.int32, DType.int64):

990 min_ = coord.min().value

991 max_ = coord.max().value

992 values = coord.values

993 unique = values[0:0]

994 for pivot in [1000, 100, 10, 1]:

995 if len(unique) == max_ - min_ + 1:

996 break

997 unique = np.unique(values[: len(values) // pivot])

998 else:

999 unique = np.unique(coord.values)

1000 return array(dims=[arg], values=unique, unit=coord.unit)

1001

1002

1003@overload

1004def group(

1005 x: DataArray,

1006 /,

1007 *args: str | Variable,

1008 dim: str | tuple[str, ...] | None = None,

1009) -> DataArray: ...

1010

1011

1012@overload

1013def group(

1014 x: DataGroup[Any],

1015 /,

1016 *args: str | Variable,

1017 dim: str | tuple[str, ...] | None = None,

1018) -> DataGroup[Any]: ...

1019

1020

1021@data_group_overload

1022def group(

1023 x: DataArray | DataGroup[Any],

1024 /,

1025 *args: str | Variable,

1026 dim: str | tuple[str, ...] | None = None,

1027) -> DataArray | DataGroup[Any]:

1028 """Create binned data by grouping input by one or more coordinates.

1029

1030 Grouping can be specified in two ways: (1) When a string is provided the unique

1031 values of the corresponding coordinate are used as groups. (2) When a Scipp variable

1032 is provided then the variable's values are used as groups.

1033

1034 Note that option (1) may be very slow if the input is very large.

1035

1036 The `dim` argument controls which dimensions are concatenated and which are

1037 preserved. The default `dim=None` means that the dimensions of the coordinate

1038 used for binning are concatenated. In case of an input that is binned-data

1039 there may be no such coordinate, in which case `dim=None` is equivalent to `dim=()`,

1040 resulting in a new dimension in the output. In many cases this default yields the

1041 desired behavior, there are two classes of exceptions where specifying `dim`

1042 explicitly can be useful:

1043

1044 1. Given input data with an N-D coordinate, where N>1, we can use `dim` to restrict

1045 the grouping to a subset of M dimensions, resulting in an (N-M)-D array of bins.

1046 This can be of particular importance when the input is binned data: Frequently

1047 we may want to group to add an additional dimension, but if there is a dense

1048 coordinate present the default `dim=None` would result in removal of the

1049 coordinate's dimensions. This can be prevented by setting `dim=()`, which will

1050 always add a new dimensions.

1051 2. Given M-D input data with an N-D coordinate, where N<M, we can specify `dim` to

1052 concatenate, e.g., the remaining M-N dimensions while grouping. This is often

1053 equivalent to not specifying `dim` and a call to `da.bins.concat()` after

1054 grouping but is more memory efficient.

1055

1056 If the dimensions of the input coordinate are not known, using an explicit `dim`

1057 argument can be useful to obtain predictable behavior in generic code.

1058

1059 Warning

1060 -------

1061

1062 When there is existing binning or grouping, the algorithm assumes that coordinates

1063 of the binned data are correct, i.e., compatible with the corresponding

1064 coordinate values in the individual bins. If this is not the case then the behavior

1065 if UNSPECIFIED. That is, the algorithm may or may not ignore the existing

1066 coordinates. If you encounter such as case, remove the conflicting coordinate,

1067 e.g., using :py:func:`scipp.DataArray.drop_coords`.

1068

1069 Parameters

1070 ----------

1071 x:

1072 Input data.

1073 *args:

1074 Dimension labels or grouping variables.

1075 dim:

1076 Dimension(s) to concatenate into a single bin. If None (the default), the

1077 dimensions of the coordinate used for grouping are concatenated.

1078

1079 Returns

1080 -------

1081 :

1082 Binned data.

1083

1084 See Also

1085 --------

1086 scipp.bin:

1087 Creating binned data by binning based on edges, instead of grouping.

1088 scipp.binning.make_binned:

1089 Lower level function that can bin and group.

1090

1091 Examples

1092 --------

1093

1094 Group a table by one of its coord columns, specifying (1) a coord name or (2)

1095 an actual grouping:

1096

1097 >>> from numpy.random import default_rng

1098 >>> rng = default_rng(seed=1234)

1099 >>> x = sc.array(dims=['row'], unit='m', values=rng.random(100))

1100 >>> y = sc.array(dims=['row'], unit='m', values=rng.random(100))

1101 >>> data = sc.ones(dims=['row'], unit='K', shape=[100])

1102 >>> table = sc.DataArray(data=data, coords={'x': x, 'y': y})

1103 >>> table.coords['label'] = (table.coords['x'] * 10).to(dtype='int64')

1104 >>> table.group('label').sizes

1105 {'label': 10}

1106

1107 >>> groups = sc.array(dims=['label'], values=[1, 3, 5], unit='m')

1108 >>> table.group(groups).sizes

1109 {'label': 3}

1110

1111 Group a table by two of its coord columns:

1112

1113 >>> table.coords['a'] = (table.coords['x'] * 10).to(dtype='int64')

1114 >>> table.coords['b'] = (table.coords['y'] * 10).to(dtype='int64')

1115 >>> table.group('a', 'b').sizes

1116 {'a': 10, 'b': 10}

1117

1118 >>> groups = sc.array(dims=['a'], values=[1, 3, 5], unit='m')

1119 >>> table.group(groups, 'b').sizes

1120 {'a': 3, 'b': 10}

1121

1122 Group binned data along an additional dimension:

1123

1124 >>> table.coords['a'] = (table.coords['y'] * 10).to(dtype='int64')

1125 >>> binned = table.bin(x=10)

1126 >>> binned.group('a').sizes

1127 {'x': 10, 'a': 10}

1128

1129 The `dim` argument controls which dimensions are concatenated and which are

1130 preserved. Given 3-D data with a 2-D coordinate, the default `dim=None` results in:

1131

1132 >>> xyz = sc.data.table_xyz(100).bin(x=4, y=5, z=6)

1133 >>> times = rng.integers(low=1, high=3, size=(4, 5))

1134 >>> xyz.coords['t'] = sc.array(dims=['x', 'y'], unit='s', values=times)

1135 >>> xyz.group('t').sizes

1136 {'z': 6, 't': 2}

1137

1138 Specifying `dim=('x', 'y', 'z')` or equivalently `dim=xyz.dims` will additionally

1139 concatenate along the z-dimension, resulting in a 1-D array of bins:

1140

1141 >>> xyz.group('t', dim=('x', 'y', 'z')).sizes

1142 {'t': 2}

1143

1144 To preserve a dimension of the input's t-coordinate, we can drop this dimension

1145 from the tuple of dimensions to concatenate:

1146

1147 >>> xyz.group('t', dim='y').sizes

1148 {'x': 4, 'z': 6, 't': 2}

1149

1150 Finally, we can add a new dimension without touching the existing dimensions:

1151

1152 >>> xyz.group('t', dim=()).sizes

1153 {'x': 4, 'y': 5, 'z': 6, 't': 2}

1154

1155 Note that this is generally only useful if the input is binned data with a binned

1156 t-coordinate.

1157 """

1158 if isinstance(x, DataGroup):

1159 # Only to make mypy happy because we have `DataGroup` in annotation of `x`

1160 # so that Sphinx shows it.

1161 raise TypeError("Internal error: input should not be a DataGroup")

1162 groups = [_make_groups(x, name) for name in args]

1163 erase = _find_replaced_dims(x, dims=[g.dim for g in groups], dim=dim)

1164 return make_binned(x, groups=groups, erase=erase)