# Copyright 2020 Akamai Technologies, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from distimate.stats import CDF, PDF, Quantile, mean
[docs]class Distribution:
"""
Statistical distribution represented by its histogram.
Provides an object interface on top of a histogram array.
Supports distribution merging and comparison.
Implements approximation of common statistical functions.
:param edges: 1-D array-like, ordered histogram edges
:param values: 1-D array-like, histogram, one item longer than *edges*
"""
__slots__ = ("_edges", "_values")
_dtype = np.float64
def __init__(self, edges, values=None):
self._edges = np.asarray(edges)
size = len(self._edges) + 1
if values is None:
values = np.zeros(size, dtype=self._dtype)
else:
values = np.asarray(values, dtype=self._dtype)
if values.ndim != 1:
raise ValueError("Histogram must be 1-D array-like.")
if len(values) != size:
raise ValueError("Histogram must have len(edges) + 1 items.")
if not np.all(values >= 0):
raise ValueError("Histogram values must not be negative.")
self._values = values
def __repr__(self):
name = type(self).__name__
return f"<{name}: weight={self.weight:.0f}, mean={self.mean:.2f}>"
[docs] def __eq__(self, other):
"""Return whether distribution histograms are equal."""
if isinstance(other, Distribution):
self._check_compatibility(other)
return np.array_equal(self._values, other._values)
return NotImplemented
[docs] def __add__(self, other):
"""Combine this distribution with other distribution."""
if isinstance(other, Distribution):
self._check_compatibility(other)
values = self._values + other._values
return Distribution(self.edges, values)
return NotImplemented
[docs] def __iadd__(self, other):
"""Combine this distribution with other distribution inplace."""
if isinstance(other, Distribution):
self._check_compatibility(other)
self._values += other._values
return self
return NotImplemented
@property
def edges(self):
"""
Edges of the underlying histogram
:return: :class: 1-D `numpy.array`, ordered histogram edges
"""
return self._edges
@property
def values(self):
"""
Values of the underlying histogram.
:return: 1-D `numpy.array`, histogram values
"""
return self._values
[docs] @classmethod
def from_samples(cls, edges, samples, weights=None):
"""
Create a distribution from a list of values.
:param edges: 1-D array-like, ordered histogram edges
:param samples: 1-D array-like
:param weights: optional scalar
or 1-D array-like with same length as samples.
:return: a new :class:`Distribution`
"""
dist = cls(edges)
dist.update(samples, weights)
return dist
[docs] @classmethod
def from_histogram(cls, edges, histogram):
"""
Create a distribution from a histogram.
:param edges: 1-D array-like, ordered histogram edges
:param histogram: 1-D array-like, one item longer than edges
:return: a new :class:`Distribution`
"""
return cls(edges, histogram)
[docs] @classmethod
def from_cumulative(cls, edges, cumulative):
"""
Create a distribution from a cumulative histogram.
:param edges: 1-D array-like, ordered histogram edges
:param cumulative: 1-D array-like, one item longer than edges
:return: a new :class:`Distribution`
"""
values = np.diff(cumulative, prepend=0)
return cls(edges, values)
[docs] def to_histogram(self):
"""
Return a histogram of this distribution as a NumPy array.
:return: 1-D :class:`numpy.array`
"""
return self._values.copy()
[docs] def to_cumulative(self):
"""
Return a cumulative histogram of this distribution as a NumPy array.
:return: 1-D :class:`numpy.array`
"""
return np.cumsum(self._values)
[docs] def add(self, value, weight=None):
"""
Add a new item to this distribution.
:param value: item to add
:param weight: optional item weight
"""
if np.ndim(value) != 0:
raise ValueError("Value must be a scalar.")
if weight is None:
weight = 1
index = self._edges.searchsorted(value)
self._values[index] += weight
[docs] def update(self, values, weights=None):
"""
Add multiple items to this distribution.
:param values: items to add, 1-D array-like
:param weights: optional scalar or 1-D array-like
with same length as samples.
"""
values = np.asarray(values)
if values.ndim != 1:
raise ValueError("Values must be 1-D array-like.")
if weights is None:
weights = 1
index = self._edges.searchsorted(values)
# Cannot use self._hist[index] += weights because it does
# not accumulate if index contains duplicate values.
np.add.at(self._values, index, weights)
@property
def weight(self):
"""
Return a total weight of samples in this distribution.
:return: float number
"""
return self._values.sum()
@property
def mean(self):
"""
Estimate mean of this distribution.
The approximated mean is for sanity checks only,
it is ineffective and imprecise to estimate mean from a histogram.
See :func:`.mean` for details.
:return: float number
"""
return mean(self._edges, self._values)
@property
def pdf(self):
"""
Probability density function (PDF) of this distribution.
See :class:`.PDF` for details.
:return: a :class:`.PDF` instance
"""
return PDF(self._edges, self._values)
@property
def cdf(self):
"""
Cumulative distribution function (CDF) of this distribution.
See :class:`.CDF` for details.
:return: a :class:`.CDF` instance
"""
return CDF(self._edges, self._values)
@property
def quantile(self):
"""
Quantile function of this distribution.
See :class:`.Quantile` for details.
:return: a :class:`.Quantile` instance
"""
return Quantile(self._edges, self._values)
def _check_compatibility(self, dist):
if not np.array_equal(dist._edges, self._edges):
raise ValueError("Distributions have different edges.")