# Copyright 2020 Akamai Technologies, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from distimate.types import DistributionType
try:
import pandas as pd
except ImportError:
pd = None
def _format_number(v):
if round(v) == v:
return str(int(v))
return str(v)
[docs]class DistributionAccessor(object):
"""
Implements ``.dist`` accessor on :class:`pandas.Series`.
Allows to easily call :class:`.Distribution` methods
on all instances in Pandas Series:
.. code-block:: python
df[col] = pd.Series.dist.from_histogram(dist_type, histograms)
median = df[col].dist.quantile(0.5)
"""
def __init__(self, series):
self._series = series
[docs] @staticmethod
def from_histogram(dist_type, histograms, *, name=None):
"""
Construct a new :class:`pandas.Series` from histograms.
This is a static method that can be accessed
as ``pd.Series.dist.from_histogram()``.
:param dist_type: :class:`.DistributionType` or
1-D array-like with histogram edges
:param histograms: :class:`pandas.DataFrame` or 2-D array-like
:param name: optional name of the series.
:return: :class:`pandas.Series`
"""
if not isinstance(dist_type, DistributionType):
dist_type = DistributionType(dist_type)
index = None
if isinstance(histograms, pd.DataFrame):
index = histograms.index
histograms = histograms.values
dists = [dist_type.from_histogram(histogram) for histogram in histograms]
return pd.Series(dists, index=index, name=name)
[docs] @staticmethod
def from_cumulative(dist_type, cumulatives, *, name=None):
"""
Construct a new :class:`pandas.Series` from cumulative histograms.
This is a static method that can be accessed
as ``pd.Series.dist.from_cumulative()``.
:param dist_type: :class:`.DistributionType` or
1-D array-like with histogram edges
:param histograms: :class:`pandas.DataFrame` or 2-D array-like
:param name: Optional name of the series.
:return: :class:`pandas.Series`
"""
if not isinstance(dist_type, DistributionType):
dist_type = DistributionType(dist_type)
index = None
if isinstance(cumulatives, pd.DataFrame):
index = cumulatives.index
cumulatives = cumulatives.values
histograms = np.diff(cumulatives, prepend=0)
dists = [dist_type.from_histogram(histogram) for histogram in histograms]
return pd.Series(dists, index=index, name=name)
[docs] def to_histogram(self):
"""
Convert :class:`pandas.Series` of :class:`.Distribution`
instances to histograms.
:return: :class:`pandas.DataFrame` with histogram values.
"""
data = self.values
columns = [self._get_name(f"histogram{i}") for i in range(data.shape[-1])]
return pd.DataFrame(data, index=self._series.index, columns=columns)
[docs] def to_cumulative(self):
"""
Convert :class:`pandas.Series` of :class:`.Distribution` instances
to cumulative histograms.
:return: :class:`pandas.DataFrame` with cumulative values
"""
data = np.cumsum(self.values, axis=1)
columns = [self._get_name(f"cumulative{i}") for i in range(data.shape[-1])]
return pd.DataFrame(data, index=self._series.index, columns=columns)
[docs] def pdf(self, v):
"""
Compute PDF for :class:`pandas.Series` of :class:`.Distribution` instances.
:param v: input value, or list of them
:return: :class:`pandas.Series`
"""
return self._compute(self._pdf, v)
[docs] def cdf(self, v):
"""
Compute CDF for series of distribution instances.
:param v: input value, or list of them
:return: :class:`pandas.Series`
"""
return self._compute(self._cdf, v)
[docs] def quantile(self, v):
"""
Compute quantile function :class:`pandas.Series`
of :class:`.Distribution` intances.
:param v: input value, or list of them
:return: :class:`pandas.Series`
"""
return self._compute(self._quantile, v)
@property
def values(self):
"""
Values of the underlying histograms.
:return: 2-D :class:`numpy.array`
"""
if self._series.empty:
return np.zeros((0, 0))
return np.array([dist.values for dist in self._series])
def _compute(self, meth, v):
if isinstance(v, (tuple, list)):
columns = [meth(i) for i in v]
return pd.concat(columns, axis=1)
return meth(v)
def _pdf(self, v):
name = self._get_name(f"pdf{_format_number(v)}")
data = [dist.pdf(v) if pd.notna(dist) else np.nan for dist in self._series]
return pd.Series(data, index=self._series.index, name=name)
def _cdf(self, v):
name = self._get_name(f"cdf{_format_number(v)}")
data = [dist.cdf(v) if pd.notna(dist) else np.nan for dist in self._series]
return pd.Series(data, index=self._series.index, name=name)
def _quantile(self, v):
name = self._get_name(f"q{_format_number(100 * v)}")
data = [dist.quantile(v) if pd.notna(dist) else np.nan for dist in self._series]
return pd.Series(data, index=self._series.index, name=name)
def _get_name(self, name):
if self._series.name is None:
return name
return f"{self._series.name}_{name}"
def register_to_pandas():
if pd is None:
return # Pandas are not installed
pd.api.extensions.register_series_accessor("dist")(DistributionAccessor)