import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
[docs]class TolData:
"""
Class to process statistic data
.. warning::
This class is under heavy construction and not consistently documented so far. No standardized way of using this
class for postprocessing tolerance data or using it for creation of a online stop criterion for tolerance analysis
is defined so far!
"""
[docs] def __init__(self, data, label=None):
self.data = None
self.size = None
self.lim = None
self.label = label
self.update_data(data)
[docs] def update_data(self, data):
"""
overwrite previous data
"""
data = np.array(data)
if 1 in data.shape:
"""shape==(1,x) or (x,1)"""
data = data.flatten()
if data.ndim == 1:
self.data = data
self.size = len(data)
self.lim = (np.min(data), np.max(data))
else:
print(f"\n+++'data' has shape {data.shape}, but only one-dimensional arrays are allowed so far!+++\n")
[docs] def get_worst_case(self):
"""
get the worst case value and its index within the :attr:`data` attribute of the :class:`TolData` class.
Returns
-------
(int, float):
(index, value) of worst case entity
"""
wc_index = np.argsort(self.data)[-1]
wc_value = self.data[wc_index]
return wc_index, wc_value
[docs] def get_ecdf(self):
"""
calculate the empiric cumulative distribution function (ECDF) :math:`\widehat{F}` of the available data.
Returns
-------
(np.ndarray, np.ndarray):
`(ecdf_q, ecdf)` quantile values `ecdf_q` and corresponding probability values `ecdf` of available data
"""
ecdf = (np.ones(self.size) / self.size).cumsum()
ecdf_q = np.sort(self.data)
return ecdf_q, ecdf
[docs] def fit_gauss(self):
"""
fit gauss (norm) distribution to self.data
Returns
-------
(mu,std): mean and standard deviation of fitted normal distribution
"""
mu, std = st.norm.fit(self.data)
return mu, std
[docs] def fit_kde(self):
"""
fits a probability density function to self.data using kernel density estimation as implemented in the
`scipy.stats module <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html#scipy.stats.gaussian_kde>`_
. This uses a Gaussian kernel and Scott's Rule (of thumb) for bandwidth selection.
Returns
-------
kernel: scipy.stats.gaussian_kde object
"""
kernel = st.gaussian_kde(self.data)
return kernel
[docs] def get_fit_data(self, method='kde', fit_q=None, q_steps=100, verbose=1):
"""
calculate probability density function (PDF) and cumulative distribution function (CDF) based on fitted
distribution.
Parameters
----------
method: str
method which is used to fit the distribution ['gauss','kde'], default 'kde'
fit_q: array_like or None
quantile values where PDF/CDF shall be evaluated, default 'None' --> automatic definition
q_steps: int
if fit_q==None, quantile range is chosen based on the available data and divided into `q_steps` steps
verbose: int
if verbose>0, info message can be printed, default 1
Returns
-------
"""
if fit_q is None:
fit_q = np.linspace(self.lim[0], self.lim[1], q_steps)
dyn_x_range = True
else:
# fit_q defined -> no dynamic adaption (for kde)
dyn_x_range = False
if method == "gauss":
mu, std = self.fit_gauss()
pdf = st.norm.pdf(fit_q, mu, std)
cdf = st.norm.cdf(fit_q, mu, std)
elif method == "kde":
kern = self.fit_kde()
datarange = self.lim[1]-self.lim[0]
if dyn_x_range:
adapt_cnt = 0
while kern.integrate_box_1d(fit_q[0], fit_q[-1]) < 0.999:
fit_q = np.linspace(fit_q[0] - datarange / 20, fit_q[-1] + datarange / 20, q_steps)
adapt_cnt += 1
if (adapt_cnt > 0) and (verbose > 0):
print(f"fitted kde overlaps original data by approx. {adapt_cnt*datarange/20:.3e}!\n Evaluated on extended range!")
pdf = kern.pdf(fit_q)
cs = pdf.cumsum()
cdf = cs/cs.max()
else:
print(f"method {method} not implemented")
pdf = None
cdf = None
return fit_q, pdf, cdf
[docs] def run_batch_test(self, split=0.99, tests=1000):
test_size = int(np.ceil(self.size * split))
# TODO: check if tests is lower than number of possible test_size out of self.size
# -> if not - reduce split!
#rng = np.random.default_rng() # numpy >= 1.17
# use np.random.choice() # numpy < 1.17 (SyMSpace)
x_lst, cdf_lst, err_lst = [], [], []
rcdf_full_x, rcdf_full = self.get_ecdf()
full_data = self.data
for test in range(tests):
#self.update_data(rng.choice(full_data, test_size))
self.update_data(np.random.choice(full_data, test_size))
x, _, c = self.get_fit_data(verbose=0)
err, _ = TolData.get_cdf_diff_area([x, rcdf_full_x], [c, rcdf_full])
x_lst.append(x)
cdf_lst.append(c)
err_lst.append(err)
self.update_data(full_data)
c_trust_area, _ = TolData.get_cdf_diff_area(x_lst, cdf_lst)
return x_lst, cdf_lst, c_trust_area, err_lst
[docs] def run_batch_test_conf(self, confidence=0.9, split=0.99, tests=1000):
# returns confidence value and number of samples for which the value holds true
test_size = int(np.ceil(self.size * split))
# TODO: check if tests is lower than number of possible test_size out of self.size
# -> if not - reduce split!
#rng = np.random.default_rng() # numpy >= 1.17
# use np.random.choice() # numpy < 1.17 (SyMSpace)
err_lst = []
# cdf_full_x, cdf_full = self.get_ecdf()
cdf_full_x, _, cdf_full = self.get_fit_data(verbose=0)
full_data = self.data
for test in range(tests):
#self.update_data(rng.choice(full_data, test_size))
self.update_data(np.random.choice(full_data, test_size))
x, _, c = self.get_fit_data(verbose=0)
err, _ = TolData.get_cdf_diff_area([x, cdf_full_x], [c, cdf_full])
err_lst.append(err)
self.update_data(full_data)
conv_td = TolData(np.array(err_lst))
return conv_td.get_quantile(confidence), test_size
[docs] def get_quantile(self, cdf_val, method='kde'):
x,_,c = self.get_fit_data(method=method, verbose=0)
return np.interp(cdf_val, c, x)
[docs] def get_F(self, quantile_val, method='kde'):
x,_,c = self.get_fit_data(method=method, verbose=0)
return np.interp(quantile_val, x, c)
[docs] def get_slope(self, p0=0.2, p1=0.8):
q0 = self.get_quantile(p0)
q1 = self.get_quantile(p1)
slope = (p1-p0)/(q1-q0)
return 1/slope, (q0, q1), (p0, p1)
[docs] @staticmethod
def get_quantile_from_data(x,F,p):
# todo: maybe add some checks about sorting, monotony, ...
return np.interp(p, F, x)
# modified version from https://www.statsmodels.org/dev/_modules/statsmodels/distributions/empirical_distribution.html
[docs] def get_dkw_conf_set(self, method='empiric', alpha=0.05):
r"""
Modified version from https://www.statsmodels.org/dev/_modules/statsmodels/distributions/empirical_distribution.html
Constructs a Dvoretzky-Kiefer-Wolfowitz confidence band for the eCDF.
Parameters
----------
F : array_like
The empirical distributions
alpha : float
Set alpha for a (1 - alpha) % confidence band, p=(1-alpha)
Notes
-----
Based on the DKW inequality.
.. math:: P \left( \sup_x \left| F(x) - \hat(F)_n(X) \right| > \epsilon \right) \leq 2e^{-2n\epsilon^2}
References
----------
Wasserman, L. 2006. `All of Nonparametric Statistics`. Springer.
"""
nobs = self.size # use number of empirical samples in any way!
epsilon = TolData.get_dkw_epsilon(nobs, alpha)
if method[:3].lower() =='emp':
x,F = self.get_ecdf()
else:
x,_,F = self.get_fit_data(method=method, verbose=0)
lower = np.clip(F - epsilon, 0, 1)
upper = np.clip(F + epsilon, 0, 1)
return lower, upper, x
[docs] def get_dkw_conf_quantiles(self, p=0.95, alpha=0.05, method='gauss_kde'):
lower, upper, x = self.get_dkw_conf_set(method=method, alpha=alpha)
q_lower = TolData.get_quantile_from_data(x, lower, p)
q_upper = TolData.get_quantile_from_data(x, upper, p)
q_F = self.get_quantile(p, method=method)
return q_upper, q_F, q_lower
# def get_quantile(self, cdf_val, method='gauss_kde'):
# x,_,c = self.get_fit_data(method=method, verbose=0)
# return np.interp(cdf_val, c, x)
[docs] @staticmethod
def get_dkw_epsilon(nobs, alpha):
"""
Dvoretzky - Kiefer - Wolfowitz confidence value
:param nobs:
:param alpha: float
Set alpha for a (1 - alpha) % confidence band.
:return: epsilon
"""
return np.sqrt(np.log(2. / alpha) / (2 * nobs))
[docs] @staticmethod
def get_cdf_diff_area(x_lst, cdf_lst):
# min/max Kurven berechnen
# max-min aufintegrieren -> Vertrauensparameter!
#x_interp = np.sort(np.array(x_lst).flatten())
x_interp = np.sort(np.concatenate(x_lst))
c_interp = [np.interp(x_interp, x, c) for x, c in zip(x_lst, cdf_lst)]
c_interp = np.array(c_interp)
c_max = c_interp.max(axis=0)
c_min = c_interp.min(axis=0)
c_trust = c_max - c_min
c_trust_area = np.trapz(c_trust, x=x_interp)
return c_trust_area, (x_interp, c_min, c_max)
[docs] @staticmethod
def get_cdf_diff(x_lst, cdf_lst, area=True):
# min/max Kurven berechnen
# max-min aufintegrieren -> Vertrauensparameter!
# x_interp = np.sort(np.array(x_lst).flatten())
if (len(x_lst) == 2) and area:
fill_up_pts = 10
x0 = x_lst[0]
x1 = x_lst[1]
y0 = cdf_lst[0]
y1 = cdf_lst[1]
if x0[0] > x1[0]:
# fill up x0 from left
x0 = np.concatenate((np.linspace(x1[0], x0[0] - 1e-10, fill_up_pts), x0))
y0 = np.concatenate((np.zeros(fill_up_pts), y0))
else:
x1 = np.concatenate((np.linspace(x0[0], x1[0] - 1e-10, fill_up_pts), x1))
y1 = np.concatenate((np.zeros(fill_up_pts), y1))
if x0[-1] < x1[-1]:
# fill up x0 on right end
x0 = np.concatenate((x0, np.linspace(x0[-1] + 1e-10, x1[-1], fill_up_pts)))
y0 = np.concatenate((y0, np.ones(fill_up_pts)))
else:
x1 = np.concatenate((x1, np.linspace(x1[-1] + 1e-10, x0[-1], fill_up_pts)))
y1 = np.concatenate((y1, np.ones(fill_up_pts)))
x_lst = [x0, x1]
cdf_lst = [y0, y1]
x_interp = np.sort(np.concatenate(x_lst))
c_interp = [np.interp(x_interp, x, c) for x, c in zip(x_lst, cdf_lst)]
c_interp = np.array(c_interp)
c_max = c_interp.max(axis=0)
c_min = c_interp.min(axis=0)
c_trust = c_max - c_min
c_trust_area = np.trapz(c_trust, x=x_interp)
c_0 = c_interp[0, :]
c_1 = c_interp[1, :]
c_err = c_0 - c_1
return x_interp, c_err, c_trust_area
[docs] def plot_cdf(self, method=('ecdf', 'kde'), xlabel=None, ylabel='CDF', nominal_line=False, ax=None, **plotargs):
"""
Plot the cumulative distribution function of the underlying data :attr:`TolData.data` using matplotlib.
Parameters
----------
method : array-like
define the method which is used to evaluate the CDF. Possible values are `'ecdf'` to plot the empiric
cumulative distribution function or one of the methods available in the :func:`get_fit_data` method.
Default is `('ecdf', 'kde')`
xlabel : str
define custom xlabel, default :code:`None`. If default and :attr:`TolData.label` is defined, this label
is used.
ylabel : str
define custom ylabel, default :code:`'CDF'`
ax : matplotlib.axes
define axes handle to force plotting to a certain axes
plotargs
kwargs forwarded to :func:`matplotlib.axes.plot` function
Returns
-------
matplotlib.axes
handle to matplotlib axes for further customization
Examples
--------
.. plot::
:include-source:
import numpy as np
from symtolerance2.toldata import TolData
import matplotlib.pyplot as plt
# create normal distributed random values
data = np.random.normal(size=100)
td = TolData(data, label='random values')
td.plot_cdf()
"""
if (xlabel is None) and (self.label is not None):
xlabel = self.label
legend_label = None
if "label" in plotargs:
legend_label = plotargs.pop('label')
title_str = 'cumulative distribution function (CDF)'
if "title" in plotargs:
title_str = plotargs.pop('title')
if ax is None:
fig, ax = plt.subplots()
if 'ecdf' in method:
x, y = self.get_ecdf()
ax.plot(x, y, label=legend_label if legend_label is not None else 'ECDF', **plotargs)
if 'kde' in method:
x, _, y = self.get_fit_data()
ax.plot(x, y, label=legend_label if legend_label is not None else 'KDE', **plotargs)
if 'gauss' in method:
x, _, y = self.get_fit_data(method='gauss')
ax.plot(x, y, label=legend_label if legend_label is not None else 'Gauss', **plotargs)
if nominal_line:
ax.plot(np.ones(2)*self.data[0], [0, 1], color=ax.get_lines()[-1].get_color())
ax.set_title(title_str)
ax.legend()
ax.grid(True)
if xlabel is not None:
ax.set_xlabel(xlabel, labelpad=-2)
if ylabel is not None:
ax.set_ylabel(ylabel)
return ax
# def fit_density_function(data, show_plots=False, bins=10):
# # data as falttened np.array
# # bins has no influence on fitting, for fitting raw data vector is used!
# data_len = len(data)
# y, x = np.histogram(data, bins=bins)
# xmin, xmax = x[0], x[-1]
# # normalize histogram to have an area integral of one
# density_fact = data_len / (1 / ((xmax - xmin) / bins))
# y = y / density_fact
# # center coordinates of histogram bars
# x_c = (x + np.roll(x, -1))[:-1] / 2
#
# # fit normal distribution
# mu, std = st.norm.fit(data)
#
# # evaluate fitted distribution
# x_fit = np.linspace(xmin, xmax, 100)
# pdf = st.norm.pdf(x_fit, mu, std)
# cdf = st.norm.cdf(x_fit, mu, std)
#
# # calculate cummulated density function histogram data
# y_cs = np.cumsum(y)
#
# # calculate cummulated density function from raw data
# y_cs2 = (np.ones(data_len) / data_len).cumsum()
# x_cs2 = np.sort(data)
#
# # TODO: error should be based on x_values, not on y_values!
# # cdf_err = np.sum((st.norm.cdf(x_cs2, mu, std)-y_cs2)**2)/data_len
# cdf_err = np.sum(np.abs(st.norm.cdf(x_cs2, mu, std) - y_cs2)) / data_len
#
# # plot all data:
# if show_plots:
# fig, (ax, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
# bar_width = (xmax - xmin) / bins * 0.9
# ax.bar(x_c, y, width=bar_width, alpha=0.5)
# ax.grid(True)
# ax.set_xlim([xmin, xmax])
# ax.plot(x_fit, pdf, 'g', linewidth=2)
#
# ax2.bar(x_c, y_cs / y_cs.max(), width=bar_width, alpha=0.5)
# ax2.plot(x_cs2, y_cs2, 'b', alpha=0.35, linewidth=3)
# ax2.plot(x_fit, cdf, 'g', linewidth=2)
# ax2.grid(True)
#
# ax.set_title(f"Fit results: mu = {mu:.4f}, std = {std:.4f}")
# ax.set_ylabel('PDF')
# ax.set_xlabel('T_cogg_pp in Nm')
#
# ax2.set_title(f"CDF me: {cdf_err:.2e}, samples used: {data_len:.0f}")
# ax2.set_ylabel('CDF')
# ax2.set_xlabel('T_cogg_pp in Nm')
#
# return x_fit, pdf, cdf, cdf_err