Source code for symtolerance2.toldata

import numpy as np

import scipy.stats as st
import matplotlib.pyplot as plt


[docs]class TolData: """ Class to process statistic data .. warning:: This class is under heavy construction and not consistently documented so far. No standardized way of using this class for postprocessing tolerance data or using it for creation of a online stop criterion for tolerance analysis is defined so far! """
[docs] def __init__(self, data, label=None): self.data = None self.size = None self.lim = None self.label = label self.update_data(data)
[docs] def update_data(self, data): """ overwrite previous data """ data = np.array(data) if 1 in data.shape: """shape==(1,x) or (x,1)""" data = data.flatten() if data.ndim == 1: self.data = data self.size = len(data) self.lim = (np.min(data), np.max(data)) else: print(f"\n+++'data' has shape {data.shape}, but only one-dimensional arrays are allowed so far!+++\n")
[docs] def get_worst_case(self): """ get the worst case value and its index within the :attr:`data` attribute of the :class:`TolData` class. Returns ------- (int, float): (index, value) of worst case entity """ wc_index = np.argsort(self.data)[-1] wc_value = self.data[wc_index] return wc_index, wc_value
[docs] def get_ecdf(self): """ calculate the empiric cumulative distribution function (ECDF) :math:`\widehat{F}` of the available data. Returns ------- (np.ndarray, np.ndarray): `(ecdf_q, ecdf)` quantile values `ecdf_q` and corresponding probability values `ecdf` of available data """ ecdf = (np.ones(self.size) / self.size).cumsum() ecdf_q = np.sort(self.data) return ecdf_q, ecdf
[docs] def fit_gauss(self): """ fit gauss (norm) distribution to self.data Returns ------- (mu,std): mean and standard deviation of fitted normal distribution """ mu, std = st.norm.fit(self.data) return mu, std
[docs] def fit_kde(self): """ fits a probability density function to self.data using kernel density estimation as implemented in the `scipy.stats module <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html#scipy.stats.gaussian_kde>`_ . This uses a Gaussian kernel and Scott's Rule (of thumb) for bandwidth selection. Returns ------- kernel: scipy.stats.gaussian_kde object """ kernel = st.gaussian_kde(self.data) return kernel
[docs] def get_fit_data(self, method='kde', fit_q=None, q_steps=100, verbose=1): """ calculate probability density function (PDF) and cumulative distribution function (CDF) based on fitted distribution. Parameters ---------- method: str method which is used to fit the distribution ['gauss','kde'], default 'kde' fit_q: array_like or None quantile values where PDF/CDF shall be evaluated, default 'None' --> automatic definition q_steps: int if fit_q==None, quantile range is chosen based on the available data and divided into `q_steps` steps verbose: int if verbose>0, info message can be printed, default 1 Returns ------- """ if fit_q is None: fit_q = np.linspace(self.lim[0], self.lim[1], q_steps) dyn_x_range = True else: # fit_q defined -> no dynamic adaption (for kde) dyn_x_range = False if method == "gauss": mu, std = self.fit_gauss() pdf = st.norm.pdf(fit_q, mu, std) cdf = st.norm.cdf(fit_q, mu, std) elif method == "kde": kern = self.fit_kde() datarange = self.lim[1]-self.lim[0] if dyn_x_range: adapt_cnt = 0 while kern.integrate_box_1d(fit_q[0], fit_q[-1]) < 0.999: fit_q = np.linspace(fit_q[0] - datarange / 20, fit_q[-1] + datarange / 20, q_steps) adapt_cnt += 1 if (adapt_cnt > 0) and (verbose > 0): print(f"fitted kde overlaps original data by approx. {adapt_cnt*datarange/20:.3e}!\n Evaluated on extended range!") pdf = kern.pdf(fit_q) cs = pdf.cumsum() cdf = cs/cs.max() else: print(f"method {method} not implemented") pdf = None cdf = None return fit_q, pdf, cdf
[docs] def run_batch_test(self, split=0.99, tests=1000): test_size = int(np.ceil(self.size * split)) # TODO: check if tests is lower than number of possible test_size out of self.size # -> if not - reduce split! #rng = np.random.default_rng() # numpy >= 1.17 # use np.random.choice() # numpy < 1.17 (SyMSpace) x_lst, cdf_lst, err_lst = [], [], [] rcdf_full_x, rcdf_full = self.get_ecdf() full_data = self.data for test in range(tests): #self.update_data(rng.choice(full_data, test_size)) self.update_data(np.random.choice(full_data, test_size)) x, _, c = self.get_fit_data(verbose=0) err, _ = TolData.get_cdf_diff_area([x, rcdf_full_x], [c, rcdf_full]) x_lst.append(x) cdf_lst.append(c) err_lst.append(err) self.update_data(full_data) c_trust_area, _ = TolData.get_cdf_diff_area(x_lst, cdf_lst) return x_lst, cdf_lst, c_trust_area, err_lst
[docs] def run_batch_test_conf(self, confidence=0.9, split=0.99, tests=1000): # returns confidence value and number of samples for which the value holds true test_size = int(np.ceil(self.size * split)) # TODO: check if tests is lower than number of possible test_size out of self.size # -> if not - reduce split! #rng = np.random.default_rng() # numpy >= 1.17 # use np.random.choice() # numpy < 1.17 (SyMSpace) err_lst = [] # cdf_full_x, cdf_full = self.get_ecdf() cdf_full_x, _, cdf_full = self.get_fit_data(verbose=0) full_data = self.data for test in range(tests): #self.update_data(rng.choice(full_data, test_size)) self.update_data(np.random.choice(full_data, test_size)) x, _, c = self.get_fit_data(verbose=0) err, _ = TolData.get_cdf_diff_area([x, cdf_full_x], [c, cdf_full]) err_lst.append(err) self.update_data(full_data) conv_td = TolData(np.array(err_lst)) return conv_td.get_quantile(confidence), test_size
[docs] def get_quantile(self, cdf_val, method='kde'): x,_,c = self.get_fit_data(method=method, verbose=0) return np.interp(cdf_val, c, x)
[docs] def get_F(self, quantile_val, method='kde'): x,_,c = self.get_fit_data(method=method, verbose=0) return np.interp(quantile_val, x, c)
[docs] def get_slope(self, p0=0.2, p1=0.8): q0 = self.get_quantile(p0) q1 = self.get_quantile(p1) slope = (p1-p0)/(q1-q0) return 1/slope, (q0, q1), (p0, p1)
[docs] @staticmethod def get_quantile_from_data(x,F,p): # todo: maybe add some checks about sorting, monotony, ... return np.interp(p, F, x)
# modified version from https://www.statsmodels.org/dev/_modules/statsmodels/distributions/empirical_distribution.html
[docs] def get_dkw_conf_set(self, method='empiric', alpha=0.05): r""" Modified version from https://www.statsmodels.org/dev/_modules/statsmodels/distributions/empirical_distribution.html Constructs a Dvoretzky-Kiefer-Wolfowitz confidence band for the eCDF. Parameters ---------- F : array_like The empirical distributions alpha : float Set alpha for a (1 - alpha) % confidence band, p=(1-alpha) Notes ----- Based on the DKW inequality. .. math:: P \left( \sup_x \left| F(x) - \hat(F)_n(X) \right| > \epsilon \right) \leq 2e^{-2n\epsilon^2} References ---------- Wasserman, L. 2006. `All of Nonparametric Statistics`. Springer. """ nobs = self.size # use number of empirical samples in any way! epsilon = TolData.get_dkw_epsilon(nobs, alpha) if method[:3].lower() =='emp': x,F = self.get_ecdf() else: x,_,F = self.get_fit_data(method=method, verbose=0) lower = np.clip(F - epsilon, 0, 1) upper = np.clip(F + epsilon, 0, 1) return lower, upper, x
[docs] def get_dkw_conf_quantiles(self, p=0.95, alpha=0.05, method='gauss_kde'): lower, upper, x = self.get_dkw_conf_set(method=method, alpha=alpha) q_lower = TolData.get_quantile_from_data(x, lower, p) q_upper = TolData.get_quantile_from_data(x, upper, p) q_F = self.get_quantile(p, method=method) return q_upper, q_F, q_lower
# def get_quantile(self, cdf_val, method='gauss_kde'): # x,_,c = self.get_fit_data(method=method, verbose=0) # return np.interp(cdf_val, c, x)
[docs] @staticmethod def get_dkw_epsilon(nobs, alpha): """ Dvoretzky - Kiefer - Wolfowitz confidence value :param nobs: :param alpha: float Set alpha for a (1 - alpha) % confidence band. :return: epsilon """ return np.sqrt(np.log(2. / alpha) / (2 * nobs))
[docs] @staticmethod def get_cdf_diff_area(x_lst, cdf_lst): # min/max Kurven berechnen # max-min aufintegrieren -> Vertrauensparameter! #x_interp = np.sort(np.array(x_lst).flatten()) x_interp = np.sort(np.concatenate(x_lst)) c_interp = [np.interp(x_interp, x, c) for x, c in zip(x_lst, cdf_lst)] c_interp = np.array(c_interp) c_max = c_interp.max(axis=0) c_min = c_interp.min(axis=0) c_trust = c_max - c_min c_trust_area = np.trapz(c_trust, x=x_interp) return c_trust_area, (x_interp, c_min, c_max)
[docs] @staticmethod def get_cdf_diff(x_lst, cdf_lst, area=True): # min/max Kurven berechnen # max-min aufintegrieren -> Vertrauensparameter! # x_interp = np.sort(np.array(x_lst).flatten()) if (len(x_lst) == 2) and area: fill_up_pts = 10 x0 = x_lst[0] x1 = x_lst[1] y0 = cdf_lst[0] y1 = cdf_lst[1] if x0[0] > x1[0]: # fill up x0 from left x0 = np.concatenate((np.linspace(x1[0], x0[0] - 1e-10, fill_up_pts), x0)) y0 = np.concatenate((np.zeros(fill_up_pts), y0)) else: x1 = np.concatenate((np.linspace(x0[0], x1[0] - 1e-10, fill_up_pts), x1)) y1 = np.concatenate((np.zeros(fill_up_pts), y1)) if x0[-1] < x1[-1]: # fill up x0 on right end x0 = np.concatenate((x0, np.linspace(x0[-1] + 1e-10, x1[-1], fill_up_pts))) y0 = np.concatenate((y0, np.ones(fill_up_pts))) else: x1 = np.concatenate((x1, np.linspace(x1[-1] + 1e-10, x0[-1], fill_up_pts))) y1 = np.concatenate((y1, np.ones(fill_up_pts))) x_lst = [x0, x1] cdf_lst = [y0, y1] x_interp = np.sort(np.concatenate(x_lst)) c_interp = [np.interp(x_interp, x, c) for x, c in zip(x_lst, cdf_lst)] c_interp = np.array(c_interp) c_max = c_interp.max(axis=0) c_min = c_interp.min(axis=0) c_trust = c_max - c_min c_trust_area = np.trapz(c_trust, x=x_interp) c_0 = c_interp[0, :] c_1 = c_interp[1, :] c_err = c_0 - c_1 return x_interp, c_err, c_trust_area
[docs] def plot_cdf(self, method=('ecdf', 'kde'), xlabel=None, ylabel='CDF', nominal_line=False, ax=None, **plotargs): """ Plot the cumulative distribution function of the underlying data :attr:`TolData.data` using matplotlib. Parameters ---------- method : array-like define the method which is used to evaluate the CDF. Possible values are `'ecdf'` to plot the empiric cumulative distribution function or one of the methods available in the :func:`get_fit_data` method. Default is `('ecdf', 'kde')` xlabel : str define custom xlabel, default :code:`None`. If default and :attr:`TolData.label` is defined, this label is used. ylabel : str define custom ylabel, default :code:`'CDF'` ax : matplotlib.axes define axes handle to force plotting to a certain axes plotargs kwargs forwarded to :func:`matplotlib.axes.plot` function Returns ------- matplotlib.axes handle to matplotlib axes for further customization Examples -------- .. plot:: :include-source: import numpy as np from symtolerance2.toldata import TolData import matplotlib.pyplot as plt # create normal distributed random values data = np.random.normal(size=100) td = TolData(data, label='random values') td.plot_cdf() """ if (xlabel is None) and (self.label is not None): xlabel = self.label legend_label = None if "label" in plotargs: legend_label = plotargs.pop('label') title_str = 'cumulative distribution function (CDF)' if "title" in plotargs: title_str = plotargs.pop('title') if ax is None: fig, ax = plt.subplots() if 'ecdf' in method: x, y = self.get_ecdf() ax.plot(x, y, label=legend_label if legend_label is not None else 'ECDF', **plotargs) if 'kde' in method: x, _, y = self.get_fit_data() ax.plot(x, y, label=legend_label if legend_label is not None else 'KDE', **plotargs) if 'gauss' in method: x, _, y = self.get_fit_data(method='gauss') ax.plot(x, y, label=legend_label if legend_label is not None else 'Gauss', **plotargs) if nominal_line: ax.plot(np.ones(2)*self.data[0], [0, 1], color=ax.get_lines()[-1].get_color()) ax.set_title(title_str) ax.legend() ax.grid(True) if xlabel is not None: ax.set_xlabel(xlabel, labelpad=-2) if ylabel is not None: ax.set_ylabel(ylabel) return ax
# def fit_density_function(data, show_plots=False, bins=10): # # data as falttened np.array # # bins has no influence on fitting, for fitting raw data vector is used! # data_len = len(data) # y, x = np.histogram(data, bins=bins) # xmin, xmax = x[0], x[-1] # # normalize histogram to have an area integral of one # density_fact = data_len / (1 / ((xmax - xmin) / bins)) # y = y / density_fact # # center coordinates of histogram bars # x_c = (x + np.roll(x, -1))[:-1] / 2 # # # fit normal distribution # mu, std = st.norm.fit(data) # # # evaluate fitted distribution # x_fit = np.linspace(xmin, xmax, 100) # pdf = st.norm.pdf(x_fit, mu, std) # cdf = st.norm.cdf(x_fit, mu, std) # # # calculate cummulated density function histogram data # y_cs = np.cumsum(y) # # # calculate cummulated density function from raw data # y_cs2 = (np.ones(data_len) / data_len).cumsum() # x_cs2 = np.sort(data) # # # TODO: error should be based on x_values, not on y_values! # # cdf_err = np.sum((st.norm.cdf(x_cs2, mu, std)-y_cs2)**2)/data_len # cdf_err = np.sum(np.abs(st.norm.cdf(x_cs2, mu, std) - y_cs2)) / data_len # # # plot all data: # if show_plots: # fig, (ax, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 5)) # bar_width = (xmax - xmin) / bins * 0.9 # ax.bar(x_c, y, width=bar_width, alpha=0.5) # ax.grid(True) # ax.set_xlim([xmin, xmax]) # ax.plot(x_fit, pdf, 'g', linewidth=2) # # ax2.bar(x_c, y_cs / y_cs.max(), width=bar_width, alpha=0.5) # ax2.plot(x_cs2, y_cs2, 'b', alpha=0.35, linewidth=3) # ax2.plot(x_fit, cdf, 'g', linewidth=2) # ax2.grid(True) # # ax.set_title(f"Fit results: mu = {mu:.4f}, std = {std:.4f}") # ax.set_ylabel('PDF') # ax.set_xlabel('T_cogg_pp in Nm') # # ax2.set_title(f"CDF me: {cdf_err:.2e}, samples used: {data_len:.0f}") # ax2.set_ylabel('CDF') # ax2.set_xlabel('T_cogg_pp in Nm') # # return x_fit, pdf, cdf, cdf_err