Lumiera  0.pre.03
»edit your freedom«
statistic.hpp
1 /*
2  STATISTIC.hpp - helpers for generic statistics calculations
3 
4  Copyright (C)
5  2022, Hermann Vosseler <Ichthyostega@web.de>
6 
7   **Lumiera** is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by the
9   Free Software Foundation; either version 2 of the License, or (at your
10   option) any later version. See the file COPYING for further details.
11 
12 */
13 
14 
25 #ifndef LIB_STAT_STATISTIC_H
26 #define LIB_STAT_STATISTIC_H
27 
28 
29 #include "lib/error.hpp"
30 #include "lib/nocopy.hpp"
31 #include "lib/iter-adapter.hpp"
32 #include "lib/format-string.hpp"
33 #include "lib/util.hpp"
34 
35 #include <utility>
36 #include <vector>
37 #include <array>
38 #include <tuple>
39 #include <cmath>
40 
41 namespace lib {
42 namespace stat{
43 
44  namespace error = lumiera::error;
45 
46  using std::fabs;
47  using std::array;
48  using std::tuple;
49  using std::make_tuple;
50  using std::forward;
51  using std::move;
52  using util::min;
53  using util::max;
54  using util::isnil;
55  using util::_Fmt;
56 
57  using VecD = std::vector<double>;
58 
59 
60 
62  template<typename TUP>
63  constexpr auto
64  array_from_tuple (TUP&& tuple)
65  {
66  constexpr auto makeArray = [](auto&& ... x)
67  {
68  return std::array{forward<decltype(x)> (x) ...};
69  };
70  return std::apply (makeArray, forward<TUP> (tuple));
71  }
72 
73  template<size_t places>
74  inline double
75  round (double val)
76  {
77  constexpr double shift{pow(10.0, places)};
78  return std::round(val*shift) / shift;
79  }
80 
81 
82 
83 
90  template<typename D>
91  class DataSpan
93  {
94  const D* const b_{nullptr};
95  const D* const e_{nullptr};
96 
97  public:
98  DataSpan() = default;
99  DataSpan (D const& begin, D const& end)
100  : b_{&begin}
101  , e_{&end}
102  {
103  if (e_ < b_)
104  throw error::Invalid{"End point before begin."};
105  }
106 
107  template<class CON>
108  DataSpan (CON const& container)
109  : DataSpan{*std::begin(container), *std::end(container)}
110  { }
111 
112 
113  using iterator = const D*;
114  using const_iterator = iterator;
115 
116  size_t size() const { return e_ - b_; }
117  bool empty() const { return b_ == e_;}
118 
119  iterator begin() const { return b_; }
120  iterator end() const { return e_; }
121  friend const_iterator begin (DataSpan const& span){ return span.begin();}
122  friend const_iterator end (DataSpan const& span){ return span.end(); }
123 
124  D const& operator[](size_t i) const { return *(b_ + i); }
125  D const& at(size_t i) const
126  {
127  if (i >= size())
128  throw error::Invalid{_Fmt{"Index %d beyond size=%d"}
129  % i % size()};
130  return this->operator[](i);
131  }
132  };
133 
135  template<class CON>
137 
138 
139 
140 
141 
143  template<typename... NUMS>
144  inline double
145  errorSum (NUMS ...vals)
146  {
147  auto sqr = [](auto val){ return val*val; };
148  return sqrt((sqr(vals)+ ... + 0.0));
149  }
150 
151 
152 
153  template<typename D>
154  inline double
155  average (DataSpan<D> const& data)
156  {
157  if (isnil(data)) return 0.0;
158  double sum = 0.0;
159  for (auto val : data)
160  sum += val;
161  return sum / data.size();
162  }
163 
164  template<typename D>
165  inline double
166  sdev (DataSpan<D> const& data, D mean)
167  {
168  if (isnil(data)) return 0.0;
169  double sdev = 0.0;
170  for (auto val : data)
171  {
172  D offset = val - mean;
173  sdev += offset*offset;
174  }
175  size_t n = data.size();
176  sdev /= n<2? 1: n-1;
177  return sqrt (sdev);
178  }
179 
180  inline double
181  sdev (VecD const& data, double mean)
182  {
183  return sdev(DataSpan<double>{data}, mean);
184  }
185 
186 
187 
188  inline DataSpan<double>
189  lastN (VecD const& data, size_t n)
190  {
191  n = min (n, data.size());
192  size_t oldest = data.size() - n;
193  return DataSpan<double>{data[oldest], *data.end()};
194  }
195 
196  inline double
197  averageLastN (VecD const& data, size_t n)
198  {
199  return average (lastN (data,n));
200  }
201 
202  inline double
203  sdevLastN (VecD const& data, size_t n, double mean)
204  {
205  return sdev (lastN (data,n), mean);
206  }
207 
208 
210  template<typename D>
211  inline auto
212  computeStatSums (DataSpan<D> const& series)
213  {
214  double ysum = 0.0;
215  double yysum = 0.0;
216  double xysum = 0.0;
217  size_t x = 0;
218  for (auto& y : series)
219  {
220  ysum += y;
221  yysum += y*y;
222  xysum += x*y;
223  ++x;
224  }
225  return make_tuple (ysum,yysum, xysum);
226  }
227 
228 
235  {
236  double x;
237  double y;
238  double w;
239 
240  RegressionPoint (double vx, double vy, double vw=1.0)
241  : x{vx}
242  , y{vy}
243  , w{vw}
244  { }
245  };
246 
247  using RegressionData = std::vector<RegressionPoint>;
248 
249 
251  inline auto
252  computeWeightedStatSums (DataSpan<RegressionPoint> const& points)
253  {
254  std::array<double,6> sums;
255  sums.fill(0.0);
256  auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums;
257  for (auto& p : points)
258  {
259  wsum += p.w;
260  wxsum += p.w * p.x;
261  wysum += p.w * p.y;
262  wxxsum += p.w * p.x*p.x;
263  wyysum += p.w * p.y*p.y;
264  wxysum += p.w * p.x*p.y;
265  }
266  return sums;
267  }
268 
281  inline auto
282  computeLinearRegression (DataSpan<RegressionPoint> const& points)
283  {
284  auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points);
285 
286  double xm = wxsum / wsum; // weighted mean x = 1/Σw · Σwx
287  double ym = wysum / wsum;
288  double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum; // Σw · x-Variance = Σw(x-xm)²
289  double vary = wyysum + ym*ym * wsum - 2*ym * wysum;
290  double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum; // Σw · Covariance = Σw(x-xm)(y-ym)
291 
292  // Linear Regression minimising σ²
293  double gradient = cova / varx; // gradient = correlation · σy / σx ; σ = √Variance
294  double socket = ym - gradient * xm; // Regression line: Y-ym = gradient · (x-xm) ; set x≔0 yields socket
295 
296  // Correlation (Pearson's r)
297  double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary);
298 
299  // calculate error Δ for all measurement points
300  size_t n = points.size();
301  VecD predicted; predicted.reserve(n);
302  VecD deltas; deltas.reserve(n);
303  double maxDelta = 0.0;
304  double variance = 0.0;
305  for (auto& p : points)
306  {
307  double y_pred = socket + gradient * p.x;
308  double delta = p.y - y_pred;
309  predicted.push_back (y_pred);
310  deltas.push_back (delta);
311  maxDelta = max (maxDelta, fabs(delta));
312  variance += p.w * delta*delta;
313  }
314  variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation,
315  // based on 2 other estimated values (socket,gradient)
316  return make_tuple (socket,gradient
317  ,move(predicted)
318  ,move(deltas)
319  ,correlation
320  ,maxDelta
321  ,sqrt(variance)
322  );
323  }
324 
325  inline auto
326  computeLinearRegression (RegressionData const& points)
327  {
328  return computeLinearRegression (DataSpan<RegressionPoint>{points});
329  }
330 
331 
332 
342  template<typename D>
343  inline auto
344  computeTimeSeriesLinearRegression (DataSpan<D> const& series)
345  {
346  if (series.size() < 2) return make_tuple(0.0,0.0,0.0);
347 
348  auto [ysum,yysum, xysum] = computeStatSums(series);
349 
350  size_t n = series.size();
351  double im = (n-1)/2.0; // mean of zero-based indices i ∈ {0 … n-1}
352  double ym = ysum / n; // mean y
353  double varx = (n-1)*(n+1)/12.0; // variance of zero-based indices Σ(i-im)² / n
354  double vary = yysum/n - ym*ym; // variance of data values Σ(y-ym)² / n
355  double cova = xysum - ysum *(n-1)/2; // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n·ym ≙ Σy
356 
357  // Linear Regression minimising σ²
358  double gradient = cova / (n*varx); // Gradient = Correlation · σy / σx ; σ = √Variance; Correlation = Covariance /(√Σx √Σy)
359  double socket = ym - gradient * im; // Regression line: Y-ym = Gradient · (i-im) ; set i≔0 yields socket
360 
361  // Correlation (Pearson's r)
362  double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary);
363  return make_tuple (socket,gradient,correlation);
364  }
365 
366  inline auto
367  computeTimeSeriesLinearRegression (VecD const& series)
368  {
369  return computeTimeSeriesLinearRegression (DataSpan<double>{series});
370  }
371 
372 }} // namespace lib::stat
373 #endif /*LIB_STAT_STATISTIC_H*/
Helper template(s) for creating Lumiera Forward Iterators.
Front-end for printf-style string template interpolation.
A front-end for using printf-style formatting.
Implementation namespace for support and library code.
Read-only view into a segment within a sequence of data.
Definition: statistic.hpp:91
Derived specific exceptions within Lumiera&#39;s exception hierarchy.
Definition: error.hpp:190
Mix-Ins to allow or prohibit various degrees of copying and cloning.
Tiny helper functions and shortcuts to be used everywhere Consider this header to be effectively incl...
Types marked with this mix-in may be duplicated by copy-construction, yet may not be moved or transfe...
Definition: nocopy.hpp:95
Lumiera error handling (C++ interface).
Single data point used for linear regression.
Definition: statistic.hpp:234