Lumiera  0.pre.03
»edit your freedom«
statistic.hpp
1 /*
2  STATISTIC.hpp - helpers for generic statistics calculations
3 
4  Copyright (C) Lumiera.org
5  2022, Hermann Vosseler <Ichthyostega@web.de>
6 
7  This program is free software; you can redistribute it and/or
8  modify it under the terms of the GNU General Public License as
9  published by the Free Software Foundation; either version 2 of
10  the License, or (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with this program; if not, write to the Free Software
19  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 
21 */
22 
23 
34 #ifndef LIB_STAT_STATISTIC_H
35 #define LIB_STAT_STATISTIC_H
36 
37 
38 #include "lib/error.hpp"
39 #include "lib/nocopy.hpp"
40 #include "lib/iter-adapter.hpp"
41 #include "lib/format-string.hpp"
42 #include "lib/util.hpp"
43 
44 #include <utility>
45 #include <vector>
46 #include <array>
47 #include <tuple>
48 #include <cmath>
49 
50 namespace lib {
51 namespace stat{
52 
53  namespace error = lumiera::error;
54 
55  using std::fabs;
56  using std::array;
57  using std::tuple;
58  using std::make_tuple;
59  using std::forward;
60  using std::move;
61  using util::min;
62  using util::max;
63  using util::isnil;
64  using util::_Fmt;
65 
66  using VecD = std::vector<double>;
67 
68 
69 
71  template<typename TUP>
72  constexpr auto
73  array_from_tuple (TUP&& tuple)
74  {
75  constexpr auto makeArray = [](auto&& ... x)
76  {
77  return std::array{forward<decltype(x)> (x) ...};
78  };
79  return std::apply (makeArray, forward<TUP> (tuple));
80  }
81 
82  template<size_t places>
83  inline double
84  round (double val)
85  {
86  constexpr double shift{pow(10.0, places)};
87  return std::round(val*shift) / shift;
88  }
89 
90 
91 
92 
99  template<typename D>
100  class DataSpan
102  {
103  const D* const b_{nullptr};
104  const D* const e_{nullptr};
105 
106  public:
107  DataSpan() = default;
108  DataSpan (D const& begin, D const& end)
109  : b_{&begin}
110  , e_{&end}
111  {
112  if (e_ < b_)
113  throw error::Invalid{"End point before begin."};
114  }
115 
116  template<class CON>
117  DataSpan (CON const& container)
118  : DataSpan{*std::begin(container), *std::end(container)}
119  { }
120 
121 
122  using iterator = const D*;
123  using const_iterator = iterator;
124 
125  size_t size() const { return e_ - b_; }
126  bool empty() const { return b_ == e_;}
127 
128  iterator begin() const { return b_; }
129  iterator end() const { return e_; }
130  friend const_iterator begin (DataSpan const& span){ return span.begin();}
131  friend const_iterator end (DataSpan const& span){ return span.end(); }
132 
133  D const& operator[](size_t i) const { return *(b_ + i); }
134  D const& at(size_t i) const
135  {
136  if (i >= size())
137  throw error::Invalid{_Fmt{"Index %d beyond size=%d"}
138  % i % size()};
139  return this->operator[](i);
140  }
141  };
142 
144  template<class CON>
146 
147 
148 
149 
150 
152  template<typename... NUMS>
153  inline double
154  errorSum (NUMS ...vals)
155  {
156  auto sqr = [](auto val){ return val*val; };
157  return sqrt((sqr(vals)+ ... + 0.0));
158  }
159 
160 
161 
162  template<typename D>
163  inline double
164  average (DataSpan<D> const& data)
165  {
166  if (isnil(data)) return 0.0;
167  double sum = 0.0;
168  for (auto val : data)
169  sum += val;
170  return sum / data.size();
171  }
172 
173  template<typename D>
174  inline double
175  sdev (DataSpan<D> const& data, D mean)
176  {
177  if (isnil(data)) return 0.0;
178  double sdev = 0.0;
179  for (auto val : data)
180  {
181  D offset = val - mean;
182  sdev += offset*offset;
183  }
184  size_t n = data.size();
185  sdev /= n<2? 1: n-1;
186  return sqrt (sdev);
187  }
188 
189  inline double
190  sdev (VecD const& data, double mean)
191  {
192  return sdev(DataSpan<double>{data}, mean);
193  }
194 
195 
196 
197  inline DataSpan<double>
198  lastN (VecD const& data, size_t n)
199  {
200  n = min (n, data.size());
201  size_t oldest = data.size() - n;
202  return DataSpan<double>{data[oldest], *data.end()};
203  }
204 
205  inline double
206  averageLastN (VecD const& data, size_t n)
207  {
208  return average (lastN (data,n));
209  }
210 
211  inline double
212  sdevLastN (VecD const& data, size_t n, double mean)
213  {
214  return sdev (lastN (data,n), mean);
215  }
216 
217 
219  template<typename D>
220  inline auto
221  computeStatSums (DataSpan<D> const& series)
222  {
223  double ysum = 0.0;
224  double yysum = 0.0;
225  double xysum = 0.0;
226  size_t x = 0;
227  for (auto& y : series)
228  {
229  ysum += y;
230  yysum += y*y;
231  xysum += x*y;
232  ++x;
233  }
234  return make_tuple (ysum,yysum, xysum);
235  }
236 
237 
244  {
245  double x;
246  double y;
247  double w;
248 
249  RegressionPoint (double vx, double vy, double vw=1.0)
250  : x{vx}
251  , y{vy}
252  , w{vw}
253  { }
254  };
255 
256  using RegressionData = std::vector<RegressionPoint>;
257 
258 
260  inline auto
261  computeWeightedStatSums (DataSpan<RegressionPoint> const& points)
262  {
263  std::array<double,6> sums;
264  sums.fill(0.0);
265  auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums;
266  for (auto& p : points)
267  {
268  wsum += p.w;
269  wxsum += p.w * p.x;
270  wysum += p.w * p.y;
271  wxxsum += p.w * p.x*p.x;
272  wyysum += p.w * p.y*p.y;
273  wxysum += p.w * p.x*p.y;
274  }
275  return sums;
276  }
277 
290  inline auto
291  computeLinearRegression (DataSpan<RegressionPoint> const& points)
292  {
293  auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points);
294 
295  double xm = wxsum / wsum; // weighted mean x = 1/Σw · Σwx
296  double ym = wysum / wsum;
297  double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum; // Σw · x-Variance = Σw(x-xm)²
298  double vary = wyysum + ym*ym * wsum - 2*ym * wysum;
299  double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum; // Σw · Covariance = Σw(x-xm)(y-ym)
300 
301  // Linear Regression minimising σ²
302  double gradient = cova / varx; // gradient = correlation · σy / σx ; σ = √Variance
303  double socket = ym - gradient * xm; // Regression line: Y-ym = gradient · (x-xm) ; set x≔0 yields socket
304 
305  // Correlation (Pearson's r)
306  double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary);
307 
308  // calculate error Δ for all measurement points
309  size_t n = points.size();
310  VecD predicted; predicted.reserve(n);
311  VecD deltas; deltas.reserve(n);
312  double maxDelta = 0.0;
313  double variance = 0.0;
314  for (auto& p : points)
315  {
316  double y_pred = socket + gradient * p.x;
317  double delta = p.y - y_pred;
318  predicted.push_back (y_pred);
319  deltas.push_back (delta);
320  maxDelta = max (maxDelta, fabs(delta));
321  variance += p.w * delta*delta;
322  }
323  variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation,
324  // based on 2 other estimated values (socket,gradient)
325  return make_tuple (socket,gradient
326  ,move(predicted)
327  ,move(deltas)
328  ,correlation
329  ,maxDelta
330  ,sqrt(variance)
331  );
332  }
333 
334  inline auto
335  computeLinearRegression (RegressionData const& points)
336  {
337  return computeLinearRegression (DataSpan<RegressionPoint>{points});
338  }
339 
340 
341 
351  template<typename D>
352  inline auto
353  computeTimeSeriesLinearRegression (DataSpan<D> const& series)
354  {
355  if (series.size() < 2) return make_tuple(0.0,0.0,0.0);
356 
357  auto [ysum,yysum, xysum] = computeStatSums(series);
358 
359  size_t n = series.size();
360  double im = (n-1)/2.0; // mean of zero-based indices i ∈ {0 … n-1}
361  double ym = ysum / n; // mean y
362  double varx = (n-1)*(n+1)/12.0; // variance of zero-based indices Σ(i-im)² / n
363  double vary = yysum/n - ym*ym; // variance of data values Σ(y-ym)² / n
364  double cova = xysum - ysum *(n-1)/2; // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n·ym ≙ Σy
365 
366  // Linear Regression minimising σ²
367  double gradient = cova / (n*varx); // Gradient = Correlation · σy / σx ; σ = √Variance; Correlation = Covariance /(√Σx √Σy)
368  double socket = ym - gradient * im; // Regression line: Y-ym = Gradient · (i-im) ; set i≔0 yields socket
369 
370  // Correlation (Pearson's r)
371  double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary);
372  return make_tuple (socket,gradient,correlation);
373  }
374 
375  inline auto
376  computeTimeSeriesLinearRegression (VecD const& series)
377  {
378  return computeTimeSeriesLinearRegression (DataSpan<double>{series});
379  }
380 
381 }} // namespace lib::stat
382 #endif /*LIB_STAT_STATISTIC_H*/
Helper template(s) for creating Lumiera Forward Iterators.
Front-end for printf-style string template interpolation.
A front-end for using printf-style formatting.
Implementation namespace for support and library code.
Read-only view into a segment within a sequence of data.
Definition: statistic.hpp:100
Derived specific exceptions within Lumiera&#39;s exception hierarchy.
Definition: error.hpp:199
Mix-Ins to allow or prohibit various degrees of copying and cloning.
Tiny helper functions and shortcuts to be used everywhere Consider this header to be effectively incl...
Types marked with this mix-in may be created by copy-construction (or move construction), but may be not reassigned thereafter.
Definition: nocopy.hpp:91
Lumiera error handling (C++ interface).
Single data point used for linear regression.
Definition: statistic.hpp:243