doxy/statistic_8hpp_source.html

 /*
   STATISTIC.hpp  -  helpers for generic statistics calculations

    Copyright (C)
      2022,            Hermann Vosseler <Ichthyostega@web.de>

   **Lumiera** is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by the
   Free Software Foundation; either version 2 of the License, or (at your
   option) any later version. See the file COPYING for further details.

 */


 #ifndef LIB_STAT_STATISTIC_H
 #define LIB_STAT_STATISTIC_H


 #include "lib/error.hpp"
 #include "lib/nocopy.hpp"
 #include "lib/iter-adapter.hpp"
 #include "lib/format-string.hpp"
 #include "lib/util.hpp"

 #include <utility>
 #include <vector>
 #include <array>
 #include <tuple>
 #include <cmath>

 namespace lib {
 namespace stat{

   namespace error = lumiera::error;

   using std::fabs;
   using std::array;
   using std::tuple;
   using std::make_tuple;
   using std::forward;
   using std::move;
   using util::min;
   using util::max;
   using util::isnil;
   using util::_Fmt;

   using VecD = std::vector<double>;


   template<typename TUP>
   constexpr auto
   array_from_tuple (TUP&& tuple)
   {
     constexpr auto makeArray = [](auto&& ... x)
       {
         return std::array{forward<decltype(x)> (x) ...};
       };
     return std::apply (makeArray, forward<TUP> (tuple));
   }

   template<size_t places>
   inline double
   round (double val)
   {
     constexpr double shift{pow(10.0, places)};
     return std::round(val*shift) / shift;
   }


   template<typename D>
   class DataSpan
     : util::Cloneable
     {
       const D* const b_{nullptr};
       const D* const e_{nullptr};

     public:
       DataSpan() = default;
       DataSpan (D const& begin, D const& end)
         : b_{&begin}
         , e_{&end}
         {
           if (e_ < b_)
             throw error::Invalid{"End point before begin."};
         }

       template<class CON>
       DataSpan (CON const& container)
         : DataSpan{*std::begin(container), *std::end(container)}
         { }


       using iterator = const D*;
       using const_iterator = iterator;

       size_t size()  const { return e_ - b_; }
       bool empty()   const { return b_ == e_;}

       iterator begin() const { return b_; }
       iterator end()   const { return e_; }
       friend const_iterator begin (DataSpan const& span){ return span.begin();}
       friend const_iterator end   (DataSpan const& span){ return span.end();  }

       D const& operator[](size_t i) const { return *(b_ + i); }
       D const& at(size_t i)  const
         {
           if (i >= size())
             throw error::Invalid{_Fmt{"Index %d beyond size=%d"}
                                      % i % size()};
           return this->operator[](i);
         }
     };

   template<class CON>
   DataSpan (CON const& container) -> DataSpan<typename lib::meta::ValueTypeBinding<CON>::value_type>;


   template<typename... NUMS>
   inline double
   errorSum (NUMS ...vals)
   {
     auto sqr = [](auto val){ return val*val; };
     return sqrt((sqr(vals)+ ... + 0.0));
   }


   template<typename D>
   inline double
   average (DataSpan<D> const& data)
   {
     if (isnil(data)) return 0.0;
     double sum = 0.0;
     for (auto val : data)
       sum += val;
     return sum / data.size();
   }

   template<typename D>
   inline double
   sdev (DataSpan<D> const& data, D mean)
   {
     if (isnil(data)) return 0.0;
     double sdev = 0.0;
     for (auto val : data)
       {
         D offset = val - mean;
         sdev += offset*offset;
       }
     size_t n = data.size();
     sdev /= n<2? 1: n-1;
     return sqrt (sdev);
   }

   inline double
   sdev (VecD const& data, double mean)
   {
     return sdev(DataSpan<double>{data}, mean);
   }


   inline DataSpan<double>
   lastN (VecD const& data, size_t n)
   {
     n = min (n, data.size());
     size_t oldest = data.size() - n;
     return DataSpan<double>{data[oldest], *data.end()};
   }

   inline double
   averageLastN (VecD const& data, size_t n)
   {
     return average (lastN (data,n));
   }

   inline double
   sdevLastN (VecD const& data, size_t n, double mean)
   {
     return sdev (lastN (data,n), mean);
   }


   template<typename D>
   inline auto
   computeStatSums (DataSpan<D> const& series)
   {
     double ysum = 0.0;
     double yysum = 0.0;
     double xysum = 0.0;
     size_t x = 0;
     for (auto& y : series)
       {
         ysum += y;
         yysum += y*y;
         xysum += x*y;
         ++x;
       }
     return make_tuple (ysum,yysum, xysum);
   }


   struct RegressionPoint
     {
       double x;
       double y;
       double w;

       RegressionPoint (double vx, double vy, double vw=1.0)
         : x{vx}
         , y{vy}
         , w{vw}
         { }
     };

   using RegressionData = std::vector<RegressionPoint>;


   inline auto
   computeWeightedStatSums (DataSpan<RegressionPoint> const& points)
   {
     std::array<double,6> sums;
     sums.fill(0.0);
     auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums;
     for (auto& p : points)
       {
         wsum += p.w;
         wxsum += p.w * p.x;
         wysum += p.w * p.y;
         wxxsum += p.w * p.x*p.x;
         wyysum += p.w * p.y*p.y;
         wxysum += p.w * p.x*p.y;
       }
     return sums;
   }

   inline auto
   computeLinearRegression (DataSpan<RegressionPoint> const& points)
   {
     auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points);

     double xm = wxsum / wsum;                                       // weighted mean x = 1/Σw · Σwx
     double ym = wysum / wsum;
     double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum;             // Σw · x-Variance = Σw(x-xm)²
     double vary = wyysum + ym*ym * wsum - 2*ym * wysum;
     double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum;  // Σw · Covariance = Σw(x-xm)(y-ym)

     // Linear Regression minimising σ²
     double gradient = cova / varx;                                  // gradient = correlation · σy / σx ; σ = √Variance
     double socket   = ym - gradient * xm;                           // Regression line:  Y-ym = gradient · (x-xm)  ; set x≔0 yields socket

     // Correlation (Pearson's r)
     double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary);

     // calculate error Δ for all measurement points
     size_t n = points.size();
     VecD predicted;  predicted.reserve(n);
     VecD deltas;     deltas.reserve(n);
     double maxDelta = 0.0;
     double variance = 0.0;
     for (auto& p : points)
       {
         double y_pred = socket + gradient * p.x;
         double delta  = p.y - y_pred;
         predicted.push_back (y_pred);
         deltas.push_back (delta);
         maxDelta = max (maxDelta, fabs(delta));
         variance += p.w * delta*delta;
       }
     variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation,
                                                     // based on 2 other estimated values (socket,gradient)
     return make_tuple (socket,gradient
                       ,move(predicted)
                       ,move(deltas)
                       ,correlation
                       ,maxDelta
                       ,sqrt(variance)
                       );
   }

   inline auto
   computeLinearRegression (RegressionData const& points)
   {
     return computeLinearRegression (DataSpan<RegressionPoint>{points});
   }


   template<typename D>
   inline auto
   computeTimeSeriesLinearRegression (DataSpan<D> const& series)
   {
     if (series.size() < 2) return make_tuple(0.0,0.0,0.0);

     auto [ysum,yysum, xysum] = computeStatSums(series);

     size_t n = series.size();
     double im = (n-1)/2.0;                     // mean of zero-based indices i ∈ {0 … n-1}
     double ym = ysum / n;                      // mean y
     double varx = (n-1)*(n+1)/12.0;            // variance of zero-based indices Σ(i-im)² / n
     double vary = yysum/n - ym*ym;             // variance of data values  Σ(y-ym)² / n
     double cova = xysum  - ysum *(n-1)/2;      // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n·ym ≙ Σy

     // Linear Regression minimising σ²
     double gradient = cova / (n*varx);         // Gradient = Correlation · σy / σx ; σ = √Variance;  Correlation = Covariance /(√Σx √Σy)
     double socket   = ym - gradient * im;      // Regression line:  Y-ym = Gradient · (i-im)  ; set i≔0 yields socket

     // Correlation (Pearson's r)
     double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary);
     return make_tuple (socket,gradient,correlation);
   }

   inline auto
   computeTimeSeriesLinearRegression (VecD const& series)
   {
     return computeTimeSeriesLinearRegression (DataSpan<double>{series});
   }

 }} // namespace lib::stat
 #endif /*LIB_STAT_STATISTIC_H*/
iter-adapter.hpp
Helper template(s) for creating Lumiera Forward Iterators.

format-string.hpp
Front-end for printf-style string template interpolation.

util::_Fmt
A front-end for using printf-style formatting.
Definition: format-string.hpp:148

lib
Implementation namespace for support and library code.
Definition: common-services.cpp:54

lib::stat::DataSpan
Read-only view into a segment within a sequence of data.
Definition: statistic.hpp:91

lumiera::error::LumieraError
Derived specific exceptions within Lumiera&#39;s exception hierarchy.
Definition: error.hpp:190

nocopy.hpp
Mix-Ins to allow or prohibit various degrees of copying and cloning.

util.hpp
Tiny helper functions and shortcuts to be used everywhere Consider this header to be effectively incl...

util::Cloneable
Types marked with this mix-in may be duplicated by copy-construction, yet may not be moved or transfe...
Definition: nocopy.hpp:95

error.hpp
Lumiera error handling (C++ interface).

lib::stat::RegressionPoint
Single data point used for linear regression.
Definition: statistic.hpp:234

lumiera::error
Definition: config-rules.hpp:66