Module: Philiprehberger::MathKit::Stats

Defined in:
lib/philiprehberger/math_kit/stats.rb

Class Method Summary collapse

Class Method Details

.confidence_interval(values, level: 0.95) ⇒ Array(Float, Float)

Confidence interval for the mean using t-distribution critical values

Parameters:

  • values (Array<Numeric>)

    the input values

  • level (Float) (defaults to: 0.95)

    confidence level (0.90, 0.95, or 0.99)

Returns:

  • (Array(Float, Float))

    lower and upper bounds

Raises:

  • (ArgumentError)

    if fewer than 2 values or unsupported level



164
165
166
167
168
169
170
171
172
173
174
# File 'lib/philiprehberger/math_kit/stats.rb', line 164

def confidence_interval(values, level: 0.95)
  n = values.size
  raise ArgumentError, 'confidence interval requires at least 2 values' if n < 2

  t_value = t_critical(n - 1, level)
  avg = mean(values)
  se = stddev(values, population: false) / Math.sqrt(n)
  margin = t_value * se

  [avg - margin, avg + margin]
end

.correlation(xs, ys) ⇒ Float

Pearson correlation coefficient between two datasets

Parameters:

  • xs (Array<Numeric>)

    first dataset

  • ys (Array<Numeric>)

    second dataset

Returns:

  • (Float)

    the Pearson correlation coefficient (-1 to 1)

Raises:

  • (ArgumentError)

    if datasets differ in size or have fewer than 2 values



182
183
184
185
186
187
188
189
190
191
192
# File 'lib/philiprehberger/math_kit/stats.rb', line 182

def correlation(xs, ys)
  raise ArgumentError, 'datasets must have the same size' if xs.size != ys.size
  raise ArgumentError, 'correlation requires at least 2 values' if xs.size < 2

  cov = covariance(xs, ys)
  sx = stddev(xs, population: false)
  sy = stddev(ys, population: false)
  return 0.0 if sx.zero? || sy.zero?

  cov / (sx * sy)
end

.covariance(xs, ys) ⇒ Float

Sample covariance between two datasets

Parameters:

  • xs (Array<Numeric>)

    first dataset

  • ys (Array<Numeric>)

    second dataset

Returns:

  • (Float)

    the sample covariance

Raises:

  • (ArgumentError)

    if datasets differ in size or have fewer than 2 values



200
201
202
203
204
205
206
207
208
# File 'lib/philiprehberger/math_kit/stats.rb', line 200

def covariance(xs, ys)
  raise ArgumentError, 'datasets must have the same size' if xs.size != ys.size
  raise ArgumentError, 'covariance requires at least 2 values' if xs.size < 2

  n = xs.size
  avg_x = mean(xs)
  avg_y = mean(ys)
  xs.zip(ys).sum { |x, y| (x - avg_x) * (y - avg_y) } / (n - 1).to_f
end

.describe(values) ⇒ Hash

Summary statistics for a dataset

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Hash)

    with :count, :mean, :median, :min, :max, :stddev, :variance, :p25, :p50, :p75

Raises:

  • (ArgumentError)

    if values is empty



299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# File 'lib/philiprehberger/math_kit/stats.rb', line 299

def describe(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  {
    count: values.size,
    mean: mean(values),
    median: median(values),
    min: values.min.to_f,
    max: values.max.to_f,
    stddev: values.size >= 2 ? stddev(values, population: false) : 0.0,
    variance: values.size >= 2 ? variance(values, population: false) : 0.0,
    p25: percentile(values, 25),
    p50: percentile(values, 50),
    p75: percentile(values, 75)
  }
end

.histogram(values, bins: 10) ⇒ Array<Hash>

Frequency distribution (histogram)

Parameters:

  • values (Array<Numeric>)

    the input values

  • bins (Integer) (defaults to: 10)

    number of bins (default: 10)

Returns:

  • (Array<Hash>)

    array of { min:, max:, count: } hashes

Raises:

  • (ArgumentError)

    if values is empty or bins < 1



322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# File 'lib/philiprehberger/math_kit/stats.rb', line 322

def histogram(values, bins: 10)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'bins must be at least 1' if bins < 1

  min_val = values.min.to_f
  max_val = values.max.to_f
  width = max_val == min_val ? 1.0 : (max_val - min_val) / bins.to_f

  result = Array.new(bins) do |i|
    { min: min_val + (i * width), max: min_val + ((i + 1) * width), count: 0 }
  end

  values.each do |v|
    idx = width.zero? ? 0 : ((v - min_val) / width).floor
    idx = bins - 1 if idx >= bins
    result[idx][:count] += 1
  end

  result
end

.kurtosis(values) ⇒ Float

Sample excess kurtosis (Fisher definition, normal = 0)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the sample excess kurtosis

Raises:

  • (ArgumentError)

    if fewer than 4 values



142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/philiprehberger/math_kit/stats.rb', line 142

def kurtosis(values)
  n = values.size
  raise ArgumentError, 'kurtosis requires at least 4 values' if n < 4

  avg = mean(values)
  s2 = variance(values, population: false)
  return 0.0 if s2.zero?

  m4 = values.sum { |v| (v - avg)**4 } / n.to_f
  raw = m4 / (s2**2)
  # Adjusted Fisher kurtosis
  prefactor = (n.to_f * (n + 1)) / ((n - 1) * (n - 2) * (n - 3))
  correction = (3.0 * ((n - 1)**2)) / ((n - 2) * (n - 3))
  (prefactor * n * raw) - correction
end

.mean(values) ⇒ Float

Arithmetic mean of values

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the arithmetic mean

Raises:

  • (ArgumentError)

    if values is empty



12
13
14
15
16
# File 'lib/philiprehberger/math_kit/stats.rb', line 12

def mean(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  sum(values).to_f / values.size
end

.median(values) ⇒ Float

Median (middle value or average of two middle values)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the median

Raises:

  • (ArgumentError)

    if values is empty



23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/philiprehberger/math_kit/stats.rb', line 23

def median(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  sorted = values.sort
  mid = sorted.size / 2

  if sorted.size.odd?
    sorted[mid].to_f
  else
    (sorted[mid - 1] + sorted[mid]).to_f / 2
  end
end

.median_absolute_deviation(values) ⇒ Float

Median absolute deviation

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the MAD

Raises:

  • (ArgumentError)

    if values is empty



246
247
248
249
250
251
252
# File 'lib/philiprehberger/math_kit/stats.rb', line 246

def median_absolute_deviation(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  med = median(values)
  deviations = values.map { |v| (v - med).abs }
  median(deviations)
end

.mode(values) ⇒ Array<Numeric>

Mode(s) — most frequently occurring value(s)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Array<Numeric>)

    the mode(s) as an array

Raises:

  • (ArgumentError)

    if values is empty



41
42
43
44
45
46
47
# File 'lib/philiprehberger/math_kit/stats.rb', line 41

def mode(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  freq = values.tally
  max_count = freq.values.max
  freq.select { |_, count| count == max_count }.keys
end

.normalize(values) ⇒ Array<Float>

Min-max normalization to 0..1 range

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Array<Float>)

    normalized values

Raises:

  • (ArgumentError)

    if values is empty



215
216
217
218
219
220
221
222
223
224
# File 'lib/philiprehberger/math_kit/stats.rb', line 215

def normalize(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  min_val = values.min.to_f
  max_val = values.max.to_f
  span = max_val - min_val
  return values.map { 0.0 } if span.zero?

  values.map { |v| (v - min_val) / span }
end

.percentile(values, p) ⇒ Float

Percentile (0-100) using linear interpolation

Parameters:

  • values (Array<Numeric>)

    the input values

  • p (Numeric)

    the percentile (0-100)

Returns:

  • (Float)

    the percentile value

Raises:

  • (ArgumentError)

    if values is empty or p is out of range



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/philiprehberger/math_kit/stats.rb', line 82

def percentile(values, p)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'percentile must be between 0 and 100' if p.negative? || p > 100

  sorted = values.sort
  return sorted.first.to_f if p.zero?
  return sorted.last.to_f if p == 100

  rank = (p / 100.0) * (sorted.size - 1)
  lower = rank.floor
  upper = rank.ceil

  return sorted[lower].to_f if lower == upper

  fraction = rank - lower
  (sorted[lower] + (fraction * (sorted[upper] - sorted[lower]))).to_f
end

.range(values) ⇒ Numeric

Range (max - min)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

Raises:

  • (ArgumentError)

    if values is empty



113
114
115
116
117
# File 'lib/philiprehberger/math_kit/stats.rb', line 113

def range(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  values.max - values.min
end

.skewness(values) ⇒ Float

Sample skewness (Fisher-Pearson)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the sample skewness

Raises:

  • (ArgumentError)

    if fewer than 3 values



124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/philiprehberger/math_kit/stats.rb', line 124

def skewness(values)
  n = values.size
  raise ArgumentError, 'skewness requires at least 3 values' if n < 3

  avg = mean(values)
  s = stddev(values, population: false)
  return 0.0 if s.zero?

  m3 = values.sum { |v| (v - avg)**3 } / n.to_f
  adjustment = (n.to_f * (n - 1)) / (n - 2)
  (adjustment / n) * (m3 / (s**3)) * n
end

.standardize(values) ⇒ Array<Float>

Z-score standardization (mean=0, stddev=1)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Array<Float>)

    standardized values

Raises:

  • (ArgumentError)

    if fewer than 2 values



231
232
233
234
235
236
237
238
239
# File 'lib/philiprehberger/math_kit/stats.rb', line 231

def standardize(values)
  raise ArgumentError, 'standardize requires at least 2 values' if values.size < 2

  avg = mean(values)
  s = stddev(values, population: false)
  return values.map { 0.0 } if s.zero?

  values.map { |v| (v - avg) / s }
end

.stddev(values, population: true) ⇒ Float

Standard deviation

Parameters:

  • values (Array<Numeric>)

    the input values

  • population (Boolean) (defaults to: true)

    true for population stddev, false for sample

Returns:

  • (Float)

    the standard deviation



72
73
74
# File 'lib/philiprehberger/math_kit/stats.rb', line 72

def stddev(values, population: true)
  Math.sqrt(variance(values, population: population))
end

.sum(values) ⇒ Numeric

Sum of values

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:



104
105
106
# File 'lib/philiprehberger/math_kit/stats.rb', line 104

def sum(values)
  values.sum
end

.trimmed_mean(values, trim: 0.1) ⇒ Float

Trimmed mean (removes a fraction from each end before averaging)

Parameters:

  • values (Array<Numeric>)

    the input values

  • trim (Float) (defaults to: 0.1)

    fraction to trim from each end (0.0 to 0.5 exclusive)

Returns:

  • (Float)

    the trimmed mean

Raises:

  • (ArgumentError)

    if values is empty or trim is out of range



260
261
262
263
264
265
266
267
268
269
270
271
# File 'lib/philiprehberger/math_kit/stats.rb', line 260

def trimmed_mean(values, trim: 0.1)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'trim must be between 0.0 and 0.5 (exclusive)' if trim.negative? || trim >= 0.5

  sorted = values.sort
  n = sorted.size
  k = (n * trim).floor
  return mean(sorted) if k.zero?

  trimmed = sorted[k..-(k + 1)]
  mean(trimmed)
end

.variance(values, population: true) ⇒ Float

Population or sample variance

Parameters:

  • values (Array<Numeric>)

    the input values

  • population (Boolean) (defaults to: true)

    true for population variance, false for sample

Returns:

  • (Float)

    the variance

Raises:

  • (ArgumentError)

    if values is empty or sample variance with fewer than 2 values



55
56
57
58
59
60
61
62
63
64
65
# File 'lib/philiprehberger/math_kit/stats.rb', line 55

def variance(values, population: true)
  raise ArgumentError, 'values must not be empty' if values.empty?

  n = values.size
  raise ArgumentError, 'sample variance requires at least 2 values' if !population && n < 2

  avg = mean(values)
  sum_sq = values.sum { |v| (v - avg)**2 }
  divisor = population ? n : n - 1
  sum_sq.to_f / divisor
end

.weighted_mean(values, weights:) ⇒ Float

Weighted arithmetic mean

Parameters:

  • values (Array<Numeric>)

    the input values

  • weights (Array<Numeric>)

    the corresponding weights

Returns:

  • (Float)

    the weighted mean

Raises:

  • (ArgumentError)

    if arrays differ in size, are empty, or weights sum to zero



349
350
351
352
353
354
355
356
357
# File 'lib/philiprehberger/math_kit/stats.rb', line 349

def weighted_mean(values, weights:)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'values and weights must have the same size' if values.size != weights.size

  total_weight = weights.sum.to_f
  raise ArgumentError, 'weights must not sum to zero' if total_weight.zero?

  values.zip(weights).sum { |v, w| v * w } / total_weight
end

.winsorized_mean(values, trim: 0.1) ⇒ Float

Winsorized mean (replaces extremes with boundary values before averaging)

Parameters:

  • values (Array<Numeric>)

    the input values

  • trim (Float) (defaults to: 0.1)

    fraction to winsorize from each end (0.0 to 0.5 exclusive)

Returns:

  • (Float)

    the winsorized mean

Raises:

  • (ArgumentError)

    if values is empty or trim is out of range



279
280
281
282
283
284
285
286
287
288
289
290
291
292
# File 'lib/philiprehberger/math_kit/stats.rb', line 279

def winsorized_mean(values, trim: 0.1)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'trim must be between 0.0 and 0.5 (exclusive)' if trim.negative? || trim >= 0.5

  sorted = values.sort
  n = sorted.size
  k = (n * trim).floor
  return mean(sorted) if k.zero?

  low = sorted[k]
  high = sorted[-(k + 1)]
  winsorized = sorted.map { |v| [[v, low].max, high].min }
  mean(winsorized)
end