Module: Philiprehberger::MathKit::Stats

Defined in:
lib/philiprehberger/math_kit/stats.rb

Class Method Summary collapse

Class Method Details

.confidence_interval(values, level: 0.95) ⇒ Array(Float, Float)

Confidence interval for the mean using t-distribution critical values

Parameters:

  • values (Array<Numeric>)

    the input values

  • level (Float) (defaults to: 0.95)

    confidence level (0.90, 0.95, or 0.99)

Returns:

  • (Array(Float, Float))

    lower and upper bounds

Raises:

  • (ArgumentError)

    if fewer than 2 values or unsupported level



177
178
179
180
181
182
183
184
185
186
187
# File 'lib/philiprehberger/math_kit/stats.rb', line 177

def confidence_interval(values, level: 0.95)
  n = values.size
  raise ArgumentError, 'confidence interval requires at least 2 values' if n < 2

  t_value = t_critical(n - 1, level)
  avg = mean(values)
  se = stddev(values, population: false) / Math.sqrt(n)
  margin = t_value * se

  [avg - margin, avg + margin]
end

.correlation(xs, ys) ⇒ Float

Pearson correlation coefficient between two datasets

Parameters:

  • xs (Array<Numeric>)

    first dataset

  • ys (Array<Numeric>)

    second dataset

Returns:

  • (Float)

    the Pearson correlation coefficient (-1 to 1)

Raises:

  • (ArgumentError)

    if datasets differ in size or have fewer than 2 values



195
196
197
198
199
200
201
202
203
204
205
# File 'lib/philiprehberger/math_kit/stats.rb', line 195

def correlation(xs, ys)
  raise ArgumentError, 'datasets must have the same size' if xs.size != ys.size
  raise ArgumentError, 'correlation requires at least 2 values' if xs.size < 2

  cov = covariance(xs, ys)
  sx = stddev(xs, population: false)
  sy = stddev(ys, population: false)
  return 0.0 if sx.zero? || sy.zero?

  cov / (sx * sy)
end

.covariance(xs, ys) ⇒ Float

Sample covariance between two datasets

Parameters:

  • xs (Array<Numeric>)

    first dataset

  • ys (Array<Numeric>)

    second dataset

Returns:

  • (Float)

    the sample covariance

Raises:

  • (ArgumentError)

    if datasets differ in size or have fewer than 2 values



213
214
215
216
217
218
219
220
221
# File 'lib/philiprehberger/math_kit/stats.rb', line 213

def covariance(xs, ys)
  raise ArgumentError, 'datasets must have the same size' if xs.size != ys.size
  raise ArgumentError, 'covariance requires at least 2 values' if xs.size < 2

  n = xs.size
  avg_x = mean(xs)
  avg_y = mean(ys)
  xs.zip(ys).sum { |x, y| (x - avg_x) * (y - avg_y) } / (n - 1).to_f
end

.describe(values) ⇒ Hash

Summary statistics for a dataset

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Hash)

    with :count, :mean, :median, :min, :max, :stddev, :variance, :p25, :p50, :p75

Raises:

  • (ArgumentError)

    if values is empty



312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'lib/philiprehberger/math_kit/stats.rb', line 312

def describe(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  {
    count: values.size,
    mean: mean(values),
    median: median(values),
    min: values.min.to_f,
    max: values.max.to_f,
    stddev: values.size >= 2 ? stddev(values, population: false) : 0.0,
    variance: values.size >= 2 ? variance(values, population: false) : 0.0,
    p25: percentile(values, 25),
    p50: percentile(values, 50),
    p75: percentile(values, 75)
  }
end

.histogram(values, bins: 10) ⇒ Array<Hash>

Frequency distribution (histogram)

Parameters:

  • values (Array<Numeric>)

    the input values

  • bins (Integer) (defaults to: 10)

    number of bins (default: 10)

Returns:

  • (Array<Hash>)

    array of { min:, max:, count: } hashes

Raises:

  • (ArgumentError)

    if values is empty or bins < 1



335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# File 'lib/philiprehberger/math_kit/stats.rb', line 335

def histogram(values, bins: 10)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'bins must be at least 1' if bins < 1

  min_val = values.min.to_f
  max_val = values.max.to_f
  width = max_val == min_val ? 1.0 : (max_val - min_val) / bins.to_f

  result = Array.new(bins) do |i|
    { min: min_val + (i * width), max: min_val + ((i + 1) * width), count: 0 }
  end

  values.each do |v|
    idx = width.zero? ? 0 : ((v - min_val) / width).floor
    idx = bins - 1 if idx >= bins
    result[idx][:count] += 1
  end

  result
end

.kurtosis(values) ⇒ Float

Sample excess kurtosis (Fisher definition, normal = 0)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the sample excess kurtosis

Raises:

  • (ArgumentError)

    if fewer than 4 values



155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/philiprehberger/math_kit/stats.rb', line 155

def kurtosis(values)
  n = values.size
  raise ArgumentError, 'kurtosis requires at least 4 values' if n < 4

  avg = mean(values)
  s2 = variance(values, population: false)
  return 0.0 if s2.zero?

  m4 = values.sum { |v| (v - avg)**4 } / n.to_f
  raw = m4 / (s2**2)
  # Adjusted Fisher kurtosis
  prefactor = (n.to_f * (n + 1)) / ((n - 1) * (n - 2) * (n - 3))
  correction = (3.0 * ((n - 1)**2)) / ((n - 2) * (n - 3))
  (prefactor * n * raw) - correction
end

.mean(values) ⇒ Float

Arithmetic mean of values

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the arithmetic mean

Raises:

  • (ArgumentError)

    if values is empty



12
13
14
15
16
# File 'lib/philiprehberger/math_kit/stats.rb', line 12

def mean(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  sum(values).to_f / values.size
end

.median(values) ⇒ Float

Median (middle value or average of two middle values)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the median

Raises:

  • (ArgumentError)

    if values is empty



23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/philiprehberger/math_kit/stats.rb', line 23

def median(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  sorted = values.sort
  mid = sorted.size / 2

  if sorted.size.odd?
    sorted[mid].to_f
  else
    (sorted[mid - 1] + sorted[mid]).to_f / 2
  end
end

.median_absolute_deviation(values) ⇒ Float

Median absolute deviation

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the MAD

Raises:

  • (ArgumentError)

    if values is empty



259
260
261
262
263
264
265
# File 'lib/philiprehberger/math_kit/stats.rb', line 259

def median_absolute_deviation(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  med = median(values)
  deviations = values.map { |v| (v - med).abs }
  median(deviations)
end

.mode(values) ⇒ Array<Numeric>

Mode(s) — most frequently occurring value(s)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Array<Numeric>)

    the mode(s) as an array

Raises:

  • (ArgumentError)

    if values is empty



41
42
43
44
45
46
47
# File 'lib/philiprehberger/math_kit/stats.rb', line 41

def mode(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  freq = values.tally
  max_count = freq.values.max
  freq.select { |_, count| count == max_count }.keys
end

.normalize(values) ⇒ Array<Float>

Min-max normalization to 0..1 range

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Array<Float>)

    normalized values

Raises:

  • (ArgumentError)

    if values is empty



228
229
230
231
232
233
234
235
236
237
# File 'lib/philiprehberger/math_kit/stats.rb', line 228

def normalize(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  min_val = values.min.to_f
  max_val = values.max.to_f
  span = max_val - min_val
  return values.map { 0.0 } if span.zero?

  values.map { |v| (v - min_val) / span }
end

.percentile(values, p) ⇒ Float

Percentile (0-100) using linear interpolation

Parameters:

  • values (Array<Numeric>)

    the input values

  • p (Numeric)

    the percentile (0-100)

Returns:

  • (Float)

    the percentile value

Raises:

  • (ArgumentError)

    if values is empty or p is out of range



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/philiprehberger/math_kit/stats.rb', line 82

def percentile(values, p)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'percentile must be between 0 and 100' if p.negative? || p > 100

  sorted = values.sort
  return sorted.first.to_f if p.zero?
  return sorted.last.to_f if p == 100

  rank = (p / 100.0) * (sorted.size - 1)
  lower = rank.floor
  upper = rank.ceil

  return sorted[lower].to_f if lower == upper

  fraction = rank - lower
  (sorted[lower] + (fraction * (sorted[upper] - sorted[lower]))).to_f
end

.range(values) ⇒ Numeric

Range (max - min)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

Raises:

  • (ArgumentError)

    if values is empty



126
127
128
129
130
# File 'lib/philiprehberger/math_kit/stats.rb', line 126

def range(values)
  raise ArgumentError, 'values must not be empty' if values.empty?

  values.max - values.min
end

.skewness(values) ⇒ Float

Sample skewness (Fisher-Pearson)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the sample skewness

Raises:

  • (ArgumentError)

    if fewer than 3 values



137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/philiprehberger/math_kit/stats.rb', line 137

def skewness(values)
  n = values.size
  raise ArgumentError, 'skewness requires at least 3 values' if n < 3

  avg = mean(values)
  s = stddev(values, population: false)
  return 0.0 if s.zero?

  m3 = values.sum { |v| (v - avg)**3 } / n.to_f
  adjustment = (n.to_f * (n - 1)) / (n - 2)
  (adjustment / n) * (m3 / (s**3)) * n
end

.standardize(values) ⇒ Array<Float>

Z-score standardization (mean=0, stddev=1)

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Array<Float>)

    standardized values

Raises:

  • (ArgumentError)

    if fewer than 2 values



244
245
246
247
248
249
250
251
252
# File 'lib/philiprehberger/math_kit/stats.rb', line 244

def standardize(values)
  raise ArgumentError, 'standardize requires at least 2 values' if values.size < 2

  avg = mean(values)
  s = stddev(values, population: false)
  return values.map { 0.0 } if s.zero?

  values.map { |v| (v - avg) / s }
end

.stddev(values, population: true) ⇒ Float

Standard deviation

Parameters:

  • values (Array<Numeric>)

    the input values

  • population (Boolean) (defaults to: true)

    true for population stddev, false for sample

Returns:

  • (Float)

    the standard deviation



72
73
74
# File 'lib/philiprehberger/math_kit/stats.rb', line 72

def stddev(values, population: true)
  Math.sqrt(variance(values, population: population))
end

.sum(values) ⇒ Numeric

Sum of values

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:



104
105
106
# File 'lib/philiprehberger/math_kit/stats.rb', line 104

def sum(values)
  values.sum
end

.sum_of_squares(values) ⇒ Float

Sum of squared deviations from the mean: sum_i (x_i - mean)^2. Building block for variance, regression residuals, ANOVA, etc. Returns 0.0 for empty or single-element inputs.

Parameters:

  • values (Array<Numeric>)

    the input values

Returns:

  • (Float)

    the sum of squares



114
115
116
117
118
119
# File 'lib/philiprehberger/math_kit/stats.rb', line 114

def sum_of_squares(values)
  return 0.0 if values.size < 2

  avg = mean(values)
  values.sum(0.0) { |v| (v - avg)**2 }
end

.trimmed_mean(values, trim: 0.1) ⇒ Float

Trimmed mean (removes a fraction from each end before averaging)

Parameters:

  • values (Array<Numeric>)

    the input values

  • trim (Float) (defaults to: 0.1)

    fraction to trim from each end (0.0 to 0.5 exclusive)

Returns:

  • (Float)

    the trimmed mean

Raises:

  • (ArgumentError)

    if values is empty or trim is out of range



273
274
275
276
277
278
279
280
281
282
283
284
# File 'lib/philiprehberger/math_kit/stats.rb', line 273

def trimmed_mean(values, trim: 0.1)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'trim must be between 0.0 and 0.5 (exclusive)' if trim.negative? || trim >= 0.5

  sorted = values.sort
  n = sorted.size
  k = (n * trim).floor
  return mean(sorted) if k.zero?

  trimmed = sorted[k..-(k + 1)]
  mean(trimmed)
end

.variance(values, population: true) ⇒ Float

Population or sample variance

Parameters:

  • values (Array<Numeric>)

    the input values

  • population (Boolean) (defaults to: true)

    true for population variance, false for sample

Returns:

  • (Float)

    the variance

Raises:

  • (ArgumentError)

    if values is empty or sample variance with fewer than 2 values



55
56
57
58
59
60
61
62
63
64
65
# File 'lib/philiprehberger/math_kit/stats.rb', line 55

def variance(values, population: true)
  raise ArgumentError, 'values must not be empty' if values.empty?

  n = values.size
  raise ArgumentError, 'sample variance requires at least 2 values' if !population && n < 2

  avg = mean(values)
  sum_sq = values.sum { |v| (v - avg)**2 }
  divisor = population ? n : n - 1
  sum_sq.to_f / divisor
end

.weighted_mean(values, weights:) ⇒ Float

Weighted arithmetic mean

Parameters:

  • values (Array<Numeric>)

    the input values

  • weights (Array<Numeric>)

    the corresponding weights

Returns:

  • (Float)

    the weighted mean

Raises:

  • (ArgumentError)

    if arrays differ in size, are empty, or weights sum to zero



362
363
364
365
366
367
368
369
370
# File 'lib/philiprehberger/math_kit/stats.rb', line 362

def weighted_mean(values, weights:)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'values and weights must have the same size' if values.size != weights.size

  total_weight = weights.sum.to_f
  raise ArgumentError, 'weights must not sum to zero' if total_weight.zero?

  values.zip(weights).sum { |v, w| v * w } / total_weight
end

.winsorized_mean(values, trim: 0.1) ⇒ Float

Winsorized mean (replaces extremes with boundary values before averaging)

Parameters:

  • values (Array<Numeric>)

    the input values

  • trim (Float) (defaults to: 0.1)

    fraction to winsorize from each end (0.0 to 0.5 exclusive)

Returns:

  • (Float)

    the winsorized mean

Raises:

  • (ArgumentError)

    if values is empty or trim is out of range



292
293
294
295
296
297
298
299
300
301
302
303
304
305
# File 'lib/philiprehberger/math_kit/stats.rb', line 292

def winsorized_mean(values, trim: 0.1)
  raise ArgumentError, 'values must not be empty' if values.empty?
  raise ArgumentError, 'trim must be between 0.0 and 0.5 (exclusive)' if trim.negative? || trim >= 0.5

  sorted = values.sort
  n = sorted.size
  k = (n * trim).floor
  return mean(sorted) if k.zero?

  low = sorted[k]
  high = sorted[-(k + 1)]
  winsorized = sorted.map { |v| [[v, low].max, high].min }
  mean(winsorized)
end