Class: Iriq::PositionStats

Inherits:
Object
  • Object
show all
Defined in:
lib/iriq/position_stats.rb

Overview

Rolling frequency counts for a single (host, prefix-shape, position). Value cardinality is capped so a high-entropy position (UUIDs, timestamps) doesn’t grow memory without bound — ‘total` keeps growing accurately, but only the first `max_values` distinct values are tracked individually. Existing tracked values still receive increments after the cap is hit; only NEW distinct values are dropped.

Constant Summary collapse

DEFAULT_MAX_VALUES =
5_000
NUMERIC_TYPES =
%i[integer float].freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(max_values: DEFAULT_MAX_VALUES) ⇒ PositionStats

Returns a new instance of PositionStats.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/iriq/position_stats.rb', line 16

def initialize(max_values: DEFAULT_MAX_VALUES)
  @value_counts  = Hash.new(0)
  @type_counts   = Hash.new(0)
  @total         = 0
  @max_values    = max_values
  # Range stats for numeric observations only. Lets the corpus
  # promote /articles/2024 etc. to :year when all values land in
  # 1900..2100, and surfaces min/max/avg on ParamSummary for
  # general numeric params.
  @numeric_count = 0
  @numeric_min   = nil
  @numeric_max   = nil
  @numeric_sum   = 0.0
end

Instance Attribute Details

#max_valuesObject (readonly)

Returns the value of attribute max_values.



11
12
13
# File 'lib/iriq/position_stats.rb', line 11

def max_values
  @max_values
end

#numeric_countObject (readonly)

Returns the value of attribute numeric_count.



11
12
13
# File 'lib/iriq/position_stats.rb', line 11

def numeric_count
  @numeric_count
end

#numeric_maxObject (readonly)

Returns the value of attribute numeric_max.



11
12
13
# File 'lib/iriq/position_stats.rb', line 11

def numeric_max
  @numeric_max
end

#numeric_minObject (readonly)

Returns the value of attribute numeric_min.



11
12
13
# File 'lib/iriq/position_stats.rb', line 11

def numeric_min
  @numeric_min
end

#numeric_sumObject (readonly)

Returns the value of attribute numeric_sum.



11
12
13
# File 'lib/iriq/position_stats.rb', line 11

def numeric_sum
  @numeric_sum
end

#totalObject (readonly)

Returns the value of attribute total.



11
12
13
# File 'lib/iriq/position_stats.rb', line 11

def total
  @total
end

#type_countsObject (readonly)

Returns the value of attribute type_counts.



11
12
13
# File 'lib/iriq/position_stats.rb', line 11

def type_counts
  @type_counts
end

#value_countsObject (readonly)

Returns the value of attribute value_counts.



11
12
13
# File 'lib/iriq/position_stats.rb', line 11

def value_counts
  @value_counts
end

Class Method Details

.from_dump(h) ⇒ Object



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/iriq/position_stats.rb', line 114

def self.from_dump(h)
  stats = new(max_values: h["max_values"])
  stats.instance_variable_set(:@total, h["total"])
  vc = Hash.new(0).merge(h["value_counts"])
  tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
  stats.instance_variable_set(:@value_counts, vc)
  stats.instance_variable_set(:@type_counts, tc)
  if h["numeric_count"]
    stats.instance_variable_set(:@numeric_count, h["numeric_count"])
    stats.instance_variable_set(:@numeric_min, h["numeric_min"])
    stats.instance_variable_set(:@numeric_max, h["numeric_max"])
    stats.instance_variable_set(:@numeric_sum, h["numeric_sum"])
  end
  stats
end

Instance Method Details

#cardinalityObject



62
63
64
# File 'lib/iriq/position_stats.rb', line 62

def cardinality
  @value_counts.size
end

#dominant_typeObject

Most common type. On count ties, breaks lexicographically by type symbol name so the result is deterministic and matches Go’s DominantType (Go’s map iteration is randomized).



84
85
86
87
88
89
90
91
92
93
94
# File 'lib/iriq/position_stats.rb', line 84

def dominant_type
  best = nil
  best_count = -1
  @type_counts.each do |t, n|
    if n > best_count || (n == best_count && t.to_s < best.to_s)
      best = t
      best_count = n
    end
  end
  best
end

#dumpObject



96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/iriq/position_stats.rb', line 96

def dump
  # Dup the hashes so callers can mutate the dump structure (test
  # fixtures, post-processing) without aliasing the live state.
  out = {
    "value_counts" => @value_counts.dup,
    "type_counts"  => @type_counts.transform_keys(&:to_s),
    "total"        => @total,
    "max_values"   => @max_values,
  }
  if @numeric_count.positive?
    out["numeric_count"] = @numeric_count
    out["numeric_min"]   = @numeric_min
    out["numeric_max"]   = @numeric_max
    out["numeric_sum"]   = @numeric_sum
  end
  out
end

#numeric_avgObject



40
41
42
43
44
# File 'lib/iriq/position_stats.rb', line 40

def numeric_avg
  return nil if @numeric_count.zero?

  @numeric_sum / @numeric_count
end

#observe(value, type) ⇒ Object



31
32
33
34
35
36
37
38
# File 'lib/iriq/position_stats.rb', line 31

def observe(value, type)
  @total += 1
  @type_counts[type] += 1
  if @value_counts.size < @max_values || @value_counts.key?(value)
    @value_counts[value] += 1
  end
  record_numeric(value, type)
end

#value_fraction(value) ⇒ Object



75
76
77
78
79
# File 'lib/iriq/position_stats.rb', line 75

def value_fraction(value)
  return 0.0 if @total.zero?

  (@value_counts[value] || 0).to_f / @total
end

#variable_fraction(classifier) ⇒ Object

Fraction of observations whose type was variable (i.e. classifier said not :literal).



68
69
70
71
72
73
# File 'lib/iriq/position_stats.rb', line 68

def variable_fraction(classifier)
  return 0.0 if @total.zero?

  var = @type_counts.sum { |t, c| classifier.variable?(t) ? c : 0 }
  var.to_f / @total
end