Module: ToyTap

Defined in:
lib/toy/dev/toy_tap.rb

Class Method Summary collapse

Class Method Details

.emit(sess, region, tensor, layer, head, step, t_now, n_heads) ⇒ Object

Single tap event. Pass layer=-1 / head=-1 to omit those fields (null in the JSON). n_heads>0 enables per_head_l2 (n must be divisible by n_heads).



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/toy/dev/toy_tap.rb', line 31

def self.emit(sess, region, tensor, layer, head, step, t_now, n_heads)
  if tensor == nil || tensor == TinyNN.tnn_null_ptr
    return
  end
  n = TinyNN.tnn_tensor_nelements(tensor)
  TinyNN.tnn_download(sess, tensor)
  sum_sq  = TinyNN.tnn_scratch_sum_sq_f32(sess, n)
  sum_abs = TinyNN.tnn_scratch_sum_abs_f32(sess, n)
  nan_n   = TinyNN.tnn_scratch_nan_count_f32(sess, n)
  l2       = sum_sq ** 0.5
  abs_mean = sum_abs / n.to_f
  ne0 = TinyNN.tnn_tensor_ne0(tensor)
  ne1 = TinyNN.tnn_tensor_ne1(tensor)
  dtype = TinyNN.tnn_tensor_dtype(tensor)
  dn = "f32"
  if dtype == 1
    dn = "f16"
  elsif dtype == 8
    dn = "q8_0"
  elsif dtype == 26
    dn = "i32"
  elsif dtype == 30
    dn = "bf16"
  end

  ev = SpinelKit::Json::Builder.new
  ev.add_str("kind",  "tap")
  ev.add_str("phase", "train")
  ev.add_num("t",      t_now)
  ev.add_num("step",   step)
  ev.add_str("region", region)
  if layer >= 0
    ev.add_num("layer", layer)
  else
    ev.add_raw("layer", "null")
  end
  if head >= 0
    ev.add_num("head", head)
  else
    ev.add_raw("head", "null")
  end
  ev.add_raw("shape",     "[" + ne0.to_s + "," + ne1.to_s + "]")
  ev.add_str("dtype",     dn)
  ev.add_num("l2",        l2)
  ev.add_num("abs_mean",  abs_mean)
  ev.add_num("nan_count", nan_n)

  # per_head_l2: download to f64 Mat, split into n_heads contiguous
  # chunks, L2 each. Costs n*8 bytes of host traffic + an O(n) loop
  # — comparable to the scratch-stat path; we avoid adding a
  # tnn_scratch_*_range C primitive to keep the FFI surface minimal.
  if n_heads > 0 && n_heads <= n && (n % n_heads) == 0
    per = n / n_heads
    buf = Mat.new(1, n)
    TinyNN.tnn_download_to_f64_array(sess, tensor, buf.flat, n)
    phl2 = "["
    h = 0
    while h < n_heads
      sq = 0.0
      i = 0
      while i < per
        v = buf.flat[h * per + i]
        sq = sq + v * v
        i = i + 1
      end
      l2h = sq ** 0.5
      phl2 = phl2 + (h == 0 ? "" : ",") + l2h.to_s
      h = h + 1
    end
    phl2 = phl2 + "]"
    ev.add_raw("per_head_l2", phl2)
  end

  TinyNN.tnn_events_emit(ev.dump)
end

.emit_cka(sess, region, tensor, layer, head, step, t_now) ⇒ Object

GH#15 — Activation-Gram tap for linear CKA. For activation A of shape [d, T] in ggml column-major, computes G = Aᵀ·A (T×T) and emits it as a ‘gram` field on the tap event alongside the standard scalar stats. T×T fits easily in JSON for typical context lengths (e.g. T=32 → 32×32 = 1024 floats = a few KB per event).

Region naming convention is the issue’s: attn_q_post_rope, ffn_out, resid_post_block, etc. — user-chosen + stable across runs.



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/toy/dev/toy_tap.rb', line 115

def self.emit_cka(sess, region, tensor, layer, head, step, t_now)
  if tensor == nil || tensor == TinyNN.tnn_null_ptr
    return
  end
  d = TinyNN.tnn_tensor_ne0(tensor)
  t = TinyNN.tnn_tensor_ne1(tensor)
  if d <= 0 || t <= 0
    return
  end
  n = d * t
  TinyNN.tnn_download(sess, tensor)
  sum_sq  = TinyNN.tnn_scratch_sum_sq_f32(sess, n)
  sum_abs = TinyNN.tnn_scratch_sum_abs_f32(sess, n)
  nan_n   = TinyNN.tnn_scratch_nan_count_f32(sess, n)
  l2       = sum_sq ** 0.5
  abs_mean = sum_abs / n.to_f

  # Gram G = Aᵀ·A. A is column-major [d, T] → flat[i + j*d] = A[i,j].
  # G[i, j] = sum_k A[k, i] * A[k, j] = sum_k flat[k + i*d] * flat[k + j*d]
  # Cost: T*T*d ops. For T=32, d=64: 65k. For T=128, d=1024: 16M.
  buf = Mat.new(1, n)
  TinyNN.tnn_download_to_f64_array(sess, tensor, buf.flat, n)

  ev = SpinelKit::Json::Builder.new
  ev.add_str("kind",  "tap")
  ev.add_str("phase", "train")
  ev.add_num("t",      t_now)
  ev.add_num("step",   step)
  ev.add_str("region", region)
  if layer >= 0
    ev.add_num("layer", layer)
  else
    ev.add_raw("layer", "null")
  end
  if head >= 0
    ev.add_num("head", head)
  else
    ev.add_raw("head", "null")
  end
  ev.add_raw("shape",     "[" + d.to_s + "," + t.to_s + "]")
  ev.add_str("dtype",     "f32")
  ev.add_num("l2",        l2)
  ev.add_num("abs_mean",  abs_mean)
  ev.add_num("nan_count", nan_n)

  gram = "["
  i = 0
  while i < t
    gram = gram + (i == 0 ? "[" : ",[")
    j = 0
    while j < t
      s = 0.0
      k = 0
      while k < d
        s = s + buf.flat[k + i * d] * buf.flat[k + j * d]
        k = k + 1
      end
      gram = gram + (j == 0 ? "" : ",") + s.to_s
      j = j + 1
    end
    gram = gram + "]"
    i = i + 1
  end
  gram = gram + "]"
  ev.add_raw("gram", gram)

  TinyNN.tnn_events_emit(ev.dump)
end