Class: Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/toy/io/tokenizer.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(vocab, merges, bos_id, eos_id, pad_id, unk_id, model_name = "", add_bos = false) ⇒ Tokenizer

Returns a new instance of Tokenizer.



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/toy/io/tokenizer.rb', line 28

def initialize(vocab, merges, bos_id, eos_id, pad_id, unk_id,
               model_name = "", add_bos = false)
  @vocab      = vocab
  @vocab_size = vocab.length
  @merges     = merges
  @bos_id     = bos_id
  @eos_id     = eos_id
  @pad_id     = pad_id
  @unk_id     = unk_id
  @add_bos    = add_bos
  @present    = (vocab.length > 0)
  # SPM (SentencePiece, marker U+2581 ▁) vs GPT-2 byte-level BPE.
  # Detection: vocab heuristic OR model_name says "llama".
  #
  # The heuristic (vocab[3] == "<0x00>") is reliable for any
  # historic SPM model — every SPM tokenizer in the wild puts
  # <0x00> at index 3 (after <unk>/<s>/</s>). For Gemma 2 the
  # special-tokens band is longer and <0x00> sits further in;
  # the heuristic returns false for Gemma. The model_name "llama"
  # picks Gemma 2 (and any future SPM model) up.
  #
  # We deliberately do NOT trust model_name alone — older project
  # converters wrote "gpt2" for SPM models (Mistral's tokenizer
  # GGUF is the canonical example), so authoritatively trusting
  # model_name would flip Mistral to the wrong path. OR both
  # signals: either says SPM, treat as SPM.
  @spm = (vocab.length > 3 && vocab[3] == "<0x00>") || (model_name == "llama")
  # T-Gemma (#117): SPM split into two encoding paths:
  #   - BPE+scores (Mistral, Llama-1/2, TinyLlama): merges array
  #     populated; encode via the existing merge-loop algorithm.
  #   - Unigram (Gemma 2, newer SPM models): merges array empty;
  #     pieces are scored individually and tokenization is greedy
  #     longest-match (an approximation of the proper Viterbi
  #     decode). The vocab IS the unigram model.
  # Distinguish by the merges array being non-empty.
  @spm_unigram = @spm && (merges.length == 0)

  # Inverse vocab: token-string → id.
  @vocab_inv = {}
  i = 0
  while i < vocab.length
    @vocab_inv[vocab[i]] = i
    i = i + 1
  end

  # Merge-rank hash: "a b" → rank. Lower = higher priority.
  @merge_rank = {}
  i = 0
  while i < merges.length
    @merge_rank[merges[i]] = i
    i = i + 1
  end

  # GPT-2 byte→char table built lazily on first access (initialize
  # used to segv on Spinel when both this big build and the large
  # vocab/merges hashes ran inside one ctor — moved out for safety).
  @byte_to_char = nil
  @char_to_byte = nil

  # One-shot warn flag for UNK emissions. We *never* silently emit
  # UNK — see lib/tokenizer.rb's encode for the rationale. The first
  # piece that misses vocab prints to stderr with the piece value;
  # subsequent misses are quiet to avoid spamming long prompts.
  @warned_unk = false
end

Instance Attribute Details

#add_bosObject

Returns the value of attribute add_bos.



24
25
26
# File 'lib/toy/io/tokenizer.rb', line 24

def add_bos
  @add_bos
end

#bos_idObject (readonly)

Returns the value of attribute bos_id.



24
25
26
# File 'lib/toy/io/tokenizer.rb', line 24

def bos_id
  @bos_id
end

#eos_idObject (readonly)

Returns the value of attribute eos_id.



24
25
26
# File 'lib/toy/io/tokenizer.rb', line 24

def eos_id
  @eos_id
end

#pad_idObject (readonly)

Returns the value of attribute pad_id.



24
25
26
# File 'lib/toy/io/tokenizer.rb', line 24

def pad_id
  @pad_id
end

#presentObject (readonly)

Returns the value of attribute present.



24
25
26
# File 'lib/toy/io/tokenizer.rb', line 24

def present
  @present
end

#spmObject (readonly)

Returns the value of attribute spm.



24
25
26
# File 'lib/toy/io/tokenizer.rb', line 24

def spm
  @spm
end

#spm_unigramObject (readonly)

Returns the value of attribute spm_unigram.



24
25
26
# File 'lib/toy/io/tokenizer.rb', line 24

def spm_unigram
  @spm_unigram
end

#unk_idObject (readonly)

Returns the value of attribute unk_id.



24
25
26
# File 'lib/toy/io/tokenizer.rb', line 24

def unk_id
  @unk_id
end

#vocab_sizeObject (readonly)

Returns the value of attribute vocab_size.



24
25
26
# File 'lib/toy/io/tokenizer.rb', line 24

def vocab_size
  @vocab_size
end

Class Method Details

.cp_to_utf8(c) ⇒ Object

Codepoint → UTF-8 string. Used only for codepoints < 0x800 (the GPT-2 mapping maxes at 0x143). Spinel-friendly: no Encoding::UTF_8.



139
140
141
142
143
144
145
146
147
148
149
# File 'lib/toy/io/tokenizer.rb', line 139

def self.cp_to_utf8(c)
  if c < 0x80
    return c.chr
  end
  if c < 0x800
    b1 = (0xC0 | (c >> 6)).chr
    b2 = (0x80 | (c & 0x3F)).chr
    return b1 + b2
  end
  "?"
end

.from_gguf(path) ⇒ Object

Build from a GGUF file with embedded tokenizer metadata.



555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
# File 'lib/toy/io/tokenizer.rb', line 555

def self.from_gguf(path)
  empty = [""]
  empty.pop
  handle = GgufKV.tnn_gguf_load(path)
  if handle == nil
    return Tokenizer.new(empty, empty, -1, -1, -1, -1)
  end

  bos = GgufKV.tnn_gguf_get_u32(handle, "tokenizer.ggml.bos_token_id")
  eos = GgufKV.tnn_gguf_get_u32(handle, "tokenizer.ggml.eos_token_id")
  pad = GgufKV.tnn_gguf_get_u32(handle, "tokenizer.ggml.padding_token_id")
  unk = GgufKV.tnn_gguf_get_u32(handle, "tokenizer.ggml.unknown_token_id")
  # T-Gemma (#117): tokenizer.ggml.model is the authoritative kind
  # ("llama" = SPM, "gpt2" = byte-level BPE). Older Llama-1/2
  # GGUFs may omit it; the Tokenizer ctor falls back to a vocab
  # heuristic when model_name is empty.
  model_name = GgufKV.tnn_gguf_get_str(handle, "tokenizer.ggml.model")
  if model_name == nil; model_name = ""; end
  # add_bos_token: per-arch flag (Gemma 2 sets this to true).
  # Returns -1 when the key is missing; treat as false.
  add_bos_v = GgufKV.tnn_gguf_get_bool(handle, "tokenizer.ggml.add_bos_token")
  add_bos   = (add_bos_v == 1)
  n_tok    = GgufKV.tnn_gguf_arr_n(handle, "tokenizer.ggml.tokens")
  n_merges = GgufKV.tnn_gguf_arr_n(handle, "tokenizer.ggml.merges")

  vocab = [""]
  vocab.pop
  if n_tok > 0
    i = 0
    while i < n_tok
      s = GgufKV.tnn_gguf_arr_str(handle, "tokenizer.ggml.tokens", i)
      if s == nil
        vocab.push("")
      else
        vocab.push(s)
      end
      i = i + 1
    end
  end

  merges = [""]
  merges.pop
  if n_merges > 0
    i = 0
    while i < n_merges
      s = GgufKV.tnn_gguf_arr_str(handle, "tokenizer.ggml.merges", i)
      if s == nil
        merges.push("")
      else
        merges.push(s)
      end
      i = i + 1
    end
  end

  GgufKV.tnn_gguf_free(handle)
  Tokenizer.new(vocab, merges, bos, eos, pad, unk, model_name, add_bos)
end

Instance Method Details

#build_byte_tablesObject



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/toy/io/tokenizer.rb', line 94

def build_byte_tables
  return if @byte_to_char != nil
  btc = [""]
  btc.pop
  j = 0
  while j < 256
    btc.push("")
    j = j + 1
  end
  # GPT-2 bytes_to_unicode in ONE pass with an inline "kept" boolean.
  # Bytes in these three ranges map to their own codepoint; every other
  # byte maps to 256, 257, … in order (so the space 0x20 → U+0120 = Ġ).
  #
  # SPINEL LANDMINE (the #34 root cause): the previous version used an
  # `is_kept[]` Array<bool> seeded with `false` + a separate `if !is_kept[b]`
  # pass. Under Spinel that else-branch NEVER ran (n_mapped stayed 0), so the
  # mapped chars — including Ġ for 0x20 — were never built. `@byte_to_char[32]`
  # came out empty, encode dropped every leading space and selected the
  # space-less token (`upon`=25705 instead of `Ġupon`=1980), and decode had no
  # marker to restore. Inline the test as a plain boolean and branch on `k`
  # (no bool array, no `!`) — verified to produce btc[0x20]=Ġ.
  n_mapped = 0
  b = 0
  while b < 256
    k = (b >= 0x21 && b <= 0x7E) || (b >= 0xA1 && b <= 0xAC) || (b >= 0xAE && b <= 0xFF)
    if k
      btc[b] = Tokenizer.cp_to_utf8(b)
    else
      btc[b] = Tokenizer.cp_to_utf8(256 + n_mapped)
      n_mapped = n_mapped + 1
    end
    b = b + 1
  end
  @byte_to_char = btc
  ctb = {}
  i = 0
  while i < btc.length
    ctb[btc[i]] = i
    i = i + 1
  end
  @char_to_byte = ctb
end

#decode(ids) ⇒ Object

Decode IDs → text. Walks token byte-chars, maps each back to its original byte, returns the concatenated UTF-8 string.



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/toy/io/tokenizer.rb', line 160

def decode(ids)
  if !@present
    puts "Tokenizer.decode: vocab not loaded (re-convert with --with-tokenizer)"
    return ""
  end
  if @spm
    return decode_spm(ids)
  end
  build_byte_tables
  chained = ""
  i = 0
  while i < ids.length
    tok_id = ids[i]
    if tok_id == @bos_id || tok_id == @eos_id || tok_id == @pad_id
      i = i + 1
      next
    end
    chained = chained + token_at(tok_id)
    i = i + 1
  end
  out = ""
  chars = chained.chars
  j = 0
  while j < chars.length
    c = chars[j]
    b = @char_to_byte[c]
    if b == nil
      out = out + "?"
    else
      out = out + b.chr
    end
    j = j + 1
  end
  out
end

#decode_spm(ids) ⇒ Object

T1.3: SentencePiece decode. Concatenate token strings; replace ▁with space; collapse byte-fallback <0xHH> sequences into UTF-8 bytes. Llama-1/2 / Mistral / TinyLlama use this path.

SPM tokenizers prepend a leading ▁ to encode the first word’s boundary (Llama-2 / Mistral convention — encoding “X” gives [“▁X”]). On decode, we strip exactly one leading ▁ at the start of the output so the round-trip is lossless. After the first piece, ▁ in the middle of a token (e.g. “▁the”) becomes a regular space.



206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/toy/io/tokenizer.rb', line 206

def decode_spm(ids)
  out = ""
  first_emit = true
  i = 0
  while i < ids.length
    tid = ids[i]
    if tid == @bos_id || tid == @eos_id || tid == @pad_id
      i = i + 1
      next
    end
    piece = token_at(tid)
    # Byte-fallback token: "<0xHH>". Hex parse via byte indexing
    # because Spinel's String#[Range] can mis-slice on multi-char
    # ranges (memory feedback_spinel_type_inference_landmines).
    pb = piece.bytes
    if pb.length == 6 && pb[0] == 60 && pb[1] == 48 && pb[2] == 120 && pb[5] == 62
      out = out + ((hex_digit_value(pb[3]) << 4) | hex_digit_value(pb[4])).chr
      first_emit = false
    else
      # Walk UTF-8 bytes; collapse 0xE2 0x96 0x81 (▁) into ASCII
      # space, but skip the very first ▁ if it's a leading-space
      # encoding marker.
      bi = 0
      while bi < pb.length
        if bi + 2 < pb.length && pb[bi] == 226 && pb[bi + 1] == 150 && pb[bi + 2] == 129
          if first_emit
            # Drop the leading ▁
          else
            out = out + " "
          end
          first_emit = false
          bi = bi + 3
        else
          out = out + pb[bi].chr
          first_emit = false
          bi = bi + 1
        end
      end
    end
    i = i + 1
  end
  out
end

#encode(text) ⇒ Object

Encode text → IDs. Pre-tokenize via regex; for each chunk, run the byte→char map then BPE merge loop; lookup pieces in vocab.



261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
# File 'lib/toy/io/tokenizer.rb', line 261

def encode(text)
  if !@present
    puts "Tokenizer.encode: vocab not loaded (re-convert with --with-tokenizer)"
    return []
  end
  # T-Gemma (#117): prepend BOS only on the SPM-Unigram path
  # (Gemma 2 needs it; without BOS at pos 0 the model produces
  # degenerate output). Other paths (byte-level BPE for SmolLM2/
  # Qwen3; BPE-SPM for Mistral/TinyLlama) preserve their existing
  # tokenization to maintain bit-identical regression behavior on
  # canonical prompts.
  if @spm_unigram
    ids = [0]; ids.pop
    if @add_bos && @bos_id != nil && @bos_id >= 0
      ids.push(@bos_id)
    end
    body = encode_spm_unigram(text)
    bi = 0
    while bi < body.length; ids.push(body[bi]); bi = bi + 1; end
    return ids
  end
  if @spm
    return encode_spm(text)
  end
  build_byte_tables
  ids = [0]
  ids.pop
  # Pre-tokenizer regex (Llama-3 / cl100k_base style, ASCII fallback).
  pre_re = /'s|'t|'re|'ve|'m|'ll|'d|'S|'T|'RE|'VE|'M|'LL|'D|[^\r\na-zA-Z0-9]?[a-zA-Z]+|[0-9]{1,3}| ?[^\sa-zA-Z0-9]+[\r\n]*|\s+/
  chunks = text.scan(pre_re)
  ci = 0
  while ci < chunks.length
    chunk = chunks[ci]
    bytes = chunk.bytes
    # Lift bytes to GPT-2 byte-chars.
    bc = ""
    bi = 0
    while bi < bytes.length
      bc = bc + @byte_to_char[bytes[bi]]
      bi = bi + 1
    end
    # BPE merge loop: start with single-char pieces; iteratively apply
    # the lowest-rank merge until no merge applies.
    pieces = bc.chars
    while true
      best_rank = 999999999
      best_idx = -1
      k = 0
      while k < pieces.length - 1
        key = pieces[k] + " " + pieces[k + 1]
        # IMPORTANT: in Spinel, `Hash#[missing_key]` returns the
        # integer 0, not nil. Without the has_key? guard, every
        # absent merge appears to have rank 0 (the highest
        # priority), which makes BPE apply spurious merges and
        # produce pieces that aren't in the vocab. The bug shows
        # up on SmolLM2 (where merges are sparser) but the same
        # broken control flow is there on every model.
        if @merge_rank.has_key?(key)
          r = @merge_rank[key]
          if r < best_rank
            best_rank = r
            best_idx = k
          end
        end
        k = k + 1
      end
      if best_idx < 0
        break
      end
      pieces[best_idx] = pieces[best_idx] + pieces[best_idx + 1]
      pieces.delete_at(best_idx + 1)
    end
    # Vocab lookup. Same has_key? rule as the merge loop above —
    # without it, missing vocab entries silently resolve to id 0
    # (whatever vocab[0] is, usually a special token like
    # <|endoftext|>), and the decode side strips it. End result:
    # text round-trips with silently-dropped characters.
    pi = 0
    while pi < pieces.length
      piece = pieces[pi]
      if @vocab_inv.has_key?(piece)
        ids.push(@vocab_inv[piece])
      else
        if !@warned_unk
          puts "WARN: tokenizer: piece " + piece.inspect +
               " not in vocab — emitting UNK (this prompt may decode lossy)"
          @warned_unk = true
        end
        if @unk_id != nil && @unk_id >= 0
          ids.push(@unk_id)
        end
      end
      pi = pi + 1
    end
    ci = ci + 1
  end
  ids
end

#encode_spm(text) ⇒ Object

T1.3: SentencePiece encode. Llama-1/2 / Mistral / TinyLlama. Differs from GPT-2 byte-level BPE in two ways:

- leading space is encoded as ▁ (U+2581), not Ġ
- chars not in vocab fall back to per-UTF-8-byte <0xHH> tokens
  instead of going through a fixed byte-to-char map

Algorithm: prepend ▁; replace each space with ▁; split into chars; byte-fallback any char missing from vocab; then run the BPE merge loop (identical to the GPT-2 path, same has_key? rule for Spinel).



368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
# File 'lib/toy/io/tokenizer.rb', line 368

def encode_spm(text)
  ids = [0]; ids.pop
  # Prepend ▁ + replace spaces with ▁. Bytewise to dodge encoding
  # concerns under Spinel; ▁ = U+2581 = 0xE2 0x96 0x81 in UTF-8.
  sp = "\xE2\x96\x81"
  text_bytes = text.bytes
  pre = sp + ""   # leading ▁
  tb = 0
  while tb < text_bytes.length
    b = text_bytes[tb]
    if b == 0x20         # ASCII space → ▁
      pre = pre + sp
    else
      pre = pre + b.chr
    end
    tb = tb + 1
  end

  pieces = pre.chars
  # Byte-fallback for any char not in vocab. UTF-8 chars are
  # decomposed into per-byte <0xHH> piece strings; those ARE in
  # vocab (positions 3..258).
  pi = 0
  expanded = [""]; expanded.pop
  while pi < pieces.length
    ch = pieces[pi]
    if @vocab_inv.has_key?(ch)
      expanded.push(ch)
    else
      cbytes = ch.bytes
      cbi = 0
      while cbi < cbytes.length
        hex = cbytes[cbi].to_s(16).upcase
        if hex.length == 1; hex = "0" + hex; end
        expanded.push("<0x" + hex + ">")
        cbi = cbi + 1
      end
    end
    pi = pi + 1
  end
  pieces = expanded

  # BPE merge loop. Same form as the GPT-2 path; merges use a
  # space-delimited "a b" key. has_key? guards against Spinel's
  # hash-missing-returns-0 (memory feedback #9).
  while true
    best_rank = 999999999
    best_idx = -1
    k = 0
    while k < pieces.length - 1
      key = pieces[k] + " " + pieces[k + 1]
      if @merge_rank.has_key?(key)
        r = @merge_rank[key]
        if r < best_rank
          best_rank = r
          best_idx = k
        end
      end
      k = k + 1
    end
    if best_idx < 0; break; end
    pieces[best_idx] = pieces[best_idx] + pieces[best_idx + 1]
    pieces.delete_at(best_idx + 1)
  end

  # Vocab lookup with the never-mask rule from T1.2.
  pi = 0
  while pi < pieces.length
    piece = pieces[pi]
    if @vocab_inv.has_key?(piece)
      ids.push(@vocab_inv[piece])
    else
      if !@warned_unk
        puts "WARN: tokenizer(spm): piece " + piece.inspect +
             " not in vocab — emitting UNK"
        @warned_unk = true
      end
      if @unk_id != nil && @unk_id >= 0
        ids.push(@unk_id)
      end
    end
    pi = pi + 1
  end
  ids
end

#encode_spm_unigram(text) ⇒ Object

T-Gemma (#117): SPM Unigram encode (Gemma 2 and similar models whose GGUF carries ‘tokenizer.ggml.tokens` + `tokenizer.ggml.scores` but NO `tokenizer.ggml.merges`). The vocab itself IS the model —no merge rules to apply.

Algorithm: greedy longest-match over the prefixed string. For each cursor position, try the longest substring (up to MAX_PIECE_LEN bytes) that exists in vocab; emit its id; advance. Fall back to per-UTF-8-byte <0xHH> tokens for characters not covered.

This is an APPROXIMATION of the proper Unigram tokenizer (which does Viterbi over piece scores to find the maximum-score segmentation). For Gemma 2’s vocab=256000, greedy longest-match produces sensible tokenization for prose; rare-character cases may differ from the canonical SentencePiece library output by a token here or there.

Spinel constraint: use Hash#has_key? not Hash#[]; the latter returns 0 for missing keys (landmine #9). String byte/char indexing via [i…j] is safe under Spinel — verified by the existing decode_spm code.



475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
# File 'lib/toy/io/tokenizer.rb', line 475

def encode_spm_unigram(text)
  ids = [0]; ids.pop
  # Prepend ▁ + replace each space with ▁. Same prefix shape as
  # the BPE-SPM path.
  sp = "\xE2\x96\x81"
  text_bytes = text.bytes
  prepared = sp + ""
  tb = 0
  while tb < text_bytes.length
    b = text_bytes[tb]
    if b == 0x20         # ASCII space → ▁
      prepared = prepared + sp
    else
      prepared = prepared + b.chr
    end
    tb = tb + 1
  end

  # Greedy longest-match. Walk character-by-character (NOT byte-by-
  # byte — multi-byte UTF-8 like ▁ must be intact for vocab lookup).
  # Max piece length cap: 64 chars handles the longest pieces in
  # known SPM vocabs comfortably.
  chars      = prepared.chars
  n          = chars.length
  max_piece  = 64
  pos        = 0
  while pos < n
    # Build the longest candidate substring (up to max_piece chars or
    # end of input), then shrink until we find a hit in vocab.
    jmax = pos + max_piece
    if jmax > n; jmax = n; end
    j      = jmax
    hit_id = -1
    hit_len = 0
    while j > pos
      piece = ""
      k = pos
      while k < j; piece = piece + chars[k]; k = k + 1; end
      if @vocab_inv.has_key?(piece)
        hit_id  = @vocab_inv[piece]
        hit_len = j - pos
        break
      end
      j = j - 1
    end
    if hit_id >= 0
      ids.push(hit_id)
      pos = pos + hit_len
    else
      # Byte-fallback: decompose the single character at `pos` into
      # per-byte <0xHH> tokens. SPM vocabs include all 256 byte
      # tokens for exactly this case.
      ch = chars[pos]
      cbytes = ch.bytes
      cbi = 0
      while cbi < cbytes.length
        hex = cbytes[cbi].to_s(16).upcase
        if hex.length == 1; hex = "0" + hex; end
        tag = "<0x" + hex + ">"
        if @vocab_inv.has_key?(tag)
          ids.push(@vocab_inv[tag])
        else
          if !@warned_unk
            puts "WARN: tokenizer(spm-unigram): byte-fallback " + tag +
                 " not in vocab — emitting UNK"
            @warned_unk = true
          end
          if @unk_id != nil && @unk_id >= 0
            ids.push(@unk_id)
          end
        end
        cbi = cbi + 1
      end
      pos = pos + 1
    end
  end
  ids
end

#hex_digit_value(b) ⇒ Object

ASCII hex char → 0..15. Caller has already verified it’s a hex digit (because the surrounding token matches <0x..>).



252
253
254
255
256
257
# File 'lib/toy/io/tokenizer.rb', line 252

def hex_digit_value(b)
  if b >= 48 && b <= 57; return b - 48; end           # '0'..'9'
  if b >= 65 && b <= 70; return b - 65 + 10; end      # 'A'..'F'
  if b >= 97 && b <= 102; return b - 97 + 10; end     # 'a'..'f'
  0
end

#token_at(id) ⇒ Object



151
152
153
154
155
156
# File 'lib/toy/io/tokenizer.rb', line 151

def token_at(id)
  if id < 0 || id >= @vocab_size
    return ""
  end
  @vocab[id]
end