Class: Ignis::AI::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/nnw/ai/tokenizer.rb

Overview

Hybrid tokenizer: tries native HuggingFace tokenizers DLL first, falls back to pure Ruby BPE. Best of both worlds.

Native DLL: tokenizers_ruby.dll / libtokenizers.so / libtokenizers.dylib Search paths: model dir, Ignis lib root, system PATH, vcpkg

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(config_path) ⇒ Tokenizer

Returns a new instance of Tokenizer.

Parameters:

  • config_path (String)

    path to tokenizer.json

Raises:

  • (ArgumentError)


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/nnw/ai/tokenizer.rb', line 32

def initialize(config_path)
  raise ArgumentError, "tokenizer.json not found: #{config_path}" unless File.exist?(config_path)

  @config_path = config_path
  @native_backend = false
  @native_handle = nil

  # Try native DLL first
  native_dll = find_native_dll(File.dirname(config_path))
  if native_dll
    begin
      load_native_backend(native_dll, config_path)
      @native_backend = true
      Ignis.logger.info("Tokenizer: using native backend (#{File.basename(native_dll)})")
    rescue => e
      Ignis.logger.warn("Tokenizer: native backend failed (#{e.message}), falling back to Ruby BPE")
      @native_backend = false
    end
  end

  # Always load Ruby config for metadata (special tokens, vocab size)
  # even when native backend is in use
  config = JSON.parse(File.read(config_path))
  load_from_config(config)
end

Instance Attribute Details

#id_to_tokenHash{Integer => String} (readonly)

Returns id to token.

Returns:

  • (Hash{Integer => String})

    id to token



23
24
25
# File 'lib/nnw/ai/tokenizer.rb', line 23

def id_to_token
  @id_to_token
end

#native_backendBoolean (readonly)

Returns whether using native backend.

Returns:

  • (Boolean)

    whether using native backend



15
16
17
# File 'lib/nnw/ai/tokenizer.rb', line 15

def native_backend
  @native_backend
end

#special_token_idsSet<Integer> (readonly)

Returns ids of special tokens (used by inference EOS checks).

Returns:

  • (Set<Integer>)

    ids of special tokens (used by inference EOS checks)



29
30
31
# File 'lib/nnw/ai/tokenizer.rb', line 29

def special_token_ids
  @special_token_ids
end

#special_tokensHash{String => Integer} (readonly)

Returns special tokens.

Returns:

  • (Hash{String => Integer})

    special tokens



26
27
28
# File 'lib/nnw/ai/tokenizer.rb', line 26

def special_tokens
  @special_tokens
end

#token_to_idHash{String => Integer} (readonly)

Returns token to id.

Returns:

  • (Hash{String => Integer})

    token to id



20
21
22
# File 'lib/nnw/ai/tokenizer.rb', line 20

def token_to_id
  @token_to_id
end

#vocab_sizeInteger (readonly)

Returns vocabulary size.

Returns:

  • (Integer)

    vocabulary size



17
18
19
# File 'lib/nnw/ai/tokenizer.rb', line 17

def vocab_size
  @vocab_size
end

Class Method Details

.from_pretrained(dir) ⇒ Tokenizer

Load from HuggingFace model directory.

Parameters:

  • dir (String)

    directory containing tokenizer.json

Returns:



61
62
63
64
# File 'lib/nnw/ai/tokenizer.rb', line 61

def self.from_pretrained(dir)
  path = File.join(dir, "tokenizer.json")
  new(path)
end

Instance Method Details

#decode(ids, skip_special_tokens: true) ⇒ String

Decode token ids to text.

Parameters:

  • ids (Array<Integer>)
  • skip_special_tokens (Boolean) (defaults to: true)

Returns:

  • (String)


83
84
85
86
87
88
89
90
# File 'lib/nnw/ai/tokenizer.rb', line 83

def decode(ids, skip_special_tokens: true)
  tokens = ids.filter_map do |id|
    next nil if skip_special_tokens && @special_token_ids.include?(id)
    @id_to_token[id]
  end
  text = tokens.join("")
  @byte_level ? decode_byte_level(text) : text
end

#decode_batch(id_sequences, skip_special_tokens: true) ⇒ Array<String>

Batch decode.

Parameters:

  • id_sequences (Array<Array<Integer>>)
  • skip_special_tokens (Boolean) (defaults to: true)

Returns:

  • (Array<String>)


104
105
106
# File 'lib/nnw/ai/tokenizer.rb', line 104

def decode_batch(id_sequences, skip_special_tokens: true)
  id_sequences.map { |ids| decode(ids, skip_special_tokens: skip_special_tokens) }
end

#encode(text, add_special_tokens: true) ⇒ Array<Integer>

Encode text to token ids. Uses native backend if available, else pure Ruby BPE.

Parameters:

  • text (String)
  • add_special_tokens (Boolean) (defaults to: true)

Returns:

  • (Array<Integer>)


71
72
73
74
75
76
77
# File 'lib/nnw/ai/tokenizer.rb', line 71

def encode(text, add_special_tokens: true)
  if @native_backend && @native_encode
    native_encode(text, add_special_tokens)
  else
    ruby_encode(text, add_special_tokens)
  end
end

#encode_batch(texts, add_special_tokens: true) ⇒ Array<Array<Integer>>

Batch encode.

Parameters:

  • texts (Array<String>)
  • add_special_tokens (Boolean) (defaults to: true)

Returns:

  • (Array<Array<Integer>>)


96
97
98
# File 'lib/nnw/ai/tokenizer.rb', line 96

def encode_batch(texts, add_special_tokens: true)
  texts.map { |t| encode(t, add_special_tokens: add_special_tokens) }
end

#encode_to_tensor(text, device_id: 0) ⇒ Tensor

Encode and return GPU Tensor (int32).

Parameters:

  • text (String)
  • device_id (Integer) (defaults to: 0)

Returns:

  • (Tensor)


112
113
114
115
116
117
# File 'lib/nnw/ai/tokenizer.rb', line 112

def encode_to_tensor(text, device_id: 0)
  ids = encode(text)
  nv = Ignis::Shared::NvArray.new(shape: [1, ids.length], dtype: :int32, device_id: device_id)
  nv.from_host(ids)
  Tensor.new(data: nv, requires_grad: false)
end