Class: Ignis::AI::Tokenizer

Inherits:

Object

Object
Ignis::AI::Tokenizer

show all

Defined in:: lib/nnw/ai/tokenizer.rb

Overview

Hybrid tokenizer: tries native HuggingFace tokenizers DLL first, falls back to pure Ruby BPE. Best of both worlds.

Native DLL: tokenizers_ruby.dll / libtokenizers.so / libtokenizers.dylib Search paths: model dir, Ignis lib root, system PATH, vcpkg

Instance Attribute Summary collapse

#id_to_token ⇒ Hash{Integer => String} readonly

Id to token.
#native_backend ⇒ Boolean readonly

Whether using native backend.
#special_token_ids ⇒ Set<Integer> readonly

Ids of special tokens (used by inference EOS checks).
#special_tokens ⇒ Hash{String => Integer} readonly

Special tokens.
#token_to_id ⇒ Hash{String => Integer} readonly

Token to id.
#vocab_size ⇒ Integer readonly

Vocabulary size.

Class Method Summary collapse

.from_pretrained(dir) ⇒ Tokenizer

Load from HuggingFace model directory.

Instance Method Summary collapse

#decode(ids, skip_special_tokens: true) ⇒ String

Decode token ids to text.
#decode_batch(id_sequences, skip_special_tokens: true) ⇒ Array<String>

Batch decode.
#encode(text, add_special_tokens: true) ⇒ Array<Integer>

Encode text to token ids.
#encode_batch(texts, add_special_tokens: true) ⇒ Array<Array<Integer>>

Batch encode.
#encode_to_tensor(text, device_id: 0) ⇒ Tensor

Encode and return GPU Tensor (int32).
#initialize(config_path) ⇒ Tokenizer constructor

A new instance of Tokenizer.

Constructor Details

#initialize(config_path) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

Parameters:

config_path (String) —

path to tokenizer.json

Raises:

(ArgumentError)

# File 'lib/nnw/ai/tokenizer.rb', line 32

def initialize(config_path)
  raise ArgumentError, "tokenizer.json not found: #{config_path}" unless File.exist?(config_path)

  @config_path = config_path
  @native_backend = false
  @native_handle = nil

  # Try native DLL first
  native_dll = find_native_dll(File.dirname(config_path))
  if native_dll
    begin
      load_native_backend(native_dll, config_path)
      @native_backend = true
      Ignis.logger.info("Tokenizer: using native backend (#{File.basename(native_dll)})")
    rescue => e
      Ignis.logger.warn("Tokenizer: native backend failed (#{e.message}), falling back to Ruby BPE")
      @native_backend = false
    end
  end

  # Always load Ruby config for metadata (special tokens, vocab size)
  # even when native backend is in use
  config = JSON.parse(File.read(config_path))
  load_from_config(config)
end

Instance Attribute Details

#id_to_token ⇒ `Hash{Integer => String}` (readonly)

Returns id to token.

Returns:

(Hash{Integer => String}) —

id to token



23
24
25

# File 'lib/nnw/ai/tokenizer.rb', line 23

def id_to_token
  @id_to_token
end

#native_backend ⇒ `Boolean` (readonly)

Returns whether using native backend.

Returns:

(Boolean) —

whether using native backend



15
16
17

# File 'lib/nnw/ai/tokenizer.rb', line 15

def native_backend
  @native_backend
end

#special_token_ids ⇒ `Set<Integer>` (readonly)

Returns ids of special tokens (used by inference EOS checks).

Returns:

(Set<Integer>) —

ids of special tokens (used by inference EOS checks)



29
30
31

# File 'lib/nnw/ai/tokenizer.rb', line 29

def special_token_ids
  @special_token_ids
end

#special_tokens ⇒ `Hash{String => Integer}` (readonly)

Returns special tokens.

Returns:

(Hash{String => Integer}) —

special tokens



26
27
28

# File 'lib/nnw/ai/tokenizer.rb', line 26

def special_tokens
  @special_tokens
end

#token_to_id ⇒ `Hash{String => Integer}` (readonly)

Returns token to id.

Returns:

(Hash{String => Integer}) —

token to id



20
21
22

# File 'lib/nnw/ai/tokenizer.rb', line 20

def token_to_id
  @token_to_id
end

#vocab_size ⇒ `Integer` (readonly)

Returns vocabulary size.

Returns:

(Integer) —

vocabulary size



17
18
19

# File 'lib/nnw/ai/tokenizer.rb', line 17

def vocab_size
  @vocab_size
end

Class Method Details

.from_pretrained(dir) ⇒ `Tokenizer`

Load from HuggingFace model directory.

Parameters:

dir (String) —

directory containing tokenizer.json

Returns:

(Tokenizer)

# File 'lib/nnw/ai/tokenizer.rb', line 61

def self.from_pretrained(dir)
  path = File.join(dir, "tokenizer.json")
  new(path)
end

Instance Method Details

#decode(ids, skip_special_tokens: true) ⇒ `String`

Decode token ids to text.

Parameters:

ids (Array<Integer>)
skip_special_tokens (Boolean) (defaults to: true)

Returns:

(String)

# File 'lib/nnw/ai/tokenizer.rb', line 83

def decode(ids, skip_special_tokens: true)
  tokens = ids.filter_map do |id|
    next nil if skip_special_tokens && @special_token_ids.include?(id)
    @id_to_token[id]
  end
  text = tokens.join("")
  @byte_level ? decode_byte_level(text) : text
end

#decode_batch(id_sequences, skip_special_tokens: true) ⇒ `Array<String>`

Batch decode.

Parameters:

id_sequences (Array<Array<Integer>>)
skip_special_tokens (Boolean) (defaults to: true)

Returns:

(Array<String>)



104
105
106

# File 'lib/nnw/ai/tokenizer.rb', line 104

def decode_batch(id_sequences, skip_special_tokens: true)
  id_sequences.map { |ids| decode(ids, skip_special_tokens: skip_special_tokens) }
end

#encode(text, add_special_tokens: true) ⇒ `Array<Integer>`

Encode text to token ids. Uses native backend if available, else pure Ruby BPE.

Parameters:

text (String)
add_special_tokens (Boolean) (defaults to: true)

Returns:

(Array<Integer>)

# File 'lib/nnw/ai/tokenizer.rb', line 71

def encode(text, add_special_tokens: true)
  if @native_backend && @native_encode
    native_encode(text, add_special_tokens)
  else
    ruby_encode(text, add_special_tokens)
  end
end

#encode_batch(texts, add_special_tokens: true) ⇒ `Array<Array<Integer>>`

Batch encode.

Parameters:

texts (Array<String>)
add_special_tokens (Boolean) (defaults to: true)

Returns:

(Array<Array<Integer>>)



96
97
98

# File 'lib/nnw/ai/tokenizer.rb', line 96

def encode_batch(texts, add_special_tokens: true)
  texts.map { |t| encode(t, add_special_tokens: add_special_tokens) }
end

#encode_to_tensor(text, device_id: 0) ⇒ `Tensor`

Encode and return GPU Tensor (int32).

Parameters:

text (String)
device_id (Integer) (defaults to: 0)

Returns:

(Tensor)

# File 'lib/nnw/ai/tokenizer.rb', line 112

def encode_to_tensor(text, device_id: 0)
  ids = encode(text)
  nv = Ignis::Shared::NvArray.new(shape: [1, ids.length], dtype: :int32, device_id: device_id)
  nv.from_host(ids)
  Tensor.new(data: nv, requires_grad: false)
end

Class: Ignis::AI::Tokenizer

Overview

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(config_path) ⇒ Tokenizer

Instance Attribute Details

#id_to_token ⇒ Hash{Integer => String} (readonly)

#native_backend ⇒ Boolean (readonly)

#special_token_ids ⇒ Set<Integer> (readonly)

#special_tokens ⇒ Hash{String => Integer} (readonly)

#token_to_id ⇒ Hash{String => Integer} (readonly)

#vocab_size ⇒ Integer (readonly)

Class Method Details

.from_pretrained(dir) ⇒ Tokenizer

Instance Method Details

#decode(ids, skip_special_tokens: true) ⇒ String

#decode_batch(id_sequences, skip_special_tokens: true) ⇒ Array<String>

#encode(text, add_special_tokens: true) ⇒ Array<Integer>

#encode_batch(texts, add_special_tokens: true) ⇒ Array<Array<Integer>>

#encode_to_tensor(text, device_id: 0) ⇒ Tensor

#initialize(config_path) ⇒ `Tokenizer`

#id_to_token ⇒ `Hash{Integer => String}` (readonly)

#native_backend ⇒ `Boolean` (readonly)

#special_token_ids ⇒ `Set<Integer>` (readonly)

#special_tokens ⇒ `Hash{String => Integer}` (readonly)

#token_to_id ⇒ `Hash{String => Integer}` (readonly)

#vocab_size ⇒ `Integer` (readonly)

.from_pretrained(dir) ⇒ `Tokenizer`

#decode(ids, skip_special_tokens: true) ⇒ `String`

#decode_batch(id_sequences, skip_special_tokens: true) ⇒ `Array<String>`

#encode(text, add_special_tokens: true) ⇒ `Array<Integer>`

#encode_batch(texts, add_special_tokens: true) ⇒ `Array<Array<Integer>>`

#encode_to_tensor(text, device_id: 0) ⇒ `Tensor`