Class: RubyLLM::Tokenizer::Backend::SentencePiece

Inherits:
Base
  • Object
show all
Defined in:
lib/ruby_llm/tokenizer/backend/sentencepiece.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Base

#count, #truncate

Constructor Details

#initialize(model_file: nil, model_file_env: nil) ⇒ SentencePiece

Returns a new instance of SentencePiece.



11
12
13
14
15
16
17
18
# File 'lib/ruby_llm/tokenizer/backend/sentencepiece.rb', line 11

def initialize(model_file: nil, model_file_env: nil)
  super()
  @model_file = resolve_model_file(model_file, model_file_env)
  processor_class = load_sentencepiece_processor_class
  @tokenizer = processor_class.new(model_file: @model_file)
rescue StandardError => e
  raise BackendError, "Failed to load SentencePiece model #{@model_file.inspect}: #{e.message}"
end

Instance Attribute Details

#model_fileObject (readonly)

Returns the value of attribute model_file.



9
10
11
# File 'lib/ruby_llm/tokenizer/backend/sentencepiece.rb', line 9

def model_file
  @model_file
end

Instance Method Details

#analyze(text) ⇒ Object



28
29
30
31
32
33
# File 'lib/ruby_llm/tokenizer/backend/sentencepiece.rb', line 28

def analyze(text)
  text = text.to_s
  ids = @tokenizer.public_send(:encode_as_ids, text)
  tokens = @tokenizer.public_send(:encode, text, out_type: "str")
  Analysis.new(tokens: tokens, ids: ids, model: identifier)
end

#decode(ids) ⇒ Object



24
25
26
# File 'lib/ruby_llm/tokenizer/backend/sentencepiece.rb', line 24

def decode(ids)
  @tokenizer.public_send(:decode, Array(ids))
end

#encode(text) ⇒ Object



20
21
22
# File 'lib/ruby_llm/tokenizer/backend/sentencepiece.rb', line 20

def encode(text)
  @tokenizer.public_send(:encode_as_ids, text.to_s)
end

#identifierObject



35
36
37
# File 'lib/ruby_llm/tokenizer/backend/sentencepiece.rb', line 35

def identifier
  "sentencepiece:#{model_file}"
end