Class: Candle::LLM
- Inherits:
-
Object
- Object
- Candle::LLM
- Defined in:
- lib/candle/llm.rb
Constant Summary collapse
- TOKENIZER_REGISTRY =
Tokenizer registry for automatic detection
{ # Exact model matches "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" => "mistralai/Mistral-7B-Instruct-v0.2", "TheBloke/Mistral-7B-v0.1-GGUF" => "mistralai/Mistral-7B-v0.1", "TheBloke/Llama-2-7B-Chat-GGUF" => "meta-llama/Llama-2-7b-chat-hf", "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" => "TinyLlama/TinyLlama-1.1B-Chat-v1.0", # Qwen official GGUF models "Qwen/Qwen3-8B-GGUF" => "Qwen/Qwen3-8B", "Qwen/Qwen3-4B-GGUF" => "Qwen/Qwen3-4B", "Qwen/Qwen3-14B-GGUF" => "Qwen/Qwen3-14B", "Qwen/Qwen3-32B-GGUF" => "Qwen/Qwen3-32B", "Qwen/Qwen3-72B-GGUF" => "Qwen/Qwen3-72B", # Phi GGUF models "TheBloke/phi-2-GGUF" => "microsoft/phi-2", "microsoft/phi-4-gguf" => "microsoft/phi-4", "bartowski/Phi-3.5-mini-instruct-GGUF" => "microsoft/Phi-3.5-mini-instruct", # Pattern-based fallbacks (evaluated in order) :patterns => [ # Mistral models [/mistral.*?7b.*?instruct.*?v0\.2/i, "mistralai/Mistral-7B-Instruct-v0.2"], [/mistral.*?7b.*?instruct.*?v0\.1/i, "mistralai/Mistral-7B-Instruct-v0.1"], [/mistral.*?7b/i, "mistralai/Mistral-7B-v0.1"], # Llama models [/llama.*?3.*?8b/i, "meta-llama/Meta-Llama-3-8B"], [/llama.*?3.*?70b/i, "meta-llama/Meta-Llama-3-70B"], [/llama.*?2.*?7b.*?chat/i, "meta-llama/Llama-2-7b-chat-hf"], [/llama.*?2.*?13b.*?chat/i, "meta-llama/Llama-2-13b-chat-hf"], [/llama.*?2.*?70b.*?chat/i, "meta-llama/Llama-2-70b-chat-hf"], [/tinyllama/i, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"], # Gemma models [/gemma.*?2.*?9b/i, "google/gemma-2-9b"], [/gemma.*?2.*?2b/i, "google/gemma-2-2b"], [/gemma.*?7b/i, "google/gemma-7b"], [/gemma.*?2b/i, "google/gemma-2b"], # Qwen models [/qwen.*?3.*?72b/i, "Qwen/Qwen3-72B"], [/qwen.*?3.*?32b/i, "Qwen/Qwen3-32B"], [/qwen.*?3.*?14b/i, "Qwen/Qwen3-14B"], [/qwen.*?3.*?8b/i, "Qwen/Qwen3-8B"], [/qwen.*?3.*?4b/i, "Qwen/Qwen3-4B"], [/qwen.*?3.*?1\.8b/i, "Qwen/Qwen3-1.8B"], [/qwen.*?3.*?0\.5b/i, "Qwen/Qwen3-0.5B"], [/qwen.*?2\.5/i, "Qwen/Qwen2.5-0.5B"], [/qwen.*?2/i, "Qwen/Qwen2-1.5B"], [/qwen/i, "Qwen/Qwen-1_8B"], # Phi models (order matters - more specific patterns first) [/phi.*?3\.5.*?mini/i, "microsoft/Phi-3.5-mini-instruct"], [/phi.*?3.*?mini.*?4k/i, "microsoft/Phi-3-mini-4k-instruct"], [/phi.*?3.*?medium/i, "microsoft/Phi-3-medium-4k-instruct"], [/phi.*?3.*?small/i, "microsoft/Phi-3-small-8k-instruct"], [/phi.*?3.*?mini/i, "microsoft/Phi-3-mini-4k-instruct"], [/phi.*?3/i, "microsoft/Phi-3-mini-4k-instruct"], [/phi-4/i, "microsoft/phi-4"], [/phi.*?2/i, "microsoft/phi-2"], [/phi.*?1\.5/i, "microsoft/phi-1_5"], [/phi/i, "microsoft/phi-2"] ] }
Class Method Summary collapse
- .from_pretrained(model_id, device: Candle::Device.best, gguf_file: nil, tokenizer: nil) ⇒ Object
-
.guess_tokenizer(model_id) ⇒ Object
Guess the tokenizer for a model.
-
.register_tokenizer(model_pattern, tokenizer_id) ⇒ Object
Allow users to register custom tokenizer mappings.
Instance Method Summary collapse
-
#cached_eos_token ⇒ Object
Cache for EOS token to avoid repeated calls.
-
#chat(messages, **options) ⇒ Object
Chat interface — always returns a String.
-
#chat_stream(messages, **options, &block) ⇒ Object
Streaming chat interface.
-
#chat_with_tools(messages, tools:, execute: false, **options) ⇒ Object
Chat with tool calling — always returns a ToolCallResult Set execute: true to automatically run the tools (default: false).
-
#constraint_from_regex(pattern) ⇒ Object
Create a structured constraint from a regex pattern Uses the model’s vocabulary with proper byte encoding handling.
-
#constraint_from_schema(schema) ⇒ Object
Create a structured constraint from a JSON schema Uses the model’s vocabulary with proper byte encoding handling.
- #generate(prompt, config: GenerationConfig.balanced, reset_cache: true) ⇒ Object
-
#generate_regex(prompt, pattern:, stop_on_match: true, **options) ⇒ Object
Generate with regex constraint.
- #generate_stream(prompt, config: GenerationConfig.balanced, reset_cache: true, &block) ⇒ Object
-
#generate_structured(prompt, schema:, **options) ⇒ Object
Generate and parse structured output from a JSON schema.
-
#inspect ⇒ Object
Inspect method for debugging and exploration.
-
#model_eos_tokens ⇒ Object
Get model-specific EOS tokens.
Class Method Details
.from_pretrained(model_id, device: Candle::Device.best, gguf_file: nil, tokenizer: nil) ⇒ Object
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 |
# File 'lib/candle/llm.rb', line 363 def self.from_pretrained(model_id, device: Candle::Device.best, gguf_file: nil, tokenizer: nil) model_str = if gguf_file "#{model_id}@#{gguf_file}" else model_id end # Handle GGUF models that need tokenizer if model_str.downcase.include?("gguf") && tokenizer.nil? # Try to load without tokenizer first begin _from_pretrained(model_str, device) rescue => e if e..include?("No tokenizer found") # Auto-detect tokenizer detected_tokenizer = guess_tokenizer(model_id) Candle.logger.info "No tokenizer found in GGUF repo. Using tokenizer from: #{detected_tokenizer}" model_str = "#{model_str}@@#{detected_tokenizer}" _from_pretrained(model_str, device) else raise e end end elsif tokenizer # User specified tokenizer model_str = "#{model_str}@@#{tokenizer}" _from_pretrained(model_str, device) else # Non-GGUF model or GGUF with embedded tokenizer _from_pretrained(model_str, device) end end |
.guess_tokenizer(model_id) ⇒ Object
Guess the tokenizer for a model
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
# File 'lib/candle/llm.rb', line 239 def self.guess_tokenizer(model_id) # Check exact matches first return TOKENIZER_REGISTRY[model_id] if TOKENIZER_REGISTRY[model_id] # Check patterns if patterns = TOKENIZER_REGISTRY[:patterns] patterns.each do |pattern, tokenizer| return tokenizer if model_id.match?(pattern) end end # Default: try removing common GGUF suffixes base_model = model_id.gsub(/-gguf|-q\d+_\w+$/i, "") base_model end |
.register_tokenizer(model_pattern, tokenizer_id) ⇒ Object
Allow users to register custom tokenizer mappings
227 228 229 230 231 232 233 234 235 236 |
# File 'lib/candle/llm.rb', line 227 def self.register_tokenizer(model_pattern, tokenizer_id) if model_pattern.is_a?(String) TOKENIZER_REGISTRY[model_pattern] = tokenizer_id elsif model_pattern.is_a?(Regexp) TOKENIZER_REGISTRY[:patterns] ||= [] TOKENIZER_REGISTRY[:patterns].unshift([model_pattern, tokenizer_id]) else raise ArgumentError, "model_pattern must be a String or Regexp" end end |
Instance Method Details
#cached_eos_token ⇒ Object
Cache for EOS token to avoid repeated calls
6 7 8 9 10 11 12 |
# File 'lib/candle/llm.rb', line 6 def cached_eos_token @cached_eos_token ||= begin if respond_to?(:eos_token) eos_token rescue nil end end end |
#chat(messages, **options) ⇒ Object
Chat interface — always returns a String
256 257 258 259 |
# File 'lib/candle/llm.rb', line 256 def chat(, **) prompt = apply_chat_template() generate(prompt, **) end |
#chat_stream(messages, **options, &block) ⇒ Object
Streaming chat interface
262 263 264 265 |
# File 'lib/candle/llm.rb', line 262 def chat_stream(, **, &block) prompt = apply_chat_template() generate_stream(prompt, **, &block) end |
#chat_with_tools(messages, tools:, execute: false, **options) ⇒ Object
Chat with tool calling — always returns a ToolCallResult Set execute: true to automatically run the tools (default: false)
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
# File 'lib/candle/llm.rb', line 269 def chat_with_tools(, tools:, execute: false, **) tool_prompt = build_tool_system_prompt(tools) augmented = inject_tool_instructions(, tool_prompt) raw_response = chat(augmented, **) result = ToolCallParser.parse(raw_response, available_tools: tools) if result.has_tool_calls? && execute tool_results = result.tool_calls.map do |tool_call| tool = tools.find { |t| t.name == tool_call.name } unless tool next { tool_call: tool_call, result: nil, error: "Unknown tool: #{tool_call.name}" } end begin output = tool.call(tool_call.arguments) { tool_call: tool_call, result: output, error: nil } rescue Exception => e { tool_call: tool_call, result: nil, error: e. } end end ToolCallResult.new( tool_calls: result.tool_calls, tool_results: tool_results, text_response: result.text_response, raw_response: raw_response ) else ToolCallResult.new( tool_calls: result.tool_calls, tool_results: [], text_response: result.has_tool_calls? ? result.text_response : raw_response, raw_response: raw_response ) end end |
#constraint_from_regex(pattern) ⇒ Object
Create a structured constraint from a regex pattern Uses the model’s vocabulary with proper byte encoding handling
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/candle/llm.rb', line 61 def constraint_from_regex(pattern) pattern_str = pattern.is_a?(Regexp) ? pattern.source : pattern.to_s # Extract the tokenizer source model ID for proper vocabulary loading tokenizer_model = tokenizer_source_model if tokenizer_model begin StructuredConstraint.from_regex_with_model(pattern_str, tokenizer_model) rescue RuntimeError => e # Fall back to legacy method if from_pretrained fails if e..include?("UnsupportedTokenizer") StructuredConstraint.from_regex(pattern_str, tokenizer) else raise end end else # Fall back to legacy method if we can't determine the model StructuredConstraint.from_regex(pattern_str, tokenizer) end end |
#constraint_from_schema(schema) ⇒ Object
Create a structured constraint from a JSON schema Uses the model’s vocabulary with proper byte encoding handling
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/candle/llm.rb', line 36 def constraint_from_schema(schema) schema_str = schema.is_a?(String) ? schema : JSON.generate(schema) # Extract the tokenizer source model ID for proper vocabulary loading tokenizer_model = tokenizer_source_model if tokenizer_model begin StructuredConstraint.from_schema_with_model(schema_str, tokenizer_model) rescue RuntimeError => e # Fall back to legacy method if from_pretrained fails # (e.g., tokenizer doesn't have EOS token in expected format) if e..include?("UnsupportedTokenizer") StructuredConstraint.from_schema(schema_str, tokenizer) else raise end end else # Fall back to legacy method if we can't determine the model StructuredConstraint.from_schema(schema_str, tokenizer) end end |
#generate(prompt, config: GenerationConfig.balanced, reset_cache: true) ⇒ Object
347 348 349 350 351 352 353 |
# File 'lib/candle/llm.rb', line 347 def generate(prompt, config: GenerationConfig.balanced, reset_cache: true) begin _generate(prompt, config) ensure clear_cache if reset_cache end end |
#generate_regex(prompt, pattern:, stop_on_match: true, **options) ⇒ Object
Generate with regex constraint
121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/candle/llm.rb', line 121 def generate_regex(prompt, pattern:, stop_on_match: true, **) constraint = constraint_from_regex(pattern) # Configure generation with early stopping by default config_opts = .merge( constraint: constraint, stop_on_constraint_satisfaction: .fetch(:stop_on_constraint_satisfaction, stop_on_match), stop_on_match: stop_on_match ) config = [:config] || GenerationConfig.balanced(**config_opts) generate(prompt, config: config, reset_cache: .fetch(:reset_cache, true)) end |
#generate_stream(prompt, config: GenerationConfig.balanced, reset_cache: true, &block) ⇒ Object
355 356 357 358 359 360 361 |
# File 'lib/candle/llm.rb', line 355 def generate_stream(prompt, config: GenerationConfig.balanced, reset_cache: true, &block) begin _generate_stream(prompt, config, &block) ensure clear_cache if reset_cache end end |
#generate_structured(prompt, schema:, **options) ⇒ Object
Generate and parse structured output from a JSON schema
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/candle/llm.rb', line 136 def generate_structured(prompt, schema:, **) constraint = constraint_from_schema(schema) # Configure generation with early stopping by default config_opts = .merge( constraint: constraint, stop_on_constraint_satisfaction: .fetch(:stop_on_constraint_satisfaction, true) ) config = [:config] || GenerationConfig.balanced(**config_opts) result = generate(prompt, config: config, reset_cache: .fetch(:reset_cache, true)) # Try to parse as JSON begin # First, try to extract JSON if there's content after stop tokens json_content = extract_json_content(result) JSON.parse(json_content) rescue JSON::ParserError => e # Return the raw string if parsing fails Candle.logger.warn "Generated output is not valid JSON: #{e.}" if [:warn_on_parse_error] result end end |
#inspect ⇒ Object
Inspect method for debugging and exploration
309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 |
# File 'lib/candle/llm.rb', line 309 def inspect opts = rescue {} # Extract key information model_type = opts["model_type"] || "Unknown" device = opts["device"] || self.device.to_s rescue "unknown" # Build the inspect string parts = ["#<Candle::LLM"] # Add base model or model_id if opts["base_model"] parts << "model=#{opts["base_model"]}" elsif opts["model_id"] parts << "model=#{opts["model_id"]}" elsif respond_to?(:model_id) parts << "model=#{model_id}" end # Add GGUF file if present if opts["gguf_file"] parts << "gguf=#{opts["gguf_file"]}" end # Add device parts << "device=#{device}" # Add model type parts << "type=#{model_type}" # Add architecture for GGUF models if opts["architecture"] parts << "arch=#{opts["architecture"]}" end parts.join(" ") + ">" end |
#model_eos_tokens ⇒ Object
Get model-specific EOS tokens
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/candle/llm.rb', line 15 def model_eos_tokens @model_eos_tokens ||= begin tokens = [] if model_eos = cached_eos_token tokens << model_eos # For Gemma, also include end_of_turn for chat scenarios and </s> # Even though </s> is technically an HTML tag in Gemma's vocabulary, # it seems to use it as a generation boundary in practice if model_name.downcase.include?("gemma") tokens << "<end_of_turn>" tokens << "</s>" end else # Fallback to common tokens only if model doesn't provide one tokens = ["</s>", "<|endoftext|>", "<|im_end|>", "<end>"] end tokens.uniq end end |