Class: LLaMACpp::Client
- Inherits:
-
Object
- Object
- LLaMACpp::Client
- Defined in:
- lib/llama_cpp/client.rb
Overview
Client provides a high-level interface to the LLM model.
Instance Method Summary collapse
-
#completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1) ⇒ String
Generates completions for a given prompt.
-
#embeddings(text) ⇒ Array<Float>
Obtains the embedding for a given text.
-
#initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil, n_ctx: 512, n_parts: -1,, memory_f16: false, use_mmap: true, use_mlock: false, embedding: false, n_threads: 1, seed: 0) ⇒ Client
constructor
Creates a new client.
Constructor Details
#initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil, n_ctx: 512, n_parts: -1,, memory_f16: false, use_mmap: true, use_mlock: false, embedding: false, n_threads: 1, seed: 0) ⇒ Client
Creates a new client.
rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/llama_cpp/client.rb', line 21 def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil, n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false, embedding: false, n_threads: 1, seed: 0) @params = { model_path: model_path, lora_adapter_path: lora_adapter_path, lora_base_path: lora_base_path, n_ctx: n_ctx, n_parts: n_parts, memory_f16: memory_f16, use_mmap: use_mmap, use_mlock: use_mlock, embedding: , n_threads: n_threads, seed: seed } @context_params = ContextParams.new @context_params.n_ctx = n_ctx @context_params.n_parts = n_parts @context_params.f16_kv = memory_f16 @context_params.use_mmap = use_mmap @context_params.use_mlock = use_mlock @context_params. = @context_params.seed = seed @context = Context.new(model_path: model_path, params: @context_params) return unless lora_adapter_path.is_a?(String) if lora_base_path.is_a?(String) @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads) else @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads) end end |
Instance Method Details
#completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1) ⇒ String
Generates completions for a given prompt.
rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/llama_cpp/client.rb', line 74 def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1) embd_input = tokenize_prompt(prompt) n_ctx = @context.n_ctx raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4 last_n_tokens = [0] * n_ctx embd = [] n_consumed = 0 n_past = 0 n_remain = max_tokens n_vocab = @context.n_vocab output = [] while n_remain != 0 unless embd.empty? if n_past + embd.size > n_ctx n_left = n_past - n_keep n_past = n_keep embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size]) end @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads]) end n_past += embd.size embd.clear if embd_input.size <= n_consumed logits = @context.logits base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) } candidates = LLaMACpp::TokenDataArray.new(base_candidates) # apply penalties last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min @context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty) @context.sample_frequency_and_presence_penalties( candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence ) # temperature sampling @context.sample_top_k(candidates, k: top_k) @context.sample_tail_free(candidates, z: tfs_z) @context.sample_typical(candidates, prob: typical_p) @context.sample_top_p(candidates, prob: top_p) @context.sample_temperature(candidates, temperature: temperature) id = @context.sample_token(candidates) last_n_tokens.shift last_n_tokens.push(id) last_n_tokens.shift last_n_tokens.push(id) embd.push(id) n_remain -= 1 else while embd_input.size > n_consumed embd.push(embd_input[n_consumed]) last_n_tokens.shift last_n_tokens.push(embd_input[n_consumed]) n_consumed += 1 break if embd.size >= n_batch end end embd.each { |token| output << @context.token_to_str(token) } break if !embd.empty? && embd[-1] == LLaMACpp.token_eos end output.join.delete_prefix(" #{prompt}").strip end |
#embeddings(text) ⇒ Array<Float>
Obtains the embedding for a given text.
158 159 160 161 162 163 164 165 166 |
# File 'lib/llama_cpp/client.rb', line 158 def (text) raise 'The embedding option is set to false' unless @params[:embedding] embd_input = tokenize_prompt(text) raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive? @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads]) @context. end |