Class: LLaMACpp::Client

Inherits:

Object

Object
LLaMACpp::Client

show all

Defined in:: lib/llama_cpp/client.rb

Overview

Client provides a high-level interface to the LLM model.

Instance Method Summary collapse

#completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1) ⇒ String

Generates completions for a given prompt.
#embeddings(text) ⇒ Array<Float>

Obtains the embedding for a given text.
#initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil, n_ctx: 512, n_parts: -1,, memory_f16: false, use_mmap: true, use_mlock: false, embedding: false, n_threads: 1, seed: 0) ⇒ Client constructor

Creates a new client.

Constructor Details

#initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil, n_ctx: 512, n_parts: -1,, memory_f16: false, use_mmap: true, use_mlock: false, embedding: false, n_threads: 1, seed: 0) ⇒ `Client`

Creates a new client.

rubocop:disable Metrics/MethodLength, Metrics/ParameterLists

Parameters:

model_path (String) —

The path to the model file.
lora_adapter_path (String) (defaults to: nil) —

The path to the LoRA adapter file.
lora_base_path (String) (defaults to: nil) —

The path to the LoRA base model file.
n_ctx (Integer) (defaults to: 512) —

The context size.
n_parts (Integer) (defaults to: -1,) —

The amount of model parts (-1 = determine from model dimensions).
memory_f16 (Boolean) (defaults to: false) —

The flag wheter to use f16 instead of f32 for memory kv.
use_mmap (Boolean) (defaults to: true) —

The flag whether to use mmap.
use_mlock (Boolean) (defaults to: false) —

The flag hether to use mlock.
embedding (Boolean) (defaults to: false) —

The flag whether to calculate embedding.
n_threads (Integer) (defaults to: 1) —

The number of threads to use.
seed (Integer) (defaults to: 0) —

The seed for the random number generator.

# File 'lib/llama_cpp/client.rb', line 21

def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
               n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
               embedding: false,
               n_threads: 1, seed: 0)
  @params = {
    model_path: model_path,
    lora_adapter_path: lora_adapter_path,
    lora_base_path: lora_base_path,
    n_ctx: n_ctx,
    n_parts: n_parts,
    memory_f16: memory_f16,
    use_mmap: use_mmap,
    use_mlock: use_mlock,
    embedding: embedding,
    n_threads: n_threads,
    seed: seed
  }
  @context_params = ContextParams.new
  @context_params.n_ctx = n_ctx
  @context_params.n_parts = n_parts
  @context_params.f16_kv = memory_f16
  @context_params.use_mmap = use_mmap
  @context_params.use_mlock = use_mlock
  @context_params.embedding = embedding
  @context_params.seed = seed
  @context = Context.new(model_path: model_path, params: @context_params)
  return unless lora_adapter_path.is_a?(String)

  if lora_base_path.is_a?(String)
    @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
  else
    @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
  end
end

Instance Method Details

#completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1) ⇒ `String`

Generates completions for a given prompt.

rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity

Parameters:

prompt (String) —

The prompt to generate completions for.
max_tokens (Integer) (defaults to: 64) —

The maximum number of tokens to generate.
n_keep (Integer) (defaults to: 10) —

The number of tokens to keep from the initial prompt.
repeat_last_n (Integer) (defaults to: 64) —

The number of tokens to use for repeat penalty.
n_batch (Integer) (defaults to: 512) —

The batch size.
frequency (Float) (defaults to: 0.0) —

The frequency penalty value.
presence (Float) (defaults to: 0.0) —

The presence penalty value.
top_k (Integer) (defaults to: 40) —

The top-k value.
top_p (Float) (defaults to: 0.95) —

The top-p value.
tfs_z (Float) (defaults to: 1.0) —

The tail free sampling parameter.
typical_p (Float) (defaults to: 1.0) —

The typical probability value.
temperature (Float) (defaults to: 0.8) —

The temperature value.
repeat_penalty (Float) (defaults to: 1.1) —

The repeat penalty value.

Returns:

(String)

Raises:

(ArgumentError)

# File 'lib/llama_cpp/client.rb', line 74

def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
                frequency: 0.0, presence: 0.0,
                top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
  embd_input = tokenize_prompt(prompt)

  n_ctx = @context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = max_tokens
  n_vocab = @context.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = @context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      @context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
      @context.sample_frequency_and_presence_penalties(
        candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
      )

      # temperature sampling
      @context.sample_top_k(candidates, k: top_k)
      @context.sample_tail_free(candidates, z: tfs_z)
      @context.sample_typical(candidates, prob: typical_p)
      @context.sample_top_p(candidates, prob: top_p)
      @context.sample_temperature(candidates, temperature: temperature)
      id = @context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << @context.token_to_str(token) }

    break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
  end

  output.join.delete_prefix(" #{prompt}").strip
end

#embeddings(text) ⇒ `Array<Float>`

Obtains the embedding for a given text.

Parameters:

text (String) —

The text to obtain the embedding for.

Returns:

(Array<Float>)

# File 'lib/llama_cpp/client.rb', line 158

def embeddings(text)
  raise 'The embedding option is set to false' unless @params[:embedding]

  embd_input = tokenize_prompt(text)
  raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?

  @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
  @context.embeddings
end

Class: LLaMACpp::Client

Overview

Instance Method Summary collapse

Constructor Details

#initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil, n_ctx: 512, n_parts: -1,, memory_f16: false, use_mmap: true, use_mlock: false, embedding: false, n_threads: 1, seed: 0) ⇒ Client

Instance Method Details

#completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1) ⇒ String

#embeddings(text) ⇒ Array<Float>

#initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil, n_ctx: 512, n_parts: -1,, memory_f16: false, use_mmap: true, use_mlock: false, embedding: false, n_threads: 1, seed: 0) ⇒ `Client`

#completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1) ⇒ `String`

#embeddings(text) ⇒ `Array<Float>`