Module: LLaMACpp

Defined in:: lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Constant Summary collapse

VERSION = The version of llama_cpp.rb you install.

'0.15.3'

LLAMA_CPP_VERSION = The version of llama.cpp bundled with llama_cpp.rb.

'b2988'

LLAMA_VOCAB_TYPE_NONE =

INT2NUM(LLAMA_VOCAB_TYPE_NONE)

LLAMA_VOCAB_TYPE_SPM =

INT2NUM(LLAMA_VOCAB_TYPE_SPM)

LLAMA_VOCAB_TYPE_BPE =

INT2NUM(LLAMA_VOCAB_TYPE_BPE)

LLAMA_VOCAB_TYPE_WPM =

INT2NUM(LLAMA_VOCAB_TYPE_WPM)

LLAMA_VOCAB_PRE_TYPE_DEFAULT =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT)

LLAMA_VOCAB_PRE_TYPE_LLAMA3 =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3)

LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM)

LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER)

LLAMA_VOCAB_PRE_TYPE_FALCON =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON)

LLAMA_VOCAB_PRE_TYPE_MPT =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT)

LLAMA_VOCAB_PRE_TYPE_STARCODER =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER)

LLAMA_VOCAB_PRE_TYPE_GPT2 =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2)

LLAMA_VOCAB_PRE_TYPE_REFACT =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT)

LLAMA_VOCAB_PRE_TYPE_COMMAND_R =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R)

LLAMA_VOCAB_PRE_TYPE_STABLELM2 =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2)

LLAMA_VOCAB_PRE_TYPE_QWEN2 =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2)

LLAMA_VOCAB_PRE_TYPE_OLMO =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO)

LLAMA_VOCAB_PRE_TYPE_DBRX =

INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX)

LLAMA_TOKEN_TYPE_UNDEFINED =

INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)

LLAMA_TOKEN_TYPE_NORMAL =

INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)

LLAMA_TOKEN_TYPE_UNKNOWN =

INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)

LLAMA_TOKEN_TYPE_CONTROL =

INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)

LLAMA_TOKEN_TYPE_USER_DEFINED =

INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)

LLAMA_TOKEN_TYPE_UNUSED =

INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)

LLAMA_TOKEN_TYPE_BYTE =

INT2NUM(LLAMA_TOKEN_TYPE_BYTE)

LLAMA_FTYPE_ALL_F32 =

INT2NUM(LLAMA_FTYPE_ALL_F32)

LLAMA_FTYPE_MOSTLY_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_F16)

LLAMA_FTYPE_MOSTLY_Q4_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)

LLAMA_FTYPE_MOSTLY_Q4_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)

LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)

LLAMA_FTYPE_MOSTLY_Q8_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)

LLAMA_FTYPE_MOSTLY_Q5_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)

LLAMA_FTYPE_MOSTLY_Q5_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)

LLAMA_FTYPE_MOSTLY_Q2_K =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)

LLAMA_FTYPE_MOSTLY_Q3_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)

LLAMA_FTYPE_MOSTLY_Q3_K_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)

LLAMA_FTYPE_MOSTLY_Q3_K_L =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)

LLAMA_FTYPE_MOSTLY_Q4_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)

LLAMA_FTYPE_MOSTLY_Q4_K_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)

LLAMA_FTYPE_MOSTLY_Q5_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)

LLAMA_FTYPE_MOSTLY_Q5_K_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)

LLAMA_FTYPE_MOSTLY_Q6_K =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)

LLAMA_FTYPE_MOSTLY_IQ2_XXS =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS)

LLAMA_FTYPE_MOSTLY_IQ2_XS =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS)

LLAMA_FTYPE_MOSTLY_Q2_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S)

LLAMA_FTYPE_MOSTLY_IQ3_XS =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS)

LLAMA_FTYPE_MOSTLY_IQ3_XXS =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS)

LLAMA_FTYPE_MOSTLY_IQ1_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S)

LLAMA_FTYPE_MOSTLY_IQ4_NL =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL)

LLAMA_FTYPE_MOSTLY_IQ3_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S)

LLAMA_FTYPE_MOSTLY_IQ3_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M)

LLAMA_FTYPE_MOSTLY_IQ4_XS =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS)

LLAMA_FTYPE_MOSTLY_IQ1_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M)

LLAMA_FTYPE_MOSTLY_BF16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_BF16)

LLAMA_FTYPE_GUESSED =

INT2NUM(LLAMA_FTYPE_GUESSED)

LLAMA_KV_OVERRIDE_TYPE_INT =

INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT)

LLAMA_KV_OVERRIDE_TYPE_FLOAT =

INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT)

LLAMA_KV_OVERRIDE_TYPE_BOOL =

INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL)

LLAMA_KV_OVERRIDE_TYPE_STR =

INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR)

LLAMA_GRETYPE_END =

INT2NUM(LLAMA_GRETYPE_END)

LLAMA_GRETYPE_ALT =

INT2NUM(LLAMA_GRETYPE_ALT)

LLAMA_GRETYPE_RULE_REF =

INT2NUM(LLAMA_GRETYPE_RULE_REF)

LLAMA_GRETYPE_CHAR =

INT2NUM(LLAMA_GRETYPE_CHAR)

LLAMA_GRETYPE_CHAR_NOT =

INT2NUM(LLAMA_GRETYPE_CHAR_NOT)

LLAMA_GRETYPE_CHAR_RNG_UPPER =

INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)

LLAMA_GRETYPE_CHAR_ALT =

INT2NUM(LLAMA_GRETYPE_CHAR_ALT)

LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED =

INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED)

LLAMA_ROPE_SCALING_TYPE_NONE =

INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE)

LLAMA_ROPE_SCALING_TYPE_LINEAR =

INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR)

LLAMA_ROPE_SCALING_TYPE_YARN =

INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN)

LLAMA_ROPE_SCALING_TYPE_MAX_VALUE =

INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE)

LLAMA_POOLING_TYPE_UNSPECIFIED =

INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED)

LLAMA_POOLING_TYPE_NONE =

INT2NUM(LLAMA_POOLING_TYPE_NONE)

LLAMA_POOLING_TYPE_MEAN =

INT2NUM(LLAMA_POOLING_TYPE_MEAN)

LLAMA_POOLING_TYPE_CLS =

INT2NUM(LLAMA_POOLING_TYPE_CLS)

LLAMA_SPLIT_MODE_NONE =

INT2NUM(LLAMA_SPLIT_MODE_NONE)

LLAMA_SPLIT_MODE_LAYER =

INT2NUM(LLAMA_SPLIT_MODE_LAYER)

LLAMA_SPLIT_MODE_ROW =

INT2NUM(LLAMA_SPLIT_MODE_ROW)

LLAMA_FILE_MAGIC_GGLA =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC_GGSN =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC_GGSQ =

rb_str_new2(ss_magic.str().c_str())

LLAMA_SESSION_MAGIC =

rb_str_new2(ss_magic.str().c_str())

LLAMA_STATE_SEQ_MAGIC =

rb_str_new2(ss_magic.str().c_str())

LLAMA_DEFAULT_SEED =

rb_str_new2(ss_magic.str().c_str())

LLAMA_SESSION_VERSION =

rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())

LLAMA_STATE_SEQ_VERSION =

rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str())

Class Method Summary collapse

.backend_free ⇒ Object
.backend_init ⇒ Object

module functions.
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

Generates sentences following the given prompt for operation check.
.max_devices ⇒ Object
.model_quantize(*args) ⇒ Object
.numa_init(strategy) ⇒ Object
.print_system_info ⇒ Object
.supports_gpu_offload? ⇒ Boolean
.supports_mlock? ⇒ Boolean
.supports_mmap? ⇒ Boolean
.time_us ⇒ Object

Class Method Details

.backend_free ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 3365

static VALUE rb_llama_llama_backend_free(VALUE self) {
  llama_backend_free();

  return Qnil;
}

.backend_init ⇒ `Object`

module functions

# File 'ext/llama_cpp/llama_cpp.cpp', line 3359

static VALUE rb_llama_llama_backend_init(VALUE self) {
  llama_backend_init();

  return Qnil;
}

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ `String`

Generates sentences following the given prompt for operation check.

Parameters:

context (LLaMACpp::Context) —

The context to use.
prompt (String) —

The prompt to start generation with.
n_predict (Integer) (defaults to: 128) —

The number of tokens to predict.
n_keep (Integer) (defaults to: 10) —

The number of tokens to keep in the context.
n_batch (Integer) (defaults to: 512) —

The number of tokens to process in a batch.
repeat_last_n (Integer) (defaults to: 64) —

The number of tokens to consider for repetition penalty.
repeat_penalty (Float) (defaults to: 1.1) —

The repetition penalty.
frequency (Float) (defaults to: 0.0) —

The frequency penalty.
presence (Float) (defaults to: 0.0) —

The presence penalty.
top_k (Integer) (defaults to: 40) —

The number of tokens to consider for top-k sampling.
top_p (Float) (defaults to: 0.95) —

The probability threshold for nucleus sampling.
tfs_z (Float) (defaults to: 1.0) —

The z parameter for tail-free sampling.
typical_p (Float) (defaults to: 1.0) —

The probability for typical sampling.
temperature (Float) (defaults to: 0.8) —

The temperature for temperature sampling.

Returns:

(String)

Raises:

(ArgumentError)

# File 'lib/llama_cpp.rb', line 27

def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
             n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
             repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
             top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = n_predict
  n_vocab = context.model.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalties(
        candidates, last_n_tokens[-last_n_repeat..],
        penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temp(candidates, temp: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.model.token_to_piece(token) }

    break if !embd.empty? && embd[-1] == context.model.token_eos
  end

  output.join.scrub('?').strip.delete_prefix(prompt).strip
end

.max_devices ⇒ `Object`



3423
3424
3425

# File 'ext/llama_cpp/llama_cpp.cpp', line 3423

static VALUE rb_llama_max_devices(VALUE self) {
  return SIZET2NUM(llama_max_devices());
}

.model_quantize(*args) ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 3382

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
  VALUE kw_values[3] = { Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
    rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);

  if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}

.numa_init(strategy) ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 3371

static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
  if (!RB_INTEGER_TYPE_P(strategy)) {
    rb_raise(rb_eArgError, "strategy must be an integer");
    return Qnil;
  }

  llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));

  return Qnil;
}

.print_system_info ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 3414

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.supports_gpu_offload? ⇒ `Boolean`

Returns:

(Boolean)



3435
3436
3437

# File 'ext/llama_cpp/llama_cpp.cpp', line 3435

static VALUE rb_llama_supports_gpu_offload(VALUE self) {
  return llama_supports_gpu_offload() ? Qtrue : Qfalse;
}

.supports_mlock? ⇒ `Boolean`

Returns:

(Boolean)



3431
3432
3433

# File 'ext/llama_cpp/llama_cpp.cpp', line 3431

static VALUE rb_llama_supports_mlock(VALUE self) {
  return llama_supports_mlock() ? Qtrue : Qfalse;
}

.supports_mmap? ⇒ `Boolean`

Returns:

(Boolean)



3427
3428
3429

# File 'ext/llama_cpp/llama_cpp.cpp', line 3427

static VALUE rb_llama_supports_mmap(VALUE self) {
  return llama_supports_mmap() ? Qtrue : Qfalse;
}

.time_us ⇒ `Object`



3419
3420
3421

# File 'ext/llama_cpp/llama_cpp.cpp', line 3419

static VALUE rb_llama_time_us(VALUE self) {
  return LONG2NUM(llama_time_us());
}

Module: LLaMACpp

Overview

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.backend_free ⇒ Object

.backend_init ⇒ Object

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

.max_devices ⇒ Object

.model_quantize(*args) ⇒ Object

.numa_init(strategy) ⇒ Object

.print_system_info ⇒ Object

.supports_gpu_offload? ⇒ Boolean

.supports_mlock? ⇒ Boolean

.supports_mmap? ⇒ Boolean

.time_us ⇒ Object

.backend_free ⇒ `Object`

.backend_init ⇒ `Object`

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ `String`

.max_devices ⇒ `Object`

.model_quantize(*args) ⇒ `Object`

.numa_init(strategy) ⇒ `Object`

.print_system_info ⇒ `Object`

.supports_gpu_offload? ⇒ `Boolean`

.supports_mlock? ⇒ `Boolean`

.supports_mmap? ⇒ `Boolean`

.time_us ⇒ `Object`