Module: LLaMACpp

Defined in:
lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Constant Summary collapse

VERSION =

The version of llama_cpp.rb you install.

'0.15.3'
LLAMA_CPP_VERSION =

The version of llama.cpp bundled with llama_cpp.rb.

'b2988'
LLAMA_VOCAB_TYPE_NONE =
INT2NUM(LLAMA_VOCAB_TYPE_NONE)
LLAMA_VOCAB_TYPE_SPM =
INT2NUM(LLAMA_VOCAB_TYPE_SPM)
LLAMA_VOCAB_TYPE_BPE =
INT2NUM(LLAMA_VOCAB_TYPE_BPE)
LLAMA_VOCAB_TYPE_WPM =
INT2NUM(LLAMA_VOCAB_TYPE_WPM)
LLAMA_VOCAB_PRE_TYPE_DEFAULT =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT)
LLAMA_VOCAB_PRE_TYPE_LLAMA3 =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3)
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM)
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER)
LLAMA_VOCAB_PRE_TYPE_FALCON =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON)
LLAMA_VOCAB_PRE_TYPE_MPT =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT)
LLAMA_VOCAB_PRE_TYPE_STARCODER =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER)
LLAMA_VOCAB_PRE_TYPE_GPT2 =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2)
LLAMA_VOCAB_PRE_TYPE_REFACT =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT)
LLAMA_VOCAB_PRE_TYPE_COMMAND_R =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R)
LLAMA_VOCAB_PRE_TYPE_STABLELM2 =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2)
LLAMA_VOCAB_PRE_TYPE_QWEN2 =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2)
LLAMA_VOCAB_PRE_TYPE_OLMO =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO)
LLAMA_VOCAB_PRE_TYPE_DBRX =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX)
LLAMA_TOKEN_TYPE_UNDEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)
LLAMA_TOKEN_TYPE_NORMAL =
INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)
LLAMA_TOKEN_TYPE_UNKNOWN =
INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)
LLAMA_TOKEN_TYPE_CONTROL =
INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)
LLAMA_TOKEN_TYPE_USER_DEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)
LLAMA_TOKEN_TYPE_UNUSED =
INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)
LLAMA_TOKEN_TYPE_BYTE =
INT2NUM(LLAMA_TOKEN_TYPE_BYTE)
LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
LLAMA_FTYPE_MOSTLY_IQ2_XXS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS)
LLAMA_FTYPE_MOSTLY_IQ2_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS)
LLAMA_FTYPE_MOSTLY_Q2_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S)
LLAMA_FTYPE_MOSTLY_IQ3_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS)
LLAMA_FTYPE_MOSTLY_IQ3_XXS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS)
LLAMA_FTYPE_MOSTLY_IQ1_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S)
LLAMA_FTYPE_MOSTLY_IQ4_NL =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL)
LLAMA_FTYPE_MOSTLY_IQ3_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S)
LLAMA_FTYPE_MOSTLY_IQ3_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M)
LLAMA_FTYPE_MOSTLY_IQ4_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS)
LLAMA_FTYPE_MOSTLY_IQ1_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M)
LLAMA_FTYPE_MOSTLY_BF16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_BF16)
LLAMA_FTYPE_GUESSED =
INT2NUM(LLAMA_FTYPE_GUESSED)
LLAMA_KV_OVERRIDE_TYPE_INT =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT)
LLAMA_KV_OVERRIDE_TYPE_FLOAT =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT)
LLAMA_KV_OVERRIDE_TYPE_BOOL =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL)
LLAMA_KV_OVERRIDE_TYPE_STR =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR)
LLAMA_GRETYPE_END =
INT2NUM(LLAMA_GRETYPE_END)
LLAMA_GRETYPE_ALT =
INT2NUM(LLAMA_GRETYPE_ALT)
LLAMA_GRETYPE_RULE_REF =
INT2NUM(LLAMA_GRETYPE_RULE_REF)
LLAMA_GRETYPE_CHAR =
INT2NUM(LLAMA_GRETYPE_CHAR)
LLAMA_GRETYPE_CHAR_NOT =
INT2NUM(LLAMA_GRETYPE_CHAR_NOT)
LLAMA_GRETYPE_CHAR_RNG_UPPER =
INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)
LLAMA_GRETYPE_CHAR_ALT =
INT2NUM(LLAMA_GRETYPE_CHAR_ALT)
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED)
LLAMA_ROPE_SCALING_TYPE_NONE =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE)
LLAMA_ROPE_SCALING_TYPE_LINEAR =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR)
LLAMA_ROPE_SCALING_TYPE_YARN =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN)
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE)
LLAMA_POOLING_TYPE_UNSPECIFIED =
INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED)
LLAMA_POOLING_TYPE_NONE =
INT2NUM(LLAMA_POOLING_TYPE_NONE)
LLAMA_POOLING_TYPE_MEAN =
INT2NUM(LLAMA_POOLING_TYPE_MEAN)
LLAMA_POOLING_TYPE_CLS =
INT2NUM(LLAMA_POOLING_TYPE_CLS)
LLAMA_SPLIT_MODE_NONE =
INT2NUM(LLAMA_SPLIT_MODE_NONE)
LLAMA_SPLIT_MODE_LAYER =
INT2NUM(LLAMA_SPLIT_MODE_LAYER)
LLAMA_SPLIT_MODE_ROW =
INT2NUM(LLAMA_SPLIT_MODE_ROW)
LLAMA_FILE_MAGIC_GGLA =
rb_str_new2(ss_magic.str().c_str())
LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
LLAMA_FILE_MAGIC_GGSQ =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
LLAMA_STATE_SEQ_MAGIC =
rb_str_new2(ss_magic.str().c_str())
LLAMA_DEFAULT_SEED =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())
LLAMA_STATE_SEQ_VERSION =
rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str())

Class Method Summary collapse

Class Method Details

.backend_freeObject



3365
3366
3367
3368
3369
# File 'ext/llama_cpp/llama_cpp.cpp', line 3365

static VALUE rb_llama_llama_backend_free(VALUE self) {
  llama_backend_free();

  return Qnil;
}

.backend_initObject

module functions



3359
3360
3361
3362
3363
# File 'ext/llama_cpp/llama_cpp.cpp', line 3359

static VALUE rb_llama_llama_backend_init(VALUE self) {
  llama_backend_init();

  return Qnil;
}

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

Generates sentences following the given prompt for operation check.

Parameters:

  • context (LLaMACpp::Context)

    The context to use.

  • prompt (String)

    The prompt to start generation with.

  • n_predict (Integer) (defaults to: 128)

    The number of tokens to predict.

  • n_keep (Integer) (defaults to: 10)

    The number of tokens to keep in the context.

  • n_batch (Integer) (defaults to: 512)

    The number of tokens to process in a batch.

  • repeat_last_n (Integer) (defaults to: 64)

    The number of tokens to consider for repetition penalty.

  • repeat_penalty (Float) (defaults to: 1.1)

    The repetition penalty.

  • frequency (Float) (defaults to: 0.0)

    The frequency penalty.

  • presence (Float) (defaults to: 0.0)

    The presence penalty.

  • top_k (Integer) (defaults to: 40)

    The number of tokens to consider for top-k sampling.

  • top_p (Float) (defaults to: 0.95)

    The probability threshold for nucleus sampling.

  • tfs_z (Float) (defaults to: 1.0)

    The z parameter for tail-free sampling.

  • typical_p (Float) (defaults to: 1.0)

    The probability for typical sampling.

  • temperature (Float) (defaults to: 0.8)

    The temperature for temperature sampling.

Returns:

  • (String)

Raises:

  • (ArgumentError)


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/llama_cpp.rb', line 27

def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
             n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
             repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
             top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = n_predict
  n_vocab = context.model.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalties(
        candidates, last_n_tokens[-last_n_repeat..],
        penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temp(candidates, temp: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.model.token_to_piece(token) }

    break if !embd.empty? && embd[-1] == context.model.token_eos
  end

  output.join.scrub('?').strip.delete_prefix(prompt).strip
end

.max_devicesObject



3423
3424
3425
# File 'ext/llama_cpp/llama_cpp.cpp', line 3423

static VALUE rb_llama_max_devices(VALUE self) {
  return SIZET2NUM(llama_max_devices());
}

.model_quantize(*args) ⇒ Object



3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
# File 'ext/llama_cpp/llama_cpp.cpp', line 3382

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
  VALUE kw_values[3] = { Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
    rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);

  if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}

.numa_init(strategy) ⇒ Object



3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
# File 'ext/llama_cpp/llama_cpp.cpp', line 3371

static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
  if (!RB_INTEGER_TYPE_P(strategy)) {
    rb_raise(rb_eArgError, "strategy must be an integer");
    return Qnil;
  }

  llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));

  return Qnil;
}


3414
3415
3416
3417
# File 'ext/llama_cpp/llama_cpp.cpp', line 3414

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.supports_gpu_offload?Boolean

Returns:

  • (Boolean)


3435
3436
3437
# File 'ext/llama_cpp/llama_cpp.cpp', line 3435

static VALUE rb_llama_supports_gpu_offload(VALUE self) {
  return llama_supports_gpu_offload() ? Qtrue : Qfalse;
}

.supports_mlock?Boolean

Returns:

  • (Boolean)


3431
3432
3433
# File 'ext/llama_cpp/llama_cpp.cpp', line 3431

static VALUE rb_llama_supports_mlock(VALUE self) {
  return llama_supports_mlock() ? Qtrue : Qfalse;
}

.supports_mmap?Boolean

Returns:

  • (Boolean)


3427
3428
3429
# File 'ext/llama_cpp/llama_cpp.cpp', line 3427

static VALUE rb_llama_supports_mmap(VALUE self) {
  return llama_supports_mmap() ? Qtrue : Qfalse;
}

.time_usObject



3419
3420
3421
# File 'ext/llama_cpp/llama_cpp.cpp', line 3419

static VALUE rb_llama_time_us(VALUE self) {
  return LONG2NUM(llama_time_us());
}