Module: LLaMACpp

Defined in:
lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Constant Summary collapse

VERSION =

The version of llama_cpp.rb you install.

'0.17.1'
LLAMA_CPP_VERSION =

The supported version of llama.cpp.

'b3291'
LLAMA_VOCAB_TYPE_NONE =
INT2NUM(LLAMA_VOCAB_TYPE_NONE)
LLAMA_VOCAB_TYPE_SPM =
INT2NUM(LLAMA_VOCAB_TYPE_SPM)
LLAMA_VOCAB_TYPE_BPE =
INT2NUM(LLAMA_VOCAB_TYPE_BPE)
LLAMA_VOCAB_TYPE_WPM =
INT2NUM(LLAMA_VOCAB_TYPE_WPM)
LLAMA_VOCAB_TYPE_UGM =
INT2NUM(LLAMA_VOCAB_TYPE_UGM)
LLAMA_VOCAB_PRE_TYPE_DEFAULT =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT)
LLAMA_VOCAB_PRE_TYPE_LLAMA3 =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3)
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM)
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER)
LLAMA_VOCAB_PRE_TYPE_FALCON =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON)
LLAMA_VOCAB_PRE_TYPE_MPT =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT)
LLAMA_VOCAB_PRE_TYPE_STARCODER =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER)
LLAMA_VOCAB_PRE_TYPE_GPT2 =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2)
LLAMA_VOCAB_PRE_TYPE_REFACT =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT)
LLAMA_VOCAB_PRE_TYPE_COMMAND_R =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R)
LLAMA_VOCAB_PRE_TYPE_STABLELM2 =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2)
LLAMA_VOCAB_PRE_TYPE_QWEN2 =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2)
LLAMA_VOCAB_PRE_TYPE_OLMO =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO)
LLAMA_VOCAB_PRE_TYPE_DBRX =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX)
LLAMA_VOCAB_PRE_TYPE_SMAUG =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_SMAUG)
LLAMA_VOCAB_PRE_TYPE_PORO =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_PORO)
LLAMA_VOCAB_PRE_TYPE_VIKING =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_VIKING)
LLAMA_VOCAB_PRE_TYPE_JAIS =
INT2NUM(LLAMA_VOCAB_PRE_TYPE_JAIS)
LLAMA_TOKEN_TYPE_UNDEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)
LLAMA_TOKEN_TYPE_NORMAL =
INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)
LLAMA_TOKEN_TYPE_UNKNOWN =
INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)
LLAMA_TOKEN_TYPE_CONTROL =
INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)
LLAMA_TOKEN_TYPE_USER_DEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)
LLAMA_TOKEN_TYPE_UNUSED =
INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)
LLAMA_TOKEN_TYPE_BYTE =
INT2NUM(LLAMA_TOKEN_TYPE_BYTE)
LLAMA_TOKEN_ATTR_UNDEFINED =
INT2NUM(LLAMA_TOKEN_ATTR_UNDEFINED)
LLAMA_TOKEN_ATTR_UNKNOWN =
INT2NUM(LLAMA_TOKEN_ATTR_UNKNOWN)
LLAMA_TOKEN_ATTR_UNUSED =
INT2NUM(LLAMA_TOKEN_ATTR_UNUSED)
LLAMA_TOKEN_ATTR_NORMAL =
INT2NUM(LLAMA_TOKEN_ATTR_NORMAL)
LLAMA_TOKEN_ATTR_CONTROL =
INT2NUM(LLAMA_TOKEN_ATTR_CONTROL)
LLAMA_TOKEN_ATTR_USER_DEFINED =
INT2NUM(LLAMA_TOKEN_ATTR_USER_DEFINED)
LLAMA_TOKEN_ATTR_BYTE =
INT2NUM(LLAMA_TOKEN_ATTR_BYTE)
LLAMA_TOKEN_ATTR_NORMALIZED =
INT2NUM(LLAMA_TOKEN_ATTR_NORMALIZED)
LLAMA_TOKEN_ATTR_LSTRIP =
INT2NUM(LLAMA_TOKEN_ATTR_LSTRIP)
LLAMA_TOKEN_ATTR_RSTRIP =
INT2NUM(LLAMA_TOKEN_ATTR_RSTRIP)
LLAMA_TOKEN_ATTR_SINGLE_WORD =
INT2NUM(LLAMA_TOKEN_ATTR_SINGLE_WORD)
LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
LLAMA_FTYPE_MOSTLY_IQ2_XXS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS)
LLAMA_FTYPE_MOSTLY_IQ2_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS)
LLAMA_FTYPE_MOSTLY_Q2_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S)
LLAMA_FTYPE_MOSTLY_IQ3_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS)
LLAMA_FTYPE_MOSTLY_IQ3_XXS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS)
LLAMA_FTYPE_MOSTLY_IQ1_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S)
LLAMA_FTYPE_MOSTLY_IQ4_NL =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL)
LLAMA_FTYPE_MOSTLY_IQ3_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S)
LLAMA_FTYPE_MOSTLY_IQ3_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M)
LLAMA_FTYPE_MOSTLY_IQ4_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS)
LLAMA_FTYPE_MOSTLY_IQ1_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M)
LLAMA_FTYPE_MOSTLY_BF16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_BF16)
LLAMA_FTYPE_GUESSED =
INT2NUM(LLAMA_FTYPE_GUESSED)
LLAMA_KV_OVERRIDE_TYPE_INT =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT)
LLAMA_KV_OVERRIDE_TYPE_FLOAT =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT)
LLAMA_KV_OVERRIDE_TYPE_BOOL =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL)
LLAMA_KV_OVERRIDE_TYPE_STR =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR)
LLAMA_GRETYPE_END =
INT2NUM(LLAMA_GRETYPE_END)
LLAMA_GRETYPE_ALT =
INT2NUM(LLAMA_GRETYPE_ALT)
LLAMA_GRETYPE_RULE_REF =
INT2NUM(LLAMA_GRETYPE_RULE_REF)
LLAMA_GRETYPE_CHAR =
INT2NUM(LLAMA_GRETYPE_CHAR)
LLAMA_GRETYPE_CHAR_NOT =
INT2NUM(LLAMA_GRETYPE_CHAR_NOT)
LLAMA_GRETYPE_CHAR_RNG_UPPER =
INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)
LLAMA_GRETYPE_CHAR_ALT =
INT2NUM(LLAMA_GRETYPE_CHAR_ALT)
LLAMA_GRETYPE_CHAR_ANY =
INT2NUM(LLAMA_GRETYPE_CHAR_ANY)
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED)
LLAMA_ROPE_SCALING_TYPE_NONE =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE)
LLAMA_ROPE_SCALING_TYPE_LINEAR =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR)
LLAMA_ROPE_SCALING_TYPE_YARN =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN)
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE)
LLAMA_POOLING_TYPE_UNSPECIFIED =
INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED)
LLAMA_POOLING_TYPE_NONE =
INT2NUM(LLAMA_POOLING_TYPE_NONE)
LLAMA_POOLING_TYPE_MEAN =
INT2NUM(LLAMA_POOLING_TYPE_MEAN)
LLAMA_POOLING_TYPE_CLS =
INT2NUM(LLAMA_POOLING_TYPE_CLS)
LLAMA_POOLING_TYPE_LAST =
INT2NUM(LLAMA_POOLING_TYPE_LAST)
LLAMA_SPLIT_MODE_NONE =
INT2NUM(LLAMA_SPLIT_MODE_NONE)
LLAMA_SPLIT_MODE_LAYER =
INT2NUM(LLAMA_SPLIT_MODE_LAYER)
LLAMA_SPLIT_MODE_ROW =
INT2NUM(LLAMA_SPLIT_MODE_ROW)
LLAMA_FILE_MAGIC_GGLA =
rb_str_new2(ss_magic.str().c_str())
LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
LLAMA_FILE_MAGIC_GGSQ =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
LLAMA_STATE_SEQ_MAGIC =
rb_str_new2(ss_magic.str().c_str())
LLAMA_DEFAULT_SEED =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())
LLAMA_STATE_SEQ_VERSION =
rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str())

Class Method Summary collapse

Class Method Details

.backend_freeObject



3393
3394
3395
3396
3397
# File 'ext/llama_cpp/llama_cpp.cpp', line 3393

static VALUE rb_llama_llama_backend_free(VALUE self) {
  llama_backend_free();

  return Qnil;
}

.backend_initObject

module functions



3387
3388
3389
3390
3391
# File 'ext/llama_cpp/llama_cpp.cpp', line 3387

static VALUE rb_llama_llama_backend_init(VALUE self) {
  llama_backend_init();

  return Qnil;
}

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

Generates sentences following the given prompt for operation check.

Parameters:

  • context (LLaMACpp::Context)

    The context to use.

  • prompt (String)

    The prompt to start generation with.

  • n_predict (Integer) (defaults to: 128)

    The number of tokens to predict.

  • n_keep (Integer) (defaults to: 10)

    The number of tokens to keep in the context.

  • n_batch (Integer) (defaults to: 512)

    The number of tokens to process in a batch.

  • repeat_last_n (Integer) (defaults to: 64)

    The number of tokens to consider for repetition penalty.

  • repeat_penalty (Float) (defaults to: 1.1)

    The repetition penalty.

  • frequency (Float) (defaults to: 0.0)

    The frequency penalty.

  • presence (Float) (defaults to: 0.0)

    The presence penalty.

  • top_k (Integer) (defaults to: 40)

    The number of tokens to consider for top-k sampling.

  • top_p (Float) (defaults to: 0.95)

    The probability threshold for nucleus sampling.

  • tfs_z (Float) (defaults to: 1.0)

    The z parameter for tail-free sampling.

  • typical_p (Float) (defaults to: 1.0)

    The probability for typical sampling.

  • temperature (Float) (defaults to: 0.8)

    The temperature for temperature sampling.

Returns:

  • (String)

Raises:

  • (ArgumentError)


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/llama_cpp.rb', line 27

def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
             n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
             repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
             top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = n_predict
  n_vocab = context.model.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalties(
        candidates, last_n_tokens[-last_n_repeat..],
        penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temp(candidates, temp: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.model.token_to_piece(token) }

    break if !embd.empty? && embd[-1] == context.model.token_eos
  end

  output.join.scrub('?').strip.delete_prefix(prompt).strip
end

.max_devicesObject



3451
3452
3453
# File 'ext/llama_cpp/llama_cpp.cpp', line 3451

static VALUE rb_llama_max_devices(VALUE self) {
  return SIZET2NUM(llama_max_devices());
}

.model_quantize(*args) ⇒ Object



3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
# File 'ext/llama_cpp/llama_cpp.cpp', line 3410

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
  VALUE kw_values[3] = { Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
    rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);

  if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}

.numa_init(strategy) ⇒ Object



3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
# File 'ext/llama_cpp/llama_cpp.cpp', line 3399

static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
  if (!RB_INTEGER_TYPE_P(strategy)) {
    rb_raise(rb_eArgError, "strategy must be an integer");
    return Qnil;
  }

  llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));

  return Qnil;
}


3442
3443
3444
3445
# File 'ext/llama_cpp/llama_cpp.cpp', line 3442

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.supports_gpu_offload?Boolean

Returns:

  • (Boolean)


3463
3464
3465
# File 'ext/llama_cpp/llama_cpp.cpp', line 3463

static VALUE rb_llama_supports_gpu_offload(VALUE self) {
  return llama_supports_gpu_offload() ? Qtrue : Qfalse;
}

.supports_mlock?Boolean

Returns:

  • (Boolean)


3459
3460
3461
# File 'ext/llama_cpp/llama_cpp.cpp', line 3459

static VALUE rb_llama_supports_mlock(VALUE self) {
  return llama_supports_mlock() ? Qtrue : Qfalse;
}

.supports_mmap?Boolean

Returns:

  • (Boolean)


3455
3456
3457
# File 'ext/llama_cpp/llama_cpp.cpp', line 3455

static VALUE rb_llama_supports_mmap(VALUE self) {
  return llama_supports_mmap() ? Qtrue : Qfalse;
}

.time_usObject



3447
3448
3449
# File 'ext/llama_cpp/llama_cpp.cpp', line 3447

static VALUE rb_llama_time_us(VALUE self) {
  return LONG2NUM(llama_time_us());
}