Module: LLaMACpp

Defined in:
lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Constant Summary collapse

VERSION =

The version of llama_cpp.rb you install.

'0.11.1'
LLAMA_CPP_VERSION =

The version of llama.cpp bundled with llama_cpp.rb.

'b1768'
LLAMA_MAX_DEVICES =
INT2NUM(LLAMA_MAX_DEVICES)
LLAMA_VOCAB_TYPE_SPM =
INT2NUM(LLAMA_VOCAB_TYPE_SPM)
LLAMA_VOCAB_TYPE_BPE =
INT2NUM(LLAMA_VOCAB_TYPE_BPE)
LLAMA_TOKEN_TYPE_UNDEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)
LLAMA_TOKEN_TYPE_NORMAL =
INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)
LLAMA_TOKEN_TYPE_UNKNOWN =
INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)
LLAMA_TOKEN_TYPE_CONTROL =
INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)
LLAMA_TOKEN_TYPE_USER_DEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)
LLAMA_TOKEN_TYPE_UNUSED =
INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)
LLAMA_TOKEN_TYPE_BYTE =
INT2NUM(LLAMA_TOKEN_TYPE_BYTE)
LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
LLAMA_FTYPE_GUESSED =
INT2NUM(LLAMA_FTYPE_GUESSED)
LLAMA_KV_OVERRIDE_INT =
INT2NUM(LLAMA_KV_OVERRIDE_INT)
LLAMA_KV_OVERRIDE_FLOAT =
INT2NUM(LLAMA_KV_OVERRIDE_FLOAT)
LLAMA_KV_OVERRIDE_BOOL =
INT2NUM(LLAMA_KV_OVERRIDE_BOOL)
LLAMA_GRETYPE_END =
INT2NUM(LLAMA_GRETYPE_END)
LLAMA_GRETYPE_ALT =
INT2NUM(LLAMA_GRETYPE_ALT)
LLAMA_GRETYPE_RULE_REF =
INT2NUM(LLAMA_GRETYPE_RULE_REF)
LLAMA_GRETYPE_CHAR =
INT2NUM(LLAMA_GRETYPE_CHAR)
LLAMA_GRETYPE_CHAR_NOT =
INT2NUM(LLAMA_GRETYPE_CHAR_NOT)
LLAMA_GRETYPE_CHAR_RNG_UPPER =
INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)
LLAMA_GRETYPE_CHAR_ALT =
INT2NUM(LLAMA_GRETYPE_CHAR_ALT)
LLAMA_ROPE_SCALING_UNSPECIFIED =
INT2NUM(LLAMA_ROPE_SCALING_UNSPECIFIED)
LLAMA_ROPE_SCALING_NONE =
INT2NUM(LLAMA_ROPE_SCALING_NONE)
LLAMA_ROPE_SCALING_LINEAR =
INT2NUM(LLAMA_ROPE_SCALING_LINEAR)
LLAMA_ROPE_SCALING_YARN =
INT2NUM(LLAMA_ROPE_SCALING_YARN)
LLAMA_ROPE_SCALING_MAX_VALUE =
INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE)
LLAMA_FILE_MAGIC_GGLA =
rb_str_new2(ss_magic.str().c_str())
LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
LLAMA_DEFAULT_SEED =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())

Class Method Summary collapse

Class Method Details

.backend_freeObject



3049
3050
3051
3052
3053
# File 'ext/llama_cpp/llama_cpp.cpp', line 3049

static VALUE rb_llama_llama_backend_free(VALUE self) {
  llama_backend_free();

  return Qnil;
}

.backend_init(*args) ⇒ Object

module functions



3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
# File 'ext/llama_cpp/llama_cpp.cpp', line 3036

static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[1] = { rb_intern("numa") };
  VALUE kw_values[1] = { Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);

  const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
  llama_backend_init(numa);

  return Qnil;
}

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

Generates sentences following the given prompt for operation check.

Parameters:

  • context (LLaMACpp::Context)

    The context to use.

  • prompt (String)

    The prompt to start generation with.

  • n_predict (Integer) (defaults to: 128)

    The number of tokens to predict.

  • n_keep (Integer) (defaults to: 10)

    The number of tokens to keep in the context.

  • n_batch (Integer) (defaults to: 512)

    The number of tokens to process in a batch.

  • repeat_last_n (Integer) (defaults to: 64)

    The number of tokens to consider for repetition penalty.

  • repeat_penalty (Float) (defaults to: 1.1)

    The repetition penalty.

  • frequency (Float) (defaults to: 0.0)

    The frequency penalty.

  • presence (Float) (defaults to: 0.0)

    The presence penalty.

  • top_k (Integer) (defaults to: 40)

    The number of tokens to consider for top-k sampling.

  • top_p (Float) (defaults to: 0.95)

    The probability threshold for nucleus sampling.

  • tfs_z (Float) (defaults to: 1.0)

    The z parameter for tail-free sampling.

  • typical_p (Float) (defaults to: 1.0)

    The probability for typical sampling.

  • temperature (Float) (defaults to: 0.8)

    The temperature for temperature sampling.

Returns:

  • (String)

Raises:

  • (ArgumentError)


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/llama_cpp.rb', line 27

def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
             n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
             repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
             top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = n_predict
  n_vocab = context.model.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.eval(tokens: embd, n_past: n_past)
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalties(
        candidates, last_n_tokens[-last_n_repeat..],
        penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temperature(candidates, temperature: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.model.token_to_piece(token) }

    break if !embd.empty? && embd[-1] == context.model.token_eos
  end

  output.join.scrub('?').strip.delete_prefix(prompt).strip
end

.max_devicesObject



3104
3105
3106
# File 'ext/llama_cpp/llama_cpp.cpp', line 3104

static VALUE rb_llama_max_devices(VALUE self) {
  return INT2NUM(llama_max_devices());
}

.mlock_supported?Boolean

Returns:

  • (Boolean)


3100
3101
3102
# File 'ext/llama_cpp/llama_cpp.cpp', line 3100

static VALUE rb_llama_mlock_supported(VALUE self) {
  return llama_mlock_supported() ? Qtrue : Qfalse;
}

.mmap_supported?Boolean

Returns:

  • (Boolean)


3096
3097
3098
# File 'ext/llama_cpp/llama_cpp.cpp', line 3096

static VALUE rb_llama_mmap_supported(VALUE self) {
  return llama_mmap_supported() ? Qtrue : Qfalse;
}

.model_quantize(*args) ⇒ Object



3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
# File 'ext/llama_cpp/llama_cpp.cpp', line 3055

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
  VALUE kw_values[3] = { Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
    rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);

  if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}


3087
3088
3089
3090
# File 'ext/llama_cpp/llama_cpp.cpp', line 3087

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.time_usObject



3092
3093
3094
# File 'ext/llama_cpp/llama_cpp.cpp', line 3092

static VALUE rb_llama_time_us(VALUE self) {
  return LONG2NUM(llama_time_us());
}