Module: LLaMACpp

Defined in:
lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Constant Summary collapse

VERSION =

The version of llama_cpp.rb you install.

'0.7.1'
LLAMA_CPP_VERSION =

The version of llama.cpp bundled with llama_cpp.rb.

'b1380'
LLAMA_MAX_DEVICES =
INT2NUM(LLAMA_MAX_DEVICES)
LLAMA_VOCAB_TYPE_SPM =
INT2NUM(LLAMA_VOCAB_TYPE_SPM)
LLAMA_VOCAB_TYPE_BPE =
INT2NUM(LLAMA_VOCAB_TYPE_BPE)
LLAMA_TOKEN_TYPE_UNDEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)
LLAMA_TOKEN_TYPE_NORMAL =
INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)
LLAMA_TOKEN_TYPE_UNKNOWN =
INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)
LLAMA_TOKEN_TYPE_CONTROL =
INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)
LLAMA_TOKEN_TYPE_USER_DEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)
LLAMA_TOKEN_TYPE_UNUSED =
INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)
LLAMA_TOKEN_TYPE_BYTE =
INT2NUM(LLAMA_TOKEN_TYPE_BYTE)
LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
LLAMA_FTYPE_GUESSED =
INT2NUM(LLAMA_FTYPE_GUESSED)
LLAMA_GRETYPE_END =
INT2NUM(LLAMA_GRETYPE_END)
LLAMA_GRETYPE_ALT =
INT2NUM(LLAMA_GRETYPE_ALT)
LLAMA_GRETYPE_RULE_REF =
INT2NUM(LLAMA_GRETYPE_RULE_REF)
LLAMA_GRETYPE_CHAR =
INT2NUM(LLAMA_GRETYPE_CHAR)
LLAMA_GRETYPE_CHAR_NOT =
INT2NUM(LLAMA_GRETYPE_CHAR_NOT)
LLAMA_GRETYPE_CHAR_RNG_UPPER =
INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)
LLAMA_GRETYPE_CHAR_ALT =
INT2NUM(LLAMA_GRETYPE_CHAR_ALT)
LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
LLAMA_DEFAULT_SEED =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())

Class Method Summary collapse

Class Method Details

.backend_freeObject



2822
2823
2824
2825
2826
# File 'ext/llama_cpp/llama_cpp.cpp', line 2822

static VALUE rb_llama_llama_backend_free(VALUE self) {
  llama_backend_free();

  return Qnil;
}

.backend_init(*args) ⇒ Object

module functions



2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
# File 'ext/llama_cpp/llama_cpp.cpp', line 2809

static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[1] = { rb_intern("numa") };
  VALUE kw_values[1] = { Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);

  const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
  llama_backend_init(numa);

  return Qnil;
}

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

Generates sentences following the given prompt for operation check.

Parameters:

  • context (LLaMACpp::Context)

    The context to use.

  • prompt (String)

    The prompt to start generation with.

  • n_predict (Integer) (defaults to: 128)

    The number of tokens to predict.

  • n_keep (Integer) (defaults to: 10)

    The number of tokens to keep in the context.

  • n_batch (Integer) (defaults to: 512)

    The number of tokens to process in a batch.

  • repeat_last_n (Integer) (defaults to: 64)

    The number of tokens to consider for repetition penalty.

  • repeat_penalty (Float) (defaults to: 1.1)

    The repetition penalty.

  • frequency (Float) (defaults to: 0.0)

    The frequency penalty.

  • presence (Float) (defaults to: 0.0)

    The presence penalty.

  • top_k (Integer) (defaults to: 40)

    The number of tokens to consider for top-k sampling.

  • top_p (Float) (defaults to: 0.95)

    The probability threshold for nucleus sampling.

  • tfs_z (Float) (defaults to: 1.0)

    The z parameter for tail-free sampling.

  • typical_p (Float) (defaults to: 1.0)

    The probability for typical sampling.

  • temperature (Float) (defaults to: 0.8)

    The temperature for temperature sampling.

Returns:

  • (String)

Raises:

  • (ArgumentError)


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/llama_cpp.rb', line 27

def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
             n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
             repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
             top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = n_predict
  n_vocab = context.model.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.eval(tokens: embd, n_past: n_past)
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
      context.sample_frequency_and_presence_penalties(
        candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temperature(candidates, temperature: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.model.token_to_piece(token) }

    break if !embd.empty? && embd[-1] == context.token_eos
  end

  output.join.scrub('?').strip.delete_prefix(prompt).strip
end

.max_devicesObject



2877
2878
2879
# File 'ext/llama_cpp/llama_cpp.cpp', line 2877

static VALUE rb_llama_max_devices(VALUE self) {
  return INT2NUM(llama_max_devices());
}

.mlock_supported?Boolean

Returns:

  • (Boolean)


2873
2874
2875
# File 'ext/llama_cpp/llama_cpp.cpp', line 2873

static VALUE rb_llama_mlock_supported(VALUE self) {
  return llama_mlock_supported() ? Qtrue : Qfalse;
}

.mmap_supported?Boolean

Returns:

  • (Boolean)


2869
2870
2871
# File 'ext/llama_cpp/llama_cpp.cpp', line 2869

static VALUE rb_llama_mmap_supported(VALUE self) {
  return llama_mmap_supported() ? Qtrue : Qfalse;
}

.model_quantize(*args) ⇒ Object



2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
# File 'ext/llama_cpp/llama_cpp.cpp', line 2828

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
  VALUE kw_values[3] = { Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
    rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);

  if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}


2860
2861
2862
2863
# File 'ext/llama_cpp/llama_cpp.cpp', line 2860

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.time_usObject



2865
2866
2867
# File 'ext/llama_cpp/llama_cpp.cpp', line 2865

static VALUE rb_llama_time_us(VALUE self) {
  return LONG2NUM(llama_time_us());
}