Module: LLaMACpp

Defined in:
lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Constant Summary collapse

VERSION =

The version of llama_cpp.rb you install.

'0.6.0'
LLAMA_CPP_VERSION =

The version of llama.cpp bundled with llama_cpp.rb.

'b1292'
LLAMA_MAX_DEVICES =
INT2NUM(LLAMA_MAX_DEVICES)
LLAMA_VOCAB_TYPE_SPM =
INT2NUM(LLAMA_VOCAB_TYPE_SPM)
LLAMA_VOCAB_TYPE_BPE =
INT2NUM(LLAMA_VOCAB_TYPE_BPE)
LLAMA_TOKEN_TYPE_UNDEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)
LLAMA_TOKEN_TYPE_NORMAL =
INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)
LLAMA_TOKEN_TYPE_UNKNOWN =
INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)
LLAMA_TOKEN_TYPE_CONTROL =
INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)
LLAMA_TOKEN_TYPE_USER_DEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)
LLAMA_TOKEN_TYPE_UNUSED =
INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)
LLAMA_TOKEN_TYPE_BYTE =
INT2NUM(LLAMA_TOKEN_TYPE_BYTE)
LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
LLAMA_FTYPE_GUESSED =
INT2NUM(LLAMA_FTYPE_GUESSED)
LLAMA_GRETYPE_END =
INT2NUM(LLAMA_GRETYPE_END)
LLAMA_GRETYPE_ALT =
INT2NUM(LLAMA_GRETYPE_ALT)
LLAMA_GRETYPE_RULE_REF =
INT2NUM(LLAMA_GRETYPE_RULE_REF)
LLAMA_GRETYPE_CHAR =
INT2NUM(LLAMA_GRETYPE_CHAR)
LLAMA_GRETYPE_CHAR_NOT =
INT2NUM(LLAMA_GRETYPE_CHAR_NOT)
LLAMA_GRETYPE_CHAR_RNG_UPPER =
INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)
LLAMA_GRETYPE_CHAR_ALT =
INT2NUM(LLAMA_GRETYPE_CHAR_ALT)
LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
LLAMA_DEFAULT_SEED =
rb_str_new2(ss_magic.str().c_str())
LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())

Class Method Summary collapse

Class Method Details

.backend_freeObject



2776
2777
2778
2779
2780
# File 'ext/llama_cpp/llama_cpp.cpp', line 2776

static VALUE rb_llama_llama_backend_free(VALUE self) {
  llama_backend_free();

  return Qnil;
}

.backend_init(*args) ⇒ Object

module functions



2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
# File 'ext/llama_cpp/llama_cpp.cpp', line 2763

static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[1] = { rb_intern("numa") };
  VALUE kw_values[1] = { Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);

  const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
  llama_backend_init(numa);

  return Qnil;
}

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

Generates sentences following the given prompt for operation check.

Parameters:

  • context (LLaMACpp::Context)

    The context to use.

  • prompt (String)

    The prompt to start generation with.

  • n_predict (Integer) (defaults to: 128)

    The number of tokens to predict.

  • n_keep (Integer) (defaults to: 10)

    The number of tokens to keep in the context.

  • n_batch (Integer) (defaults to: 512)

    The number of tokens to process in a batch.

  • repeat_last_n (Integer) (defaults to: 64)

    The number of tokens to consider for repetition penalty.

  • repeat_penalty (Float) (defaults to: 1.1)

    The repetition penalty.

  • frequency (Float) (defaults to: 0.0)

    The frequency penalty.

  • presence (Float) (defaults to: 0.0)

    The presence penalty.

  • top_k (Integer) (defaults to: 40)

    The number of tokens to consider for top-k sampling.

  • top_p (Float) (defaults to: 0.95)

    The probability threshold for nucleus sampling.

  • tfs_z (Float) (defaults to: 1.0)

    The z parameter for tail-free sampling.

  • typical_p (Float) (defaults to: 1.0)

    The probability for typical sampling.

  • temperature (Float) (defaults to: 0.8)

    The temperature for temperature sampling.

Returns:

  • (String)

Raises:

  • (ArgumentError)


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/llama_cpp.rb', line 27

def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
             n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
             repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
             top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = n_predict
  n_vocab = context.model.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.eval(tokens: embd, n_past: n_past)
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
      context.sample_frequency_and_presence_penalties(
        candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temperature(candidates, temperature: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.model.token_to_piece(token) }

    break if !embd.empty? && embd[-1] == context.token_eos
  end

  output.join.force_encoding('UTF-8').delete_prefix(spaced_prompt).strip
end

.max_devicesObject



2831
2832
2833
# File 'ext/llama_cpp/llama_cpp.cpp', line 2831

static VALUE rb_llama_max_devices(VALUE self) {
  return INT2NUM(llama_max_devices());
}

.mlock_supported?Boolean

Returns:

  • (Boolean)


2827
2828
2829
# File 'ext/llama_cpp/llama_cpp.cpp', line 2827

static VALUE rb_llama_mlock_supported(VALUE self) {
  return llama_mlock_supported() ? Qtrue : Qfalse;
}

.mmap_supported?Boolean

Returns:

  • (Boolean)


2823
2824
2825
# File 'ext/llama_cpp/llama_cpp.cpp', line 2823

static VALUE rb_llama_mmap_supported(VALUE self) {
  return llama_mmap_supported() ? Qtrue : Qfalse;
}

.model_quantize(*args) ⇒ Object



2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
# File 'ext/llama_cpp/llama_cpp.cpp', line 2782

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
  VALUE kw_values[3] = { Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
    rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);

  if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}


2814
2815
2816
2817
# File 'ext/llama_cpp/llama_cpp.cpp', line 2814

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.time_usObject



2819
2820
2821
# File 'ext/llama_cpp/llama_cpp.cpp', line 2819

static VALUE rb_llama_time_us(VALUE self) {
  return LONG2NUM(llama_time_us());
}