Module: LLaMACpp
- Defined in:
- lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp
Overview
llama_cpp.rb provides Ruby bindings for the llama.cpp.
Constant Summary collapse
- VERSION =
The version of llama_cpp.rb you install.
'0.9.0'
- LLAMA_CPP_VERSION =
The version of llama.cpp bundled with llama_cpp.rb.
'b1429'
- LLAMA_MAX_DEVICES =
INT2NUM(LLAMA_MAX_DEVICES)
- LLAMA_VOCAB_TYPE_SPM =
INT2NUM(LLAMA_VOCAB_TYPE_SPM)
- LLAMA_VOCAB_TYPE_BPE =
INT2NUM(LLAMA_VOCAB_TYPE_BPE)
- LLAMA_TOKEN_TYPE_UNDEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)
- LLAMA_TOKEN_TYPE_NORMAL =
INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)
- LLAMA_TOKEN_TYPE_UNKNOWN =
INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)
- LLAMA_TOKEN_TYPE_CONTROL =
INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)
- LLAMA_TOKEN_TYPE_USER_DEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)
- LLAMA_TOKEN_TYPE_UNUSED =
INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)
- LLAMA_TOKEN_TYPE_BYTE =
INT2NUM(LLAMA_TOKEN_TYPE_BYTE)
- LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
- LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
- LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
- LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
- LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
- LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
- LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
- LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
- LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
- LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
- LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
- LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
- LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
- LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
- LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
- LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
- LLAMA_FTYPE_GUESSED =
INT2NUM(LLAMA_FTYPE_GUESSED)
- LLAMA_GRETYPE_END =
INT2NUM(LLAMA_GRETYPE_END)
- LLAMA_GRETYPE_ALT =
INT2NUM(LLAMA_GRETYPE_ALT)
- LLAMA_GRETYPE_RULE_REF =
INT2NUM(LLAMA_GRETYPE_RULE_REF)
- LLAMA_GRETYPE_CHAR =
INT2NUM(LLAMA_GRETYPE_CHAR)
- LLAMA_GRETYPE_CHAR_NOT =
INT2NUM(LLAMA_GRETYPE_CHAR_NOT)
- LLAMA_GRETYPE_CHAR_RNG_UPPER =
INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)
- LLAMA_GRETYPE_CHAR_ALT =
INT2NUM(LLAMA_GRETYPE_CHAR_ALT)
- LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_DEFAULT_SEED =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())
Class Method Summary collapse
- .backend_free ⇒ Object
-
.backend_init(*args) ⇒ Object
module functions.
-
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
- .max_devices ⇒ Object
- .mlock_supported? ⇒ Boolean
- .mmap_supported? ⇒ Boolean
- .model_quantize(*args) ⇒ Object
- .print_system_info ⇒ Object
- .time_us ⇒ Object
Class Method Details
.backend_free ⇒ Object
2761 2762 2763 2764 2765 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2761 static VALUE rb_llama_llama_backend_free(VALUE self) { llama_backend_free(); return Qnil; } |
.backend_init(*args) ⇒ Object
module functions
2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2748
static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[1] = { rb_intern("numa") };
VALUE kw_values[1] = { Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
llama_backend_init(numa);
return Qnil;
}
|
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/llama_cpp.rb', line 27 def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context) raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String) spaced_prompt = " #{prompt}" embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true) n_ctx = context.n_ctx raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4 last_n_tokens = [0] * n_ctx embd = [] n_consumed = 0 n_past = 0 n_remain = n_predict n_vocab = context.model.n_vocab output = [] while n_remain != 0 unless embd.empty? if n_past + embd.size > n_ctx n_left = n_past - n_keep n_past = n_keep embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size]) end context.eval(tokens: embd, n_past: n_past) end n_past += embd.size embd.clear if embd_input.size <= n_consumed logits = context.logits base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) } candidates = LLaMACpp::TokenDataArray.new(base_candidates) # apply penalties last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min context.sample_repetition_penalties( candidates, last_n_tokens[-last_n_repeat..], penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence ) # temperature sampling context.sample_top_k(candidates, k: top_k) context.sample_tail_free(candidates, z: tfs_z) context.sample_typical(candidates, prob: typical_p) context.sample_top_p(candidates, prob: top_p) context.sample_temperature(candidates, temperature: temperature) id = context.sample_token(candidates) last_n_tokens.shift last_n_tokens.push(id) embd.push(id) n_remain -= 1 else while embd_input.size > n_consumed embd.push(embd_input[n_consumed]) last_n_tokens.shift last_n_tokens.push(embd_input[n_consumed]) n_consumed += 1 break if embd.size >= n_batch end end embd.each { |token| output << context.model.token_to_piece(token) } break if !embd.empty? && embd[-1] == context.model.token_eos end output.join.scrub('?').strip.delete_prefix(prompt).strip end |
.max_devices ⇒ Object
2816 2817 2818 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2816 static VALUE rb_llama_max_devices(VALUE self) { return INT2NUM(llama_max_devices()); } |
.mlock_supported? ⇒ Boolean
2812 2813 2814 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2812 static VALUE rb_llama_mlock_supported(VALUE self) { return llama_mlock_supported() ? Qtrue : Qfalse; } |
.mmap_supported? ⇒ Boolean
2808 2809 2810 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2808 static VALUE rb_llama_mmap_supported(VALUE self) { return llama_mmap_supported() ? Qtrue : Qfalse; } |
.model_quantize(*args) ⇒ Object
2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2767
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
rb_raise(rb_eArgError, "input_path must be a string");
return Qnil;
}
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
rb_raise(rb_eArgError, "output_path must be a string");
return Qnil;
}
if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
return Qnil;
}
const char* input_path = StringValueCStr(kw_values[0]);
const char* output_path = StringValueCStr(kw_values[1]);
LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
rb_raise(rb_eRuntimeError, "Failed to quantize model");
return Qnil;
}
return Qnil;
}
|
.print_system_info ⇒ Object
2799 2800 2801 2802 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2799 static VALUE rb_llama_print_system_info(VALUE self) { const char* result = llama_print_system_info(); return rb_utf8_str_new_cstr(result); } |
.time_us ⇒ Object
2804 2805 2806 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2804 static VALUE rb_llama_time_us(VALUE self) { return LONG2NUM(llama_time_us()); } |