Module: LLaMACpp
- Defined in:
- lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp
Overview
llama_cpp.rb provides Ruby bindings for the llama.cpp.
Constant Summary collapse
- Params =
Class alias to match interface of whispercpp gem.
ContextParams
- VERSION =
The version of llama_cpp.rb you install.
'0.3.3'
- LLAMA_CPP_VERSION =
The version of llama.cpp bundled with llama_cpp.rb.
'master-32c5411'
- LLAMA_MAX_DEVICES =
INT2NUM(LLAMA_MAX_DEVICES)
- LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
- LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
- LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
- LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
- LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
- LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
- LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
- LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
- LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
- LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
- LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
- LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
- LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
- LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
- LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
- LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
- LLAMA_FILE_MAGIC_GGJT =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGLA =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGMF =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGML =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_UNVERSIONED =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_DEFAULT_SEED =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_VERSION =
rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str())
- LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())
Class Method Summary collapse
- .backend_free ⇒ Object
-
.backend_init(*args) ⇒ Object
module functions.
-
.generate(context, prompt, n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
- .mlock_supported? ⇒ Boolean
- .mmap_supported? ⇒ Boolean
- .model_quantize(*args) ⇒ Object
- .print_system_info ⇒ Object
- .token_bos ⇒ Object
- .token_eos ⇒ Object
- .token_nl ⇒ Object
Class Method Details
.backend_free ⇒ Object
2002 2003 2004 2005 2006 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2002 static VALUE rb_llama_llama_backend_free(VALUE self) { llama_backend_free(); return Qnil; } |
.backend_init(*args) ⇒ Object
module functions
1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1989
static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[1] = { rb_intern("numa") };
VALUE kw_values[1] = { Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
const bool numa = kw_values[0] == Qundef ? false : (RTEST ? true : false);
llama_backend_init(numa);
return Qnil;
}
|
.generate(context, prompt, n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/llama_cpp.rb', line 31 def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context) raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String) spaced_prompt = " #{prompt}" embd_input = context.tokenize(text: spaced_prompt, add_bos: true) n_ctx = context.n_ctx raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4 last_n_tokens = [0] * n_ctx embd = [] n_consumed = 0 n_past = 0 n_remain = n_predict n_vocab = context.n_vocab output = [] while n_remain != 0 unless embd.empty? if n_past + embd.size > n_ctx n_left = n_past - n_keep n_past = n_keep embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size]) end context.eval(tokens: embd, n_past: n_past, n_threads: n_threads) end n_past += embd.size embd.clear if embd_input.size <= n_consumed logits = context.logits base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) } candidates = LLaMACpp::TokenDataArray.new(base_candidates) # apply penalties last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty) context.sample_frequency_and_presence_penalties( candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence ) # temperature sampling context.sample_top_k(candidates, k: top_k) context.sample_tail_free(candidates, z: tfs_z) context.sample_typical(candidates, prob: typical_p) context.sample_top_p(candidates, prob: top_p) context.sample_temperature(candidates, temperature: temperature) id = context.sample_token(candidates) last_n_tokens.shift last_n_tokens.push(id) embd.push(id) n_remain -= 1 else while embd_input.size > n_consumed embd.push(embd_input[n_consumed]) last_n_tokens.shift last_n_tokens.push(embd_input[n_consumed]) n_consumed += 1 break if embd.size >= n_batch end end embd.each { |token| output << context.token_to_str(token) } break if !embd.empty? && embd[-1] == LLaMACpp.token_eos end output.join.delete_prefix(spaced_prompt).strip end |
.mlock_supported? ⇒ Boolean
2061 2062 2063 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2061 static VALUE rb_llama_mlock_supported(VALUE self) { return llama_mlock_supported() ? Qtrue : Qfalse; } |
.mmap_supported? ⇒ Boolean
2057 2058 2059 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2057 static VALUE rb_llama_mmap_supported(VALUE self) { return llama_mmap_supported() ? Qtrue : Qfalse; } |
.model_quantize(*args) ⇒ Object
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2008
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
rb_raise(rb_eArgError, "input_path must be a string");
return Qnil;
}
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
rb_raise(rb_eArgError, "output_path must be a string");
return Qnil;
}
if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
return Qnil;
}
const char* input_path = StringValueCStr(kw_values[0]);
const char* output_path = StringValueCStr(kw_values[1]);
LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
rb_raise(rb_eRuntimeError, "Failed to quantize model");
return Qnil;
}
return Qnil;
}
|
.print_system_info ⇒ Object
2052 2053 2054 2055 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2052 static VALUE rb_llama_print_system_info(VALUE self) { const char* result = llama_print_system_info(); return rb_utf8_str_new_cstr(result); } |
.token_bos ⇒ Object
2040 2041 2042 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2040 static VALUE rb_llama_token_bos(VALUE self) { return INT2NUM(llama_token_bos()); } |
.token_eos ⇒ Object
2044 2045 2046 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2044 static VALUE rb_llama_token_eos(VALUE self) { return INT2NUM(llama_token_eos()); } |
.token_nl ⇒ Object
2048 2049 2050 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 2048 static VALUE rb_llama_token_nl(VALUE self) { return INT2NUM(llama_token_nl()); } |