Module: LLaMACpp
- Defined in:
- lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp
Overview
llama_cpp.rb provides Ruby bindings for the llama.cpp.
Constant Summary collapse
- Params =
Class alias to match interface of whispercpp gem.
ContextParams
- VERSION =
The version of llama_cpp.rb you install.
'0.3.0'
- LLAMA_CPP_VERSION =
The version of llama.cpp bundled with llama_cpp.rb.
'master-9d23589'
- LLAMA_MAX_DEVICES =
INT2NUM(LLAMA_MAX_DEVICES)
- LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
- LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
- LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
- LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
- LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
- LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
- LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
- LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
- LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
- LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
- LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
- LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
- LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
- LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
- LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
- LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
- LLAMA_FILE_MAGIC_GGJT =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGLA =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGMF =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGML =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_UNVERSIONED =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_VERSION =
rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str())
- LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())
Class Method Summary collapse
-
.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String
Generates sentences following the given prompt for operation check.
-
.init_backend(*args) ⇒ Object
module functions.
- .mlock_supported? ⇒ Boolean
- .mmap_supported? ⇒ Boolean
- .model_quantize(*args) ⇒ Object
- .print_system_info ⇒ Object
- .token_bos ⇒ Object
- .token_eos ⇒ Object
- .token_nl ⇒ Object
Class Method Details
.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String
Generates sentences following the given prompt for operation check.
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/llama_cpp.rb', line 20 def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context) raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String) spaced_prompt = " #{prompt}" embd_input = context.tokenize(text: spaced_prompt, add_bos: true) n_ctx = context.n_ctx raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4 last_n_tokens = [0] * n_ctx embd = [] n_consumed = 0 n_keep = 10 n_past = 0 n_remain = n_predict repeat_last_n = 64 repeat_penalty = 1.1 frequency = 0.0 presence = 0.0 top_k = 40 top_p = 0.95 tfs_z = 1.0 typical_p = 1.0 temperature = 0.8 n_batch = 512 n_vocab = context.n_vocab output = [] while n_remain != 0 unless embd.empty? if n_past + embd.size > n_ctx n_left = n_past - n_keep n_past = n_keep embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size]) end context.eval(tokens: embd, n_past: n_past, n_threads: n_threads) end n_past += embd.size embd.clear if embd_input.size <= n_consumed logits = context.logits base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) } candidates = LLaMACpp::TokenDataArray.new(base_candidates) # apply penalties last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty) context.sample_frequency_and_presence_penalties( candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence ) # temperature sampling context.sample_top_k(candidates, k: top_k) context.sample_tail_free(candidates, z: tfs_z) context.sample_typical(candidates, prob: typical_p) context.sample_top_p(candidates, prob: top_p) context.sample_temperature(candidates, temperature: temperature) id = context.sample_token(candidates) last_n_tokens.shift last_n_tokens.push(id) embd.push(id) n_remain -= 1 else while embd_input.size > n_consumed embd.push(embd_input[n_consumed]) last_n_tokens.shift last_n_tokens.push(embd_input[n_consumed]) n_consumed += 1 break if embd.size >= n_batch end end embd.each { |token| output << context.token_to_str(token) } break if !embd.empty? && embd[-1] == LLaMACpp.token_eos end output.join.delete_prefix(spaced_prompt).strip end |
.init_backend(*args) ⇒ Object
module functions
1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1757
static VALUE rb_llama_llama_init_backend(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[1] = { rb_intern("numa") };
VALUE kw_values[1] = { Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
const bool numa = kw_values[0] == Qundef ? false : (RTEST ? true : false);
llama_init_backend(numa);
return Qnil;
}
|
.mlock_supported? ⇒ Boolean
1823 1824 1825 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1823 static VALUE rb_llama_mlock_supported(VALUE self) { return llama_mlock_supported() ? Qtrue : Qfalse; } |
.mmap_supported? ⇒ Boolean
1819 1820 1821 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1819 static VALUE rb_llama_mmap_supported(VALUE self) { return llama_mmap_supported() ? Qtrue : Qfalse; } |
.model_quantize(*args) ⇒ Object
1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1770
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
rb_raise(rb_eArgError, "input_path must be a string");
return Qnil;
}
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
rb_raise(rb_eArgError, "output_path must be a string");
return Qnil;
}
if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
return Qnil;
}
const char* input_path = StringValueCStr(kw_values[0]);
const char* output_path = StringValueCStr(kw_values[1]);
LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
rb_raise(rb_eRuntimeError, "Failed to quantize model");
return Qnil;
}
return Qnil;
}
|
.print_system_info ⇒ Object
1814 1815 1816 1817 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1814 static VALUE rb_llama_print_system_info(VALUE self) { const char* result = llama_print_system_info(); return rb_utf8_str_new_cstr(result); } |
.token_bos ⇒ Object
1802 1803 1804 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1802 static VALUE rb_llama_token_bos(VALUE self) { return INT2NUM(llama_token_bos()); } |
.token_eos ⇒ Object
1806 1807 1808 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1806 static VALUE rb_llama_token_eos(VALUE self) { return INT2NUM(llama_token_eos()); } |
.token_nl ⇒ Object
1810 1811 1812 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1810 static VALUE rb_llama_token_nl(VALUE self) { return INT2NUM(llama_token_nl()); } |