Module: LLaMACpp
- Defined in:
- lib/llama_cpp.rb,
lib/llama_cpp/client.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp
Overview
llama_cpp.rb provides Ruby bindings for the llama.cpp.
Defined Under Namespace
Classes: Client
Constant Summary collapse
- Params =
Class alias to match interface of whispercpp gem.
ContextParams
- VERSION =
The version of llama_cpp.rb you install.
'0.1.1'
- LLAMA_CPP_VERSION =
The version of llama.cpp bundled with llama_cpp.rb.
'master-6986c78'
- LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
- LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
- LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
- LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
- LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
- LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
- LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
- LLAMA_FILE_VERSION =
rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str())
- LLAMA_FILE_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_UNVERSIONED =
rb_str_new2(ss_magic_unversioned.str().c_str())
Class Method Summary collapse
-
.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String
Generates sentences following the given prompt for operation check.
- .mlock_supported? ⇒ Boolean
- .mmap_supported? ⇒ Boolean
-
.model_quantize(*args) ⇒ Object
module functions.
- .print_system_info ⇒ Object
- .token_bos ⇒ Object
- .token_eos ⇒ Object
- .token_nl ⇒ Object
Class Method Details
.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String
Generates sentences following the given prompt for operation check.
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/llama_cpp.rb', line 21 def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context) raise ArgumentError, 'context must have loaded the model' if context.empty? raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String) spaced_prompt = " #{prompt}" embd_input = context.tokenize(text: spaced_prompt, add_bos: true) n_ctx = context.n_ctx raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4 last_n_tokens = [0] * n_ctx embd = [] n_consumed = 0 n_keep = 10 n_past = 0 n_remain = n_predict repeat_last_n = 64 repeat_penalty = 1.1 frequency = 0.0 presence = 0.0 top_k = 40 top_p = 0.95 tfs_z = 1.0 typical_p = 1.0 temperature = 0.8 n_batch = 512 n_vocab = context.n_vocab output = [] while n_remain != 0 unless embd.empty? if n_past + embd.size > n_ctx n_left = n_past - n_keep n_past = n_keep embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size]) end context.eval(tokens: embd, n_past: n_past, n_threads: n_threads) end n_past += embd.size embd.clear if embd_input.size <= n_consumed logits = context.logits base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) } candidates = LLaMACpp::TokenDataArray.new(base_candidates) # apply penalties last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty) context.sample_frequency_and_presence_penalties( candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence ) # temperature sampling context.sample_top_k(candidates, k: top_k) context.sample_tail_free(candidates, z: tfs_z) context.sample_typical(candidates, prob: typical_p) context.sample_top_p(candidates, prob: top_p) context.sample_temperature(candidates, temperature: temperature) id = context.sample_token(candidates) last_n_tokens.shift last_n_tokens.push(id) embd.push(id) n_remain -= 1 else while embd_input.size > n_consumed embd.push(embd_input[n_consumed]) last_n_tokens.shift last_n_tokens.push(embd_input[n_consumed]) n_consumed += 1 break if embd.size >= n_batch end end embd.each { |token| output << context.token_to_str(token) } break if !embd.empty? && embd[-1] == LLaMACpp.token_eos end output.join.delete_prefix(spaced_prompt).strip end |
.mlock_supported? ⇒ Boolean
1468 1469 1470 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1468 static VALUE rb_llama_mlock_supported(VALUE self) { return llama_mlock_supported() ? Qtrue : Qfalse; } |
.mmap_supported? ⇒ Boolean
1464 1465 1466 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1464 static VALUE rb_llama_mmap_supported(VALUE self) { return llama_mmap_supported() ? Qtrue : Qfalse; } |
.model_quantize(*args) ⇒ Object
module functions
1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1410
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
rb_raise(rb_eArgError, "input_path must be a string");
return Qnil;
}
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
rb_raise(rb_eArgError, "output_path must be a string");
return Qnil;
}
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
rb_raise(rb_eArgError, "ftype must be an integer");
return Qnil;
}
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
rb_raise(rb_eArgError, "n_threads must be an integer");
return Qnil;
}
const char* input_path = StringValueCStr(kw_values[0]);
const char* output_path = StringValueCStr(kw_values[1]);
const int ftype = NUM2INT(kw_values[2]);
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
rb_raise(rb_eRuntimeError, "Failed to quantize model");
return Qnil;
}
return Qnil;
}
|
.print_system_info ⇒ Object
1459 1460 1461 1462 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1459 static VALUE rb_llama_print_system_info(VALUE self) { const char* result = llama_print_system_info(); return rb_utf8_str_new_cstr(result); } |
.token_bos ⇒ Object
1447 1448 1449 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1447 static VALUE rb_llama_token_bos(VALUE self) { return INT2NUM(llama_token_bos()); } |
.token_eos ⇒ Object
1451 1452 1453 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1451 static VALUE rb_llama_token_eos(VALUE self) { return INT2NUM(llama_token_eos()); } |
.token_nl ⇒ Object
1455 1456 1457 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1455 static VALUE rb_llama_token_nl(VALUE self) { return INT2NUM(llama_token_nl()); } |