Module: LLaMACpp
- Defined in:
- lib/llama_cpp.rb,
lib/llama_cpp/client.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp
Overview
llama_cpp.rb provides Ruby bindings for the llama.cpp.
Defined Under Namespace
Classes: Client
Constant Summary collapse
- Params =
Class alias to match interface of whispercpp gem.
ContextParams
- VERSION =
The version of llama_cpp.rb you install.
'0.1.3'
- LLAMA_CPP_VERSION =
The version of llama.cpp bundled with llama_cpp.rb.
'master-66874d4'
- LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
- LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
- LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
- LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
- LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
- LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
- LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
- LLAMA_FILE_MAGIC_GGJT =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGLA =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGMF =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGML =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_UNVERSIONED =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_VERSION =
rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str())
- LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())
Class Method Summary collapse
-
.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String
Generates sentences following the given prompt for operation check.
-
.init_backend ⇒ Object
module functions.
- .mlock_supported? ⇒ Boolean
- .mmap_supported? ⇒ Boolean
- .model_quantize(*args) ⇒ Object
- .print_system_info ⇒ Object
- .token_bos ⇒ Object
- .token_eos ⇒ Object
- .token_nl ⇒ Object
Class Method Details
.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String
Generates sentences following the given prompt for operation check.
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/llama_cpp.rb', line 21 def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context) raise ArgumentError, 'context must have loaded the model' if context.empty? raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String) spaced_prompt = " #{prompt}" embd_input = context.tokenize(text: spaced_prompt, add_bos: true) n_ctx = context.n_ctx raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4 last_n_tokens = [0] * n_ctx embd = [] n_consumed = 0 n_keep = 10 n_past = 0 n_remain = n_predict repeat_last_n = 64 repeat_penalty = 1.1 frequency = 0.0 presence = 0.0 top_k = 40 top_p = 0.95 tfs_z = 1.0 typical_p = 1.0 temperature = 0.8 n_batch = 512 n_vocab = context.n_vocab output = [] while n_remain != 0 unless embd.empty? if n_past + embd.size > n_ctx n_left = n_past - n_keep n_past = n_keep embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size]) end context.eval(tokens: embd, n_past: n_past, n_threads: n_threads) end n_past += embd.size embd.clear if embd_input.size <= n_consumed logits = context.logits base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) } candidates = LLaMACpp::TokenDataArray.new(base_candidates) # apply penalties last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty) context.sample_frequency_and_presence_penalties( candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence ) # temperature sampling context.sample_top_k(candidates, k: top_k) context.sample_tail_free(candidates, z: tfs_z) context.sample_typical(candidates, prob: typical_p) context.sample_top_p(candidates, prob: top_p) context.sample_temperature(candidates, temperature: temperature) id = context.sample_token(candidates) last_n_tokens.shift last_n_tokens.push(id) embd.push(id) n_remain -= 1 else while embd_input.size > n_consumed embd.push(embd_input[n_consumed]) last_n_tokens.shift last_n_tokens.push(embd_input[n_consumed]) n_consumed += 1 break if embd.size >= n_batch end end embd.each { |token| output << context.token_to_str(token) } break if !embd.empty? && embd[-1] == LLaMACpp.token_eos end output.join.delete_prefix(spaced_prompt).strip end |
.init_backend ⇒ Object
module functions
1424 1425 1426 1427 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1424 static VALUE rb_llama_llama_init_backend(VALUE self) { llama_init_backend(); return Qnil; } |
.mlock_supported? ⇒ Boolean
1487 1488 1489 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1487 static VALUE rb_llama_mlock_supported(VALUE self) { return llama_mlock_supported() ? Qtrue : Qfalse; } |
.mmap_supported? ⇒ Boolean
1483 1484 1485 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1483 static VALUE rb_llama_mmap_supported(VALUE self) { return llama_mmap_supported() ? Qtrue : Qfalse; } |
.model_quantize(*args) ⇒ Object
1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1429
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
rb_raise(rb_eArgError, "input_path must be a string");
return Qnil;
}
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
rb_raise(rb_eArgError, "output_path must be a string");
return Qnil;
}
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
rb_raise(rb_eArgError, "ftype must be an integer");
return Qnil;
}
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
rb_raise(rb_eArgError, "n_threads must be an integer");
return Qnil;
}
const char* input_path = StringValueCStr(kw_values[0]);
const char* output_path = StringValueCStr(kw_values[1]);
const int ftype = NUM2INT(kw_values[2]);
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
rb_raise(rb_eRuntimeError, "Failed to quantize model");
return Qnil;
}
return Qnil;
}
|
.print_system_info ⇒ Object
1478 1479 1480 1481 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1478 static VALUE rb_llama_print_system_info(VALUE self) { const char* result = llama_print_system_info(); return rb_utf8_str_new_cstr(result); } |
.token_bos ⇒ Object
1466 1467 1468 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1466 static VALUE rb_llama_token_bos(VALUE self) { return INT2NUM(llama_token_bos()); } |
.token_eos ⇒ Object
1470 1471 1472 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1470 static VALUE rb_llama_token_eos(VALUE self) { return INT2NUM(llama_token_eos()); } |
.token_nl ⇒ Object
1474 1475 1476 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 1474 static VALUE rb_llama_token_nl(VALUE self) { return INT2NUM(llama_token_nl()); } |