Module: LLaMACpp
- Defined in:
- lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp
Overview
llama_cpp.rb provides Ruby bindings for the llama.cpp.
Constant Summary collapse
- VERSION =
The version of llama_cpp.rb you install.
'0.14.6'
- LLAMA_CPP_VERSION =
The version of llama.cpp bundled with llama_cpp.rb.
'b2698'
- LLAMA_VOCAB_TYPE_NONE =
INT2NUM(LLAMA_VOCAB_TYPE_NONE)
- LLAMA_VOCAB_TYPE_SPM =
INT2NUM(LLAMA_VOCAB_TYPE_SPM)
- LLAMA_VOCAB_TYPE_BPE =
INT2NUM(LLAMA_VOCAB_TYPE_BPE)
- LLAMA_VOCAB_TYPE_WPM =
INT2NUM(LLAMA_VOCAB_TYPE_WPM)
- LLAMA_TOKEN_TYPE_UNDEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)
- LLAMA_TOKEN_TYPE_NORMAL =
INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)
- LLAMA_TOKEN_TYPE_UNKNOWN =
INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)
- LLAMA_TOKEN_TYPE_CONTROL =
INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)
- LLAMA_TOKEN_TYPE_USER_DEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)
- LLAMA_TOKEN_TYPE_UNUSED =
INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)
- LLAMA_TOKEN_TYPE_BYTE =
INT2NUM(LLAMA_TOKEN_TYPE_BYTE)
- LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
- LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
- LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
- LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
- LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
- LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
- LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
- LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
- LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
- LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
- LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
- LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
- LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
- LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
- LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
- LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
- LLAMA_FTYPE_MOSTLY_IQ2_XXS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS)
- LLAMA_FTYPE_MOSTLY_IQ2_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS)
- LLAMA_FTYPE_MOSTLY_Q2_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S)
- LLAMA_FTYPE_MOSTLY_IQ3_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS)
- LLAMA_FTYPE_MOSTLY_IQ3_XXS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS)
- LLAMA_FTYPE_MOSTLY_IQ1_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S)
- LLAMA_FTYPE_MOSTLY_IQ4_NL =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL)
- LLAMA_FTYPE_MOSTLY_IQ3_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S)
- LLAMA_FTYPE_MOSTLY_IQ3_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M)
- LLAMA_FTYPE_MOSTLY_IQ4_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS)
- LLAMA_FTYPE_MOSTLY_IQ1_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M)
- LLAMA_FTYPE_GUESSED =
INT2NUM(LLAMA_FTYPE_GUESSED)
- LLAMA_KV_OVERRIDE_TYPE_INT =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT)
- LLAMA_KV_OVERRIDE_TYPE_FLOAT =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT)
- LLAMA_KV_OVERRIDE_TYPE_BOOL =
INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL)
- LLAMA_GRETYPE_END =
INT2NUM(LLAMA_GRETYPE_END)
- LLAMA_GRETYPE_ALT =
INT2NUM(LLAMA_GRETYPE_ALT)
- LLAMA_GRETYPE_RULE_REF =
INT2NUM(LLAMA_GRETYPE_RULE_REF)
- LLAMA_GRETYPE_CHAR =
INT2NUM(LLAMA_GRETYPE_CHAR)
- LLAMA_GRETYPE_CHAR_NOT =
INT2NUM(LLAMA_GRETYPE_CHAR_NOT)
- LLAMA_GRETYPE_CHAR_RNG_UPPER =
INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)
- LLAMA_GRETYPE_CHAR_ALT =
INT2NUM(LLAMA_GRETYPE_CHAR_ALT)
- LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED)
- LLAMA_ROPE_SCALING_TYPE_NONE =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE)
- LLAMA_ROPE_SCALING_TYPE_LINEAR =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR)
- LLAMA_ROPE_SCALING_TYPE_YARN =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN)
- LLAMA_ROPE_SCALING_TYPE_MAX_VALUE =
INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE)
- LLAMA_POOLING_TYPE_UNSPECIFIED =
INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED)
- LLAMA_POOLING_TYPE_NONE =
INT2NUM(LLAMA_POOLING_TYPE_NONE)
- LLAMA_POOLING_TYPE_MEAN =
INT2NUM(LLAMA_POOLING_TYPE_MEAN)
- LLAMA_POOLING_TYPE_CLS =
INT2NUM(LLAMA_POOLING_TYPE_CLS)
- LLAMA_SPLIT_MODE_NONE =
INT2NUM(LLAMA_SPLIT_MODE_NONE)
- LLAMA_SPLIT_MODE_LAYER =
INT2NUM(LLAMA_SPLIT_MODE_LAYER)
- LLAMA_SPLIT_MODE_ROW =
INT2NUM(LLAMA_SPLIT_MODE_ROW)
- LLAMA_FILE_MAGIC_GGLA =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGSQ =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_STATE_SEQ_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_DEFAULT_SEED =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())
- LLAMA_STATE_SEQ_VERSION =
rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str())
Class Method Summary collapse
- .backend_free ⇒ Object
-
.backend_init ⇒ Object
module functions.
-
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
- .max_devices ⇒ Object
- .model_quantize(*args) ⇒ Object
- .numa_init(strategy) ⇒ Object
- .print_system_info ⇒ Object
- .supports_gpu_offload? ⇒ Boolean
- .supports_mlock? ⇒ Boolean
- .supports_mmap? ⇒ Boolean
- .time_us ⇒ Object
Class Method Details
.backend_free ⇒ Object
3248 3249 3250 3251 3252 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3248 static VALUE rb_llama_llama_backend_free(VALUE self) { llama_backend_free(); return Qnil; } |
.backend_init ⇒ Object
module functions
3242 3243 3244 3245 3246 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3242 static VALUE rb_llama_llama_backend_init(VALUE self) { llama_backend_init(); return Qnil; } |
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/llama_cpp.rb', line 27 def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context) raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String) spaced_prompt = " #{prompt}" embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true) n_ctx = context.n_ctx raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4 last_n_tokens = [0] * n_ctx embd = [] n_consumed = 0 n_past = 0 n_remain = n_predict n_vocab = context.model.n_vocab output = [] while n_remain != 0 unless embd.empty? if n_past + embd.size > n_ctx n_left = n_past - n_keep n_past = n_keep embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size]) end context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0)) end n_past += embd.size embd.clear if embd_input.size <= n_consumed logits = context.logits base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) } candidates = LLaMACpp::TokenDataArray.new(base_candidates) # apply penalties last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min context.sample_repetition_penalties( candidates, last_n_tokens[-last_n_repeat..], penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence ) # temperature sampling context.sample_top_k(candidates, k: top_k) context.sample_tail_free(candidates, z: tfs_z) context.sample_typical(candidates, prob: typical_p) context.sample_top_p(candidates, prob: top_p) context.sample_temp(candidates, temp: temperature) id = context.sample_token(candidates) last_n_tokens.shift last_n_tokens.push(id) embd.push(id) n_remain -= 1 else while embd_input.size > n_consumed embd.push(embd_input[n_consumed]) last_n_tokens.shift last_n_tokens.push(embd_input[n_consumed]) n_consumed += 1 break if embd.size >= n_batch end end embd.each { |token| output << context.model.token_to_piece(token) } break if !embd.empty? && embd[-1] == context.model.token_eos end output.join.scrub('?').strip.delete_prefix(prompt).strip end |
.max_devices ⇒ Object
3306 3307 3308 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3306 static VALUE rb_llama_max_devices(VALUE self) { return SIZET2NUM(llama_max_devices()); } |
.model_quantize(*args) ⇒ Object
3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3265
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
rb_raise(rb_eArgError, "input_path must be a string");
return Qnil;
}
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
rb_raise(rb_eArgError, "output_path must be a string");
return Qnil;
}
if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
return Qnil;
}
const char* input_path = StringValueCStr(kw_values[0]);
const char* output_path = StringValueCStr(kw_values[1]);
LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
rb_raise(rb_eRuntimeError, "Failed to quantize model");
return Qnil;
}
return Qnil;
}
|
.numa_init(strategy) ⇒ Object
3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3254
static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
if (!RB_INTEGER_TYPE_P(strategy)) {
rb_raise(rb_eArgError, "strategy must be an integer");
return Qnil;
}
llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
return Qnil;
}
|
.print_system_info ⇒ Object
3297 3298 3299 3300 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3297 static VALUE rb_llama_print_system_info(VALUE self) { const char* result = llama_print_system_info(); return rb_utf8_str_new_cstr(result); } |
.supports_gpu_offload? ⇒ Boolean
3318 3319 3320 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3318 static VALUE rb_llama_supports_gpu_offload(VALUE self) { return llama_supports_gpu_offload() ? Qtrue : Qfalse; } |
.supports_mlock? ⇒ Boolean
3314 3315 3316 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3314 static VALUE rb_llama_supports_mlock(VALUE self) { return llama_supports_mlock() ? Qtrue : Qfalse; } |
.supports_mmap? ⇒ Boolean
3310 3311 3312 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3310 static VALUE rb_llama_supports_mmap(VALUE self) { return llama_supports_mmap() ? Qtrue : Qfalse; } |
.time_us ⇒ Object
3302 3303 3304 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3302 static VALUE rb_llama_time_us(VALUE self) { return LONG2NUM(llama_time_us()); } |