Module: LLaMACpp
- Defined in:
- lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp
Overview
llama_cpp.rb provides Ruby bindings for the llama.cpp.
Constant Summary collapse
- VERSION =
The version of llama_cpp.rb you install.
'0.12.7'
- LLAMA_CPP_VERSION =
The version of llama.cpp bundled with llama_cpp.rb.
'b2249'
- LLAMA_VOCAB_TYPE_SPM =
INT2NUM(LLAMA_VOCAB_TYPE_SPM)
- LLAMA_VOCAB_TYPE_BPE =
INT2NUM(LLAMA_VOCAB_TYPE_BPE)
- LLAMA_VOCAB_TYPE_WPM =
INT2NUM(LLAMA_VOCAB_TYPE_WPM)
- LLAMA_TOKEN_TYPE_UNDEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)
- LLAMA_TOKEN_TYPE_NORMAL =
INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)
- LLAMA_TOKEN_TYPE_UNKNOWN =
INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)
- LLAMA_TOKEN_TYPE_CONTROL =
INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)
- LLAMA_TOKEN_TYPE_USER_DEFINED =
INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)
- LLAMA_TOKEN_TYPE_UNUSED =
INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)
- LLAMA_TOKEN_TYPE_BYTE =
INT2NUM(LLAMA_TOKEN_TYPE_BYTE)
- LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
- LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
- LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
- LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
- LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
- LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
- LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
- LLAMA_FTYPE_MOSTLY_Q2_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)
- LLAMA_FTYPE_MOSTLY_Q3_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)
- LLAMA_FTYPE_MOSTLY_Q3_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)
- LLAMA_FTYPE_MOSTLY_Q3_K_L =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)
- LLAMA_FTYPE_MOSTLY_Q4_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)
- LLAMA_FTYPE_MOSTLY_Q4_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)
- LLAMA_FTYPE_MOSTLY_Q5_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)
- LLAMA_FTYPE_MOSTLY_Q5_K_M =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)
- LLAMA_FTYPE_MOSTLY_Q6_K =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)
- LLAMA_FTYPE_MOSTLY_IQ2_XXS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS)
- LLAMA_FTYPE_MOSTLY_IQ2_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS)
- LLAMA_FTYPE_MOSTLY_Q2_K_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S)
- LLAMA_FTYPE_MOSTLY_Q3_K_XS =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS)
- LLAMA_FTYPE_MOSTLY_IQ3_XXS =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS)
- LLAMA_FTYPE_MOSTLY_IQ1_S =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S)
- LLAMA_FTYPE_MOSTLY_IQ4_NL =
INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL)
- LLAMA_FTYPE_GUESSED =
INT2NUM(LLAMA_FTYPE_GUESSED)
- LLAMA_KV_OVERRIDE_INT =
INT2NUM(LLAMA_KV_OVERRIDE_INT)
- LLAMA_KV_OVERRIDE_FLOAT =
INT2NUM(LLAMA_KV_OVERRIDE_FLOAT)
- LLAMA_KV_OVERRIDE_BOOL =
INT2NUM(LLAMA_KV_OVERRIDE_BOOL)
- LLAMA_GRETYPE_END =
INT2NUM(LLAMA_GRETYPE_END)
- LLAMA_GRETYPE_ALT =
INT2NUM(LLAMA_GRETYPE_ALT)
- LLAMA_GRETYPE_RULE_REF =
INT2NUM(LLAMA_GRETYPE_RULE_REF)
- LLAMA_GRETYPE_CHAR =
INT2NUM(LLAMA_GRETYPE_CHAR)
- LLAMA_GRETYPE_CHAR_NOT =
INT2NUM(LLAMA_GRETYPE_CHAR_NOT)
- LLAMA_GRETYPE_CHAR_RNG_UPPER =
INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)
- LLAMA_GRETYPE_CHAR_ALT =
INT2NUM(LLAMA_GRETYPE_CHAR_ALT)
- LLAMA_ROPE_SCALING_UNSPECIFIED =
INT2NUM(LLAMA_ROPE_SCALING_UNSPECIFIED)
- LLAMA_ROPE_SCALING_NONE =
INT2NUM(LLAMA_ROPE_SCALING_NONE)
- LLAMA_ROPE_SCALING_LINEAR =
INT2NUM(LLAMA_ROPE_SCALING_LINEAR)
- LLAMA_ROPE_SCALING_YARN =
INT2NUM(LLAMA_ROPE_SCALING_YARN)
- LLAMA_ROPE_SCALING_MAX_VALUE =
INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE)
- LLAMA_POOLING_NONE =
INT2NUM(LLAMA_POOLING_NONE)
- LLAMA_POOLING_MEAN =
INT2NUM(LLAMA_POOLING_MEAN)
- LLAMA_POOLING_CLS =
INT2NUM(LLAMA_POOLING_CLS)
- LLAMA_SPLIT_NONE =
INT2NUM(LLAMA_SPLIT_NONE)
- LLAMA_SPLIT_LAYER =
INT2NUM(LLAMA_SPLIT_LAYER)
- LLAMA_SPLIT_ROW =
INT2NUM(LLAMA_SPLIT_ROW)
- LLAMA_FILE_MAGIC_GGLA =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_FILE_MAGIC_GGSN =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_SESSION_MAGIC =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_DEFAULT_SEED =
rb_str_new2(ss_magic.str().c_str())
- LLAMA_SESSION_VERSION =
rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())
Class Method Summary collapse
- .backend_free ⇒ Object
-
.backend_init ⇒ Object
module functions.
-
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
- .max_devices ⇒ Object
- .mlock_supported? ⇒ Boolean
- .mmap_supported? ⇒ Boolean
- .model_quantize(*args) ⇒ Object
- .numa_init(strategy) ⇒ Object
- .print_system_info ⇒ Object
- .supports_gpu_offload? ⇒ Boolean
- .supports_mlock? ⇒ Boolean
- .supports_mmap? ⇒ Boolean
- .time_us ⇒ Object
Class Method Details
.backend_free ⇒ Object
3252 3253 3254 3255 3256 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3252 static VALUE rb_llama_llama_backend_free(VALUE self) { llama_backend_free(); return Qnil; } |
.backend_init ⇒ Object
module functions
3246 3247 3248 3249 3250 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3246 static VALUE rb_llama_llama_backend_init(VALUE self) { llama_backend_init(); return Qnil; } |
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/llama_cpp.rb', line 27 def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context) raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String) spaced_prompt = " #{prompt}" embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true) n_ctx = context.n_ctx raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4 last_n_tokens = [0] * n_ctx embd = [] n_consumed = 0 n_past = 0 n_remain = n_predict n_vocab = context.model.n_vocab output = [] while n_remain != 0 unless embd.empty? if n_past + embd.size > n_ctx n_left = n_past - n_keep n_past = n_keep embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size]) end context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0)) end n_past += embd.size embd.clear if embd_input.size <= n_consumed logits = context.logits base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) } candidates = LLaMACpp::TokenDataArray.new(base_candidates) # apply penalties last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min context.sample_repetition_penalties( candidates, last_n_tokens[-last_n_repeat..], penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence ) # temperature sampling context.sample_top_k(candidates, k: top_k) context.sample_tail_free(candidates, z: tfs_z) context.sample_typical(candidates, prob: typical_p) context.sample_top_p(candidates, prob: top_p) context.sample_temp(candidates, temp: temperature) id = context.sample_token(candidates) last_n_tokens.shift last_n_tokens.push(id) embd.push(id) n_remain -= 1 else while embd_input.size > n_consumed embd.push(embd_input[n_consumed]) last_n_tokens.shift last_n_tokens.push(embd_input[n_consumed]) n_consumed += 1 break if embd.size >= n_batch end end embd.each { |token| output << context.model.token_to_piece(token) } break if !embd.empty? && embd[-1] == context.model.token_eos end output.join.scrub('?').strip.delete_prefix(prompt).strip end |
.max_devices ⇒ Object
3320 3321 3322 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3320 static VALUE rb_llama_max_devices(VALUE self) { return SIZET2NUM(llama_max_devices()); } |
.mlock_supported? ⇒ Boolean
3315 3316 3317 3318 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3315 static VALUE rb_llama_mlock_supported(VALUE self) { rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead."); return llama_mlock_supported() ? Qtrue : Qfalse; } |
.mmap_supported? ⇒ Boolean
3310 3311 3312 3313 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3310 static VALUE rb_llama_mmap_supported(VALUE self) { rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead."); return llama_mmap_supported() ? Qtrue : Qfalse; } |
.model_quantize(*args) ⇒ Object
3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3269
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
VALUE kw_args = Qnil;
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
rb_scan_args(argc, argv, ":", &kw_args);
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
rb_raise(rb_eArgError, "input_path must be a string");
return Qnil;
}
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
rb_raise(rb_eArgError, "output_path must be a string");
return Qnil;
}
if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
return Qnil;
}
const char* input_path = StringValueCStr(kw_values[0]);
const char* output_path = StringValueCStr(kw_values[1]);
LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
rb_raise(rb_eRuntimeError, "Failed to quantize model");
return Qnil;
}
return Qnil;
}
|
.numa_init(strategy) ⇒ Object
3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3258
static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
if (!RB_INTEGER_TYPE_P(strategy)) {
rb_raise(rb_eArgError, "strategy must be an integer");
return Qnil;
}
llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
return Qnil;
}
|
.print_system_info ⇒ Object
3301 3302 3303 3304 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3301 static VALUE rb_llama_print_system_info(VALUE self) { const char* result = llama_print_system_info(); return rb_utf8_str_new_cstr(result); } |
.supports_gpu_offload? ⇒ Boolean
3332 3333 3334 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3332 static VALUE rb_llama_supports_gpu_offload(VALUE self) { return llama_supports_gpu_offload() ? Qtrue : Qfalse; } |
.supports_mlock? ⇒ Boolean
3328 3329 3330 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3328 static VALUE rb_llama_supports_mlock(VALUE self) { return llama_supports_mlock() ? Qtrue : Qfalse; } |
.supports_mmap? ⇒ Boolean
3324 3325 3326 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3324 static VALUE rb_llama_supports_mmap(VALUE self) { return llama_supports_mmap() ? Qtrue : Qfalse; } |
.time_us ⇒ Object
3306 3307 3308 |
# File 'ext/llama_cpp/llama_cpp.cpp', line 3306 static VALUE rb_llama_time_us(VALUE self) { return LONG2NUM(llama_time_us()); } |