Module: LLaMACpp

Defined in:
lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Constant Summary collapse

Params =

Class alias to match interface of whispercpp gem.

ContextParams
VERSION =

The version of llama_cpp.rb you install.

'0.0.6'
LLAMA_CPP_VERSION =

The version of llama.cpp bundled with llama_cpp.rb.

'master-12b5900'
LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
LLAMA_FTYPE_MOSTLY_Q4_2 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2)
LLAMA_FTYPE_MOSTLY_Q4_3 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3)
LLAMA_FILE_VERSION =
rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str())
LLAMA_FILE_MAGIC =
rb_str_new2(ss_magic.str().c_str())
LLAMA_FILE_MAGIC_UNVERSIONED =
rb_str_new2(ss_magic_unversioned.str().c_str())

Class Method Summary collapse

Class Method Details

.generate(context, prompt, n_threads: 1) ⇒ String

Generates sentences following the given prompt for operation check.

Parameters:

  • context (LLaMACpp::Context)
  • prompt (String)
  • n_threads (Integer) (defaults to: 1)

Returns:

  • (String)


19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/llama_cpp.rb', line 19

def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
  spaced_prompt = " #{prompt}"

  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_keep = 10
  n_past = 0
  n_remain = 128
  repeat_last_n = 64
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.eval(tokens: embd, n_past: n_past, n_threads: n_threads)
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      start = n_ctx - repeat_last_n
      id = context.sample_top_p_top_k(
        last_n_tokens[start...(start + repeat_last_n)], top_k: 40, top_p: 0.95, temp: 0.80, penalty: 1.1
      )
      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= 512
      end
    end

    embd.each { |token| output << context.token_to_str(token) }

    break if embd[-1] == LLaMACpp.token_eos
  end

  output.join.delete_prefix(spaced_prompt).strip
end

.mlock_supported?Boolean

Returns:

  • (Boolean)


669
670
671
# File 'ext/llama_cpp/llama_cpp.cpp', line 669

static VALUE rb_llama_mlock_supported(VALUE self) {
  return llama_mlock_supported() ? Qtrue : Qfalse;
}

.mmap_supported?Boolean

Returns:

  • (Boolean)


665
666
667
# File 'ext/llama_cpp/llama_cpp.cpp', line 665

static VALUE rb_llama_mmap_supported(VALUE self) {
  return llama_mmap_supported() ? Qtrue : Qfalse;
}

.model_quantize(*args) ⇒ Object

module functions



615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
# File 'ext/llama_cpp/llama_cpp.cpp', line 615

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
  VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!RB_INTEGER_TYPE_P(kw_values[2])) {
    rb_raise(rb_eArgError, "ftype must be an integer");
    return Qnil;
  }
  if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
    rb_raise(rb_eArgError, "n_threads must be an integer");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  const int ftype = NUM2INT(kw_values[2]);
  const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);

  if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}


660
661
662
663
# File 'ext/llama_cpp/llama_cpp.cpp', line 660

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.token_bosObject



652
653
654
# File 'ext/llama_cpp/llama_cpp.cpp', line 652

static VALUE rb_llama_token_bos(VALUE self) {
  return INT2NUM(llama_token_bos());
}

.token_eosObject



656
657
658
# File 'ext/llama_cpp/llama_cpp.cpp', line 656

static VALUE rb_llama_token_eos(VALUE self) {
  return INT2NUM(llama_token_eos());
}