Module: LLaMACpp

Defined in:
lib/llama_cpp.rb,
lib/llama_cpp/client.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Defined Under Namespace

Classes: Client

Constant Summary collapse

Params =

Class alias to match interface of whispercpp gem.

ContextParams
VERSION =

The version of llama_cpp.rb you install.

'0.1.1'
LLAMA_CPP_VERSION =

The version of llama.cpp bundled with llama_cpp.rb.

'master-6986c78'
LLAMA_FTYPE_ALL_F32 =
INT2NUM(LLAMA_FTYPE_ALL_F32)
LLAMA_FTYPE_MOSTLY_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_F16)
LLAMA_FTYPE_MOSTLY_Q4_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)
LLAMA_FTYPE_MOSTLY_Q4_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)
LLAMA_FTYPE_MOSTLY_Q8_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)
LLAMA_FTYPE_MOSTLY_Q5_0 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)
LLAMA_FTYPE_MOSTLY_Q5_1 =
INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)
LLAMA_FILE_VERSION =
rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str())
LLAMA_FILE_MAGIC =
rb_str_new2(ss_magic.str().c_str())
LLAMA_FILE_MAGIC_UNVERSIONED =
rb_str_new2(ss_magic_unversioned.str().c_str())

Class Method Summary collapse

Class Method Details

.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String

Generates sentences following the given prompt for operation check.

Parameters:

  • context (LLaMACpp::Context)

    The context to use.

  • prompt (String)

    The prompt to start generation with.

  • n_predict (Integer) (defaults to: 128)

    The number of tokens to predict.

  • n_threads (Integer) (defaults to: 1)

    The number of threads.

Returns:

  • (String)

Raises:

  • (ArgumentError)


21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/llama_cpp.rb', line 21

def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'context must have loaded the model' if context.empty?
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_keep = 10
  n_past = 0
  n_remain = n_predict
  repeat_last_n = 64
  repeat_penalty = 1.1
  frequency = 0.0
  presence = 0.0
  top_k = 40
  top_p = 0.95
  tfs_z = 1.0
  typical_p = 1.0
  temperature = 0.8
  n_batch = 512
  n_vocab = context.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.eval(tokens: embd, n_past: n_past, n_threads: n_threads)
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
      context.sample_frequency_and_presence_penalties(
        candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temperature(candidates, temperature: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.token_to_str(token) }

    break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
  end

  output.join.delete_prefix(spaced_prompt).strip
end

.mlock_supported?Boolean

Returns:

  • (Boolean)


1468
1469
1470
# File 'ext/llama_cpp/llama_cpp.cpp', line 1468

static VALUE rb_llama_mlock_supported(VALUE self) {
  return llama_mlock_supported() ? Qtrue : Qfalse;
}

.mmap_supported?Boolean

Returns:

  • (Boolean)


1464
1465
1466
# File 'ext/llama_cpp/llama_cpp.cpp', line 1464

static VALUE rb_llama_mmap_supported(VALUE self) {
  return llama_mmap_supported() ? Qtrue : Qfalse;
}

.model_quantize(*args) ⇒ Object

module functions



1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
# File 'ext/llama_cpp/llama_cpp.cpp', line 1410

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
  VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!RB_INTEGER_TYPE_P(kw_values[2])) {
    rb_raise(rb_eArgError, "ftype must be an integer");
    return Qnil;
  }
  if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
    rb_raise(rb_eArgError, "n_threads must be an integer");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  const int ftype = NUM2INT(kw_values[2]);
  const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);

  if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}


1459
1460
1461
1462
# File 'ext/llama_cpp/llama_cpp.cpp', line 1459

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.token_bosObject



1447
1448
1449
# File 'ext/llama_cpp/llama_cpp.cpp', line 1447

static VALUE rb_llama_token_bos(VALUE self) {
  return INT2NUM(llama_token_bos());
}

.token_eosObject



1451
1452
1453
# File 'ext/llama_cpp/llama_cpp.cpp', line 1451

static VALUE rb_llama_token_eos(VALUE self) {
  return INT2NUM(llama_token_eos());
}

.token_nlObject



1455
1456
1457
# File 'ext/llama_cpp/llama_cpp.cpp', line 1455

static VALUE rb_llama_token_nl(VALUE self) {
  return INT2NUM(llama_token_nl());
}