Module: LLaMACpp

Defined in:: lib/llama_cpp.rb,
lib/llama_cpp/client.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Defined Under Namespace

Classes: Client

Constant Summary collapse

Params = Class alias to match interface of whispercpp gem.

ContextParams

VERSION = The version of llama_cpp.rb you install.

'0.1.3'

LLAMA_CPP_VERSION = The version of llama.cpp bundled with llama_cpp.rb.

'master-66874d4'

LLAMA_FTYPE_ALL_F32 =

INT2NUM(LLAMA_FTYPE_ALL_F32)

LLAMA_FTYPE_MOSTLY_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_F16)

LLAMA_FTYPE_MOSTLY_Q4_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)

LLAMA_FTYPE_MOSTLY_Q4_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)

LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)

LLAMA_FTYPE_MOSTLY_Q8_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)

LLAMA_FTYPE_MOSTLY_Q5_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)

LLAMA_FTYPE_MOSTLY_Q5_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)

LLAMA_FILE_MAGIC_GGJT =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC_GGLA =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC_GGMF =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC_GGML =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC_GGSN =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC_UNVERSIONED =

rb_str_new2(ss_magic.str().c_str())

LLAMA_SESSION_MAGIC =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_VERSION =

rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str())

LLAMA_SESSION_VERSION =

rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())

Class Method Summary collapse

.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String

Generates sentences following the given prompt for operation check.
.init_backend ⇒ Object

module functions.
.mlock_supported? ⇒ Boolean
.mmap_supported? ⇒ Boolean
.model_quantize(*args) ⇒ Object
.print_system_info ⇒ Object
.token_bos ⇒ Object
.token_eos ⇒ Object
.token_nl ⇒ Object

Class Method Details

.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ `String`

Generates sentences following the given prompt for operation check.

Parameters:

context (LLaMACpp::Context) —

The context to use.
prompt (String) —

The prompt to start generation with.
n_predict (Integer) (defaults to: 128) —

The number of tokens to predict.
n_threads (Integer) (defaults to: 1) —

The number of threads.

Returns:

(String)

Raises:

(ArgumentError)

# File 'lib/llama_cpp.rb', line 21

def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'context must have loaded the model' if context.empty?
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_keep = 10
  n_past = 0
  n_remain = n_predict
  repeat_last_n = 64
  repeat_penalty = 1.1
  frequency = 0.0
  presence = 0.0
  top_k = 40
  top_p = 0.95
  tfs_z = 1.0
  typical_p = 1.0
  temperature = 0.8
  n_batch = 512
  n_vocab = context.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.eval(tokens: embd, n_past: n_past, n_threads: n_threads)
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
      context.sample_frequency_and_presence_penalties(
        candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temperature(candidates, temperature: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.token_to_str(token) }

    break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
  end

  output.join.delete_prefix(spaced_prompt).strip
end

.init_backend ⇒ `Object`

module functions

# File 'ext/llama_cpp/llama_cpp.cpp', line 1424

static VALUE rb_llama_llama_init_backend(VALUE self) {
  llama_init_backend();
  return Qnil;
}

.mlock_supported? ⇒ `Boolean`

Returns:

(Boolean)



1487
1488
1489

# File 'ext/llama_cpp/llama_cpp.cpp', line 1487

static VALUE rb_llama_mlock_supported(VALUE self) {
  return llama_mlock_supported() ? Qtrue : Qfalse;
}

.mmap_supported? ⇒ `Boolean`

Returns:

(Boolean)



1483
1484
1485

# File 'ext/llama_cpp/llama_cpp.cpp', line 1483

static VALUE rb_llama_mmap_supported(VALUE self) {
  return llama_mmap_supported() ? Qtrue : Qfalse;
}

.model_quantize(*args) ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 1429

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
  VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!RB_INTEGER_TYPE_P(kw_values[2])) {
    rb_raise(rb_eArgError, "ftype must be an integer");
    return Qnil;
  }
  if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
    rb_raise(rb_eArgError, "n_threads must be an integer");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  const int ftype = NUM2INT(kw_values[2]);
  const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);

  if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}

.print_system_info ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 1478

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.token_bos ⇒ `Object`



1466
1467
1468

# File 'ext/llama_cpp/llama_cpp.cpp', line 1466

static VALUE rb_llama_token_bos(VALUE self) {
  return INT2NUM(llama_token_bos());
}

.token_eos ⇒ `Object`



1470
1471
1472

# File 'ext/llama_cpp/llama_cpp.cpp', line 1470

static VALUE rb_llama_token_eos(VALUE self) {
  return INT2NUM(llama_token_eos());
}

.token_nl ⇒ `Object`



1474
1475
1476

# File 'ext/llama_cpp/llama_cpp.cpp', line 1474

static VALUE rb_llama_token_nl(VALUE self) {
  return INT2NUM(llama_token_nl());
}

Module: LLaMACpp

Overview

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String

.init_backend ⇒ Object

.mlock_supported? ⇒ Boolean

.mmap_supported? ⇒ Boolean

.model_quantize(*args) ⇒ Object

.print_system_info ⇒ Object

.token_bos ⇒ Object

.token_eos ⇒ Object

.token_nl ⇒ Object

.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ `String`

.init_backend ⇒ `Object`

.mlock_supported? ⇒ `Boolean`

.mmap_supported? ⇒ `Boolean`

.model_quantize(*args) ⇒ `Object`

.print_system_info ⇒ `Object`

.token_bos ⇒ `Object`

.token_eos ⇒ `Object`

.token_nl ⇒ `Object`