Module: LLaMACpp

Defined in:: lib/llama_cpp.rb,
lib/llama_cpp/client.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Defined Under Namespace

Classes: Client

Constant Summary collapse

Params = Class alias to match interface of whispercpp gem.

ContextParams

VERSION = The version of llama_cpp.rb you install.

'0.0.7'

LLAMA_CPP_VERSION = The version of llama.cpp bundled with llama_cpp.rb.

'master-11d9023'

LLAMA_FTYPE_ALL_F32 =

INT2NUM(LLAMA_FTYPE_ALL_F32)

LLAMA_FTYPE_MOSTLY_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_F16)

LLAMA_FTYPE_MOSTLY_Q4_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)

LLAMA_FTYPE_MOSTLY_Q4_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)

LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)

LLAMA_FTYPE_MOSTLY_Q4_2 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2)

LLAMA_FTYPE_MOSTLY_Q4_3 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3)

LLAMA_FTYPE_MOSTLY_Q8_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)

LLAMA_FTYPE_MOSTLY_Q5_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)

LLAMA_FTYPE_MOSTLY_Q5_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)

LLAMA_FILE_VERSION =

rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str())

LLAMA_FILE_MAGIC =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC_UNVERSIONED =

rb_str_new2(ss_magic_unversioned.str().c_str())

Class Method Summary collapse

.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String

Generates sentences following the given prompt for operation check.
.mlock_supported? ⇒ Boolean
.mmap_supported? ⇒ Boolean
.model_quantize(*args) ⇒ Object

module functions.
.print_system_info ⇒ Object
.token_bos ⇒ Object
.token_eos ⇒ Object

Class Method Details

.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ `String`

Generates sentences following the given prompt for operation check.

Parameters:

context (LLaMACpp::Context) —

The context to use.
prompt (String) —

The prompt to start generation with.
n_predict (Integer) (defaults to: 128) —

The number of tokens to predict.
n_threads (Integer) (defaults to: 1) —

The number of threads.

Returns:

(String)

Raises:

(ArgumentError)

# File 'lib/llama_cpp.rb', line 21

def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'context must have loaded the model' if context.empty?
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_keep = 10
  n_past = 0
  n_remain = n_predict
  repeat_last_n = 64
  n_batch = 512
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.eval(tokens: embd, n_past: n_past, n_threads: n_threads)
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      start = n_ctx - repeat_last_n
      id = context.sample_top_p_top_k(
        last_n_tokens[start...(start + repeat_last_n)], top_k: 40, top_p: 0.95, temp: 0.80, penalty: 1.1
      )
      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.token_to_str(token) }

    break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
  end

  output.join.delete_prefix(spaced_prompt).strip
end

.mlock_supported? ⇒ `Boolean`

Returns:

(Boolean)



692
693
694

# File 'ext/llama_cpp/llama_cpp.cpp', line 692

static VALUE rb_llama_mlock_supported(VALUE self) {
  return llama_mlock_supported() ? Qtrue : Qfalse;
}

.mmap_supported? ⇒ `Boolean`

Returns:

(Boolean)



688
689
690

# File 'ext/llama_cpp/llama_cpp.cpp', line 688

static VALUE rb_llama_mmap_supported(VALUE self) {
  return llama_mmap_supported() ? Qtrue : Qfalse;
}

.model_quantize(*args) ⇒ `Object`

module functions

# File 'ext/llama_cpp/llama_cpp.cpp', line 638

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
  VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!RB_INTEGER_TYPE_P(kw_values[2])) {
    rb_raise(rb_eArgError, "ftype must be an integer");
    return Qnil;
  }
  if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
    rb_raise(rb_eArgError, "n_threads must be an integer");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  const int ftype = NUM2INT(kw_values[2]);
  const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);

  if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}

.print_system_info ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 683

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.token_bos ⇒ `Object`



675
676
677

# File 'ext/llama_cpp/llama_cpp.cpp', line 675

static VALUE rb_llama_token_bos(VALUE self) {
  return INT2NUM(llama_token_bos());
}

.token_eos ⇒ `Object`



679
680
681

# File 'ext/llama_cpp/llama_cpp.cpp', line 679

static VALUE rb_llama_token_eos(VALUE self) {
  return INT2NUM(llama_token_eos());
}

Module: LLaMACpp

Overview

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ String

.mlock_supported? ⇒ Boolean

.mmap_supported? ⇒ Boolean

.model_quantize(*args) ⇒ Object

.print_system_info ⇒ Object

.token_bos ⇒ Object

.token_eos ⇒ Object

.generate(context, prompt, n_predict: 128, n_threads: 1) ⇒ `String`

.mlock_supported? ⇒ `Boolean`

.mmap_supported? ⇒ `Boolean`

.model_quantize(*args) ⇒ `Object`

.print_system_info ⇒ `Object`

.token_bos ⇒ `Object`

.token_eos ⇒ `Object`