Module: LLaMACpp

Defined in:: lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Constant Summary collapse

Params = Class alias to match interface of whispercpp gem.

ContextParams

VERSION = The version of llama_cpp.rb you install.

'0.4.0'

LLAMA_CPP_VERSION = The version of llama.cpp bundled with llama_cpp.rb.

'b1060'

LLAMA_MAX_DEVICES =

INT2NUM(LLAMA_MAX_DEVICES)

LLAMA_LOG_LEVEL_ERROR =

INT2NUM(LLAMA_LOG_LEVEL_ERROR)

LLAMA_LOG_LEVEL_WARN =

INT2NUM(LLAMA_LOG_LEVEL_WARN)

LLAMA_LOG_LEVEL_INFO =

INT2NUM(LLAMA_LOG_LEVEL_INFO)

LLAMA_VOCAB_TYPE_SPM =

INT2NUM(LLAMA_VOCAB_TYPE_SPM)

LLAMA_VOCAB_TYPE_BPE =

INT2NUM(LLAMA_VOCAB_TYPE_BPE)

LLAMA_TOKEN_TYPE_UNDEFINED =

INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)

LLAMA_TOKEN_TYPE_NORMAL =

INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)

LLAMA_TOKEN_TYPE_UNKNOWN =

INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)

LLAMA_TOKEN_TYPE_CONTROL =

INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)

LLAMA_TOKEN_TYPE_USER_DEFINED =

INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)

LLAMA_TOKEN_TYPE_UNUSED =

INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)

LLAMA_TOKEN_TYPE_BYTE =

INT2NUM(LLAMA_TOKEN_TYPE_BYTE)

LLAMA_FTYPE_ALL_F32 =

INT2NUM(LLAMA_FTYPE_ALL_F32)

LLAMA_FTYPE_MOSTLY_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_F16)

LLAMA_FTYPE_MOSTLY_Q4_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)

LLAMA_FTYPE_MOSTLY_Q4_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)

LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)

LLAMA_FTYPE_MOSTLY_Q8_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)

LLAMA_FTYPE_MOSTLY_Q5_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)

LLAMA_FTYPE_MOSTLY_Q5_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)

LLAMA_FTYPE_MOSTLY_Q2_K =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)

LLAMA_FTYPE_MOSTLY_Q3_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)

LLAMA_FTYPE_MOSTLY_Q3_K_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)

LLAMA_FTYPE_MOSTLY_Q3_K_L =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)

LLAMA_FTYPE_MOSTLY_Q4_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)

LLAMA_FTYPE_MOSTLY_Q4_K_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)

LLAMA_FTYPE_MOSTLY_Q5_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)

LLAMA_FTYPE_MOSTLY_Q5_K_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)

LLAMA_FTYPE_MOSTLY_Q6_K =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)

LLAMA_FTYPE_GUESSED =

INT2NUM(LLAMA_FTYPE_GUESSED)

LLAMA_GRETYPE_END =

INT2NUM(LLAMA_GRETYPE_END)

LLAMA_GRETYPE_ALT =

INT2NUM(LLAMA_GRETYPE_ALT)

LLAMA_GRETYPE_RULE_REF =

INT2NUM(LLAMA_GRETYPE_RULE_REF)

LLAMA_GRETYPE_CHAR =

INT2NUM(LLAMA_GRETYPE_CHAR)

LLAMA_GRETYPE_CHAR_NOT =

INT2NUM(LLAMA_GRETYPE_CHAR_NOT)

LLAMA_GRETYPE_CHAR_RNG_UPPER =

INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)

LLAMA_GRETYPE_CHAR_ALT =

INT2NUM(LLAMA_GRETYPE_CHAR_ALT)

LLAMA_FILE_MAGIC_GGSN =

rb_str_new2(ss_magic.str().c_str())

LLAMA_SESSION_MAGIC =

rb_str_new2(ss_magic.str().c_str())

LLAMA_DEFAULT_SEED =

rb_str_new2(ss_magic.str().c_str())

LLAMA_SESSION_VERSION =

rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())

Class Method Summary collapse

.backend_free ⇒ Object
.backend_init(*args) ⇒ Object

module functions.
.generate(context, prompt, n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

Generates sentences following the given prompt for operation check.
.max_devices ⇒ Object
.mlock_supported? ⇒ Boolean
.mmap_supported? ⇒ Boolean
.model_quantize(*args) ⇒ Object
.print_system_info ⇒ Object
.time_us ⇒ Object

Class Method Details

.backend_free ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 2456

static VALUE rb_llama_llama_backend_free(VALUE self) {
  llama_backend_free();

  return Qnil;
}

.backend_init(*args) ⇒ `Object`

module functions

# File 'ext/llama_cpp/llama_cpp.cpp', line 2443

static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[1] = { rb_intern("numa") };
  VALUE kw_values[1] = { Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);

  const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
  llama_backend_init(numa);

  return Qnil;
}

.generate(context, prompt, n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ `String`

Generates sentences following the given prompt for operation check.

Parameters:

context (LLaMACpp::Context) —

The context to use.
prompt (String) —

The prompt to start generation with.
n_predict (Integer) (defaults to: 128) —

The number of tokens to predict.
n_threads (Integer) (defaults to: 1) —

The number of threads.
n_keep (Integer) (defaults to: 10) —

The number of tokens to keep in the context.
n_batch (Integer) (defaults to: 512) —

The number of tokens to process in a batch.
repeat_last_n (Integer) (defaults to: 64) —

The number of tokens to consider for repetition penalty.
repeat_penalty (Float) (defaults to: 1.1) —

The repetition penalty.
frequency (Float) (defaults to: 0.0) —

The frequency penalty.
presence (Float) (defaults to: 0.0) —

The presence penalty.
top_k (Integer) (defaults to: 40) —

The number of tokens to consider for top-k sampling.
top_p (Float) (defaults to: 0.95) —

The probability threshold for nucleus sampling.
tfs_z (Float) (defaults to: 1.0) —

The z parameter for tail-free sampling.
typical_p (Float) (defaults to: 1.0) —

The probability for typical sampling.
temperature (Float) (defaults to: 0.8) —

The temperature for temperature sampling.

Returns:

(String)

Raises:

(ArgumentError)

# File 'lib/llama_cpp.rb', line 31

def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
             n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
             repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
             top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = n_predict
  n_vocab = context.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.eval(tokens: embd, n_past: n_past, n_threads: n_threads)
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
      context.sample_frequency_and_presence_penalties(
        candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temperature(candidates, temperature: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.token_to_str(token) }

    break if !embd.empty? && embd[-1] == context.token_eos
  end

  output.join.delete_prefix(spaced_prompt).strip
end

.max_devices ⇒ `Object`



2511
2512
2513

# File 'ext/llama_cpp/llama_cpp.cpp', line 2511

static VALUE rb_llama_max_devices(VALUE self) {
  return INT2NUM(llama_max_devices());
}

.mlock_supported? ⇒ `Boolean`

Returns:

(Boolean)



2507
2508
2509

# File 'ext/llama_cpp/llama_cpp.cpp', line 2507

static VALUE rb_llama_mlock_supported(VALUE self) {
  return llama_mlock_supported() ? Qtrue : Qfalse;
}

.mmap_supported? ⇒ `Boolean`

Returns:

(Boolean)



2503
2504
2505

# File 'ext/llama_cpp/llama_cpp.cpp', line 2503

static VALUE rb_llama_mmap_supported(VALUE self) {
  return llama_mmap_supported() ? Qtrue : Qfalse;
}

.model_quantize(*args) ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 2462

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
  VALUE kw_values[3] = { Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
    rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);

  if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}

.print_system_info ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 2494

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.time_us ⇒ `Object`



2499
2500
2501

# File 'ext/llama_cpp/llama_cpp.cpp', line 2499

static VALUE rb_llama_time_us(VALUE self) {
  return LONG2NUM(llama_time_us());
}

Module: LLaMACpp

Overview

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.backend_free ⇒ Object

.backend_init(*args) ⇒ Object

.generate(context, prompt, n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

.max_devices ⇒ Object

.mlock_supported? ⇒ Boolean

.mmap_supported? ⇒ Boolean

.model_quantize(*args) ⇒ Object

.print_system_info ⇒ Object

.time_us ⇒ Object

.backend_free ⇒ `Object`

.backend_init(*args) ⇒ `Object`

.generate(context, prompt, n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ `String`

.max_devices ⇒ `Object`

.mlock_supported? ⇒ `Boolean`

.mmap_supported? ⇒ `Boolean`

.model_quantize(*args) ⇒ `Object`

.print_system_info ⇒ `Object`

.time_us ⇒ `Object`