Module: LLaMACpp

Defined in:: lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/llama_cpp.cpp

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Constant Summary collapse

VERSION = The version of llama_cpp.rb you install.

'0.12.4'

LLAMA_CPP_VERSION = The version of llama.cpp bundled with llama_cpp.rb.

'b2047'

LLAMA_VOCAB_TYPE_SPM =

INT2NUM(LLAMA_VOCAB_TYPE_SPM)

LLAMA_VOCAB_TYPE_BPE =

INT2NUM(LLAMA_VOCAB_TYPE_BPE)

LLAMA_TOKEN_TYPE_UNDEFINED =

INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED)

LLAMA_TOKEN_TYPE_NORMAL =

INT2NUM(LLAMA_TOKEN_TYPE_NORMAL)

LLAMA_TOKEN_TYPE_UNKNOWN =

INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN)

LLAMA_TOKEN_TYPE_CONTROL =

INT2NUM(LLAMA_TOKEN_TYPE_CONTROL)

LLAMA_TOKEN_TYPE_USER_DEFINED =

INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED)

LLAMA_TOKEN_TYPE_UNUSED =

INT2NUM(LLAMA_TOKEN_TYPE_UNUSED)

LLAMA_TOKEN_TYPE_BYTE =

INT2NUM(LLAMA_TOKEN_TYPE_BYTE)

LLAMA_FTYPE_ALL_F32 =

INT2NUM(LLAMA_FTYPE_ALL_F32)

LLAMA_FTYPE_MOSTLY_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_F16)

LLAMA_FTYPE_MOSTLY_Q4_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0)

LLAMA_FTYPE_MOSTLY_Q4_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1)

LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16)

LLAMA_FTYPE_MOSTLY_Q8_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0)

LLAMA_FTYPE_MOSTLY_Q5_0 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0)

LLAMA_FTYPE_MOSTLY_Q5_1 =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1)

LLAMA_FTYPE_MOSTLY_Q2_K =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K)

LLAMA_FTYPE_MOSTLY_Q3_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S)

LLAMA_FTYPE_MOSTLY_Q3_K_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M)

LLAMA_FTYPE_MOSTLY_Q3_K_L =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L)

LLAMA_FTYPE_MOSTLY_Q4_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S)

LLAMA_FTYPE_MOSTLY_Q4_K_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M)

LLAMA_FTYPE_MOSTLY_Q5_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S)

LLAMA_FTYPE_MOSTLY_Q5_K_M =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M)

LLAMA_FTYPE_MOSTLY_Q6_K =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K)

LLAMA_FTYPE_MOSTLY_IQ2_XXS =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS)

LLAMA_FTYPE_MOSTLY_IQ2_XS =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS)

LLAMA_FTYPE_MOSTLY_Q2_K_S =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S)

LLAMA_FTYPE_MOSTLY_Q3_K_XS =

INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS)

LLAMA_FTYPE_MOSTLY_IQ3_XXS =

INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS)

LLAMA_FTYPE_GUESSED =

INT2NUM(LLAMA_FTYPE_GUESSED)

LLAMA_KV_OVERRIDE_INT =

INT2NUM(LLAMA_KV_OVERRIDE_INT)

LLAMA_KV_OVERRIDE_FLOAT =

INT2NUM(LLAMA_KV_OVERRIDE_FLOAT)

LLAMA_KV_OVERRIDE_BOOL =

INT2NUM(LLAMA_KV_OVERRIDE_BOOL)

LLAMA_GRETYPE_END =

INT2NUM(LLAMA_GRETYPE_END)

LLAMA_GRETYPE_ALT =

INT2NUM(LLAMA_GRETYPE_ALT)

LLAMA_GRETYPE_RULE_REF =

INT2NUM(LLAMA_GRETYPE_RULE_REF)

LLAMA_GRETYPE_CHAR =

INT2NUM(LLAMA_GRETYPE_CHAR)

LLAMA_GRETYPE_CHAR_NOT =

INT2NUM(LLAMA_GRETYPE_CHAR_NOT)

LLAMA_GRETYPE_CHAR_RNG_UPPER =

INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER)

LLAMA_GRETYPE_CHAR_ALT =

INT2NUM(LLAMA_GRETYPE_CHAR_ALT)

LLAMA_ROPE_SCALING_UNSPECIFIED =

INT2NUM(LLAMA_ROPE_SCALING_UNSPECIFIED)

LLAMA_ROPE_SCALING_NONE =

INT2NUM(LLAMA_ROPE_SCALING_NONE)

LLAMA_ROPE_SCALING_LINEAR =

INT2NUM(LLAMA_ROPE_SCALING_LINEAR)

LLAMA_ROPE_SCALING_YARN =

INT2NUM(LLAMA_ROPE_SCALING_YARN)

LLAMA_ROPE_SCALING_MAX_VALUE =

INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE)

LLAMA_SPLIT_NONE =

INT2NUM(LLAMA_SPLIT_NONE)

LLAMA_SPLIT_LAYER =

INT2NUM(LLAMA_SPLIT_LAYER)

LLAMA_SPLIT_ROW =

INT2NUM(LLAMA_SPLIT_ROW)

LLAMA_FILE_MAGIC_GGLA =

rb_str_new2(ss_magic.str().c_str())

LLAMA_FILE_MAGIC_GGSN =

rb_str_new2(ss_magic.str().c_str())

LLAMA_SESSION_MAGIC =

rb_str_new2(ss_magic.str().c_str())

LLAMA_DEFAULT_SEED =

rb_str_new2(ss_magic.str().c_str())

LLAMA_SESSION_VERSION =

rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str())

Class Method Summary collapse

.backend_free ⇒ Object
.backend_init(*args) ⇒ Object

module functions.
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

Generates sentences following the given prompt for operation check.
.max_devices ⇒ Object
.mlock_supported? ⇒ Boolean
.mmap_supported? ⇒ Boolean
.model_quantize(*args) ⇒ Object
.print_system_info ⇒ Object
.supports_gpu_offload? ⇒ Boolean
.supports_mlock? ⇒ Boolean
.supports_mmap? ⇒ Boolean
.time_us ⇒ Object

Class Method Details

.backend_free ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 3214

static VALUE rb_llama_llama_backend_free(VALUE self) {
  llama_backend_free();

  return Qnil;
}

.backend_init(*args) ⇒ `Object`

module functions

# File 'ext/llama_cpp/llama_cpp.cpp', line 3201

static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[1] = { rb_intern("numa") };
  VALUE kw_values[1] = { Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);

  const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
  llama_backend_init(numa);

  return Qnil;
}

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ `String`

Generates sentences following the given prompt for operation check.

Parameters:

context (LLaMACpp::Context) —

The context to use.
prompt (String) —

The prompt to start generation with.
n_predict (Integer) (defaults to: 128) —

The number of tokens to predict.
n_keep (Integer) (defaults to: 10) —

The number of tokens to keep in the context.
n_batch (Integer) (defaults to: 512) —

The number of tokens to process in a batch.
repeat_last_n (Integer) (defaults to: 64) —

The number of tokens to consider for repetition penalty.
repeat_penalty (Float) (defaults to: 1.1) —

The repetition penalty.
frequency (Float) (defaults to: 0.0) —

The frequency penalty.
presence (Float) (defaults to: 0.0) —

The presence penalty.
top_k (Integer) (defaults to: 40) —

The number of tokens to consider for top-k sampling.
top_p (Float) (defaults to: 0.95) —

The probability threshold for nucleus sampling.
tfs_z (Float) (defaults to: 1.0) —

The z parameter for tail-free sampling.
typical_p (Float) (defaults to: 1.0) —

The probability for typical sampling.
temperature (Float) (defaults to: 0.8) —

The temperature for temperature sampling.

Returns:

(String)

Raises:

(ArgumentError)

# File 'lib/llama_cpp.rb', line 27

def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
             n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
             repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
             top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = n_predict
  n_vocab = context.model.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalties(
        candidates, last_n_tokens[-last_n_repeat..],
        penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temp(candidates, temp: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.model.token_to_piece(token) }

    break if !embd.empty? && embd[-1] == context.model.token_eos
  end

  output.join.scrub('?').strip.delete_prefix(prompt).strip
end

.max_devices ⇒ `Object`



3271
3272
3273

# File 'ext/llama_cpp/llama_cpp.cpp', line 3271

static VALUE rb_llama_max_devices(VALUE self) {
  return SIZET2NUM(llama_max_devices());
}

.mlock_supported? ⇒ `Boolean`

Returns:

(Boolean)

# File 'ext/llama_cpp/llama_cpp.cpp', line 3266

static VALUE rb_llama_mlock_supported(VALUE self) {
  rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
  return llama_mlock_supported() ? Qtrue : Qfalse;
}

.mmap_supported? ⇒ `Boolean`

Returns:

(Boolean)

# File 'ext/llama_cpp/llama_cpp.cpp', line 3261

static VALUE rb_llama_mmap_supported(VALUE self) {
  rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
  return llama_mmap_supported() ? Qtrue : Qfalse;
}

.model_quantize(*args) ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 3220

static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
  VALUE kw_args = Qnil;
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
  VALUE kw_values[3] = { Qundef, Qundef, Qundef };
  rb_scan_args(argc, argv, ":", &kw_args);
  rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);

  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
    rb_raise(rb_eArgError, "input_path must be a string");
    return Qnil;
  }
  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
    rb_raise(rb_eArgError, "output_path must be a string");
    return Qnil;
  }
  if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
    rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
    return Qnil;
  }

  const char* input_path = StringValueCStr(kw_values[0]);
  const char* output_path = StringValueCStr(kw_values[1]);
  LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);

  if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
    rb_raise(rb_eRuntimeError, "Failed to quantize model");
    return Qnil;
  }

  return Qnil;
}

.print_system_info ⇒ `Object`

# File 'ext/llama_cpp/llama_cpp.cpp', line 3252

static VALUE rb_llama_print_system_info(VALUE self) {
  const char* result = llama_print_system_info();
  return rb_utf8_str_new_cstr(result);
}

.supports_gpu_offload? ⇒ `Boolean`

Returns:

(Boolean)



3283
3284
3285

# File 'ext/llama_cpp/llama_cpp.cpp', line 3283

static VALUE rb_llama_supports_gpu_offload(VALUE self) {
  return llama_supports_gpu_offload() ? Qtrue : Qfalse;
}

.supports_mlock? ⇒ `Boolean`

Returns:

(Boolean)



3279
3280
3281

# File 'ext/llama_cpp/llama_cpp.cpp', line 3279

static VALUE rb_llama_supports_mlock(VALUE self) {
  return llama_supports_mlock() ? Qtrue : Qfalse;
}

.supports_mmap? ⇒ `Boolean`

Returns:

(Boolean)



3275
3276
3277

# File 'ext/llama_cpp/llama_cpp.cpp', line 3275

static VALUE rb_llama_supports_mmap(VALUE self) {
  return llama_supports_mmap() ? Qtrue : Qfalse;
}

.time_us ⇒ `Object`



3257
3258
3259

# File 'ext/llama_cpp/llama_cpp.cpp', line 3257

static VALUE rb_llama_time_us(VALUE self) {
  return LONG2NUM(llama_time_us());
}

Module: LLaMACpp

Overview

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.backend_free ⇒ Object

.backend_init(*args) ⇒ Object

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

.max_devices ⇒ Object

.mlock_supported? ⇒ Boolean

.mmap_supported? ⇒ Boolean

.model_quantize(*args) ⇒ Object

.print_system_info ⇒ Object

.supports_gpu_offload? ⇒ Boolean

.supports_mlock? ⇒ Boolean

.supports_mmap? ⇒ Boolean

.time_us ⇒ Object

.backend_free ⇒ `Object`

.backend_init(*args) ⇒ `Object`

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ `String`

.max_devices ⇒ `Object`

.mlock_supported? ⇒ `Boolean`

.mmap_supported? ⇒ `Boolean`

.model_quantize(*args) ⇒ `Object`

.print_system_info ⇒ `Object`

.supports_gpu_offload? ⇒ `Boolean`

.supports_mlock? ⇒ `Boolean`

.supports_mmap? ⇒ `Boolean`

.time_us ⇒ `Object`