Class: Kiribi::Gemma4::E2B::VisionEncoder

Inherits:
Object
  • Object
show all
Defined in:
lib/kiribi/gemma4/e2b/vision_encoder.rb

Constant Summary collapse

PATCH_SIZE =
16
RESCALE_FACTOR =
1.0 / 255
MAX_SOFT_TOKENS =
280
POOLING_KERNEL =
3
MAX_PATCHES =
MAX_SOFT_TOKENS * POOLING_KERNEL**2
SIDE_MULT =
POOLING_KERNEL * PATCH_SIZE

Instance Method Summary collapse

Constructor Details

#initializeVisionEncoder

Returns a new instance of VisionEncoder.



18
19
20
# File 'lib/kiribi/gemma4/e2b/vision_encoder.rb', line 18

def initialize
  @model = OnnxRuntime::Model.new(VISION_ENCODER_FILEPATH)
end

Instance Method Details

#encode(blob_rgb, width, height) ⇒ Object

blob_rgb: RGB24 raw bytes(既に width × height へリサイズ済み)image_features 配列を返す



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/kiribi/gemma4/e2b/vision_encoder.rb', line 46

def encode(blob_rgb, width, height)
  blob = blob_rgb.is_a?(String) ? blob_rgb.unpack("C*") : blob_rgb
  patches_w = width / PATCH_SIZE
  patches_h = height / PATCH_SIZE

  pixel_values = []
  pixel_position_ids = []

  patches_w.times do |col|
    patches_h.times do |row|
      patch = []
      PATCH_SIZE.times do |dy|
        PATCH_SIZE.times do |dx|
          y = row * PATCH_SIZE + dy
          x = col * PATCH_SIZE + dx
          idx = (y * width + x) * 3
          patch << blob[idx] * RESCALE_FACTOR
          patch << blob[idx + 1] * RESCALE_FACTOR
          patch << blob[idx + 2] * RESCALE_FACTOR
        end
      end
      pixel_values << patch
      pixel_position_ids << [col, row]
    end
  end

  while pixel_values.length < MAX_PATCHES
    pixel_values << Array.new(PATCH_SIZE**2 * 3, 0.0)
    pixel_position_ids << [-1, -1]
  end

  @model.predict({
    "pixel_values" => [pixel_values],
    "pixel_position_ids" => [pixel_position_ids],
  })["image_features"]
end

#input_size_of(original_width, original_height) ⇒ Object

元画像サイズ (original_width, original_height) を受け取り、encode 前にリサイズすべきサイズ [width, height] を返す。



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/kiribi/gemma4/e2b/vision_encoder.rb', line 24

def input_size_of(original_width, original_height)
  target_px = MAX_PATCHES * PATCH_SIZE**2
  factor = Math.sqrt(target_px.to_f / (original_height * original_width))

  width  = (factor * original_width  / SIDE_MULT).floor * SIDE_MULT
  height = (factor * original_height / SIDE_MULT).floor * SIDE_MULT

  if width == 0 && height == 0
    raise "Image too small to resize"
  elsif height == 0
    height = SIDE_MULT
    width = [(original_width / original_height) * SIDE_MULT, MAX_SOFT_TOKENS * SIDE_MULT].min
  elsif width == 0
    width = SIDE_MULT
    height = [(original_height / original_width) * SIDE_MULT, MAX_SOFT_TOKENS * SIDE_MULT].min
  end

  [width, height]
end