Class: Kiribi::Gemma4E2B::VisionEncoder
- Defined in:
- lib/kiribi/gemma4_e2b/vision_encoder.rb
Constant Summary collapse
- FILES =
%w[vision_encoder.onnx vision_encoder.onnx_data].freeze
- PATCH_SIZE =
16- RESCALE_FACTOR =
1.0 / 255
- MAX_SOFT_TOKENS =
280- POOLING_KERNEL =
3- MAX_PATCHES =
MAX_SOFT_TOKENS * POOLING_KERNEL**2
- SIDE_MULT =
POOLING_KERNEL * PATCH_SIZE
Instance Method Summary collapse
- #encode(blob_rgb, width, height) ⇒ Object
-
#initialize(dest_dir) ⇒ VisionEncoder
constructor
A new instance of VisionEncoder.
- #input_size_of(original_width, original_height) ⇒ Object
Methods inherited from Base
Constructor Details
#initialize(dest_dir) ⇒ VisionEncoder
Returns a new instance of VisionEncoder.
17 18 19 20 21 22 23 |
# File 'lib/kiribi/gemma4_e2b/vision_encoder.rb', line 17 def initialize(dest_dir) FILES.each do |f| path = File.join(dest_dir, f) raise Kiribi::ModelNotDownloaded, %(gemma4-e2b/vision: #{f} missing. Run: Kiribi.download("gemma4-e2b/vision")) unless File.exist?(path) end @model = OnnxRuntime::Model.new(File.join(dest_dir, "vision_encoder.onnx")) end |
Instance Method Details
#encode(blob_rgb, width, height) ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/kiribi/gemma4_e2b/vision_encoder.rb', line 45 def encode(blob_rgb, width, height) blob = blob_rgb.is_a?(String) ? blob_rgb.unpack("C*") : blob_rgb patches_w = width / PATCH_SIZE patches_h = height / PATCH_SIZE pixel_values = [] pixel_position_ids = [] patches_w.times do |col| patches_h.times do |row| patch = [] PATCH_SIZE.times do |dy| PATCH_SIZE.times do |dx| y = row * PATCH_SIZE + dy x = col * PATCH_SIZE + dx idx = (y * width + x) * 3 patch << blob[idx] * RESCALE_FACTOR patch << blob[idx + 1] * RESCALE_FACTOR patch << blob[idx + 2] * RESCALE_FACTOR end end pixel_values << patch pixel_position_ids << [col, row] end end while pixel_values.length < MAX_PATCHES pixel_values << Array.new(PATCH_SIZE**2 * 3, 0.0) pixel_position_ids << [-1, -1] end @model.predict({ "pixel_values" => [pixel_values], "pixel_position_ids" => [pixel_position_ids] })["image_features"] end |
#input_size_of(original_width, original_height) ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/kiribi/gemma4_e2b/vision_encoder.rb', line 25 def input_size_of(original_width, original_height) target_px = MAX_PATCHES * PATCH_SIZE**2 factor = Math.sqrt(target_px.to_f / (original_height * original_width)) width = (factor * original_width / SIDE_MULT).floor * SIDE_MULT height = (factor * original_height / SIDE_MULT).floor * SIDE_MULT if width == 0 && height == 0 raise "Image too small to resize" elsif height == 0 height = SIDE_MULT width = [(original_width / original_height) * SIDE_MULT, MAX_SOFT_TOKENS * SIDE_MULT].min elsif width == 0 width = SIDE_MULT height = [(original_height / original_width) * SIDE_MULT, MAX_SOFT_TOKENS * SIDE_MULT].min end [width, height] end |