Class: Kreuzberg::Result
- Inherits:
-
Object
- Object
- Kreuzberg::Result
- Defined in:
- lib/kreuzberg/result.rb,
lib/kreuzberg/djot_content.rb,
lib/kreuzberg/document_structure.rb
Overview
rubocop:disable Metrics/ClassLength
Defined Under Namespace
Classes: Chunk, DjotContent, DocumentAnnotation, DocumentBoundingBox, DocumentNode, DocumentStructure, ElementBoundingBox, ElementMetadataStruct, ElementStruct, HierarchicalBlock, Image, OcrBoundingGeometry, OcrConfidence, OcrElement, OcrRotation, PageContent, PageHierarchy, Table
Instance Attribute Summary collapse
-
#byte_end ⇒ Integer
readonly
Ending byte offset (UTF-8).
-
#byte_start ⇒ Integer
readonly
Starting byte offset (UTF-8).
-
#chunks ⇒ Object
readonly
Returns the value of attribute chunks.
-
#content ⇒ String
readonly
Text content for this page.
-
#detected_languages ⇒ Object
readonly
Returns the value of attribute detected_languages.
-
#djot_content ⇒ Object
readonly
Returns the value of attribute djot_content.
-
#document ⇒ Object
readonly
Returns the value of attribute document.
-
#elements ⇒ Object
readonly
Returns the value of attribute elements.
-
#extracted_keywords ⇒ Object
readonly
Returns the value of attribute extracted_keywords.
-
#first_page ⇒ Integer?
readonly
First page number (1-indexed).
-
#hierarchy ⇒ PageHierarchy?
readonly
Hierarchy information for the page.
-
#images ⇒ Array<Image>
readonly
Images on this page.
-
#last_page ⇒ Integer?
readonly
Last page number (1-indexed).
-
#metadata ⇒ Object
readonly
Returns the value of attribute metadata.
-
#metadata_json ⇒ Object
readonly
Returns the value of attribute metadata_json.
-
#mime_type ⇒ Object
readonly
Returns the value of attribute mime_type.
-
#ocr_elements ⇒ Object
readonly
Returns the value of attribute ocr_elements.
-
#page_number ⇒ Integer
readonly
Page number (1-indexed).
-
#pages ⇒ Object
readonly
Returns the value of attribute pages.
-
#processing_warnings ⇒ Object
readonly
Returns the value of attribute processing_warnings.
-
#quality_score ⇒ Object
readonly
Returns the value of attribute quality_score.
-
#tables ⇒ Array<Table>
readonly
Tables on this page.
-
#token_count ⇒ Integer?
readonly
Approximate token count (may be nil).
Instance Method Summary collapse
-
#chunk_count ⇒ Integer
Get the total number of text chunks.
-
#detected_language ⇒ String?
Get the primary detected language.
-
#initialize(hash) ⇒ Result
constructor
Initialize from native hash result.
-
#metadata_field(name) ⇒ Object?
Get a metadata field by name.
-
#page_count ⇒ Integer
Get the total number of pages in the document.
-
#to_h ⇒ Hash
Convert to hash.
-
#to_json ⇒ String
Convert to JSON.
Constructor Details
#initialize(hash) ⇒ Result
Initialize from native hash result
rubocop:disable Metrics/AbcSize
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 |
# File 'lib/kreuzberg/result.rb', line 321 def initialize(hash) @content = get_value(hash, 'content', '') @mime_type = get_value(hash, 'mime_type', '') @metadata_json = get_value(hash, 'metadata_json', '{}') @metadata = (@metadata_json) @tables = parse_tables(get_value(hash, 'tables')) @detected_languages = parse_detected_languages(get_value(hash, 'detected_languages')) @chunks = parse_chunks(get_value(hash, 'chunks')) @images = parse_images(get_value(hash, 'images')) @pages = parse_pages(get_value(hash, 'pages')) @elements = parse_elements(get_value(hash, 'elements')) @ocr_elements = parse_ocr_elements(get_value(hash, 'ocr_elements')) @djot_content = parse_djot_content(get_value(hash, 'djot_content')) @document = parse_document_structure(get_value(hash, 'document')) @extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords')) @quality_score = get_value(hash, 'quality_score') @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings')) end |
Instance Attribute Details
#byte_end ⇒ Integer (readonly)
Returns Ending byte offset (UTF-8).
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/kreuzberg/result.rb', line 43 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding, keyword_init: true ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
#byte_start ⇒ Integer (readonly)
Returns Starting byte offset (UTF-8).
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/kreuzberg/result.rb', line 43 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding, keyword_init: true ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
#chunks ⇒ Object (readonly)
Returns the value of attribute chunks.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def chunks @chunks end |
#content ⇒ String (readonly)
Returns Text content for this page.
43 44 45 |
# File 'lib/kreuzberg/result.rb', line 43 def content @content end |
#detected_languages ⇒ Object (readonly)
Returns the value of attribute detected_languages.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def detected_languages @detected_languages end |
#djot_content ⇒ Object (readonly)
Returns the value of attribute djot_content.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def djot_content @djot_content end |
#document ⇒ Object (readonly)
Returns the value of attribute document.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def document @document end |
#elements ⇒ Object (readonly)
Returns the value of attribute elements.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def elements @elements end |
#extracted_keywords ⇒ Object (readonly)
Returns the value of attribute extracted_keywords.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def extracted_keywords @extracted_keywords end |
#first_page ⇒ Integer? (readonly)
Returns First page number (1-indexed).
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/kreuzberg/result.rb', line 43 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding, keyword_init: true ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
#hierarchy ⇒ PageHierarchy? (readonly)
Returns Hierarchy information for the page.
143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/kreuzberg/result.rb', line 143 PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do def to_h { page_number: page_number, content: content, tables: tables.map(&:to_h), images: images.map(&:to_h), hierarchy: hierarchy&.to_h, is_blank: is_blank } end end |
#images ⇒ Array<Image> (readonly)
Returns Images on this page.
143 144 145 |
# File 'lib/kreuzberg/result.rb', line 143 def images @images end |
#last_page ⇒ Integer? (readonly)
Returns Last page number (1-indexed).
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/kreuzberg/result.rb', line 43 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding, keyword_init: true ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
#metadata ⇒ Object (readonly)
Returns the value of attribute metadata.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def @metadata end |
#metadata_json ⇒ Object (readonly)
Returns the value of attribute metadata_json.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def @metadata_json end |
#mime_type ⇒ Object (readonly)
Returns the value of attribute mime_type.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def mime_type @mime_type end |
#ocr_elements ⇒ Object (readonly)
Returns the value of attribute ocr_elements.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def ocr_elements @ocr_elements end |
#page_number ⇒ Integer (readonly)
Returns Page number (1-indexed).
143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/kreuzberg/result.rb', line 143 PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do def to_h { page_number: page_number, content: content, tables: tables.map(&:to_h), images: images.map(&:to_h), hierarchy: hierarchy&.to_h, is_blank: is_blank } end end |
#pages ⇒ Object (readonly)
Returns the value of attribute pages.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def pages @pages end |
#processing_warnings ⇒ Object (readonly)
Returns the value of attribute processing_warnings.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def processing_warnings @processing_warnings end |
#quality_score ⇒ Object (readonly)
Returns the value of attribute quality_score.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def quality_score @quality_score end |
#tables ⇒ Array<Table> (readonly)
Returns Tables on this page.
143 144 145 |
# File 'lib/kreuzberg/result.rb', line 143 def tables @tables end |
#token_count ⇒ Integer? (readonly)
Returns Approximate token count (may be nil).
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/kreuzberg/result.rb', line 43 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding, keyword_init: true ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
Instance Method Details
#chunk_count ⇒ Integer
Get the total number of text chunks
Returns 0 if chunking was not performed.
399 400 401 |
# File 'lib/kreuzberg/result.rb', line 399 def chunk_count @chunks&.length || 0 end |
#detected_language ⇒ String?
Get the primary detected language
412 413 414 415 416 417 |
# File 'lib/kreuzberg/result.rb', line 412 def detected_language return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language'] return @detected_languages&.first if @detected_languages&.any? nil end |
#metadata_field(name) ⇒ Object?
Get a metadata field by name
Supports dot notation for nested fields (e.g., “format.pages”).
434 435 436 437 438 439 440 441 442 443 444 445 446 447 |
# File 'lib/kreuzberg/result.rb', line 434 def (name) return nil unless @metadata.is_a?(Hash) parts = name.to_s.split('.') value = @metadata parts.each do |part| return nil unless value.is_a?(Hash) value = value[part] end value end |
#page_count ⇒ Integer
Get the total number of pages in the document
381 382 383 384 385 386 387 |
# File 'lib/kreuzberg/result.rb', line 381 def page_count if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash) @metadata['pages']['total_count'] || 0 else 0 end end |
#to_h ⇒ Hash
Convert to hash
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 |
# File 'lib/kreuzberg/result.rb', line 345 def to_h { content: @content, mime_type: @mime_type, metadata: @metadata, tables: serialize_tables, detected_languages: @detected_languages, chunks: serialize_chunks, images: serialize_images, pages: serialize_pages, elements: serialize_elements, ocr_elements: serialize_ocr_elements, djot_content: @djot_content&.to_h, document: @document&.to_h, extracted_keywords: @extracted_keywords&.map(&:to_h), quality_score: @quality_score, processing_warnings: @processing_warnings.map(&:to_h) } end |
#to_json ⇒ String
Convert to JSON
369 370 371 |
# File 'lib/kreuzberg/result.rb', line 369 def to_json(*) to_h.to_json(*) end |