Class: Kreuzberg::Result
- Inherits:
-
Object
- Object
- Kreuzberg::Result
- Defined in:
- lib/kreuzberg/result.rb,
lib/kreuzberg/djot_content.rb,
lib/kreuzberg/document_structure.rb
Overview
rubocop:disable Metrics/ClassLength
Defined Under Namespace
Classes: Chunk, DjotContent, DocumentAnnotation, DocumentBoundingBox, DocumentNode, DocumentStructure, ElementBoundingBox, ElementMetadataStruct, ElementStruct, HierarchicalBlock, Image, OcrBoundingGeometry, OcrConfidence, OcrElement, OcrRotation, PageContent, PageHierarchy, Table
Instance Attribute Summary collapse
-
#annotations ⇒ Object
readonly
Returns the value of attribute annotations.
-
#byte_end ⇒ Integer
readonly
Ending byte offset (UTF-8).
-
#byte_start ⇒ Integer
readonly
Starting byte offset (UTF-8).
-
#chunks ⇒ Object
readonly
Returns the value of attribute chunks.
-
#content ⇒ String
readonly
Text content for this page.
-
#detected_languages ⇒ Object
readonly
Returns the value of attribute detected_languages.
-
#djot_content ⇒ Object
readonly
Returns the value of attribute djot_content.
-
#document ⇒ Object
readonly
Returns the value of attribute document.
-
#elements ⇒ Object
readonly
Returns the value of attribute elements.
-
#extracted_keywords ⇒ Object
readonly
Returns the value of attribute extracted_keywords.
-
#first_page ⇒ Integer?
readonly
First page number (1-indexed).
-
#hierarchy ⇒ PageHierarchy?
readonly
Hierarchy information for the page.
-
#images ⇒ Array<Image>
readonly
Images on this page.
-
#last_page ⇒ Integer?
readonly
Last page number (1-indexed).
-
#metadata ⇒ Object
readonly
Returns the value of attribute metadata.
-
#metadata_json ⇒ Object
readonly
Returns the value of attribute metadata_json.
-
#mime_type ⇒ Object
readonly
Returns the value of attribute mime_type.
-
#ocr_elements ⇒ Object
readonly
Returns the value of attribute ocr_elements.
-
#page_number ⇒ Integer
readonly
Page number (1-indexed).
-
#pages ⇒ Object
readonly
Returns the value of attribute pages.
-
#processing_warnings ⇒ Object
readonly
Returns the value of attribute processing_warnings.
-
#quality_score ⇒ Object
readonly
Returns the value of attribute quality_score.
-
#tables ⇒ Array<Table>
readonly
Tables on this page.
-
#token_count ⇒ Integer?
readonly
Approximate token count (may be nil).
Instance Method Summary collapse
-
#chunk_count ⇒ Integer
Get the total number of text chunks.
-
#detected_language ⇒ String?
Get the primary detected language.
-
#initialize(hash) ⇒ Result
constructor
Initialize from native hash result.
-
#metadata_field(name) ⇒ Object?
Get a metadata field by name.
-
#page_count ⇒ Integer
Get the total number of pages in the document.
-
#to_h ⇒ Hash
Convert to hash.
-
#to_json ⇒ String
Convert to JSON.
Constructor Details
#initialize(hash) ⇒ Result
Initialize from native hash result
rubocop:disable Metrics/AbcSize
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 |
# File 'lib/kreuzberg/result.rb', line 322 def initialize(hash) @content = get_value(hash, 'content', '') @mime_type = get_value(hash, 'mime_type', '') @metadata_json = get_value(hash, 'metadata_json', '{}') @metadata = (@metadata_json) @tables = parse_tables(get_value(hash, 'tables')) @detected_languages = parse_detected_languages(get_value(hash, 'detected_languages')) @chunks = parse_chunks(get_value(hash, 'chunks')) @images = parse_images(get_value(hash, 'images')) @pages = parse_pages(get_value(hash, 'pages')) @elements = parse_elements(get_value(hash, 'elements')) @ocr_elements = parse_ocr_elements(get_value(hash, 'ocr_elements')) @djot_content = parse_djot_content(get_value(hash, 'djot_content')) @document = parse_document_structure(get_value(hash, 'document')) @extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords')) @quality_score = get_value(hash, 'quality_score') @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings')) @annotations = parse_annotations(get_value(hash, 'annotations')) end |
Instance Attribute Details
#annotations ⇒ Object (readonly)
Returns the value of attribute annotations.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def annotations @annotations end |
#byte_end ⇒ Integer (readonly)
Returns Ending byte offset (UTF-8).
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/kreuzberg/result.rb', line 45 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
#byte_start ⇒ Integer (readonly)
Returns Starting byte offset (UTF-8).
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/kreuzberg/result.rb', line 45 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
#chunks ⇒ Object (readonly)
Returns the value of attribute chunks.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def chunks @chunks end |
#content ⇒ String (readonly)
Returns Text content for this page.
45 46 47 |
# File 'lib/kreuzberg/result.rb', line 45 def content @content end |
#detected_languages ⇒ Object (readonly)
Returns the value of attribute detected_languages.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def detected_languages @detected_languages end |
#djot_content ⇒ Object (readonly)
Returns the value of attribute djot_content.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def djot_content @djot_content end |
#document ⇒ Object (readonly)
Returns the value of attribute document.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def document @document end |
#elements ⇒ Object (readonly)
Returns the value of attribute elements.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def elements @elements end |
#extracted_keywords ⇒ Object (readonly)
Returns the value of attribute extracted_keywords.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def extracted_keywords @extracted_keywords end |
#first_page ⇒ Integer? (readonly)
Returns First page number (1-indexed).
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/kreuzberg/result.rb', line 45 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
#hierarchy ⇒ PageHierarchy? (readonly)
Returns Hierarchy information for the page.
145 146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/kreuzberg/result.rb', line 145 PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank) do def to_h { page_number: page_number, content: content, tables: tables.map(&:to_h), images: images.map(&:to_h), hierarchy: hierarchy&.to_h, is_blank: is_blank } end end |
#images ⇒ Array<Image> (readonly)
Returns Images on this page.
145 146 147 |
# File 'lib/kreuzberg/result.rb', line 145 def images @images end |
#last_page ⇒ Integer? (readonly)
Returns Last page number (1-indexed).
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/kreuzberg/result.rb', line 45 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
#metadata ⇒ Object (readonly)
Returns the value of attribute metadata.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def @metadata end |
#metadata_json ⇒ Object (readonly)
Returns the value of attribute metadata_json.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def @metadata_json end |
#mime_type ⇒ Object (readonly)
Returns the value of attribute mime_type.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def mime_type @mime_type end |
#ocr_elements ⇒ Object (readonly)
Returns the value of attribute ocr_elements.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def ocr_elements @ocr_elements end |
#page_number ⇒ Integer (readonly)
Returns Page number (1-indexed).
145 146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/kreuzberg/result.rb', line 145 PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank) do def to_h { page_number: page_number, content: content, tables: tables.map(&:to_h), images: images.map(&:to_h), hierarchy: hierarchy&.to_h, is_blank: is_blank } end end |
#pages ⇒ Object (readonly)
Returns the value of attribute pages.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def pages @pages end |
#processing_warnings ⇒ Object (readonly)
Returns the value of attribute processing_warnings.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def processing_warnings @processing_warnings end |
#quality_score ⇒ Object (readonly)
Returns the value of attribute quality_score.
15 16 17 |
# File 'lib/kreuzberg/result.rb', line 15 def quality_score @quality_score end |
#tables ⇒ Array<Table> (readonly)
Returns Tables on this page.
145 146 147 |
# File 'lib/kreuzberg/result.rb', line 145 def tables @tables end |
#token_count ⇒ Integer? (readonly)
Returns Approximate token count (may be nil).
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/kreuzberg/result.rb', line 45 Chunk = Struct.new( :content, :byte_start, :byte_end, :token_count, :chunk_index, :total_chunks, :first_page, :last_page, :embedding ) do def to_h { content: content, byte_start: byte_start, byte_end: byte_end, token_count: token_count, chunk_index: chunk_index, total_chunks: total_chunks, first_page: first_page, last_page: last_page, embedding: } end end |
Instance Method Details
#chunk_count ⇒ Integer
Get the total number of text chunks
Returns 0 if chunking was not performed.
404 405 406 |
# File 'lib/kreuzberg/result.rb', line 404 def chunk_count @chunks&.length || 0 end |
#detected_language ⇒ String?
Get the primary detected language
417 418 419 420 421 422 |
# File 'lib/kreuzberg/result.rb', line 417 def detected_language return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language'] return @detected_languages&.first if @detected_languages&.any? nil end |
#metadata_field(name) ⇒ Object?
Get a metadata field by name
Supports dot notation for nested fields (e.g., “format.pages”).
439 440 441 442 443 444 445 446 447 448 449 450 451 452 |
# File 'lib/kreuzberg/result.rb', line 439 def (name) return nil unless @metadata.is_a?(Hash) parts = name.to_s.split('.') value = @metadata parts.each do |part| return nil unless value.is_a?(Hash) value = value[part] end value end |
#page_count ⇒ Integer
Get the total number of pages in the document
386 387 388 389 390 391 392 |
# File 'lib/kreuzberg/result.rb', line 386 def page_count if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash) @metadata['pages']['total_count'] || 0 else 0 end end |
#to_h ⇒ Hash
Convert to hash
rubocop:disable Metrics/CyclomaticComplexity
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 |
# File 'lib/kreuzberg/result.rb', line 348 def to_h { content: @content, mime_type: @mime_type, metadata: @metadata, tables: serialize_tables, detected_languages: @detected_languages, chunks: serialize_chunks, images: serialize_images, pages: serialize_pages, elements: serialize_elements, ocr_elements: serialize_ocr_elements, djot_content: @djot_content&.to_h, document: @document&.to_h, extracted_keywords: @extracted_keywords&.map(&:to_h), quality_score: @quality_score, processing_warnings: @processing_warnings.map(&:to_h), annotations: @annotations&.map(&:to_h) } end |
#to_json ⇒ String
Convert to JSON
374 375 376 |
# File 'lib/kreuzberg/result.rb', line 374 def to_json(*) to_h.to_json(*) end |