Class: Kreuzberg::Result

Inherits:
Object
  • Object
show all
Defined in:
lib/kreuzberg/result.rb,
lib/kreuzberg/djot_content.rb,
lib/kreuzberg/document_structure.rb

Overview

rubocop:disable Metrics/ClassLength

Defined Under Namespace

Classes: Chunk, DjotContent, DocumentAnnotation, DocumentBoundingBox, DocumentNode, DocumentStructure, ElementBoundingBox, ElementMetadataStruct, ElementStruct, HierarchicalBlock, Image, LayoutRegion, OcrBoundingGeometry, OcrConfidence, OcrElement, OcrRotation, PageContent, PageHierarchy, Table

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hash) ⇒ Result

Initialize from native hash result

rubocop:disable Metrics/AbcSize, Metrics/MethodLength

Parameters:

  • hash (Hash)

    Hash returned from native extension



345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# File 'lib/kreuzberg/result.rb', line 345

def initialize(hash)
  @content = get_value(hash, 'content', '')
  @mime_type = get_value(hash, 'mime_type', '')
  @metadata_json = get_value(hash, 'metadata_json', '{}')
  @metadata = (@metadata_json)
  @tables = parse_tables(get_value(hash, 'tables'))
  @detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
  @chunks = parse_chunks(get_value(hash, 'chunks'))
  @images = parse_images(get_value(hash, 'images'))
  @pages = parse_pages(get_value(hash, 'pages'))
  @elements = parse_elements(get_value(hash, 'elements'))
  @ocr_elements = parse_ocr_elements(get_value(hash, 'ocr_elements'))
  @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
  @document = parse_document_structure(get_value(hash, 'document'))
  @extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
  @quality_score = get_value(hash, 'quality_score')
  @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
  @annotations = parse_annotations(get_value(hash, 'annotations'))
  @uris = parse_uris(get_value(hash, 'uris'))
  @children = parse_children(get_value(hash, 'children'))
  @structured_output = get_value(hash, 'structured_output')
end

Instance Attribute Details

#annotationsObject (readonly)

Returns the value of attribute annotations.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def annotations
  @annotations
end

#byte_endInteger (readonly)

Returns Ending byte offset (UTF-8).

Returns:

  • (Integer)

    Ending byte offset (UTF-8)



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/kreuzberg/result.rb', line 46

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :chunk_type,
  :embedding
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      chunk_type: chunk_type,
      embedding: embedding
    }
  end
end

#byte_startInteger (readonly)

Returns Starting byte offset (UTF-8).

Returns:

  • (Integer)

    Starting byte offset (UTF-8)



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/kreuzberg/result.rb', line 46

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :chunk_type,
  :embedding
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      chunk_type: chunk_type,
      embedding: embedding
    }
  end
end

#childrenObject (readonly)

Returns the value of attribute children.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def children
  @children
end

#chunksObject (readonly)

Returns the value of attribute chunks.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def chunks
  @chunks
end

#contentString (readonly)

Returns Text content for this page.

Returns:

  • (String)

    Text content for this page



46
47
48
# File 'lib/kreuzberg/result.rb', line 46

def content
  @content
end

#detected_languagesObject (readonly)

Returns the value of attribute detected_languages.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def detected_languages
  @detected_languages
end

#djot_contentObject (readonly)

Returns the value of attribute djot_content.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def djot_content
  @djot_content
end

#documentObject (readonly)

Returns the value of attribute document.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def document
  @document
end

#elementsObject (readonly)

Returns the value of attribute elements.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def elements
  @elements
end

#extracted_keywordsObject (readonly)

Returns the value of attribute extracted_keywords.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def extracted_keywords
  @extracted_keywords
end

#first_pageInteger? (readonly)

Returns First page number (1-indexed).

Returns:

  • (Integer, nil)

    First page number (1-indexed)



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/kreuzberg/result.rb', line 46

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :chunk_type,
  :embedding
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      chunk_type: chunk_type,
      embedding: embedding
    }
  end
end

#hierarchyPageHierarchy? (readonly)

Returns Hierarchy information for the page.

Returns:



148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/kreuzberg/result.rb', line 148

PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, :layout_regions) do
  def to_h
    {
      page_number: page_number,
      content: content,
      tables: tables.map(&:to_h),
      images: images.map(&:to_h),
      hierarchy: hierarchy&.to_h,
      is_blank: is_blank,
      layout_regions: layout_regions&.map(&:to_h)
    }
  end
end

#imagesArray<Image> (readonly)

Returns Images on this page.

Returns:

  • (Array<Image>)

    Images on this page



148
149
150
# File 'lib/kreuzberg/result.rb', line 148

def images
  @images
end

#last_pageInteger? (readonly)

Returns Last page number (1-indexed).

Returns:

  • (Integer, nil)

    Last page number (1-indexed)



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/kreuzberg/result.rb', line 46

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :chunk_type,
  :embedding
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      chunk_type: chunk_type,
      embedding: embedding
    }
  end
end

#metadataObject (readonly)

Returns the value of attribute metadata.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def 
  @metadata
end

#metadata_jsonObject (readonly)

Returns the value of attribute metadata_json.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def 
  @metadata_json
end

#mime_typeObject (readonly)

Returns the value of attribute mime_type.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def mime_type
  @mime_type
end

#ocr_elementsObject (readonly)

Returns the value of attribute ocr_elements.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def ocr_elements
  @ocr_elements
end

#page_numberInteger (readonly)

Returns Page number (1-indexed).

Returns:

  • (Integer)

    Page number (1-indexed)



148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/kreuzberg/result.rb', line 148

PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, :layout_regions) do
  def to_h
    {
      page_number: page_number,
      content: content,
      tables: tables.map(&:to_h),
      images: images.map(&:to_h),
      hierarchy: hierarchy&.to_h,
      is_blank: is_blank,
      layout_regions: layout_regions&.map(&:to_h)
    }
  end
end

#pagesObject (readonly)

Returns the value of attribute pages.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def pages
  @pages
end

#processing_warningsObject (readonly)

Returns the value of attribute processing_warnings.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def processing_warnings
  @processing_warnings
end

#quality_scoreObject (readonly)

Returns the value of attribute quality_score.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def quality_score
  @quality_score
end

#structured_outputObject (readonly)

Returns the value of attribute structured_output.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def structured_output
  @structured_output
end

#tablesArray<Table> (readonly)

Returns Tables on this page.

Returns:

  • (Array<Table>)

    Tables on this page



148
149
150
# File 'lib/kreuzberg/result.rb', line 148

def tables
  @tables
end

#token_countInteger? (readonly)

Returns Approximate token count (may be nil).

Returns:

  • (Integer, nil)

    Approximate token count (may be nil)



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/kreuzberg/result.rb', line 46

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :chunk_type,
  :embedding
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      chunk_type: chunk_type,
      embedding: embedding
    }
  end
end

#urisObject (readonly)

Returns the value of attribute uris.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def uris
  @uris
end

Instance Method Details

#chunk_countInteger

Get the total number of text chunks

Returns 0 if chunking was not performed.

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
puts "Document has #{result.chunk_count} chunks"

Returns:

  • (Integer)

    Total chunk count (>= 0), or -1 on error



433
434
435
# File 'lib/kreuzberg/result.rb', line 433

def chunk_count
  @chunks&.length || 0
end

#detected_languageString?

Get the primary detected language

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
lang = result.detected_language
puts "Language: #{lang}" if lang

Returns:

  • (String, nil)

    ISO 639 language code (e.g., “en”, “de”), or nil if not detected



446
447
448
449
450
451
# File 'lib/kreuzberg/result.rb', line 446

def detected_language
  return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language']
  return @detected_languages&.first if @detected_languages&.any?

  nil
end

#metadata_field(name) ⇒ Object?

Get a metadata field by name

Supports dot notation for nested fields (e.g., “format.pages”).

Examples:

Get a top-level field

result = Kreuzberg.extract_file_sync("document.pdf")
title = result.("title")
puts "Title: #{title}" if title

Get a nested field

format_info = result.("format.pages")

Parameters:

  • name (String, Symbol)

    Field name

Returns:

  • (Object, nil)

    Field value, or nil if field doesn’t exist



468
469
470
471
472
473
474
475
476
477
478
479
480
481
# File 'lib/kreuzberg/result.rb', line 468

def (name)
  return nil unless @metadata.is_a?(Hash)

  parts = name.to_s.split('.')
  value = @metadata

  parts.each do |part|
    return nil unless value.is_a?(Hash)

    value = value[part]
  end

  value
end

#page_countInteger

Get the total number of pages in the document

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
puts "Document has #{result.page_count} pages"

Returns:

  • (Integer)

    Total page count (>= 0), or -1 on error



415
416
417
418
419
420
421
# File 'lib/kreuzberg/result.rb', line 415

def page_count
  if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash)
    @metadata['pages']['total_count'] || 0
  else
    0
  end
end

#to_hHash

Convert to hash

rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength

Returns:

  • (Hash)

    Hash representation



374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
# File 'lib/kreuzberg/result.rb', line 374

def to_h
  {
    content: @content,
    mime_type: @mime_type,
    metadata: @metadata,
    tables: serialize_tables,
    detected_languages: @detected_languages,
    chunks: serialize_chunks,
    images: serialize_images,
    pages: serialize_pages,
    elements: serialize_elements,
    ocr_elements: serialize_ocr_elements,
    djot_content: @djot_content&.to_h,
    document: @document&.to_h,
    extracted_keywords: @extracted_keywords&.map(&:to_h),
    quality_score: @quality_score,
    processing_warnings: @processing_warnings.map(&:to_h),
    annotations: @annotations&.map(&:to_h),
    uris: @uris&.map(&:to_h),
    children: @children&.map(&:to_h),
    structured_output: @structured_output
  }
end

#to_jsonString

Convert to JSON

Returns:

  • (String)

    JSON representation



403
404
405
# File 'lib/kreuzberg/result.rb', line 403

def to_json(*)
  to_h.to_json(*)
end