Class: Kreuzberg::Result

Inherits:
Object
  • Object
show all
Defined in:
lib/kreuzberg/result.rb,
lib/kreuzberg/djot_content.rb,
lib/kreuzberg/document_structure.rb

Overview

rubocop:disable Metrics/ClassLength

Defined Under Namespace

Classes: Chunk, DjotContent, DocumentAnnotation, DocumentBoundingBox, DocumentNode, DocumentStructure, ElementBoundingBox, ElementMetadataStruct, ElementStruct, HierarchicalBlock, Image, OcrBoundingGeometry, OcrConfidence, OcrElement, OcrRotation, PageContent, PageHierarchy, Table

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hash) ⇒ Result

Initialize from native hash result

rubocop:disable Metrics/AbcSize

Parameters:

  • hash (Hash)

    Hash returned from native extension



321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# File 'lib/kreuzberg/result.rb', line 321

def initialize(hash)
  @content = get_value(hash, 'content', '')
  @mime_type = get_value(hash, 'mime_type', '')
  @metadata_json = get_value(hash, 'metadata_json', '{}')
  @metadata = (@metadata_json)
  @tables = parse_tables(get_value(hash, 'tables'))
  @detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
  @chunks = parse_chunks(get_value(hash, 'chunks'))
  @images = parse_images(get_value(hash, 'images'))
  @pages = parse_pages(get_value(hash, 'pages'))
  @elements = parse_elements(get_value(hash, 'elements'))
  @ocr_elements = parse_ocr_elements(get_value(hash, 'ocr_elements'))
  @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
  @document = parse_document_structure(get_value(hash, 'document'))
  @extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
  @quality_score = get_value(hash, 'quality_score')
  @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
end

Instance Attribute Details

#byte_endInteger (readonly)

Returns Ending byte offset (UTF-8).

Returns:

  • (Integer)

    Ending byte offset (UTF-8)



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/kreuzberg/result.rb', line 43

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#byte_startInteger (readonly)

Returns Starting byte offset (UTF-8).

Returns:

  • (Integer)

    Starting byte offset (UTF-8)



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/kreuzberg/result.rb', line 43

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#chunksObject (readonly)

Returns the value of attribute chunks.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def chunks
  @chunks
end

#contentString (readonly)

Returns Text content for this page.

Returns:

  • (String)

    Text content for this page



43
44
45
# File 'lib/kreuzberg/result.rb', line 43

def content
  @content
end

#detected_languagesObject (readonly)

Returns the value of attribute detected_languages.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def detected_languages
  @detected_languages
end

#djot_contentObject (readonly)

Returns the value of attribute djot_content.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def djot_content
  @djot_content
end

#documentObject (readonly)

Returns the value of attribute document.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def document
  @document
end

#elementsObject (readonly)

Returns the value of attribute elements.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def elements
  @elements
end

#extracted_keywordsObject (readonly)

Returns the value of attribute extracted_keywords.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def extracted_keywords
  @extracted_keywords
end

#first_pageInteger? (readonly)

Returns First page number (1-indexed).

Returns:

  • (Integer, nil)

    First page number (1-indexed)



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/kreuzberg/result.rb', line 43

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#hierarchyPageHierarchy? (readonly)

Returns Hierarchy information for the page.

Returns:



143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/kreuzberg/result.rb', line 143

PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do
  def to_h
    {
      page_number: page_number,
      content: content,
      tables: tables.map(&:to_h),
      images: images.map(&:to_h),
      hierarchy: hierarchy&.to_h,
      is_blank: is_blank
    }
  end
end

#imagesArray<Image> (readonly)

Returns Images on this page.

Returns:

  • (Array<Image>)

    Images on this page



143
144
145
# File 'lib/kreuzberg/result.rb', line 143

def images
  @images
end

#last_pageInteger? (readonly)

Returns Last page number (1-indexed).

Returns:

  • (Integer, nil)

    Last page number (1-indexed)



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/kreuzberg/result.rb', line 43

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#metadataObject (readonly)

Returns the value of attribute metadata.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def 
  @metadata
end

#metadata_jsonObject (readonly)

Returns the value of attribute metadata_json.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def 
  @metadata_json
end

#mime_typeObject (readonly)

Returns the value of attribute mime_type.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def mime_type
  @mime_type
end

#ocr_elementsObject (readonly)

Returns the value of attribute ocr_elements.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def ocr_elements
  @ocr_elements
end

#page_numberInteger (readonly)

Returns Page number (1-indexed).

Returns:

  • (Integer)

    Page number (1-indexed)



143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/kreuzberg/result.rb', line 143

PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do
  def to_h
    {
      page_number: page_number,
      content: content,
      tables: tables.map(&:to_h),
      images: images.map(&:to_h),
      hierarchy: hierarchy&.to_h,
      is_blank: is_blank
    }
  end
end

#pagesObject (readonly)

Returns the value of attribute pages.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def pages
  @pages
end

#processing_warningsObject (readonly)

Returns the value of attribute processing_warnings.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def processing_warnings
  @processing_warnings
end

#quality_scoreObject (readonly)

Returns the value of attribute quality_score.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def quality_score
  @quality_score
end

#tablesArray<Table> (readonly)

Returns Tables on this page.

Returns:

  • (Array<Table>)

    Tables on this page



143
144
145
# File 'lib/kreuzberg/result.rb', line 143

def tables
  @tables
end

#token_countInteger? (readonly)

Returns Approximate token count (may be nil).

Returns:

  • (Integer, nil)

    Approximate token count (may be nil)



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/kreuzberg/result.rb', line 43

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

Instance Method Details

#chunk_countInteger

Get the total number of text chunks

Returns 0 if chunking was not performed.

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
puts "Document has #{result.chunk_count} chunks"

Returns:

  • (Integer)

    Total chunk count (>= 0), or -1 on error



399
400
401
# File 'lib/kreuzberg/result.rb', line 399

def chunk_count
  @chunks&.length || 0
end

#detected_languageString?

Get the primary detected language

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
lang = result.detected_language
puts "Language: #{lang}" if lang

Returns:

  • (String, nil)

    ISO 639 language code (e.g., “en”, “de”), or nil if not detected



412
413
414
415
416
417
# File 'lib/kreuzberg/result.rb', line 412

def detected_language
  return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language']
  return @detected_languages&.first if @detected_languages&.any?

  nil
end

#metadata_field(name) ⇒ Object?

Get a metadata field by name

Supports dot notation for nested fields (e.g., “format.pages”).

Examples:

Get a top-level field

result = Kreuzberg.extract_file_sync("document.pdf")
title = result.("title")
puts "Title: #{title}" if title

Get a nested field

format_info = result.("format.pages")

Parameters:

  • name (String, Symbol)

    Field name

Returns:

  • (Object, nil)

    Field value, or nil if field doesn’t exist



434
435
436
437
438
439
440
441
442
443
444
445
446
447
# File 'lib/kreuzberg/result.rb', line 434

def (name)
  return nil unless @metadata.is_a?(Hash)

  parts = name.to_s.split('.')
  value = @metadata

  parts.each do |part|
    return nil unless value.is_a?(Hash)

    value = value[part]
  end

  value
end

#page_countInteger

Get the total number of pages in the document

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
puts "Document has #{result.page_count} pages"

Returns:

  • (Integer)

    Total page count (>= 0), or -1 on error



381
382
383
384
385
386
387
# File 'lib/kreuzberg/result.rb', line 381

def page_count
  if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash)
    @metadata['pages']['total_count'] || 0
  else
    0
  end
end

#to_hHash

Convert to hash

Returns:

  • (Hash)

    Hash representation



345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# File 'lib/kreuzberg/result.rb', line 345

def to_h
  {
    content: @content,
    mime_type: @mime_type,
    metadata: @metadata,
    tables: serialize_tables,
    detected_languages: @detected_languages,
    chunks: serialize_chunks,
    images: serialize_images,
    pages: serialize_pages,
    elements: serialize_elements,
    ocr_elements: serialize_ocr_elements,
    djot_content: @djot_content&.to_h,
    document: @document&.to_h,
    extracted_keywords: @extracted_keywords&.map(&:to_h),
    quality_score: @quality_score,
    processing_warnings: @processing_warnings.map(&:to_h)
  }
end

#to_jsonString

Convert to JSON

Returns:

  • (String)

    JSON representation



369
370
371
# File 'lib/kreuzberg/result.rb', line 369

def to_json(*)
  to_h.to_json(*)
end