Class: Kreuzberg::Result

Inherits:
Object
  • Object
show all
Defined in:
lib/kreuzberg/result.rb,
lib/kreuzberg/djot_content.rb,
lib/kreuzberg/document_structure.rb

Overview

rubocop:disable Metrics/ClassLength

Defined Under Namespace

Classes: Chunk, DjotContent, DocumentAnnotation, DocumentBoundingBox, DocumentNode, DocumentStructure, ElementBoundingBox, ElementMetadataStruct, ElementStruct, HierarchicalBlock, Image, OcrBoundingGeometry, OcrConfidence, OcrElement, OcrRotation, PageContent, PageHierarchy, Table

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hash) ⇒ Result

Initialize from native hash result

rubocop:disable Metrics/AbcSize

Parameters:

  • hash (Hash)

    Hash returned from native extension



325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# File 'lib/kreuzberg/result.rb', line 325

def initialize(hash)
  @content = get_value(hash, 'content', '')
  @mime_type = get_value(hash, 'mime_type', '')
  @metadata_json = get_value(hash, 'metadata_json', '{}')
  @metadata = (@metadata_json)
  @tables = parse_tables(get_value(hash, 'tables'))
  @detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
  @chunks = parse_chunks(get_value(hash, 'chunks'))
  @images = parse_images(get_value(hash, 'images'))
  @pages = parse_pages(get_value(hash, 'pages'))
  @elements = parse_elements(get_value(hash, 'elements'))
  @ocr_elements = parse_ocr_elements(get_value(hash, 'ocr_elements'))
  @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
  @document = parse_document_structure(get_value(hash, 'document'))
  @extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
  @quality_score = get_value(hash, 'quality_score')
  @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
  @annotations = parse_annotations(get_value(hash, 'annotations'))
end

Instance Attribute Details

#annotationsObject (readonly)

Returns the value of attribute annotations.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def annotations
  @annotations
end

#byte_endInteger (readonly)

Returns Ending byte offset (UTF-8).

Returns:

  • (Integer)

    Ending byte offset (UTF-8)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#byte_startInteger (readonly)

Returns Starting byte offset (UTF-8).

Returns:

  • (Integer)

    Starting byte offset (UTF-8)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#chunksObject (readonly)

Returns the value of attribute chunks.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def chunks
  @chunks
end

#contentString (readonly)

Returns Text content for this page.

Returns:

  • (String)

    Text content for this page



45
46
47
# File 'lib/kreuzberg/result.rb', line 45

def content
  @content
end

#detected_languagesObject (readonly)

Returns the value of attribute detected_languages.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def detected_languages
  @detected_languages
end

#djot_contentObject (readonly)

Returns the value of attribute djot_content.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def djot_content
  @djot_content
end

#documentObject (readonly)

Returns the value of attribute document.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def document
  @document
end

#elementsObject (readonly)

Returns the value of attribute elements.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def elements
  @elements
end

#extracted_keywordsObject (readonly)

Returns the value of attribute extracted_keywords.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def extracted_keywords
  @extracted_keywords
end

#first_pageInteger? (readonly)

Returns First page number (1-indexed).

Returns:

  • (Integer, nil)

    First page number (1-indexed)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#hierarchyPageHierarchy? (readonly)

Returns Hierarchy information for the page.

Returns:



147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/kreuzberg/result.rb', line 147

PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do
  def to_h
    {
      page_number: page_number,
      content: content,
      tables: tables.map(&:to_h),
      images: images.map(&:to_h),
      hierarchy: hierarchy&.to_h,
      is_blank: is_blank
    }
  end
end

#imagesArray<Image> (readonly)

Returns Images on this page.

Returns:

  • (Array<Image>)

    Images on this page



147
148
149
# File 'lib/kreuzberg/result.rb', line 147

def images
  @images
end

#last_pageInteger? (readonly)

Returns Last page number (1-indexed).

Returns:

  • (Integer, nil)

    Last page number (1-indexed)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#metadataObject (readonly)

Returns the value of attribute metadata.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def 
  @metadata
end

#metadata_jsonObject (readonly)

Returns the value of attribute metadata_json.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def 
  @metadata_json
end

#mime_typeObject (readonly)

Returns the value of attribute mime_type.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def mime_type
  @mime_type
end

#ocr_elementsObject (readonly)

Returns the value of attribute ocr_elements.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def ocr_elements
  @ocr_elements
end

#page_numberInteger (readonly)

Returns Page number (1-indexed).

Returns:

  • (Integer)

    Page number (1-indexed)



147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/kreuzberg/result.rb', line 147

PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do
  def to_h
    {
      page_number: page_number,
      content: content,
      tables: tables.map(&:to_h),
      images: images.map(&:to_h),
      hierarchy: hierarchy&.to_h,
      is_blank: is_blank
    }
  end
end

#pagesObject (readonly)

Returns the value of attribute pages.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def pages
  @pages
end

#processing_warningsObject (readonly)

Returns the value of attribute processing_warnings.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def processing_warnings
  @processing_warnings
end

#quality_scoreObject (readonly)

Returns the value of attribute quality_score.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def quality_score
  @quality_score
end

#tablesArray<Table> (readonly)

Returns Tables on this page.

Returns:

  • (Array<Table>)

    Tables on this page



147
148
149
# File 'lib/kreuzberg/result.rb', line 147

def tables
  @tables
end

#token_countInteger? (readonly)

Returns Approximate token count (may be nil).

Returns:

  • (Integer, nil)

    Approximate token count (may be nil)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

Instance Method Details

#chunk_countInteger

Get the total number of text chunks

Returns 0 if chunking was not performed.

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
puts "Document has #{result.chunk_count} chunks"

Returns:

  • (Integer)

    Total chunk count (>= 0), or -1 on error



407
408
409
# File 'lib/kreuzberg/result.rb', line 407

def chunk_count
  @chunks&.length || 0
end

#detected_languageString?

Get the primary detected language

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
lang = result.detected_language
puts "Language: #{lang}" if lang

Returns:

  • (String, nil)

    ISO 639 language code (e.g., “en”, “de”), or nil if not detected



420
421
422
423
424
425
# File 'lib/kreuzberg/result.rb', line 420

def detected_language
  return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language']
  return @detected_languages&.first if @detected_languages&.any?

  nil
end

#metadata_field(name) ⇒ Object?

Get a metadata field by name

Supports dot notation for nested fields (e.g., “format.pages”).

Examples:

Get a top-level field

result = Kreuzberg.extract_file_sync("document.pdf")
title = result.("title")
puts "Title: #{title}" if title

Get a nested field

format_info = result.("format.pages")

Parameters:

  • name (String, Symbol)

    Field name

Returns:

  • (Object, nil)

    Field value, or nil if field doesn’t exist



442
443
444
445
446
447
448
449
450
451
452
453
454
455
# File 'lib/kreuzberg/result.rb', line 442

def (name)
  return nil unless @metadata.is_a?(Hash)

  parts = name.to_s.split('.')
  value = @metadata

  parts.each do |part|
    return nil unless value.is_a?(Hash)

    value = value[part]
  end

  value
end

#page_countInteger

Get the total number of pages in the document

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
puts "Document has #{result.page_count} pages"

Returns:

  • (Integer)

    Total page count (>= 0), or -1 on error



389
390
391
392
393
394
395
# File 'lib/kreuzberg/result.rb', line 389

def page_count
  if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash)
    @metadata['pages']['total_count'] || 0
  else
    0
  end
end

#to_hHash

Convert to hash

rubocop:disable Metrics/CyclomaticComplexity

Returns:

  • (Hash)

    Hash representation



351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
# File 'lib/kreuzberg/result.rb', line 351

def to_h
  {
    content: @content,
    mime_type: @mime_type,
    metadata: @metadata,
    tables: serialize_tables,
    detected_languages: @detected_languages,
    chunks: serialize_chunks,
    images: serialize_images,
    pages: serialize_pages,
    elements: serialize_elements,
    ocr_elements: serialize_ocr_elements,
    djot_content: @djot_content&.to_h,
    document: @document&.to_h,
    extracted_keywords: @extracted_keywords&.map(&:to_h),
    quality_score: @quality_score,
    processing_warnings: @processing_warnings.map(&:to_h),
    annotations: @annotations&.map(&:to_h)
  }
end

#to_jsonString

Convert to JSON

Returns:

  • (String)

    JSON representation



377
378
379
# File 'lib/kreuzberg/result.rb', line 377

def to_json(*)
  to_h.to_json(*)
end