Class: Kreuzberg::Result

Inherits:
Object
  • Object
show all
Defined in:
lib/kreuzberg/result.rb,
lib/kreuzberg/djot_content.rb,
lib/kreuzberg/document_structure.rb

Overview

rubocop:disable Metrics/ClassLength

Defined Under Namespace

Classes: Chunk, DjotContent, DocumentAnnotation, DocumentBoundingBox, DocumentNode, DocumentStructure, ElementBoundingBox, ElementMetadataStruct, ElementStruct, HierarchicalBlock, Image, OcrBoundingGeometry, OcrConfidence, OcrElement, OcrRotation, PageContent, PageHierarchy, Table

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hash) ⇒ Result

Initialize from native hash result

rubocop:disable Metrics/AbcSize

Parameters:

  • hash (Hash)

    Hash returned from native extension



325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
# File 'lib/kreuzberg/result.rb', line 325

def initialize(hash)
  @content = get_value(hash, 'content', '')
  @mime_type = get_value(hash, 'mime_type', '')
  @metadata_json = get_value(hash, 'metadata_json', '{}')
  @metadata = (@metadata_json)
  @tables = parse_tables(get_value(hash, 'tables'))
  @detected_languages = parse_detected_languages(get_value(hash, 'detected_languages'))
  @chunks = parse_chunks(get_value(hash, 'chunks'))
  @images = parse_images(get_value(hash, 'images'))
  @pages = parse_pages(get_value(hash, 'pages'))
  @elements = parse_elements(get_value(hash, 'elements'))
  @ocr_elements = parse_ocr_elements(get_value(hash, 'ocr_elements'))
  @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
  @document = parse_document_structure(get_value(hash, 'document'))
  @extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
  @quality_score = get_value(hash, 'quality_score')
  @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
end

Instance Attribute Details

#byte_endInteger (readonly)

Returns Ending byte offset (UTF-8).

Returns:

  • (Integer)

    Ending byte offset (UTF-8)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#byte_startInteger (readonly)

Returns Starting byte offset (UTF-8).

Returns:

  • (Integer)

    Starting byte offset (UTF-8)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#chunksObject (readonly)

Returns the value of attribute chunks.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def chunks
  @chunks
end

#contentString (readonly)

Returns Text content for this page.

Returns:

  • (String)

    Text content for this page



45
46
47
# File 'lib/kreuzberg/result.rb', line 45

def content
  @content
end

#detected_languagesObject (readonly)

Returns the value of attribute detected_languages.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def detected_languages
  @detected_languages
end

#djot_contentObject (readonly)

Returns the value of attribute djot_content.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def djot_content
  @djot_content
end

#documentObject (readonly)

Returns the value of attribute document.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def document
  @document
end

#elementsObject (readonly)

Returns the value of attribute elements.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def elements
  @elements
end

#extracted_keywordsObject (readonly)

Returns the value of attribute extracted_keywords.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def extracted_keywords
  @extracted_keywords
end

#first_pageInteger? (readonly)

Returns First page number (1-indexed).

Returns:

  • (Integer, nil)

    First page number (1-indexed)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#hierarchyPageHierarchy? (readonly)

Returns Hierarchy information for the page.

Returns:



147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/kreuzberg/result.rb', line 147

PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do
  def to_h
    {
      page_number: page_number,
      content: content,
      tables: tables.map(&:to_h),
      images: images.map(&:to_h),
      hierarchy: hierarchy&.to_h,
      is_blank: is_blank
    }
  end
end

#imagesArray<Image> (readonly)

Returns Images on this page.

Returns:

  • (Array<Image>)

    Images on this page



147
148
149
# File 'lib/kreuzberg/result.rb', line 147

def images
  @images
end

#last_pageInteger? (readonly)

Returns Last page number (1-indexed).

Returns:

  • (Integer, nil)

    Last page number (1-indexed)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

#metadataObject (readonly)

Returns the value of attribute metadata.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def 
  @metadata
end

#metadata_jsonObject (readonly)

Returns the value of attribute metadata_json.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def 
  @metadata_json
end

#mime_typeObject (readonly)

Returns the value of attribute mime_type.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def mime_type
  @mime_type
end

#ocr_elementsObject (readonly)

Returns the value of attribute ocr_elements.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def ocr_elements
  @ocr_elements
end

#page_numberInteger (readonly)

Returns Page number (1-indexed).

Returns:

  • (Integer)

    Page number (1-indexed)



147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/kreuzberg/result.rb', line 147

PageContent = Struct.new(:page_number, :content, :tables, :images, :hierarchy, :is_blank, keyword_init: true) do
  def to_h
    {
      page_number: page_number,
      content: content,
      tables: tables.map(&:to_h),
      images: images.map(&:to_h),
      hierarchy: hierarchy&.to_h,
      is_blank: is_blank
    }
  end
end

#pagesObject (readonly)

Returns the value of attribute pages.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def pages
  @pages
end

#processing_warningsObject (readonly)

Returns the value of attribute processing_warnings.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def processing_warnings
  @processing_warnings
end

#quality_scoreObject (readonly)

Returns the value of attribute quality_score.



15
16
17
# File 'lib/kreuzberg/result.rb', line 15

def quality_score
  @quality_score
end

#tablesArray<Table> (readonly)

Returns Tables on this page.

Returns:

  • (Array<Table>)

    Tables on this page



147
148
149
# File 'lib/kreuzberg/result.rb', line 147

def tables
  @tables
end

#token_countInteger? (readonly)

Returns Approximate token count (may be nil).

Returns:

  • (Integer, nil)

    Approximate token count (may be nil)



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/kreuzberg/result.rb', line 45

Chunk = Struct.new(
  :content,
  :byte_start,
  :byte_end,
  :token_count,
  :chunk_index,
  :total_chunks,
  :first_page,
  :last_page,
  :embedding,
  keyword_init: true
) do
  def to_h
    {
      content: content,
      byte_start: byte_start,
      byte_end: byte_end,
      token_count: token_count,
      chunk_index: chunk_index,
      total_chunks: total_chunks,
      first_page: first_page,
      last_page: last_page,
      embedding: embedding
    }
  end
end

Instance Method Details

#chunk_countInteger

Get the total number of text chunks

Returns 0 if chunking was not performed.

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
puts "Document has #{result.chunk_count} chunks"

Returns:

  • (Integer)

    Total chunk count (>= 0), or -1 on error



403
404
405
# File 'lib/kreuzberg/result.rb', line 403

def chunk_count
  @chunks&.length || 0
end

#detected_languageString?

Get the primary detected language

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
lang = result.detected_language
puts "Language: #{lang}" if lang

Returns:

  • (String, nil)

    ISO 639 language code (e.g., “en”, “de”), or nil if not detected



416
417
418
419
420
421
# File 'lib/kreuzberg/result.rb', line 416

def detected_language
  return @metadata['language'] if @metadata.is_a?(Hash) && @metadata['language']
  return @detected_languages&.first if @detected_languages&.any?

  nil
end

#metadata_field(name) ⇒ Object?

Get a metadata field by name

Supports dot notation for nested fields (e.g., “format.pages”).

Examples:

Get a top-level field

result = Kreuzberg.extract_file_sync("document.pdf")
title = result.("title")
puts "Title: #{title}" if title

Get a nested field

format_info = result.("format.pages")

Parameters:

  • name (String, Symbol)

    Field name

Returns:

  • (Object, nil)

    Field value, or nil if field doesn’t exist



438
439
440
441
442
443
444
445
446
447
448
449
450
451
# File 'lib/kreuzberg/result.rb', line 438

def (name)
  return nil unless @metadata.is_a?(Hash)

  parts = name.to_s.split('.')
  value = @metadata

  parts.each do |part|
    return nil unless value.is_a?(Hash)

    value = value[part]
  end

  value
end

#page_countInteger

Get the total number of pages in the document

Examples:

result = Kreuzberg.extract_file_sync("document.pdf")
puts "Document has #{result.page_count} pages"

Returns:

  • (Integer)

    Total page count (>= 0), or -1 on error



385
386
387
388
389
390
391
# File 'lib/kreuzberg/result.rb', line 385

def page_count
  if @metadata.is_a?(Hash) && @metadata['pages'].is_a?(Hash)
    @metadata['pages']['total_count'] || 0
  else
    0
  end
end

#to_hHash

Convert to hash

Returns:

  • (Hash)

    Hash representation



349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
# File 'lib/kreuzberg/result.rb', line 349

def to_h
  {
    content: @content,
    mime_type: @mime_type,
    metadata: @metadata,
    tables: serialize_tables,
    detected_languages: @detected_languages,
    chunks: serialize_chunks,
    images: serialize_images,
    pages: serialize_pages,
    elements: serialize_elements,
    ocr_elements: serialize_ocr_elements,
    djot_content: @djot_content&.to_h,
    document: @document&.to_h,
    extracted_keywords: @extracted_keywords&.map(&:to_h),
    quality_score: @quality_score,
    processing_warnings: @processing_warnings.map(&:to_h)
  }
end

#to_jsonString

Convert to JSON

Returns:

  • (String)

    JSON representation



373
374
375
# File 'lib/kreuzberg/result.rb', line 373

def to_json(*)
  to_h.to_json(*)
end