Class: Kreuzberg::Config::Extraction

Inherits:
Object
  • Object
show all
Defined in:
lib/kreuzberg/config.rb

Overview

Main extraction configuration

Examples:

Basic usage

config = Extraction.new(use_cache: true, force_ocr: true)

With OCR

ocr = Config::OCR.new(backend: "tesseract", language: "eng")
config = Extraction.new(ocr: ocr)

With image extraction

image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
config = Extraction.new(image_extraction: image)

With post-processing

postprocessor = Config::PostProcessor.new(
  enabled: true,
  enabled_processors: ["quality"]
)
config = Extraction.new(postprocessor: postprocessor)

With document structure

config = Extraction.new(include_document_structure: true)

With all options

config = Extraction.new(
  use_cache: true,
  enable_quality_processing: true,
  force_ocr: false,
  include_document_structure: true,
  ocr: Config::OCR.new(language: "deu"),
  chunking: Config::Chunking.new(max_chars: 500),
  language_detection: Config::LanguageDetection.new(enabled: true),
  pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
  image_extraction: Config::ImageExtraction.new(target_dpi: 600),
  postprocessor: Config::PostProcessor.new(enabled: true)
)

Constant Summary collapse

ALLOWED_KEYS =

Load configuration from a file.

Detects the file format from the extension (.toml, .yaml, .json) and loads the configuration accordingly.

Keys that are allowed in the Extraction config

Examples:

Load from TOML

config = Kreuzberg::Config::Extraction.from_file("config.toml")

Load from YAML

config = Kreuzberg::Config::Extraction.from_file("config.yaml")

Returns:

%i[
  use_cache enable_quality_processing force_ocr include_document_structure ocr chunking
  language_detection pdf_options image_extraction
  postprocessor token_reduction keywords html_options pages
  max_concurrent_extractions output_format result_format
  security_limits
].freeze
KEY_ALIASES =

Aliases for backward compatibility

{
  images: :image_extraction
}.freeze
VALID_OUTPUT_FORMATS =

Valid output format values (case-insensitive, normalized internally)

%w[plain markdown html djot].freeze
VALID_RESULT_FORMATS =

Valid result format values (case-insensitive, normalized internally)

%w[unified elements element_based].freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hash = nil, use_cache: true, enable_quality_processing: true, force_ocr: false, include_document_structure: false, ocr: nil, chunking: nil, language_detection: nil, pdf_options: nil, image_extraction: nil, postprocessor: nil, token_reduction: nil, keywords: nil, html_options: nil, pages: nil, max_concurrent_extractions: nil, output_format: nil, result_format: nil, security_limits: nil) ⇒ Extraction

Returns a new instance of Extraction.



932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
# File 'lib/kreuzberg/config.rb', line 932

def initialize(hash = nil,
               use_cache: true,
               enable_quality_processing: true,
               force_ocr: false,
               include_document_structure: false,
               ocr: nil,
               chunking: nil,
               language_detection: nil,
               pdf_options: nil,
               image_extraction: nil,
               postprocessor: nil,
               token_reduction: nil,
               keywords: nil,
               html_options: nil,
               pages: nil,
               max_concurrent_extractions: nil,
               output_format: nil,
               result_format: nil,
               security_limits: nil)
  kwargs = {
    use_cache: use_cache, enable_quality_processing: enable_quality_processing,
    force_ocr: force_ocr, include_document_structure: include_document_structure,
    ocr: ocr, chunking: chunking, language_detection: language_detection,
    pdf_options: pdf_options, image_extraction: image_extraction,
    postprocessor: postprocessor,
    token_reduction: token_reduction, keywords: keywords, html_options: html_options,
    pages: pages, max_concurrent_extractions: max_concurrent_extractions,
    output_format: output_format, result_format: result_format,
    security_limits: security_limits
  }
  extracted = extract_from_hash(hash, kwargs)

  assign_attributes(extracted)
end

Instance Attribute Details

#chunkingObject (readonly)

Returns the value of attribute chunking.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def chunking
  @chunking
end

#enable_quality_processingObject (readonly)

Returns the value of attribute enable_quality_processing.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def enable_quality_processing
  @enable_quality_processing
end

#force_ocrObject (readonly)

Returns the value of attribute force_ocr.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def force_ocr
  @force_ocr
end

#html_optionsObject (readonly)

Returns the value of attribute html_options.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def html_options
  @html_options
end

#imagesObject (readonly) Also known as: image_extraction

Returns the value of attribute images.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def images
  @images
end

#include_document_structureObject (readonly)

Returns the value of attribute include_document_structure.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def include_document_structure
  @include_document_structure
end

#keywordsObject (readonly)

Returns the value of attribute keywords.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def keywords
  @keywords
end

#language_detectionObject (readonly)

Returns the value of attribute language_detection.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def language_detection
  @language_detection
end

#max_concurrent_extractionsObject (readonly)

Returns the value of attribute max_concurrent_extractions.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def max_concurrent_extractions
  @max_concurrent_extractions
end

#ocrObject (readonly)

Returns the value of attribute ocr.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def ocr
  @ocr
end

#output_formatObject

Returns the value of attribute output_format.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def output_format
  @output_format
end

#pagesObject (readonly)

Returns the value of attribute pages.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def pages
  @pages
end

#pdf_optionsObject (readonly)

Returns the value of attribute pdf_options.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def pdf_options
  @pdf_options
end

#postprocessorObject (readonly)

Returns the value of attribute postprocessor.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def postprocessor
  @postprocessor
end

#result_formatObject

Returns the value of attribute result_format.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def result_format
  @result_format
end

#security_limitsObject (readonly)

Returns the value of attribute security_limits.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def security_limits
  @security_limits
end

#token_reductionObject (readonly)

Returns the value of attribute token_reduction.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def token_reduction
  @token_reduction
end

#use_cacheObject (readonly)

Returns the value of attribute use_cache.



844
845
846
# File 'lib/kreuzberg/config.rb', line 844

def use_cache
  @use_cache
end

Class Method Details

.discoverKreuzberg::Config::Extraction?

Discover configuration file in current or parent directories.

Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current directory and parent directories.

Examples:

config = Kreuzberg::Config::Extraction.discover
if config
  # Use discovered config
end

Returns:



925
926
927
928
929
930
# File 'lib/kreuzberg/config.rb', line 925

def self.discover
  hash = Kreuzberg._config_discover_native
  return nil if hash.nil?

  new(**normalize_hash_keys(hash))
end

.from_file(path) ⇒ Object



889
890
891
892
# File 'lib/kreuzberg/config.rb', line 889

def self.from_file(path)
  hash = Kreuzberg._config_from_file_native(path)
  new(**normalize_hash_keys(hash))
end

Instance Method Details

#[](key) ⇒ Object?

Get a configuration field using hash-like syntax

Examples:

config = Extraction.new(use_cache: true)
config[:use_cache]  # => true

Parameters:

  • key (Symbol, String)

    Field name to get

Returns:

  • (Object, nil)

    The field value



1196
1197
1198
1199
1200
# File 'lib/kreuzberg/config.rb', line 1196

def [](key)
  send(key.to_sym)
rescue NoMethodError
  nil
end

#[]=(key, value) ⇒ Object

Set a configuration field using hash-like syntax

rubocop:disable Metrics/MethodLength

Examples:

config = Extraction.new(use_cache: true)
config[:use_cache] = false
config[:force_ocr] = true

Parameters:

  • key (Symbol, String)

    Field name to set

  • value (Object)

    Value to set

Returns:

  • (Object)

    The value that was set



1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
# File 'lib/kreuzberg/config.rb', line 1144

def []=(key, value)
  key_sym = key.to_sym
  case key_sym
  when :use_cache
    @use_cache = value ? true : false
  when :enable_quality_processing
    @enable_quality_processing = value ? true : false
  when :force_ocr
    @force_ocr = value ? true : false
  when :include_document_structure
    @include_document_structure = value ? true : false
  when :ocr
    @ocr = normalize_config(value, OCR)
  when :chunking
    @chunking = normalize_config(value, Chunking)
  when :language_detection
    @language_detection = normalize_config(value, LanguageDetection)
  when :pdf_options
    @pdf_options = normalize_config(value, PDF)
  when :image_extraction
    @images = normalize_config(value, ImageExtraction)
  when :postprocessor
    @postprocessor = normalize_config(value, PostProcessor)
  when :token_reduction
    @token_reduction = normalize_config(value, TokenReduction)
  when :keywords
    @keywords = normalize_config(value, Keywords)
  when :html_options
    @html_options = normalize_config(value, HtmlOptions)
  when :pages
    @pages = normalize_config(value, PageConfig)
  when :max_concurrent_extractions
    @max_concurrent_extractions = value&.to_i
  when :output_format
    @output_format = validate_output_format(value)
  when :result_format
    @result_format = validate_result_format(value)
  else
    raise ArgumentError, "Unknown configuration key: #{key}"
  end
end

#assign_attributes(params) ⇒ Object



974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
# File 'lib/kreuzberg/config.rb', line 974

def assign_attributes(params)
  @use_cache = params[:use_cache] ? true : false
  @enable_quality_processing = params[:enable_quality_processing] ? true : false
  @force_ocr = params[:force_ocr] ? true : false
  @include_document_structure = params[:include_document_structure] ? true : false
  @ocr = normalize_config(params[:ocr], OCR)
  @chunking = normalize_config(params[:chunking], Chunking)
  @language_detection = normalize_config(params[:language_detection], LanguageDetection)
  @pdf_options = normalize_config(params[:pdf_options], PDF)
  @images = normalize_config(params[:image_extraction], ImageExtraction)
  @postprocessor = normalize_config(params[:postprocessor], PostProcessor)
  @token_reduction = normalize_config(params[:token_reduction], TokenReduction)
  @keywords = normalize_config(params[:keywords], Keywords)
  @html_options = normalize_config(params[:html_options], HtmlOptions)
  @pages = normalize_config(params[:pages], PageConfig)
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
  @output_format = validate_output_format(params[:output_format])
  @result_format = validate_result_format(params[:result_format])
  @security_limits = params[:security_limits]
end

#core_config_hashObject



1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
# File 'lib/kreuzberg/config.rb', line 1019

def core_config_hash
  {
    use_cache: @use_cache,
    enable_quality_processing: @enable_quality_processing,
    force_ocr: @force_ocr,
    include_document_structure: @include_document_structure,
    max_concurrent_extractions: @max_concurrent_extractions,
    output_format: @output_format,
    result_format: @result_format
  }
end

#extract_from_hash(hash, defaults) ⇒ Object



967
968
969
970
971
972
# File 'lib/kreuzberg/config.rb', line 967

def extract_from_hash(hash, defaults)
  return defaults unless hash.is_a?(Hash)

  hash = hash.transform_keys(&:to_sym)
  defaults.merge(hash.slice(*defaults.keys))
end

#get_field(field_name) ⇒ Object?

Get a field from the configuration

Supports dot notation for nested fields (e.g., “ocr.backend”)

Examples:

Get a top-level field

config = Extraction.new(use_cache: true)
config.get_field("use_cache")  # => true

Get a nested field

config = Extraction.new(ocr: OCR.new(backend: "tesseract"))
config.get_field("ocr.backend")  # => "tesseract"

Parameters:

  • field_name (String, Symbol)

    Field name to retrieve

Returns:

  • (Object, nil)

    Parsed field value, or nil if field doesn’t exist



1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
# File 'lib/kreuzberg/config.rb', line 1071

def get_field(field_name)
  json_hash = to_h
  field_path = field_name.to_s.split('.')

  # Navigate the nested hash using the field path
  field_path.reduce(json_hash) do |current, key|
    case current
    when Hash
      # Check both symbol and string keys, prefer symbol if exists
      if current.key?(key.to_sym)
        current[key.to_sym]
      elsif current.key?(key.to_s)
        current[key.to_s]
      end
    end
  end
end

#merge(other) ⇒ Extraction

Merge another configuration into this one

Returns a new configuration with fields from the other config overriding fields from this config (shallow merge).

Examples:

base = Extraction.new(use_cache: true, force_ocr: false)
override = Extraction.new(force_ocr: true)
merged = base.merge(override)
merged.use_cache   # => true
merged.force_ocr   # => true

Parameters:

  • other (Extraction, Hash)

    Configuration to merge

Returns:



1104
1105
1106
1107
1108
1109
# File 'lib/kreuzberg/config.rb', line 1104

def merge(other)
  other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
  # Merge the two config hashes
  merged_hash = to_h.merge(other_config.to_h)
  Extraction.new(**merged_hash)
end

#merge!(other) ⇒ self

Merge another configuration into this one (mutating)

Modifies this configuration in-place by merging fields from another config.

Examples:

base = Extraction.new(use_cache: true, force_ocr: false)
override = Extraction.new(force_ocr: true)
base.merge!(override)
base.use_cache   # => true
base.force_ocr   # => true

Parameters:

  • other (Extraction, Hash)

    Configuration to merge

Returns:

  • (self)


1125
1126
1127
1128
1129
1130
# File 'lib/kreuzberg/config.rb', line 1125

def merge!(other)
  other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
  merged = merge(other_config)
  update_from_merged(merged)
  self
end

#sub_config_hashObject



1031
1032
1033
1034
1035
1036
1037
1038
1039
# File 'lib/kreuzberg/config.rb', line 1031

def sub_config_hash
  {
    ocr: @ocr&.to_h, chunking: @chunking&.to_h,
    language_detection: @language_detection&.to_h, pdf_options: @pdf_options&.to_h,
    image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
    token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
    html_options: @html_options&.to_h, pages: @pages&.to_h
  }
end

#to_hObject



1015
1016
1017
# File 'lib/kreuzberg/config.rb', line 1015

def to_h
  core_config_hash.merge(sub_config_hash).compact
end

#to_json(*_args) ⇒ String

Serialize configuration to JSON string

Examples:

config = Extraction.new(use_cache: true)
json = config.to_json
puts json  # => "{\"use_cache\":true,...}"

Returns:

  • (String)

    JSON representation of the configuration



1050
1051
1052
1053
1054
# File 'lib/kreuzberg/config.rb', line 1050

def to_json(*_args)
  json_hash = to_h
  # Convert to JSON directly - the native function has issues
  JSON.generate(json_hash)
end

#validate_output_format(value) ⇒ Object

Raises:

  • (ArgumentError)


995
996
997
998
999
1000
1001
1002
1003
# File 'lib/kreuzberg/config.rb', line 995

def validate_output_format(value)
  return nil if value.nil?

  str_value = value.to_s.downcase
  return str_value if VALID_OUTPUT_FORMATS.include?(str_value)

  raise ArgumentError,
        "Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
end

#validate_result_format(value) ⇒ Object

Raises:

  • (ArgumentError)


1005
1006
1007
1008
1009
1010
1011
1012
1013
# File 'lib/kreuzberg/config.rb', line 1005

def validate_result_format(value)
  return nil if value.nil?

  str_value = value.to_s.downcase
  return str_value if VALID_RESULT_FORMATS.include?(str_value)

  raise ArgumentError,
        "Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
end