Class: Kreuzberg::Config::Extraction

Inherits:
Object
  • Object
show all
Defined in:
lib/kreuzberg/config.rb

Overview

Main extraction configuration

Examples:

Basic usage

config = Extraction.new(use_cache: true, force_ocr: true)

With OCR

ocr = Config::OCR.new(backend: "tesseract", language: "eng")
config = Extraction.new(ocr: ocr)

With image extraction

image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
config = Extraction.new(image_extraction: image)

With post-processing

postprocessor = Config::PostProcessor.new(
  enabled: true,
  enabled_processors: ["quality"]
)
config = Extraction.new(postprocessor: postprocessor)

With document structure

config = Extraction.new(include_document_structure: true)

With all options

config = Extraction.new(
  use_cache: true,
  enable_quality_processing: true,
  force_ocr: false,
  include_document_structure: true,
  ocr: Config::OCR.new(language: "deu"),
  chunking: Config::Chunking.new(max_chars: 500),
  language_detection: Config::LanguageDetection.new(enabled: true),
  pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
  image_extraction: Config::ImageExtraction.new(target_dpi: 600),
  postprocessor: Config::PostProcessor.new(enabled: true)
)

Constant Summary collapse

ALLOWED_KEYS =

Load configuration from a file.

Detects the file format from the extension (.toml, .yaml, .json) and loads the configuration accordingly.

Keys that are allowed in the Extraction config

Examples:

Load from TOML

config = Kreuzberg::Config::Extraction.from_file("config.toml")

Load from YAML

config = Kreuzberg::Config::Extraction.from_file("config.yaml")

Returns:

%i[
  use_cache enable_quality_processing force_ocr include_document_structure ocr chunking
  language_detection pdf_options image_extraction
  postprocessor token_reduction keywords html_options pages
  max_concurrent_extractions output_format result_format
  security_limits
].freeze
KEY_ALIASES =

Aliases for backward compatibility

{
  images: :image_extraction
}.freeze
VALID_OUTPUT_FORMATS =

Valid output format values (case-insensitive, normalized internally)

%w[plain markdown html djot].freeze
VALID_RESULT_FORMATS =

Valid result format values (case-insensitive, normalized internally)

%w[unified elements element_based].freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hash = nil, use_cache: true, enable_quality_processing: true, force_ocr: false, include_document_structure: false, ocr: nil, chunking: nil, language_detection: nil, pdf_options: nil, image_extraction: nil, postprocessor: nil, token_reduction: nil, keywords: nil, html_options: nil, pages: nil, max_concurrent_extractions: nil, output_format: nil, result_format: nil, security_limits: nil) ⇒ Extraction

Returns a new instance of Extraction.



922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
# File 'lib/kreuzberg/config.rb', line 922

def initialize(hash = nil,
               use_cache: true,
               enable_quality_processing: true,
               force_ocr: false,
               include_document_structure: false,
               ocr: nil,
               chunking: nil,
               language_detection: nil,
               pdf_options: nil,
               image_extraction: nil,
               postprocessor: nil,
               token_reduction: nil,
               keywords: nil,
               html_options: nil,
               pages: nil,
               max_concurrent_extractions: nil,
               output_format: nil,
               result_format: nil,
               security_limits: nil)
  kwargs = {
    use_cache: use_cache, enable_quality_processing: enable_quality_processing,
    force_ocr: force_ocr, include_document_structure: include_document_structure,
    ocr: ocr, chunking: chunking, language_detection: language_detection,
    pdf_options: pdf_options, image_extraction: image_extraction,
    postprocessor: postprocessor,
    token_reduction: token_reduction, keywords: keywords, html_options: html_options,
    pages: pages, max_concurrent_extractions: max_concurrent_extractions,
    output_format: output_format, result_format: result_format,
    security_limits: security_limits
  }
  extracted = extract_from_hash(hash, kwargs)

  assign_attributes(extracted)
end

Instance Attribute Details

#chunkingObject (readonly)

Returns the value of attribute chunking.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def chunking
  @chunking
end

#enable_quality_processingObject (readonly)

Returns the value of attribute enable_quality_processing.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def enable_quality_processing
  @enable_quality_processing
end

#force_ocrObject (readonly)

Returns the value of attribute force_ocr.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def force_ocr
  @force_ocr
end

#html_optionsObject (readonly)

Returns the value of attribute html_options.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def html_options
  @html_options
end

#imagesObject (readonly) Also known as: image_extraction

Returns the value of attribute images.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def images
  @images
end

#include_document_structureObject (readonly)

Returns the value of attribute include_document_structure.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def include_document_structure
  @include_document_structure
end

#keywordsObject (readonly)

Returns the value of attribute keywords.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def keywords
  @keywords
end

#language_detectionObject (readonly)

Returns the value of attribute language_detection.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def language_detection
  @language_detection
end

#max_concurrent_extractionsObject (readonly)

Returns the value of attribute max_concurrent_extractions.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def max_concurrent_extractions
  @max_concurrent_extractions
end

#ocrObject (readonly)

Returns the value of attribute ocr.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def ocr
  @ocr
end

#output_formatObject

Returns the value of attribute output_format.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def output_format
  @output_format
end

#pagesObject (readonly)

Returns the value of attribute pages.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def pages
  @pages
end

#pdf_optionsObject (readonly)

Returns the value of attribute pdf_options.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def pdf_options
  @pdf_options
end

#postprocessorObject (readonly)

Returns the value of attribute postprocessor.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def postprocessor
  @postprocessor
end

#result_formatObject

Returns the value of attribute result_format.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def result_format
  @result_format
end

#security_limitsObject (readonly)

Returns the value of attribute security_limits.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def security_limits
  @security_limits
end

#token_reductionObject (readonly)

Returns the value of attribute token_reduction.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def token_reduction
  @token_reduction
end

#use_cacheObject (readonly)

Returns the value of attribute use_cache.



834
835
836
# File 'lib/kreuzberg/config.rb', line 834

def use_cache
  @use_cache
end

Class Method Details

.discoverKreuzberg::Config::Extraction?

Discover configuration file in current or parent directories.

Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current directory and parent directories.

Examples:

config = Kreuzberg::Config::Extraction.discover
if config
  # Use discovered config
end

Returns:



915
916
917
918
919
920
# File 'lib/kreuzberg/config.rb', line 915

def self.discover
  hash = Kreuzberg._config_discover_native
  return nil if hash.nil?

  new(**normalize_hash_keys(hash))
end

.from_file(path) ⇒ Object



879
880
881
882
# File 'lib/kreuzberg/config.rb', line 879

def self.from_file(path)
  hash = Kreuzberg._config_from_file_native(path)
  new(**normalize_hash_keys(hash))
end

Instance Method Details

#[](key) ⇒ Object?

Get a configuration field using hash-like syntax

Examples:

config = Extraction.new(use_cache: true)
config[:use_cache]  # => true

Parameters:

  • key (Symbol, String)

    Field name to get

Returns:

  • (Object, nil)

    The field value



1186
1187
1188
1189
1190
# File 'lib/kreuzberg/config.rb', line 1186

def [](key)
  send(key.to_sym)
rescue NoMethodError
  nil
end

#[]=(key, value) ⇒ Object

Set a configuration field using hash-like syntax

rubocop:disable Metrics/MethodLength

Examples:

config = Extraction.new(use_cache: true)
config[:use_cache] = false
config[:force_ocr] = true

Parameters:

  • key (Symbol, String)

    Field name to set

  • value (Object)

    Value to set

Returns:

  • (Object)

    The value that was set



1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
# File 'lib/kreuzberg/config.rb', line 1134

def []=(key, value)
  key_sym = key.to_sym
  case key_sym
  when :use_cache
    @use_cache = value ? true : false
  when :enable_quality_processing
    @enable_quality_processing = value ? true : false
  when :force_ocr
    @force_ocr = value ? true : false
  when :include_document_structure
    @include_document_structure = value ? true : false
  when :ocr
    @ocr = normalize_config(value, OCR)
  when :chunking
    @chunking = normalize_config(value, Chunking)
  when :language_detection
    @language_detection = normalize_config(value, LanguageDetection)
  when :pdf_options
    @pdf_options = normalize_config(value, PDF)
  when :image_extraction
    @images = normalize_config(value, ImageExtraction)
  when :postprocessor
    @postprocessor = normalize_config(value, PostProcessor)
  when :token_reduction
    @token_reduction = normalize_config(value, TokenReduction)
  when :keywords
    @keywords = normalize_config(value, Keywords)
  when :html_options
    @html_options = normalize_config(value, HtmlOptions)
  when :pages
    @pages = normalize_config(value, PageConfig)
  when :max_concurrent_extractions
    @max_concurrent_extractions = value&.to_i
  when :output_format
    @output_format = validate_output_format(value)
  when :result_format
    @result_format = validate_result_format(value)
  else
    raise ArgumentError, "Unknown configuration key: #{key}"
  end
end

#assign_attributes(params) ⇒ Object



964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
# File 'lib/kreuzberg/config.rb', line 964

def assign_attributes(params)
  @use_cache = params[:use_cache] ? true : false
  @enable_quality_processing = params[:enable_quality_processing] ? true : false
  @force_ocr = params[:force_ocr] ? true : false
  @include_document_structure = params[:include_document_structure] ? true : false
  @ocr = normalize_config(params[:ocr], OCR)
  @chunking = normalize_config(params[:chunking], Chunking)
  @language_detection = normalize_config(params[:language_detection], LanguageDetection)
  @pdf_options = normalize_config(params[:pdf_options], PDF)
  @images = normalize_config(params[:image_extraction], ImageExtraction)
  @postprocessor = normalize_config(params[:postprocessor], PostProcessor)
  @token_reduction = normalize_config(params[:token_reduction], TokenReduction)
  @keywords = normalize_config(params[:keywords], Keywords)
  @html_options = normalize_config(params[:html_options], HtmlOptions)
  @pages = normalize_config(params[:pages], PageConfig)
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
  @output_format = validate_output_format(params[:output_format])
  @result_format = validate_result_format(params[:result_format])
  @security_limits = params[:security_limits]
end

#core_config_hashObject



1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
# File 'lib/kreuzberg/config.rb', line 1009

def core_config_hash
  {
    use_cache: @use_cache,
    enable_quality_processing: @enable_quality_processing,
    force_ocr: @force_ocr,
    include_document_structure: @include_document_structure,
    max_concurrent_extractions: @max_concurrent_extractions,
    output_format: @output_format,
    result_format: @result_format
  }
end

#extract_from_hash(hash, defaults) ⇒ Object



957
958
959
960
961
962
# File 'lib/kreuzberg/config.rb', line 957

def extract_from_hash(hash, defaults)
  return defaults unless hash.is_a?(Hash)

  hash = hash.transform_keys(&:to_sym)
  defaults.merge(hash.slice(*defaults.keys))
end

#get_field(field_name) ⇒ Object?

Get a field from the configuration

Supports dot notation for nested fields (e.g., “ocr.backend”)

Examples:

Get a top-level field

config = Extraction.new(use_cache: true)
config.get_field("use_cache")  # => true

Get a nested field

config = Extraction.new(ocr: OCR.new(backend: "tesseract"))
config.get_field("ocr.backend")  # => "tesseract"

Parameters:

  • field_name (String, Symbol)

    Field name to retrieve

Returns:

  • (Object, nil)

    Parsed field value, or nil if field doesn’t exist



1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
# File 'lib/kreuzberg/config.rb', line 1061

def get_field(field_name)
  json_hash = to_h
  field_path = field_name.to_s.split('.')

  # Navigate the nested hash using the field path
  field_path.reduce(json_hash) do |current, key|
    case current
    when Hash
      # Check both symbol and string keys, prefer symbol if exists
      if current.key?(key.to_sym)
        current[key.to_sym]
      elsif current.key?(key.to_s)
        current[key.to_s]
      end
    end
  end
end

#merge(other) ⇒ Extraction

Merge another configuration into this one

Returns a new configuration with fields from the other config overriding fields from this config (shallow merge).

Examples:

base = Extraction.new(use_cache: true, force_ocr: false)
override = Extraction.new(force_ocr: true)
merged = base.merge(override)
merged.use_cache   # => true
merged.force_ocr   # => true

Parameters:

  • other (Extraction, Hash)

    Configuration to merge

Returns:



1094
1095
1096
1097
1098
1099
# File 'lib/kreuzberg/config.rb', line 1094

def merge(other)
  other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
  # Merge the two config hashes
  merged_hash = to_h.merge(other_config.to_h)
  Extraction.new(**merged_hash)
end

#merge!(other) ⇒ self

Merge another configuration into this one (mutating)

Modifies this configuration in-place by merging fields from another config.

Examples:

base = Extraction.new(use_cache: true, force_ocr: false)
override = Extraction.new(force_ocr: true)
base.merge!(override)
base.use_cache   # => true
base.force_ocr   # => true

Parameters:

  • other (Extraction, Hash)

    Configuration to merge

Returns:

  • (self)


1115
1116
1117
1118
1119
1120
# File 'lib/kreuzberg/config.rb', line 1115

def merge!(other)
  other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
  merged = merge(other_config)
  update_from_merged(merged)
  self
end

#sub_config_hashObject



1021
1022
1023
1024
1025
1026
1027
1028
1029
# File 'lib/kreuzberg/config.rb', line 1021

def sub_config_hash
  {
    ocr: @ocr&.to_h, chunking: @chunking&.to_h,
    language_detection: @language_detection&.to_h, pdf_options: @pdf_options&.to_h,
    image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
    token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
    html_options: @html_options&.to_h, pages: @pages&.to_h
  }
end

#to_hObject



1005
1006
1007
# File 'lib/kreuzberg/config.rb', line 1005

def to_h
  core_config_hash.merge(sub_config_hash).compact
end

#to_json(*_args) ⇒ String

Serialize configuration to JSON string

Examples:

config = Extraction.new(use_cache: true)
json = config.to_json
puts json  # => "{\"use_cache\":true,...}"

Returns:

  • (String)

    JSON representation of the configuration



1040
1041
1042
1043
1044
# File 'lib/kreuzberg/config.rb', line 1040

def to_json(*_args)
  json_hash = to_h
  # Convert to JSON directly - the native function has issues
  JSON.generate(json_hash)
end

#validate_output_format(value) ⇒ Object

Raises:

  • (ArgumentError)


985
986
987
988
989
990
991
992
993
# File 'lib/kreuzberg/config.rb', line 985

def validate_output_format(value)
  return nil if value.nil?

  str_value = value.to_s.downcase
  return str_value if VALID_OUTPUT_FORMATS.include?(str_value)

  raise ArgumentError,
        "Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
end

#validate_result_format(value) ⇒ Object

Raises:

  • (ArgumentError)


995
996
997
998
999
1000
1001
1002
1003
# File 'lib/kreuzberg/config.rb', line 995

def validate_result_format(value)
  return nil if value.nil?

  str_value = value.to_s.downcase
  return str_value if VALID_RESULT_FORMATS.include?(str_value)

  raise ArgumentError,
        "Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
end