Class: Kreuzberg::Config::Extraction

Inherits:
Object
  • Object
show all
Defined in:
lib/kreuzberg/config.rb

Overview

Main extraction configuration

Examples:

Basic usage

config = Extraction.new(use_cache: true, force_ocr: true)

With OCR

ocr = Config::OCR.new(backend: "tesseract", language: "eng")
config = Extraction.new(ocr: ocr)

With image extraction

image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
config = Extraction.new(image_extraction: image)

With post-processing

postprocessor = Config::PostProcessor.new(
  enabled: true,
  enabled_processors: ["quality"]
)
config = Extraction.new(postprocessor: postprocessor)

With document structure

config = Extraction.new(include_document_structure: true)

With all options

config = Extraction.new(
  use_cache: true,
  enable_quality_processing: true,
  force_ocr: false,
  include_document_structure: true,
  ocr: Config::OCR.new(language: "deu"),
  chunking: Config::Chunking.new(max_chars: 500),
  language_detection: Config::LanguageDetection.new(enabled: true),
  pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
  image_extraction: Config::ImageExtraction.new(target_dpi: 600),
  postprocessor: Config::PostProcessor.new(enabled: true)
)

Constant Summary collapse

ALLOWED_KEYS =

Load configuration from a file.

Detects the file format from the extension (.toml, .yaml, .json) and loads the configuration accordingly.

Keys that are allowed in the Extraction config

Examples:

Load from TOML

config = Kreuzberg::Config::Extraction.from_file("config.toml")

Load from YAML

config = Kreuzberg::Config::Extraction.from_file("config.yaml")

Returns:

%i[
  use_cache enable_quality_processing force_ocr disable_ocr force_ocr_pages
  include_document_structure ocr chunking language_detection pdf_options image_extraction
  postprocessor token_reduction keywords html_options pages
  max_concurrent_extractions output_format result_format
  security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
  max_archive_depth acceleration email content_filter
].freeze
KEY_ALIASES =

Aliases for backward compatibility

{
  images: :image_extraction
}.freeze
VALID_OUTPUT_FORMATS =

Valid output format values (case-insensitive, normalized internally)

%w[plain markdown html djot].freeze
VALID_RESULT_FORMATS =

Valid result format values (case-insensitive, normalized internally)

%w[unified elements element_based].freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hash = nil, use_cache: true, enable_quality_processing: true, force_ocr: false, disable_ocr: false, force_ocr_pages: nil, include_document_structure: false, ocr: nil, chunking: nil, language_detection: nil, pdf_options: nil, image_extraction: nil, postprocessor: nil, token_reduction: nil, keywords: nil, html_options: nil, pages: nil, max_concurrent_extractions: nil, output_format: nil, result_format: nil, security_limits: nil, layout: nil, concurrency: nil, cache_namespace: nil, cache_ttl_secs: nil, extraction_timeout_secs: nil, max_archive_depth: 3, acceleration: nil, email: nil, content_filter: nil) ⇒ Extraction

rubocop:disable Metrics/MethodLength



1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
# File 'lib/kreuzberg/config.rb', line 1071

def initialize(hash = nil, # rubocop:disable Metrics/MethodLength
               use_cache: true,
               enable_quality_processing: true,
               force_ocr: false,
               disable_ocr: false,
               force_ocr_pages: nil,
               include_document_structure: false,
               ocr: nil,
               chunking: nil,
               language_detection: nil,
               pdf_options: nil,
               image_extraction: nil,
               postprocessor: nil,
               token_reduction: nil,
               keywords: nil,
               html_options: nil,
               pages: nil,
               max_concurrent_extractions: nil,
               output_format: nil,
               result_format: nil,
               security_limits: nil,
               layout: nil,
               concurrency: nil,
               cache_namespace: nil,
               cache_ttl_secs: nil,
               extraction_timeout_secs: nil,
               max_archive_depth: 3,
               acceleration: nil,
               email: nil,
               content_filter: nil)
  kwargs = {
    use_cache: use_cache, enable_quality_processing: enable_quality_processing,
    force_ocr: force_ocr, disable_ocr: disable_ocr, force_ocr_pages: force_ocr_pages,
    include_document_structure: include_document_structure,
    ocr: ocr, chunking: chunking, language_detection: language_detection,
    pdf_options: pdf_options, image_extraction: image_extraction,
    postprocessor: postprocessor,
    token_reduction: token_reduction, keywords: keywords, html_options: html_options,
    pages: pages, max_concurrent_extractions: max_concurrent_extractions,
    output_format: output_format, result_format: result_format,
    security_limits: security_limits, layout: layout,
    concurrency: concurrency,
    cache_namespace: cache_namespace,
    cache_ttl_secs: cache_ttl_secs,
    extraction_timeout_secs: extraction_timeout_secs,
    max_archive_depth: max_archive_depth,
    acceleration: acceleration,
    email: email,
    content_filter: content_filter
  }
  extracted = extract_from_hash(hash, kwargs)

  assign_attributes(extracted)
end

Instance Attribute Details

#accelerationObject (readonly)

Returns the value of attribute acceleration.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def acceleration
  @acceleration
end

#cache_namespaceObject (readonly)

Returns the value of attribute cache_namespace.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def cache_namespace
  @cache_namespace
end

#cache_ttl_secsObject (readonly)

Returns the value of attribute cache_ttl_secs.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def cache_ttl_secs
  @cache_ttl_secs
end

#chunkingObject (readonly)

Returns the value of attribute chunking.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def chunking
  @chunking
end

#concurrencyObject (readonly)

Returns the value of attribute concurrency.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def concurrency
  @concurrency
end

#content_filterObject (readonly)

Returns the value of attribute content_filter.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def content_filter
  @content_filter
end

#disable_ocrObject (readonly)

Returns the value of attribute disable_ocr.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def disable_ocr
  @disable_ocr
end

#emailObject (readonly)

Returns the value of attribute email.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def email
  @email
end

#enable_quality_processingObject (readonly)

Returns the value of attribute enable_quality_processing.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def enable_quality_processing
  @enable_quality_processing
end

#extraction_timeout_secsObject (readonly)

Returns the value of attribute extraction_timeout_secs.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def extraction_timeout_secs
  @extraction_timeout_secs
end

#force_ocrObject (readonly)

Returns the value of attribute force_ocr.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def force_ocr
  @force_ocr
end

#force_ocr_pagesObject (readonly)

Returns the value of attribute force_ocr_pages.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def force_ocr_pages
  @force_ocr_pages
end

#html_optionsObject (readonly)

Returns the value of attribute html_options.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def html_options
  @html_options
end

#imagesObject (readonly) Also known as: image_extraction

Returns the value of attribute images.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def images
  @images
end

#include_document_structureObject (readonly)

Returns the value of attribute include_document_structure.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def include_document_structure
  @include_document_structure
end

#keywordsObject (readonly)

Returns the value of attribute keywords.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def keywords
  @keywords
end

#language_detectionObject (readonly)

Returns the value of attribute language_detection.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def language_detection
  @language_detection
end

#layoutObject (readonly)

Returns the value of attribute layout.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def layout
  @layout
end

#max_archive_depthObject (readonly)

Returns the value of attribute max_archive_depth.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def max_archive_depth
  @max_archive_depth
end

#max_concurrent_extractionsObject (readonly)

Returns the value of attribute max_concurrent_extractions.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def max_concurrent_extractions
  @max_concurrent_extractions
end

#ocrObject (readonly)

Returns the value of attribute ocr.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def ocr
  @ocr
end

#output_formatObject

Returns the value of attribute output_format.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def output_format
  @output_format
end

#pagesObject (readonly)

Returns the value of attribute pages.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def pages
  @pages
end

#pdf_optionsObject (readonly)

Returns the value of attribute pdf_options.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def pdf_options
  @pdf_options
end

#postprocessorObject (readonly)

Returns the value of attribute postprocessor.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def postprocessor
  @postprocessor
end

#result_formatObject

Returns the value of attribute result_format.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def result_format
  @result_format
end

#security_limitsObject (readonly)

Returns the value of attribute security_limits.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def security_limits
  @security_limits
end

#token_reductionObject (readonly)

Returns the value of attribute token_reduction.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def token_reduction
  @token_reduction
end

#use_cacheObject (readonly)

Returns the value of attribute use_cache.



980
981
982
# File 'lib/kreuzberg/config.rb', line 980

def use_cache
  @use_cache
end

Class Method Details

.discoverKreuzberg::Config::Extraction?

Discover configuration file in current or parent directories.

Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current directory and parent directories.

Examples:

config = Kreuzberg::Config::Extraction.discover
if config
  # Use discovered config
end

Returns:



1064
1065
1066
1067
1068
1069
# File 'lib/kreuzberg/config.rb', line 1064

def self.discover
  hash = Kreuzberg._config_discover_native
  return nil if hash.nil?

  new(**normalize_hash_keys(hash))
end

.from_file(path) ⇒ Object



1028
1029
1030
1031
# File 'lib/kreuzberg/config.rb', line 1028

def self.from_file(path)
  hash = Kreuzberg._config_from_file_native(path)
  new(**normalize_hash_keys(hash))
end

Instance Method Details

#[](key) ⇒ Object?

Get a configuration field using hash-like syntax

Examples:

config = Extraction.new(use_cache: true)
config[:use_cache]  # => true

Parameters:

  • key (Symbol, String)

    Field name to get

Returns:

  • (Object, nil)

    The field value



1395
1396
1397
1398
1399
# File 'lib/kreuzberg/config.rb', line 1395

def [](key)
  send(key.to_sym)
rescue NoMethodError
  nil
end

#[]=(key, value) ⇒ Object

Set a configuration field using hash-like syntax

rubocop:disable Metrics/MethodLength

Examples:

config = Extraction.new(use_cache: true)
config[:use_cache] = false
config[:force_ocr] = true

Parameters:

  • key (Symbol, String)

    Field name to set

  • value (Object)

    Value to set

Returns:

  • (Object)

    The value that was set



1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
# File 'lib/kreuzberg/config.rb', line 1323

def []=(key, value)
  key_sym = key.to_sym
  case key_sym
  when :use_cache
    @use_cache = value ? true : false
  when :enable_quality_processing
    @enable_quality_processing = value ? true : false
  when :force_ocr
    @force_ocr = value ? true : false
  when :disable_ocr
    @disable_ocr = value ? true : false
  when :force_ocr_pages
    @force_ocr_pages = value
  when :include_document_structure
    @include_document_structure = value ? true : false
  when :ocr
    @ocr = normalize_config(value, OCR)
  when :chunking
    @chunking = normalize_config(value, Chunking)
  when :language_detection
    @language_detection = normalize_config(value, LanguageDetection)
  when :pdf_options
    @pdf_options = normalize_config(value, PDF)
  when :image_extraction
    @images = normalize_config(value, ImageExtraction)
  when :postprocessor
    @postprocessor = normalize_config(value, PostProcessor)
  when :token_reduction
    @token_reduction = normalize_config(value, TokenReduction)
  when :keywords
    @keywords = normalize_config(value, Keywords)
  when :html_options
    @html_options = normalize_config(value, HtmlOptions)
  when :pages
    @pages = normalize_config(value, PageConfig)
  when :layout
    @layout = normalize_config(value, LayoutDetection)
  when :concurrency
    @concurrency = normalize_config(value, Concurrency)
  when :acceleration
    @acceleration = normalize_config(value, Acceleration)
  when :email
    @email = normalize_config(value, Email)
  when :max_archive_depth
    @max_archive_depth = value&.to_i || 3
  when :max_concurrent_extractions
    @max_concurrent_extractions = value&.to_i
  when :output_format
    @output_format = validate_output_format(value)
  when :result_format
    @result_format = validate_result_format(value)
  when :cache_namespace
    @cache_namespace = value
  when :cache_ttl_secs
    @cache_ttl_secs = value&.to_i
  when :extraction_timeout_secs
    @extraction_timeout_secs = value&.to_i
  else
    raise ArgumentError, "Unknown configuration key: #{key}"
  end
end

#assign_attributes(params) ⇒ Object

rubocop:disable Metrics/MethodLength



1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
# File 'lib/kreuzberg/config.rb', line 1133

def assign_attributes(params) # rubocop:disable Metrics/MethodLength
  @use_cache = params[:use_cache] ? true : false
  @enable_quality_processing = params[:enable_quality_processing] ? true : false
  @force_ocr = params[:force_ocr] ? true : false
  @disable_ocr = params[:disable_ocr] ? true : false
  @force_ocr_pages = params[:force_ocr_pages]
  @include_document_structure = params[:include_document_structure] ? true : false
  @ocr = normalize_config(params[:ocr], OCR)
  @chunking = normalize_config(params[:chunking], Chunking)
  @language_detection = normalize_config(params[:language_detection], LanguageDetection)
  @pdf_options = normalize_config(params[:pdf_options], PDF)
  @images = normalize_config(params[:image_extraction], ImageExtraction)
  @postprocessor = normalize_config(params[:postprocessor], PostProcessor)
  @token_reduction = normalize_config(params[:token_reduction], TokenReduction)
  @keywords = normalize_config(params[:keywords], Keywords)
  @html_options = normalize_config(params[:html_options], HtmlOptions)
  @pages = normalize_config(params[:pages], PageConfig)
  @layout = normalize_config(params[:layout], LayoutDetection)
  @concurrency = normalize_config(params[:concurrency], Concurrency)
  @acceleration = normalize_config(params[:acceleration], Acceleration)
  @email = normalize_config(params[:email], Email)
  @content_filter = normalize_config(params[:content_filter], ContentFilter)
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
  @max_archive_depth = params[:max_archive_depth]&.to_i || 3
  @output_format = validate_output_format(params[:output_format])
  @result_format = validate_result_format(params[:result_format])
  @cache_namespace = params[:cache_namespace]
  @cache_ttl_secs = params[:cache_ttl_secs]&.to_i
  @extraction_timeout_secs = params[:extraction_timeout_secs]&.to_i
  @security_limits = params[:security_limits]
end

#core_config_hashObject



1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
# File 'lib/kreuzberg/config.rb', line 1189

def core_config_hash
  {
    use_cache: @use_cache,
    enable_quality_processing: @enable_quality_processing,
    force_ocr: @force_ocr,
    disable_ocr: @disable_ocr,
    force_ocr_pages: @force_ocr_pages,
    include_document_structure: @include_document_structure,
    max_concurrent_extractions: @max_concurrent_extractions,
    max_archive_depth: @max_archive_depth,
    output_format: @output_format,
    result_format: @result_format,
    cache_namespace: @cache_namespace,
    cache_ttl_secs: @cache_ttl_secs,
    extraction_timeout_secs: @extraction_timeout_secs
  }
end

#extract_from_hash(hash, defaults) ⇒ Object



1126
1127
1128
1129
1130
1131
# File 'lib/kreuzberg/config.rb', line 1126

def extract_from_hash(hash, defaults)
  return defaults unless hash.is_a?(Hash)

  hash = hash.transform_keys(&:to_sym)
  defaults.merge(hash.slice(*defaults.keys))
end

#get_field(field_name) ⇒ Object?

Get a field from the configuration

Supports dot notation for nested fields (e.g., “ocr.backend”)

Examples:

Get a top-level field

config = Extraction.new(use_cache: true)
config.get_field("use_cache")  # => true

Get a nested field

config = Extraction.new(ocr: OCR.new(backend: "tesseract"))
config.get_field("ocr.backend")  # => "tesseract"

Parameters:

  • field_name (String, Symbol)

    Field name to retrieve

Returns:

  • (Object, nil)

    Parsed field value, or nil if field doesn’t exist



1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
# File 'lib/kreuzberg/config.rb', line 1250

def get_field(field_name)
  json_hash = to_h
  field_path = field_name.to_s.split('.')

  # Navigate the nested hash using the field path
  field_path.reduce(json_hash) do |current, key|
    case current
    when Hash
      # Check both symbol and string keys, prefer symbol if exists
      if current.key?(key.to_sym)
        current[key.to_sym]
      elsif current.key?(key.to_s)
        current[key.to_s]
      end
    end
  end
end

#merge(other) ⇒ Extraction

Merge another configuration into this one

Returns a new configuration with fields from the other config overriding fields from this config (shallow merge).

Examples:

base = Extraction.new(use_cache: true, force_ocr: false)
override = Extraction.new(force_ocr: true)
merged = base.merge(override)
merged.use_cache   # => true
merged.force_ocr   # => true

Parameters:

  • other (Extraction, Hash)

    Configuration to merge

Returns:



1283
1284
1285
1286
1287
1288
# File 'lib/kreuzberg/config.rb', line 1283

def merge(other)
  other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
  # Merge the two config hashes
  merged_hash = to_h.merge(other_config.to_h)
  Extraction.new(**merged_hash)
end

#merge!(other) ⇒ self

Merge another configuration into this one (mutating)

Modifies this configuration in-place by merging fields from another config.

Examples:

base = Extraction.new(use_cache: true, force_ocr: false)
override = Extraction.new(force_ocr: true)
base.merge!(override)
base.use_cache   # => true
base.force_ocr   # => true

Parameters:

  • other (Extraction, Hash)

    Configuration to merge

Returns:

  • (self)


1304
1305
1306
1307
1308
1309
# File 'lib/kreuzberg/config.rb', line 1304

def merge!(other)
  other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
  merged = merge(other_config)
  update_from_merged(merged)
  self
end

#sub_config_hashObject



1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
# File 'lib/kreuzberg/config.rb', line 1207

def sub_config_hash
  {
    ocr: @ocr&.to_h, chunking: @chunking&.to_h,
    language_detection: @language_detection&.to_h, pdf_options: @pdf_options&.to_h,
    image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
    token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
    html_options: @html_options&.to_h, pages: @pages&.to_h,
    layout: @layout&.to_h, concurrency: @concurrency&.to_h,
    acceleration: @acceleration&.to_h, email: @email&.to_h,
    content_filter: @content_filter&.to_h
  }
end

#to_hObject



1185
1186
1187
# File 'lib/kreuzberg/config.rb', line 1185

def to_h
  core_config_hash.merge(sub_config_hash).compact
end

#to_json(*_args) ⇒ String

Serialize configuration to JSON string

Examples:

config = Extraction.new(use_cache: true)
json = config.to_json
puts json  # => "{\"use_cache\":true,...}"

Returns:

  • (String)

    JSON representation of the configuration



1229
1230
1231
1232
1233
# File 'lib/kreuzberg/config.rb', line 1229

def to_json(*_args)
  json_hash = to_h
  # Convert to JSON directly - the native function has issues
  JSON.generate(json_hash)
end

#validate_output_format(value) ⇒ Object

Raises:

  • (ArgumentError)


1165
1166
1167
1168
1169
1170
1171
1172
1173
# File 'lib/kreuzberg/config.rb', line 1165

def validate_output_format(value)
  return nil if value.nil?

  str_value = value.to_s.downcase
  return str_value if VALID_OUTPUT_FORMATS.include?(str_value)

  raise ArgumentError,
        "Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
end

#validate_result_format(value) ⇒ Object

Raises:

  • (ArgumentError)


1175
1176
1177
1178
1179
1180
1181
1182
1183
# File 'lib/kreuzberg/config.rb', line 1175

def validate_result_format(value)
  return nil if value.nil?

  str_value = value.to_s.downcase
  return str_value if VALID_RESULT_FORMATS.include?(str_value)

  raise ArgumentError,
        "Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
end