Class: DataDrain::FileIngestor

Inherits:

Object

Object
DataDrain::FileIngestor

show all

Includes:: Observability

Defined in:: lib/data_drain/file_ingestor.rb

Overview

Clase encargada de ingerir archivos locales (CSV, JSON, Parquet) generados por otros servicios (ej. Netflow) y subirlos al Data Lake aplicando compresión ZSTD y particionamiento Hive.

Constant Summary

Constants included from Observability

Observability::SENSITIVE_KEY_PATTERN

Instance Method Summary collapse

#call ⇒ Boolean

Ejecuta el flujo de ingestión.
#initialize(options) ⇒ FileIngestor constructor

A new instance of FileIngestor.

Constructor Details

#initialize(options) ⇒ `FileIngestor`

Returns a new instance of FileIngestor.

Parameters:

options (Hash) —

Opciones de ingestión.

Options Hash (options):

:source_path (String) —

Ruta absoluta al archivo local.
:folder_name (String) —

Nombre de la carpeta destino en el Data Lake.
:partition_keys (Array<String, Symbol>) — default: Opcional —

Columnas para particionar.
:select_sql (String) — default: Opcional —

Sentencia SELECT para transformar datos al vuelo.
:delete_after_upload (Boolean) — default: Opcional —

Borra el archivo local al terminar. Por defecto true.

# File 'lib/data_drain/file_ingestor.rb', line 18

def initialize(options)
  @source_path = options.fetch(:source_path)
  @folder_name = options.fetch(:folder_name)
  Validations.validate_identifier!(:folder_name, @folder_name)
  @partition_keys = options.fetch(:partition_keys, [])
  @select_sql = options.fetch(:select_sql, "*")
  @delete_after_upload = options.fetch(:delete_after_upload, true)
  @bucket = options[:bucket]

  @config = DataDrain.configuration
  @config.validate!
  @logger = @config.logger
  @adapter = DataDrain::Storage.adapter

  database = DuckDB::Database.open(":memory:")
  @duckdb = database.connect
end

Instance Method Details

#call ⇒ `Boolean`

Ejecuta el flujo de ingestión.

Returns:

(Boolean) —

true si el proceso fue exitoso.

# File 'lib/data_drain/file_ingestor.rb', line 38

def call
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  safe_log(:info, "file_ingestor.start", { source_path: @source_path })

  unless File.exist?(@source_path)
    safe_log(:error, "file_ingestor.file_not_found", { source_path: @source_path })
    return false
  end

  @duckdb.query("SET max_memory='#{@config.limit_ram}';") if @config.limit_ram.present?
  @duckdb.query("SET temp_directory='#{@config.tmp_directory}'") if @config.tmp_directory.present?

  @adapter.setup_duckdb(@duckdb)

  # Determinamos la función lectora de DuckDB según la extensión del archivo
  reader_function = determine_reader

  # 1. Conteo de seguridad
  step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  source_count = @duckdb.query("SELECT COUNT(*) FROM #{reader_function}").first.first
  source_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
  safe_log(:info, "file_ingestor.count", {
             source_path: @source_path,
             count: source_count,
             source_query_duration_s: source_query_duration.round(2)
           })

  if source_count.zero?
    cleanup_local_file
    duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
    safe_log(:info, "file_ingestor.skip_empty", { source_path: @source_path, duration_s: duration.round(2) })
    return true
  end

  # 2. Exportación / Subida
  @adapter.prepare_export_path(@bucket, @folder_name)
  dest_path = if @config.storage_mode.to_sym == :s3
                "s3://#{@bucket}/#{@folder_name}/"
              else
                File.join(@bucket,
                          @folder_name, "")
              end

  partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(", ")})," : ""

  query = <<~SQL
    COPY (
      SELECT #{@select_sql}
      FROM #{reader_function}
    ) TO '#{dest_path}'
    (
      FORMAT PARQUET,
      #{partition_clause}
      COMPRESSION 'ZSTD',
      OVERWRITE_OR_IGNORE 1
    );
  SQL

  safe_log(:info, "file_ingestor.export_start", { dest_path: dest_path })
  step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  @duckdb.query(query)
  export_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start

  duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
  safe_log(:info, "file_ingestor.complete", {
             source_path: @source_path,
             duration_s: duration.round(2),
             source_query_duration_s: source_query_duration.round(2),
             export_duration_s: export_duration.round(2),
             count: source_count
           })

  cleanup_local_file
  true
rescue DuckDB::Error => e
  duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
  safe_log(:error, "file_ingestor.duckdb_error",
           { source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
  false
ensure
  @duckdb&.close
end