Class: DataDrain::GlueRunner
- Inherits:
-
Object
- Object
- DataDrain::GlueRunner
- Extended by:
- Observability
- Defined in:
- lib/data_drain/glue_runner.rb
Overview
Orquestador para AWS Glue. Permite disparar y monitorear Jobs en AWS para delegar el movimiento masivo de datos (ej. tablas de 1TB).
Class Attribute Summary collapse
-
.client ⇒ Boolean
Dispara un Job de Glue y espera a que termine exitosamente.
Class Method Summary collapse
- .create_job(job_name, role_arn:, script_location:, command_name: "glueetl", default_arguments: {}, description: nil, worker_type: nil, number_of_workers: nil, timeout: 2880, max_retries: 0, allocated_capacity: nil, glue_version: nil) ⇒ Object
- .delete_job(job_name) ⇒ Object
- .ensure_job(job_name, role_arn:, script_location:, command_name: "glueetl", default_arguments: {}, description: nil, worker_type: nil, number_of_workers: nil, timeout: 2880, max_retries: 0, allocated_capacity: nil, glue_version: nil) ⇒ Object
- .get_job(job_name) ⇒ Object
- .job_exists?(job_name) ⇒ Boolean
- .run_and_wait(job_name, arguments = {}, polling_interval: 30, max_wait_seconds: nil) ⇒ Object
- .update_job(job_name, role_arn: nil, command_name: nil, script_location: nil, default_arguments: nil, description: nil, worker_type: nil, number_of_workers: nil, timeout: nil, max_retries: nil, allocated_capacity: nil, glue_version: nil) ⇒ Object
Class Attribute Details
.client ⇒ Boolean
Dispara un Job de Glue y espera a que termine exitosamente.
22 23 24 |
# File 'lib/data_drain/glue_runner.rb', line 22 def self.client @client ||= Aws::Glue::Client.new(region: DataDrain.configuration.aws_region) end |
Class Method Details
.create_job(job_name, role_arn:, script_location:, command_name: "glueetl", default_arguments: {}, description: nil, worker_type: nil, number_of_workers: nil, timeout: 2880, max_retries: 0, allocated_capacity: nil, glue_version: nil) ⇒ Object
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/data_drain/glue_runner.rb', line 43 def self.create_job(job_name, role_arn:, script_location:, command_name: "glueetl", default_arguments: {}, description: nil, worker_type: nil, number_of_workers: nil, timeout: 2880, max_retries: 0, allocated_capacity: nil, glue_version: nil) DataDrain::Validations.validate_glue_name!(:job_name, job_name) opts = { name: job_name, role: role_arn, command: { name: command_name, python_version: "3", script_location: script_location } } opts[:default_arguments] = default_arguments unless default_arguments.empty? opts[:description] = description if description opts[:timeout] = timeout if timeout opts[:max_retries] = max_retries if max_retries opts[:allocated_capacity] = allocated_capacity if allocated_capacity opts[:worker_type] = worker_type if worker_type opts[:number_of_workers] = number_of_workers if number_of_workers opts[:glue_version] = glue_version if glue_version client.create_job(**opts) get_job(job_name) end |
.delete_job(job_name) ⇒ Object
93 94 95 96 97 |
# File 'lib/data_drain/glue_runner.rb', line 93 def self.delete_job(job_name) DataDrain::Validations.validate_glue_name!(:job_name, job_name) client.delete_job(job_name: job_name) nil end |
.ensure_job(job_name, role_arn:, script_location:, command_name: "glueetl", default_arguments: {}, description: nil, worker_type: nil, number_of_workers: nil, timeout: 2880, max_retries: 0, allocated_capacity: nil, glue_version: nil) ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/data_drain/glue_runner.rb', line 99 def self.ensure_job(job_name, role_arn:, script_location:, command_name: "glueetl", default_arguments: {}, description: nil, worker_type: nil, number_of_workers: nil, timeout: 2880, max_retries: 0, allocated_capacity: nil, glue_version: nil) if job_exists?(job_name) safe_log(:info, "glue_runner.job_exists", { job: job_name }) update_job(job_name, role_arn: role_arn, command_name: command_name, script_location: script_location, default_arguments: default_arguments, description: description, worker_type: worker_type, number_of_workers: number_of_workers, timeout: timeout, max_retries: max_retries, allocated_capacity: allocated_capacity, glue_version: glue_version) else safe_log(:info, "glue_runner.job_created", { job: job_name }) create_job(job_name, role_arn: role_arn, script_location: script_location, command_name: command_name, default_arguments: default_arguments, description: description, worker_type: worker_type, number_of_workers: number_of_workers, timeout: timeout, max_retries: max_retries, allocated_capacity: allocated_capacity, glue_version: glue_version) end end |
.get_job(job_name) ⇒ Object
38 39 40 41 |
# File 'lib/data_drain/glue_runner.rb', line 38 def self.get_job(job_name) DataDrain::Validations.validate_glue_name!(:job_name, job_name) client.get_job(job_name: job_name).job end |
.job_exists?(job_name) ⇒ Boolean
30 31 32 33 34 35 36 |
# File 'lib/data_drain/glue_runner.rb', line 30 def self.job_exists?(job_name) DataDrain::Validations.validate_glue_name!(:job_name, job_name) get_job(job_name) true rescue Aws::Glue::Errors::EntityNotFoundException false end |
.run_and_wait(job_name, arguments = {}, polling_interval: 30, max_wait_seconds: nil) ⇒ Object
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# File 'lib/data_drain/glue_runner.rb', line 122 def self.run_and_wait(job_name, arguments = {}, polling_interval: 30, max_wait_seconds: nil) config = DataDrain.configuration config.validate! start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) @logger = config.logger safe_log(:info, "glue_runner.start", { job: job_name }) resp = client.start_job_run(job_name: job_name, arguments: arguments) run_id = resp.job_run_id loop do if max_wait_seconds && (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) > max_wait_seconds safe_log(:error, "glue_runner.timeout", { job: job_name, run_id: run_id, max_wait_seconds: max_wait_seconds }) raise DataDrain::Error, "Glue Job #{job_name} (Run ID: #{run_id}) excedió max_wait_seconds=#{max_wait_seconds}" end run_info = client.get_job_run(job_name: job_name, run_id: run_id).job_run status = run_info.job_run_state case status when "SUCCEEDED" duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time safe_log(:info, "glue_runner.complete", { job: job_name, run_id: run_id, duration_s: duration.round(2) }) return true when "FAILED", "STOPPED", "TIMEOUT" duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time = { job: job_name, run_id: run_id, status: status, duration_s: duration.round(2) } [:error_message] = run_info..gsub("\"", "'")[0, 200] if run_info. safe_log(:error, "glue_runner.failed", ) raise "Glue Job #{job_name} (Run ID: #{run_id}) falló con estado #{status}." else safe_log(:info, "glue_runner.polling", { job: job_name, run_id: run_id, status: status, next_check_in_s: polling_interval }) sleep polling_interval end end end |
.update_job(job_name, role_arn: nil, command_name: nil, script_location: nil, default_arguments: nil, description: nil, worker_type: nil, number_of_workers: nil, timeout: nil, max_retries: nil, allocated_capacity: nil, glue_version: nil) ⇒ Object
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/data_drain/glue_runner.rb', line 69 def self.update_job(job_name, role_arn: nil, command_name: nil, script_location: nil, default_arguments: nil, description: nil, worker_type: nil, number_of_workers: nil, timeout: nil, max_retries: nil, allocated_capacity: nil, glue_version: nil) DataDrain::Validations.validate_glue_name!(:job_name, job_name) job_update = {} job_update[:role] = role_arn if role_arn if command_name && script_location job_update[:command] = { name: command_name, python_version: "3", script_location: script_location } end job_update[:default_arguments] = default_arguments if default_arguments job_update[:description] = description if description job_update[:timeout] = timeout if timeout job_update[:max_retries] = max_retries if max_retries job_update[:allocated_capacity] = allocated_capacity if allocated_capacity job_update[:worker_type] = worker_type if worker_type job_update[:number_of_workers] = number_of_workers if number_of_workers job_update[:glue_version] = glue_version if glue_version client.update_job(job_name: job_name, job_update: job_update) get_job(job_name) end |