Class: CanvasSync::Importers::BulkImporter

Inherits:

Object

Object
CanvasSync::Importers::BulkImporter

show all

Defined in:: lib/canvas_sync/importers/bulk_importer.rb

Defined Under Namespace

Constant Summary collapse

DEFAULT_BATCH_SIZE = The batch import size can be customized by setting the ‘BULK_IMPORTER_BATCH_SIZE’ environment variable

10_000

Class Method Summary collapse

.batch_size ⇒ Object
.condition_sql(klass, columns, report_start = nil) ⇒ Object

This method generates SQL that looks like: (users.sis_id, users.email) IS DISTINCT FROM (EXCLUDED.sis_id, EXCLUDED.email).
.import(report_file_path, *args, import_args: {}, &blk) ⇒ Object

NEW: (report_file_path, klass, mapping, import_args: {}, &blk) LEGACY: (report_file_path, columns, klass, conflict_target, import_args: {}, &blk).
.perform_import(klass, columns, rows, conflict_target, import_args = {}) ⇒ Object
.perform_in_batches(report_file_path, klass, mapping, import_args: {}, &block) ⇒ Object

Class Method Details

.batch_size ⇒ `Object`

# File 'lib/canvas_sync/importers/bulk_importer.rb', line 184

def self.batch_size
  batch_size = ENV["BULK_IMPORTER_BATCH_SIZE"].to_i
  batch_size > 0 ? batch_size : DEFAULT_BATCH_SIZE
end

.condition_sql(klass, columns, report_start = nil) ⇒ `Object`

This method generates SQL that looks like: (users.sis_id, users.email) IS DISTINCT FROM (EXCLUDED.sis_id, EXCLUDED.email)

This prevents activerecord-import from setting the ‘updated_at` column for rows that haven’t actually changed. This allows you to query for rows that have changed by doing something like:

started_at = Time.now run_the_users_sync! changed = User.where(“updated_at >= ?”, started_at)

# File 'lib/canvas_sync/importers/bulk_importer.rb', line 170

def self.condition_sql(klass, columns, report_start = nil)
  columns_str = columns.map { |c| "#{klass.quoted_table_name}.#{c}" }.join(", ")
  excluded_str = columns.map { |c| "EXCLUDED.#{c}" }.join(", ")
  condition_sql = "(#{columns_str}) IS DISTINCT FROM (#{excluded_str})"

  if klass.column_names.include?("canvas_synced_at") && report_start
    condition_sql += " AND #{klass.quoted_table_name}.canvas_synced_at < '#{report_start}'"
  elsif klass.column_names.include?("updated_at") && report_start
    condition_sql += " AND #{klass.quoted_table_name}.updated_at < '#{report_start}'"
  end

  condition_sql
end

.import(report_file_path, *args, import_args: {}, &blk) ⇒ `Object`

NEW: (report_file_path, klass, mapping, import_args: {}, &blk) LEGACY: (report_file_path, columns, klass, conflict_target, import_args: {}, &blk)

# File 'lib/canvas_sync/importers/bulk_importer.rb', line 23

def self.import(report_file_path, *args, import_args: {}, &blk) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/LineLength
  if args.length == 2
    klass = args[0]
    mapping = args[1]
  elsif args.length == 3
    klass = args[1]
    mapping = { columns: args[0], conflict_target: args[2] }
  end

  ClassCallbackExecutor.run_if_defined(klass, :sync_import) do
    perform_in_batches(report_file_path, klass, mapping, import_args: import_args, &blk)
  end
end

.perform_import(klass, columns, rows, conflict_target, import_args = {}) ⇒ `Object`

# File 'lib/canvas_sync/importers/bulk_importer.rb', line 124

def self.perform_import(klass, columns, rows, conflict_target, import_args={})
  return if rows.length.zero?
  columns = columns.dup

  update_conditions = {
    condition: condition_sql(klass, columns, import_args[:sync_start_time]),
    columns: columns
  }
  update_conditions[:conflict_target] = conflict_target if conflict_target.present?

  options = { validate: false, on_duplicate_key_update: update_conditions }.merge(import_args)
  options.delete(:on_duplicate_key_update) if options.key?(:on_duplicate_key_ignore)

  result = nil
  callback_env = {
    batch: rows,
    import_result: nil,
    import_options: options,
  }
  ClassCallbackExecutor.run_if_defined(klass, :sync_batch_import, callback_env) do
    result = klass.import(columns, rows, options)
    callback_env[:import_result] = result

    global_updates = {
      canvas_synced_at: DateTime.now,
      canvas_sync_batch_id: JobBatches::Batch.current_context[:sync_batch_id],
    }
    global_updates.slice!(*klass.column_names.map(&:to_sym))
    if global_updates.present? && result.ids.present?
      klass.where(id: result.ids).update_all(global_updates)
    end
  end

  result
end

.perform_in_batches(report_file_path, klass, mapping, import_args: {}, &block) ⇒ `Object`

# File 'lib/canvas_sync/importers/bulk_importer.rb', line 37

def self.perform_in_batches(report_file_path, klass, mapping, import_args: {}, &block)
  conflict_target = Array(mapping[:conflict_target]).map(&:to_s)

  col_mapping = {}.with_indifferent_access
  mapping[:columns].each do |db_col, opts|
    next if opts[:deprecated] && !klass.column_names.include?(db_col.to_s)

    col_mapping[db_col] = opts
  end

  csv_column_names = col_mapping.values.map { |value| value[:report_column].to_s }
  database_column_names = col_mapping.keys

  enumer = CSV.foreach(report_file_path, headers: true, header_converters: :symbol).lazy

  # Optionally filter rows by a passed block
  if block
    enumer = enumer.filter_map do |row|
      catch :skip do
        block.call(row)
      end
    end
  end

  # Convert rows to a hash that we can manipulate if desired
  enumer = enumer.map {|row| row.to_h.with_indifferent_access }

  # Optionally chunk by a computed value. Mainly so we can group duplicate rows and choose one
  chunker = nil
  chunker = UserChunker.new if defined?(User) && klass == User && csv_column_names.include?('user_id')
  if chunker
    enumer = enumer.chunk{|row| chunker.key(row) }.flat_map{|key, rows| chunker.choose(key, rows) }
  end

  # Pre-alter rows
  (mapping[:row_alterations] || []).each do |alterer|
    enumer = enumer.filter_map do |row|
      catch :skip do
        klass.instance_exec(row, &alterer)
        row
      end
    end
  end

  # Prepare the rows for import
  enumer = enumer.map do |row|
    col_mapping.map do |db_col, col_def|
      value = nil
      value = row[col_def[:report_column]] if col_def[:report_column]

      if col_def[:type]
        if col_def[:type].to_sym == :datetime
          # TODO: add some timezone config to the mapping.
          # In cases where the timestamp or date doesn't include a timezone, you should be able to specify one
          value = DateTime.parse(value).utc rescue nil # rubocop:disable Style/RescueModifier
        elsif col_def[:type].to_sym == :yaml
          # CSV parse from reading the report escapes the new lines and YAML throws an error with double backslashes
          value = YAML.load(value.gsub("\\n", "\n")) rescue nil # rubocop:disable Style/RescueModifier
        end
      end

      value = col_def[:transform].call(value, row) if col_def[:transform]

      value
    end
  end

  # Reject rows within a single batch that have the same ID (use indices so we can use the post-transformed values)
  conflict_target_indices = conflict_target.map{|ct| database_column_names.index(ct) }
  row_ids = Set.new
  if conflict_target.present?
    enumer = enumer.reject do |row|
      key = conflict_target_indices.map{|ct| row[ct] }
      next true if row_ids.include?(key)

      row_ids << key
      false
    end
  end

  # Start importing
  enumer.each_slice(batch_size) do |batch|
    perform_import(klass, database_column_names, batch, conflict_target, import_args)
    row_ids.clear
  end
end

Class: CanvasSync::Importers::BulkImporter

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.batch_size ⇒ Object

.condition_sql(klass, columns, report_start = nil) ⇒ Object

.import(report_file_path, *args, import_args: {}, &blk) ⇒ Object

.perform_import(klass, columns, rows, conflict_target, import_args = {}) ⇒ Object

.perform_in_batches(report_file_path, klass, mapping, import_args: {}, &block) ⇒ Object

.batch_size ⇒ `Object`

.condition_sql(klass, columns, report_start = nil) ⇒ `Object`

.import(report_file_path, *args, import_args: {}, &blk) ⇒ `Object`

.perform_import(klass, columns, rows, conflict_target, import_args = {}) ⇒ `Object`

.perform_in_batches(report_file_path, klass, mapping, import_args: {}, &block) ⇒ `Object`