Class: SmarterCSV::Reader

Inherits:

Object

Object
SmarterCSV::Reader

show all

Includes:: Enumerable, AutoDetection, FileIO, HashTransformations, HeaderTransformations, HeaderValidations, Headers, Parser, Options

Defined in:: lib/smarter_csv/reader.rb,
lib/smarter_csv/reader_options.rb

Defined Under Namespace

Modules: Options

Constant Summary collapse

DEFAULT_CHUNK_SIZE = Default chunk size used by each_chunk when chunk_size is not explicitly set. A warning is emitted to STDERR so users know to configure it explicitly.

Instance Attribute Summary collapse

#chunk_count ⇒ Object readonly

Returns the value of attribute chunk_count.
#csv_line_count ⇒ Object readonly

Returns the value of attribute csv_line_count.
#enforce_utf8 ⇒ Object readonly

Returns the value of attribute enforce_utf8.
#errors ⇒ Object readonly

Returns the value of attribute errors.
#file_line_count ⇒ Object readonly

Returns the value of attribute file_line_count.
#has_acceleration ⇒ Object readonly

Returns the value of attribute has_acceleration.
#has_rails ⇒ Object readonly

Returns the value of attribute has_rails.
#headers ⇒ Object readonly

Returns the value of attribute headers.
#input ⇒ Object readonly

Returns the value of attribute input.
#options ⇒ Object readonly

Returns the value of attribute options.
#raw_header ⇒ Object readonly

Returns the value of attribute raw_header.
#result ⇒ Object readonly

Returns the value of attribute result.
#warnings ⇒ Object readonly

Returns the value of attribute warnings.

Class Method Summary collapse

.default_options ⇒ Object

Instance Method Summary collapse

#count_quote_chars(line, quote_char, col_sep = ",", quote_escaping = :double_quotes) ⇒ Object
#count_quote_chars_auto(line, quote_char, col_sep = ",") ⇒ Object

Returns [escaped_count, rfc_count] for :auto mode dual counting.
#each ⇒ Object

Yields each successfully parsed row as a Hash.
#each_chunk ⇒ Object

Yields each chunk as Array<Hash> plus its 0-based chunk index.
#headerA ⇒ Object

rubocop:disable Naming/MethodName.
#initialize(input, given_options = {}) ⇒ Reader constructor

first parameter: filename or input object which responds to readline method.
#process(&block) ⇒ Object

rubocop:disable Lint/UnusedMethodArgument.

Methods included from HashTransformations

#hash_transformations

Methods included from HeaderValidations

#check_duplicate_headers, #check_required_headers, #header_validations

Methods included from HeaderTransformations

#disambiguate_headers, #header_transformations, #remap_headers

Methods included from Headers

#process_headers

Methods included from Options

#process_options

Constructor Details

#initialize(input, given_options = {}) ⇒ `Reader`

first parameter: filename or input object which responds to readline method

# File 'lib/smarter_csv/reader.rb', line 39

def initialize(input, given_options = {})
  @input = input
  @has_rails = !!defined?(Rails)
  @csv_line_count = 0
  @chunk_count = 0
  @errors = {}
  @file_line_count = 0
  @headerA = []
  @headers = nil
  @raw_header = nil # header as it appears in the file
  @result = []
  @warnings = {}
  @enforce_utf8 = false # only set to true if needed (after options parsing)
  @options = process_options(given_options)
  # true if it is compiled with accelleration
  @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
end

Instance Attribute Details

#chunk_count ⇒ `Object` (readonly)

Returns the value of attribute chunk_count.



23
24
25

# File 'lib/smarter_csv/reader.rb', line 23

def chunk_count
  @chunk_count
end

#csv_line_count ⇒ `Object` (readonly)

Returns the value of attribute csv_line_count.



23
24
25

# File 'lib/smarter_csv/reader.rb', line 23

def csv_line_count
  @csv_line_count
end

#enforce_utf8 ⇒ `Object` (readonly)

Returns the value of attribute enforce_utf8.



24
25
26

# File 'lib/smarter_csv/reader.rb', line 24

def enforce_utf8
  @enforce_utf8
end

#errors ⇒ `Object` (readonly)

Returns the value of attribute errors.



25
26
27

# File 'lib/smarter_csv/reader.rb', line 25

def errors
  @errors
end

#file_line_count ⇒ `Object` (readonly)

Returns the value of attribute file_line_count.



23
24
25

# File 'lib/smarter_csv/reader.rb', line 23

def file_line_count
  @file_line_count
end

#has_acceleration ⇒ `Object` (readonly)

Returns the value of attribute has_acceleration.



24
25
26

# File 'lib/smarter_csv/reader.rb', line 24

def has_acceleration
  @has_acceleration
end

#has_rails ⇒ `Object` (readonly)

Returns the value of attribute has_rails.



24
25
26

# File 'lib/smarter_csv/reader.rb', line 24

def has_rails
  @has_rails
end

#headers ⇒ `Object` (readonly)

Returns the value of attribute headers.



25
26
27

# File 'lib/smarter_csv/reader.rb', line 25

def headers
  @headers
end

#input ⇒ `Object` (readonly)

Returns the value of attribute input.



22
23
24

# File 'lib/smarter_csv/reader.rb', line 22

def input
  @input
end

#options ⇒ `Object` (readonly)

Returns the value of attribute options.



22
23
24

# File 'lib/smarter_csv/reader.rb', line 22

def options
  @options
end

#raw_header ⇒ `Object` (readonly)

Returns the value of attribute raw_header.



25
26
27

# File 'lib/smarter_csv/reader.rb', line 25

def raw_header
  @raw_header
end

#result ⇒ `Object` (readonly)

Returns the value of attribute result.



25
26
27

# File 'lib/smarter_csv/reader.rb', line 25

def result
  @result
end

#warnings ⇒ `Object` (readonly)

Returns the value of attribute warnings.



25
26
27

# File 'lib/smarter_csv/reader.rb', line 25

def warnings
  @warnings
end

Class Method Details

.default_options ⇒ `Object`



27
28
29

# File 'lib/smarter_csv/reader.rb', line 27

def self.default_options
  Options::DEFAULT_OPTIONS
end

Instance Method Details

#count_quote_chars(line, quote_char, col_sep = ",", quote_escaping = :double_quotes) ⇒ `Object`

# File 'lib/smarter_csv/reader.rb', line 447

def count_quote_chars(line, quote_char, col_sep = ",", quote_escaping = :double_quotes)
  return 0 if line.nil? || quote_char.nil? || quote_char.empty?

  # Use C extension for performance if available (avoids creating a String object per character)
  if @has_acceleration && SmarterCSV::Parser.respond_to?(:count_quote_chars_c)
    return SmarterCSV::Parser.count_quote_chars_c(line, quote_char, col_sep, quote_escaping == :backslash)
  end

  # Fallback to Ruby implementation
  if quote_escaping == :backslash
    # Backslash mode: must walk character-by-character to track escape state
    count = 0
    escaped = false

    line.each_char do |char|
      if char == '\\' && !escaped
        escaped = true
      else
        if char == quote_char && !escaped
          count += 1
        end
        escaped = false
      end
    end
    count
  else
    # Optimization #3: double_quotes mode — use String#count (single C call,
    # no per-character String allocation)
    line.count(quote_char)
  end
end

#count_quote_chars_auto(line, quote_char, col_sep = ",") ⇒ `Object`

Returns [escaped_count, rfc_count] for :auto mode dual counting. escaped_count: quote chars not preceded by odd backslashes rfc_count: all quote chars (backslash has no special meaning)

# File 'lib/smarter_csv/reader.rb', line 482

def count_quote_chars_auto(line, quote_char, col_sep = ",")
  return [0, 0] if line.nil? || quote_char.nil? || quote_char.empty?

  if @has_acceleration && SmarterCSV::Parser.respond_to?(:count_quote_chars_auto_c)
    return SmarterCSV::Parser.count_quote_chars_auto_c(line, quote_char, col_sep)
  end

  # Optimization #3: rfc_count uses String#count (single C call)
  rfc_count = line.count(quote_char)

  # Optimization #9: if no backslashes in line, escaped_count == rfc_count
  # (no escaping possible), skip the character-by-character walk entirely.
  unless line.include?('\\')
    return [rfc_count, rfc_count]
  end

  # escaped_count needs character-by-character walk for backslash tracking
  escaped_count = 0
  escaped = false

  line.each_char do |char|
    if char == quote_char
      escaped_count += 1 unless escaped
      escaped = false
    elsif char == '\\'
      escaped = !escaped
    else
      escaped = false
    end
  end

  [escaped_count, rfc_count]
end

#each ⇒ `Object`

Yields each successfully parsed row as a Hash. Ignores chunk_size — always row-by-row, enabling standard Enumerable usage. Returns an Enumerator when called without a block.

Examples:

reader.each { |hash| MyModel.upsert(hash) }
reader.each_with_index { |hash, i| puts "Row #{i}: #{hash}" }
reader.select { |h| h[:country] == "US" }
reader.lazy.map { |h| h[:name] }.first(10)

# File 'lib/smarter_csv/reader.rb', line 66

def each
  return enum_for(:each) unless block_given?

  # Force row-by-row mode regardless of chunk_size setting
  original_chunk_size = @options[:chunk_size]
  @options[:chunk_size] = nil
  process { |row_array, _| yield row_array.first }
ensure
  @options[:chunk_size] = original_chunk_size
end

#each_chunk ⇒ `Object`

Yields each chunk as Array<Hash> plus its 0-based chunk index. Uses chunk_size from options; raises ArgumentError if chunk_size < 1. Returns an Enumerator when called without a block.

Examples:

reader = SmarterCSV::Reader.new("big.csv", chunk_size: 500)
reader.each_chunk { |chunk, i| Sidekiq.push_bulk(chunk) }
reader.each_chunk.with_index { |chunk, i| puts "Chunk #{i}: #{chunk.size} rows" }

# File 'lib/smarter_csv/reader.rb', line 85

def each_chunk
  return enum_for(:each_chunk) unless block_given?

  chunk_size = @options[:chunk_size]
  if chunk_size.nil?
    warn "SmarterCSV: chunk_size not set, defaulting to #{DEFAULT_CHUNK_SIZE}. Set chunk_size explicitly to suppress this warning." unless @options[:verbose] == :quiet
    chunk_size = DEFAULT_CHUNK_SIZE
  end
  unless chunk_size.is_a?(Integer) && chunk_size >= 1
    raise ArgumentError, "chunk_size must be an Integer >= 1 (got #{chunk_size.inspect})"
  end

  # Temporarily apply chunk_size (handles nil default case) and restore after
  original_chunk_size = @options[:chunk_size]
  @options[:chunk_size] = chunk_size
  begin
    # process reuses the same chunk Array (clearing it after each yield),
    # so we dup to give callers a stable snapshot they can safely store.
    process { |chunk, index| yield chunk.dup, index }
  ensure
    @options[:chunk_size] = original_chunk_size
  end
end

#headerA ⇒ `Object`

rubocop:disable Naming/MethodName

# File 'lib/smarter_csv/reader.rb', line 32

def headerA
  warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
  @headerA
end

#process(&block) ⇒ `Object`

rubocop:disable Lint/UnusedMethodArgument

# File 'lib/smarter_csv/reader.rb', line 109

def process(&block) # rubocop:disable Lint/UnusedMethodArgument
  @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
  @verbose = options[:verbose]

  begin
    fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

    if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
      warn 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".' unless options[:verbose] == :quiet
    end

    # auto-detect the row separator
    options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
    # attempt to auto-detect column separator
    options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto

    skip_lines(fh, options)

    # NOTE: we are no longer using header_size
    @headers, _header_size = process_headers(fh, options)
    @headerA = @headers # @headerA is deprecated, use @headers

    $stderr.puts "Effective headers:\n#{pp(@headers)}\n" if @verbose == :debug

    header_validations(@headers, options)

    # Precompute column filter sets for only_headers / except_headers (O(1) lookup per row)
    @only_headers_set   = options[:only_headers]   ? Set.new(options[:only_headers])   : nil
    @except_headers_set = options[:except_headers] ? Set.new(options[:except_headers]) : nil

    # Precompute column-filter bitmap for the C extension.
    #
    # The bitmap is a loop invariant — headers and filter settings never change between rows.
    # We store it as a packed binary String so C can copy it with a single memcpy instead of
    # N rb_ary_entry calls per row.  early_exit_after and keep_extra_cols are pre-stored so
    # C reads them with O(1) hash lookups rather than recomputing per row.
    if @only_headers_set || @except_headers_set
      keep_flags = @headers.map { |h| @only_headers_set ? @only_headers_set.include?(h) : !@except_headers_set.include?(h) }
      options[:_keep_bitmap]       = keep_flags.map { |f| f ? 1 : 0 }.pack('C*').freeze
      options[:_keep_extra_cols]   = @only_headers_set ? false : true
      options[:_early_exit_after]  = (@only_headers_set && !options[:strict]) ? (keep_flags.rindex(true) || -1) : -1
      options[:_keep_cols]         = nil   # nil signals C: "filter active, check _keep_bitmap"
    else
      options[:_keep_cols] = false  # sentinel: no filtering active — C skips all bitmap paths
      # Do NOT insert _keep_bitmap/_keep_extra_cols/_early_exit_after when unused.
      # Keeping the options hash as small as possible avoids hash table resize and
      # keeps all 10 per-row rb_hash_aref lookups hitting the same cache lines.
    end

    # Precompute all hot-path strategy ivars once — eliminates per-row option lookups
    # and method-dispatch overhead in the main loop.
    #
    # @quote_escaping_backslash / @quote_escaping_double may already exist if
    # parse_with_auto_fallback ran during header parsing (lazily created there).
    # Ensure they exist and carry the now-final _keep_cols (and bitmap keys only when active).
    @quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
    @quote_escaping_double    ||= options.merge(quote_escaping: :double_quotes)
    @quote_escaping_backslash[:_keep_cols] = options[:_keep_cols]
    @quote_escaping_double[:_keep_cols]    = options[:_keep_cols]
    if @only_headers_set || @except_headers_set
      %i[_keep_bitmap _keep_extra_cols _early_exit_after].each do |k|
        @quote_escaping_backslash[k] = options[k]
        @quote_escaping_double[k]    = options[k]
      end
    end

    @quote_escaping_auto = options[:quote_escaping] == :auto
    @use_acceleration    = options[:acceleration] && has_acceleration

    # The single options hash used on the hot path — for :auto we always try backslash
    # first (C downgrades to RFC internally via Opt #5 when no backslash is found).
    @hot_path_options = @quote_escaping_auto ? @quote_escaping_backslash : options

    # Build ParseContext objects once after headers are known.
    # Eliminates ~10 rb_hash_aref calls per row by pre-baking all loop-invariant
    # options into a C struct accessed via direct pointer dereference.
    if @use_acceleration
      hot_opts    = @hot_path_options
      double_opts = @quote_escaping_double
      @parse_ctx        = SmarterCSV::Parser.new_parse_context_c(@headers, hot_opts)
      @parse_ctx_double = SmarterCSV::Parser.new_parse_context_c(@headers, double_opts)
    end

    # Key-cleanup flags — computed once, checked per row via cheap ivar reads.
    # hash.delete(nil) / hash.delete('') only occur when key_mapping maps a header to nil/"".
    # hash.delete(:"") also catches empty headers produced by ,, in the CSV.
    @delete_nil_keys   = !!options[:key_mapping]
    @delete_empty_keys = !!options[:key_mapping] || @headers.include?(:"")

    # Cache quote_char as an ivar for the stitch-loop memchr guard (avoids hash lookup per continuation line).
    @quote_char = options[:quote_char]
    # Cache field_size_limit as an ivar (nil when unset → one nil-check per row, no method calls).
    @field_size_limit = options[:field_size_limit]

    # in case we use chunking.. we'll need to set it up..
    if options[:chunk_size].to_i > 0
      use_chunks = true
      chunk_size = options[:chunk_size].to_i
      @chunk_count = 0
      chunk = []
    else
      use_chunks = false
    end

    # --- INSTRUMENTATION HOOKS ---
    # on_start / on_chunk / on_complete are optional callables (nil by default).
    # Hooks only fire from `process` (library-controlled iteration). Enumerator
    # modes (each / each_chunk) do not fire hooks — the caller owns the lifecycle.
    _on_start    = options[:on_start]
    _on_chunk    = options[:on_chunk]
    _on_complete = options[:on_complete]
    _start_time  = Process.clock_gettime(Process::CLOCK_MONOTONIC) if _on_start || _on_complete

    if _on_start
      _input_meta = if @input.is_a?(String)
                      { input: @input, file_size: (File.size(@input) rescue nil) }
                    else
                      { input: @input.class.name, file_size: nil }
                    end
      _on_start.call(_input_meta.merge(col_sep: options[:col_sep], row_sep: options[:row_sep]))
    end

    # now on to processing all the rest of the lines in the CSV file:
    while (line = next_line_with_counts(fh, options))

      # replace invalid byte sequence in UTF-8 with question mark to avoid errors
      line = enforce_utf8_encoding(line, options) if @enforce_utf8

      $stderr.print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose == :debug

      next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any

      # Snapshot line counters before multiline stitching so error records reflect
      # where the bad row started, not where it failed.
      bad_row_start_csv_line  = @csv_line_count
      bad_row_start_file_line = @file_line_count

      begin
        # --- PARSE (inlined — no method-wrapper overhead on the hot path) ---
        # Replaces: process_line_to_hash → parse_line_to_hash → parse_line_to_hash_auto
        # All routing decisions are pre-baked into ivars set up after header processing.
        if @use_acceleration
          hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx)
          # :auto only: if unclosed quote AND backslash present, RFC may close it differently
          if @quote_escaping_auto && data_size == -1 && line.include?('\\')
            hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
          end
        else
          has_quotes = line.include?(options[:quote_char])
          hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
          if @quote_escaping_auto && data_size == -1 && line.include?('\\')
            hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
          end
        end

        # --- MULTILINE STITCH ---
        # data_size == -1 means the parser saw an unclosed quoted field at end-of-line.
        # Fetch the next physical line, append, and re-parse until the field closes.
        while data_size == -1
          next_line = fh.gets(options[:row_sep])
          raise MalformedCSV, "Unclosed quoted field detected in multiline data" if next_line.nil?

          next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
          line += next_line
          @file_line_count += 1
          $stderr.print "\nline contains unclosed quoted field, including content through file line %d\n" % @file_line_count if @verbose == :debug

          # DoS guard: prevent runaway multiline accumulation (vectors: never-closing quote, huge embedded content)
          if @field_size_limit && line.bytesize > @field_size_limit
            raise SmarterCSV::FieldSizeLimitExceeded,
                  "Multiline field exceeds field_size_limit of #{@field_size_limit} bytes " \
                  "(accumulated #{line.bytesize} bytes)"
          end

          # Opt #8 (memchr guard): if the newly appended line contains no quote character,
          # it cannot close the currently open quoted field — skip the full re-parse and
          # keep accumulating physical lines.  String#include? uses memchr internally (C speed).
          next unless next_line.include?(@quote_char)

          if @use_acceleration
            # :nocov:
            hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx)
            if @quote_escaping_auto && data_size == -1 && line.include?('\\')
              hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
            end
            # :nocov:
          else
            # Optimization #18: use detect_multiline as a cheap gate before attempting a full
            # Ruby re-parse on the growing stitched line. detect_multiline_strict now uses
            # byteindex skip-ahead (Opt #17) and is faster than parse_line_to_hash_ruby on
            # the same content. Saves N-2 wasted full parses per multiline row.
            next if detect_multiline(line, options)

            has_quotes = true # we know the line has quotes — we've been stitching a quoted field
            hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
            if @quote_escaping_auto && data_size == -1 && line.include?('\\')
              hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
            end
          end
        end

        # --- EXTRA COLUMNS ---
        if data_size > @headers.size
          raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}" if options[:missing_headers] == :raise

          while @headers.size < data_size
            @headers << "#{options[:missing_header_prefix]}#{@headers.size + 1}".to_sym
          end
        end

        next if hash.nil?

        # --- FIELD SIZE LIMIT CHECK ---
        # Pre-filter: if the raw line fits within the limit, no individual field can exceed it
        # (a field is always a substring of its row). Only iterate over values for large rows.
        if @field_size_limit && line.bytesize > @field_size_limit
          hash.each_value do |v|
            if v.is_a?(String) && v.bytesize > @field_size_limit
              raise SmarterCSV::FieldSizeLimitExceeded,
                    "Field exceeds field_size_limit of #{@field_size_limit} bytes (got #{v.bytesize} bytes)"
            end
          end
        end

        # --- COLUMN SELECTION ---
        hash.select! { |k, _| @only_headers_set.include?(k) }   if @only_headers_set
        hash.reject! { |k, _| @except_headers_set.include?(k) } if @except_headers_set

        # --- HASH CLEANUP & TRANSFORMATIONS ---
        if @use_acceleration
          # C already applied: remove_empty_values, convert_values_to_numeric, remove_zero_values.
          # Remove nil/"" keys left by key_mapping or empty CSV headers.
          if @delete_nil_keys
            hash.delete(nil)
            hash.delete('')
          end
          hash.delete(:"") if @delete_empty_keys

          if (matcher = options[:nil_values_matching])
            if options[:remove_empty_values]
              hash.delete_if do |_k, v|
                str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
                str_val && matcher.match?(str_val)
              end
            else
              hash.each_key do |k|
                v = hash[k]
                str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
                hash[k] = nil if str_val && matcher.match?(str_val)
              end
            end
          end

          if options[:value_converters]
            options[:value_converters].each do |key, converter|
              hash[key] = converter.respond_to?(:convert) ? converter.convert(hash[key]) : converter.call(hash[key]) if hash.key?(key)
            end
          end
        else
          hash = hash_transformations(hash, options)
        end

        next if options[:remove_empty_hashes] && hash.empty?

        $stderr.puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == :debug
        # optional adding of csv_line_number to the hash to help debugging
        hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
      rescue SmarterCSV::Error, EOFError => e
        raise if options[:on_bad_row] == :raise

        handle_bad_row(e, line, bad_row_start_csv_line, bad_row_start_file_line, options)
        next
      end

      # process the chunks or the resulting hash
      if use_chunks
        chunk << hash # append temp result to chunk

        if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
          _on_chunk&.call({ chunk_number: @chunk_count + 1, rows_in_chunk: chunk.size, total_rows_so_far: @csv_line_count })
          # do something with the chunk
          if block_given?
            yield chunk, @chunk_count # do something with the hashes in the chunk in the block
          else
            @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
          end
          @chunk_count += 1
          chunk.clear # re-initialize for next chunk of data
        else
          # the last chunk may contain partial data, which is handled below
        end
        # while a chunk is being filled up we don't need to do anything else here

      else # no chunk handling
        if block_given?
          yield [hash], @chunk_count # do something with the hash in the block (better to use chunking here)
          @chunk_count += 1
        else
          @result << hash
        end
      end
    end

    # print new line to retain last processing line message
    $stderr.print "\n" if @verbose == :debug

    # handling of last chunk:
    if !chunk.nil? && chunk.size > 0
      _on_chunk&.call({ chunk_number: @chunk_count + 1, rows_in_chunk: chunk.size, total_rows_so_far: @csv_line_count })
      # do something with the chunk
      if block_given?
        yield chunk, @chunk_count # do something with the hashes in the chunk in the block
      else
        @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
      end
      @chunk_count += 1
      # chunk = [] # initialize for next chunk of data
    end

    if _on_complete
      _on_complete.call({
        total_rows:   @csv_line_count,
        total_chunks: @chunk_count,
        duration:     Process.clock_gettime(Process::CLOCK_MONOTONIC) - _start_time,
        bad_rows:     @errors[:bad_row_count] || 0,
      })
    end
  ensure
    fh.close if fh.respond_to?(:close)
  end

  if block_given?
    @chunk_count # when we do processing through a block we only care how many chunks we processed
  else
    @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
  end
end

Class: SmarterCSV::Reader

Defined Under Namespace

Constant Summary collapse

Constants included from Parser

Constants included from HashTransformations

Constants included from Options

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from HashTransformations

Methods included from HeaderValidations

Methods included from HeaderTransformations

Methods included from Headers

Methods included from Options

Constructor Details

#initialize(input, given_options = {}) ⇒ Reader

Instance Attribute Details

#chunk_count ⇒ Object (readonly)

#csv_line_count ⇒ Object (readonly)

#enforce_utf8 ⇒ Object (readonly)

#errors ⇒ Object (readonly)

#file_line_count ⇒ Object (readonly)

#has_acceleration ⇒ Object (readonly)

#has_rails ⇒ Object (readonly)

#headers ⇒ Object (readonly)

#input ⇒ Object (readonly)

#options ⇒ Object (readonly)

#raw_header ⇒ Object (readonly)

#result ⇒ Object (readonly)

#warnings ⇒ Object (readonly)