Class: SmarterCSV::Reader

Inherits:
Object
  • Object
show all
Includes:
Enumerable, AutoDetection, FileIO, HashTransformations, HeaderTransformations, HeaderValidations, Headers, Parser, Options
Defined in:
lib/smarter_csv/reader.rb,
lib/smarter_csv/reader_options.rb

Defined Under Namespace

Modules: Options

Constant Summary collapse

DEFAULT_CHUNK_SIZE =

Default chunk size used by each_chunk when chunk_size is not explicitly set. A warning is emitted to STDERR so users know to configure it explicitly.

100

Constants included from Parser

Parser::BYTEINDEX_AVAILABLE, Parser::EMPTY_STRING

Constants included from HashTransformations

HashTransformations::FLOAT_REGEX, HashTransformations::INTEGER_REGEX, HashTransformations::ZERO_REGEX

Constants included from Options

Options::DEFAULT_OPTIONS

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from HashTransformations

#hash_transformations

Methods included from HeaderValidations

#check_duplicate_headers, #check_required_headers, #header_validations

Methods included from HeaderTransformations

#disambiguate_headers, #header_transformations, #remap_headers

Methods included from Headers

#process_headers

Methods included from Options

#process_options

Constructor Details

#initialize(input, given_options = {}) ⇒ Reader

first parameter: filename or input object which responds to readline method



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/smarter_csv/reader.rb', line 39

def initialize(input, given_options = {})
  @input = input
  @has_rails = !!defined?(Rails)
  @csv_line_count = 0
  @chunk_count = 0
  @errors = {}
  @file_line_count = 0
  @headerA = []
  @headers = nil
  @raw_header = nil # header as it appears in the file
  @result = []
  @warnings = {}
  @enforce_utf8 = false # only set to true if needed (after options parsing)
  @options = process_options(given_options)
  # true if it is compiled with accelleration
  @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
end

Instance Attribute Details

#chunk_countObject (readonly)

Returns the value of attribute chunk_count.



23
24
25
# File 'lib/smarter_csv/reader.rb', line 23

def chunk_count
  @chunk_count
end

#csv_line_countObject (readonly)

Returns the value of attribute csv_line_count.



23
24
25
# File 'lib/smarter_csv/reader.rb', line 23

def csv_line_count
  @csv_line_count
end

#enforce_utf8Object (readonly)

Returns the value of attribute enforce_utf8.



24
25
26
# File 'lib/smarter_csv/reader.rb', line 24

def enforce_utf8
  @enforce_utf8
end

#errorsObject (readonly)

Returns the value of attribute errors.



25
26
27
# File 'lib/smarter_csv/reader.rb', line 25

def errors
  @errors
end

#file_line_countObject (readonly)

Returns the value of attribute file_line_count.



23
24
25
# File 'lib/smarter_csv/reader.rb', line 23

def file_line_count
  @file_line_count
end

#has_accelerationObject (readonly)

Returns the value of attribute has_acceleration.



24
25
26
# File 'lib/smarter_csv/reader.rb', line 24

def has_acceleration
  @has_acceleration
end

#has_railsObject (readonly)

Returns the value of attribute has_rails.



24
25
26
# File 'lib/smarter_csv/reader.rb', line 24

def has_rails
  @has_rails
end

#headersObject (readonly)

Returns the value of attribute headers.



25
26
27
# File 'lib/smarter_csv/reader.rb', line 25

def headers
  @headers
end

#inputObject (readonly)

Returns the value of attribute input.



22
23
24
# File 'lib/smarter_csv/reader.rb', line 22

def input
  @input
end

#optionsObject (readonly)

Returns the value of attribute options.



22
23
24
# File 'lib/smarter_csv/reader.rb', line 22

def options
  @options
end

#raw_headerObject (readonly)

Returns the value of attribute raw_header.



25
26
27
# File 'lib/smarter_csv/reader.rb', line 25

def raw_header
  @raw_header
end

#resultObject (readonly)

Returns the value of attribute result.



25
26
27
# File 'lib/smarter_csv/reader.rb', line 25

def result
  @result
end

#warningsObject (readonly)

Returns the value of attribute warnings.



25
26
27
# File 'lib/smarter_csv/reader.rb', line 25

def warnings
  @warnings
end

Class Method Details

.default_optionsObject



27
28
29
# File 'lib/smarter_csv/reader.rb', line 27

def self.default_options
  Options::DEFAULT_OPTIONS
end

Instance Method Details

#count_quote_chars(line, quote_char, col_sep = ",", quote_escaping = :double_quotes) ⇒ Object



447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
# File 'lib/smarter_csv/reader.rb', line 447

def count_quote_chars(line, quote_char, col_sep = ",", quote_escaping = :double_quotes)
  return 0 if line.nil? || quote_char.nil? || quote_char.empty?

  # Use C extension for performance if available (avoids creating a String object per character)
  if @has_acceleration && SmarterCSV::Parser.respond_to?(:count_quote_chars_c)
    return SmarterCSV::Parser.count_quote_chars_c(line, quote_char, col_sep, quote_escaping == :backslash)
  end

  # Fallback to Ruby implementation
  if quote_escaping == :backslash
    # Backslash mode: must walk character-by-character to track escape state
    count = 0
    escaped = false

    line.each_char do |char|
      if char == '\\' && !escaped
        escaped = true
      else
        if char == quote_char && !escaped
          count += 1
        end
        escaped = false
      end
    end
    count
  else
    # Optimization #3: double_quotes mode — use String#count (single C call,
    # no per-character String allocation)
    line.count(quote_char)
  end
end

#count_quote_chars_auto(line, quote_char, col_sep = ",") ⇒ Object

Returns [escaped_count, rfc_count] for :auto mode dual counting. escaped_count: quote chars not preceded by odd backslashes rfc_count: all quote chars (backslash has no special meaning)



482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
# File 'lib/smarter_csv/reader.rb', line 482

def count_quote_chars_auto(line, quote_char, col_sep = ",")
  return [0, 0] if line.nil? || quote_char.nil? || quote_char.empty?

  if @has_acceleration && SmarterCSV::Parser.respond_to?(:count_quote_chars_auto_c)
    return SmarterCSV::Parser.count_quote_chars_auto_c(line, quote_char, col_sep)
  end

  # Optimization #3: rfc_count uses String#count (single C call)
  rfc_count = line.count(quote_char)

  # Optimization #9: if no backslashes in line, escaped_count == rfc_count
  # (no escaping possible), skip the character-by-character walk entirely.
  unless line.include?('\\')
    return [rfc_count, rfc_count]
  end

  # escaped_count needs character-by-character walk for backslash tracking
  escaped_count = 0
  escaped = false

  line.each_char do |char|
    if char == quote_char
      escaped_count += 1 unless escaped
      escaped = false
    elsif char == '\\'
      escaped = !escaped
    else
      escaped = false
    end
  end

  [escaped_count, rfc_count]
end

#eachObject

Yields each successfully parsed row as a Hash. Ignores chunk_size — always row-by-row, enabling standard Enumerable usage. Returns an Enumerator when called without a block.

Examples:

reader.each { |hash| MyModel.upsert(hash) }
reader.each_with_index { |hash, i| puts "Row #{i}: #{hash}" }
reader.select { |h| h[:country] == "US" }
reader.lazy.map { |h| h[:name] }.first(10)


66
67
68
69
70
71
72
73
74
75
# File 'lib/smarter_csv/reader.rb', line 66

def each
  return enum_for(:each) unless block_given?

  # Force row-by-row mode regardless of chunk_size setting
  original_chunk_size = @options[:chunk_size]
  @options[:chunk_size] = nil
  process { |row_array, _| yield row_array.first }
ensure
  @options[:chunk_size] = original_chunk_size
end

#each_chunkObject

Yields each chunk as Array<Hash> plus its 0-based chunk index. Uses chunk_size from options; raises ArgumentError if chunk_size < 1. Returns an Enumerator when called without a block.

Examples:

reader = SmarterCSV::Reader.new("big.csv", chunk_size: 500)
reader.each_chunk { |chunk, i| Sidekiq.push_bulk(chunk) }
reader.each_chunk.with_index { |chunk, i| puts "Chunk #{i}: #{chunk.size} rows" }


85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/smarter_csv/reader.rb', line 85

def each_chunk
  return enum_for(:each_chunk) unless block_given?

  chunk_size = @options[:chunk_size]
  if chunk_size.nil?
    warn "SmarterCSV: chunk_size not set, defaulting to #{DEFAULT_CHUNK_SIZE}. Set chunk_size explicitly to suppress this warning." unless @options[:verbose] == :quiet
    chunk_size = DEFAULT_CHUNK_SIZE
  end
  unless chunk_size.is_a?(Integer) && chunk_size >= 1
    raise ArgumentError, "chunk_size must be an Integer >= 1 (got #{chunk_size.inspect})"
  end

  # Temporarily apply chunk_size (handles nil default case) and restore after
  original_chunk_size = @options[:chunk_size]
  @options[:chunk_size] = chunk_size
  begin
    # process reuses the same chunk Array (clearing it after each yield),
    # so we dup to give callers a stable snapshot they can safely store.
    process { |chunk, index| yield chunk.dup, index }
  ensure
    @options[:chunk_size] = original_chunk_size
  end
end

#headerAObject

rubocop:disable Naming/MethodName



32
33
34
35
# File 'lib/smarter_csv/reader.rb', line 32

def headerA
  warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
  @headerA
end

#process(&block) ⇒ Object

rubocop:disable Lint/UnusedMethodArgument



109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
# File 'lib/smarter_csv/reader.rb', line 109

def process(&block) # rubocop:disable Lint/UnusedMethodArgument
  @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
  @verbose = options[:verbose]

  begin
    fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

    if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
      warn 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".' unless options[:verbose] == :quiet
    end

    # auto-detect the row separator
    options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
    # attempt to auto-detect column separator
    options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto

    skip_lines(fh, options)

    # NOTE: we are no longer using header_size
    @headers, _header_size = process_headers(fh, options)
    @headerA = @headers # @headerA is deprecated, use @headers

    $stderr.puts "Effective headers:\n#{pp(@headers)}\n" if @verbose == :debug

    header_validations(@headers, options)

    # Precompute column filter sets for only_headers / except_headers (O(1) lookup per row)
    @only_headers_set   = options[:only_headers]   ? Set.new(options[:only_headers])   : nil
    @except_headers_set = options[:except_headers] ? Set.new(options[:except_headers]) : nil

    # Precompute column-filter bitmap for the C extension.
    #
    # The bitmap is a loop invariant — headers and filter settings never change between rows.
    # We store it as a packed binary String so C can copy it with a single memcpy instead of
    # N rb_ary_entry calls per row.  early_exit_after and keep_extra_cols are pre-stored so
    # C reads them with O(1) hash lookups rather than recomputing per row.
    if @only_headers_set || @except_headers_set
      keep_flags = @headers.map { |h| @only_headers_set ? @only_headers_set.include?(h) : !@except_headers_set.include?(h) }
      options[:_keep_bitmap]       = keep_flags.map { |f| f ? 1 : 0 }.pack('C*').freeze
      options[:_keep_extra_cols]   = @only_headers_set ? false : true
      options[:_early_exit_after]  = (@only_headers_set && !options[:strict]) ? (keep_flags.rindex(true) || -1) : -1
      options[:_keep_cols]         = nil   # nil signals C: "filter active, check _keep_bitmap"
    else
      options[:_keep_cols] = false  # sentinel: no filtering active — C skips all bitmap paths
      # Do NOT insert _keep_bitmap/_keep_extra_cols/_early_exit_after when unused.
      # Keeping the options hash as small as possible avoids hash table resize and
      # keeps all 10 per-row rb_hash_aref lookups hitting the same cache lines.
    end

    # Precompute all hot-path strategy ivars once — eliminates per-row option lookups
    # and method-dispatch overhead in the main loop.
    #
    # @quote_escaping_backslash / @quote_escaping_double may already exist if
    # parse_with_auto_fallback ran during header parsing (lazily created there).
    # Ensure they exist and carry the now-final _keep_cols (and bitmap keys only when active).
    @quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
    @quote_escaping_double    ||= options.merge(quote_escaping: :double_quotes)
    @quote_escaping_backslash[:_keep_cols] = options[:_keep_cols]
    @quote_escaping_double[:_keep_cols]    = options[:_keep_cols]
    if @only_headers_set || @except_headers_set
      %i[_keep_bitmap _keep_extra_cols _early_exit_after].each do |k|
        @quote_escaping_backslash[k] = options[k]
        @quote_escaping_double[k]    = options[k]
      end
    end

    @quote_escaping_auto = options[:quote_escaping] == :auto
    @use_acceleration    = options[:acceleration] && has_acceleration

    # The single options hash used on the hot path — for :auto we always try backslash
    # first (C downgrades to RFC internally via Opt #5 when no backslash is found).
    @hot_path_options = @quote_escaping_auto ? @quote_escaping_backslash : options

    # Build ParseContext objects once after headers are known.
    # Eliminates ~10 rb_hash_aref calls per row by pre-baking all loop-invariant
    # options into a C struct accessed via direct pointer dereference.
    if @use_acceleration
      hot_opts    = @hot_path_options
      double_opts = @quote_escaping_double
      @parse_ctx        = SmarterCSV::Parser.new_parse_context_c(@headers, hot_opts)
      @parse_ctx_double = SmarterCSV::Parser.new_parse_context_c(@headers, double_opts)
    end

    # Key-cleanup flags — computed once, checked per row via cheap ivar reads.
    # hash.delete(nil) / hash.delete('') only occur when key_mapping maps a header to nil/"".
    # hash.delete(:"") also catches empty headers produced by ,, in the CSV.
    @delete_nil_keys   = !!options[:key_mapping]
    @delete_empty_keys = !!options[:key_mapping] || @headers.include?(:"")

    # Cache quote_char as an ivar for the stitch-loop memchr guard (avoids hash lookup per continuation line).
    @quote_char = options[:quote_char]
    # Cache field_size_limit as an ivar (nil when unset → one nil-check per row, no method calls).
    @field_size_limit = options[:field_size_limit]

    # in case we use chunking.. we'll need to set it up..
    if options[:chunk_size].to_i > 0
      use_chunks = true
      chunk_size = options[:chunk_size].to_i
      @chunk_count = 0
      chunk = []
    else
      use_chunks = false
    end

    # --- INSTRUMENTATION HOOKS ---
    # on_start / on_chunk / on_complete are optional callables (nil by default).
    # Hooks only fire from `process` (library-controlled iteration). Enumerator
    # modes (each / each_chunk) do not fire hooks — the caller owns the lifecycle.
    _on_start    = options[:on_start]
    _on_chunk    = options[:on_chunk]
    _on_complete = options[:on_complete]
    _start_time  = Process.clock_gettime(Process::CLOCK_MONOTONIC) if _on_start || _on_complete

    if _on_start
      _input_meta = if @input.is_a?(String)
                      { input: @input, file_size: (File.size(@input) rescue nil) }
                    else
                      { input: @input.class.name, file_size: nil }
                    end
      _on_start.call(_input_meta.merge(col_sep: options[:col_sep], row_sep: options[:row_sep]))
    end

    # now on to processing all the rest of the lines in the CSV file:
    while (line = next_line_with_counts(fh, options))

      # replace invalid byte sequence in UTF-8 with question mark to avoid errors
      line = enforce_utf8_encoding(line, options) if @enforce_utf8

      $stderr.print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose == :debug

      next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any

      # Snapshot line counters before multiline stitching so error records reflect
      # where the bad row started, not where it failed.
      bad_row_start_csv_line  = @csv_line_count
      bad_row_start_file_line = @file_line_count

      begin
        # --- PARSE (inlined — no method-wrapper overhead on the hot path) ---
        # Replaces: process_line_to_hash → parse_line_to_hash → parse_line_to_hash_auto
        # All routing decisions are pre-baked into ivars set up after header processing.
        if @use_acceleration
          hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx)
          # :auto only: if unclosed quote AND backslash present, RFC may close it differently
          if @quote_escaping_auto && data_size == -1 && line.include?('\\')
            hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
          end
        else
          has_quotes = line.include?(options[:quote_char])
          hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
          if @quote_escaping_auto && data_size == -1 && line.include?('\\')
            hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
          end
        end

        # --- MULTILINE STITCH ---
        # data_size == -1 means the parser saw an unclosed quoted field at end-of-line.
        # Fetch the next physical line, append, and re-parse until the field closes.
        while data_size == -1
          next_line = fh.gets(options[:row_sep])
          raise MalformedCSV, "Unclosed quoted field detected in multiline data" if next_line.nil?

          next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
          line += next_line
          @file_line_count += 1
          $stderr.print "\nline contains unclosed quoted field, including content through file line %d\n" % @file_line_count if @verbose == :debug

          # DoS guard: prevent runaway multiline accumulation (vectors: never-closing quote, huge embedded content)
          if @field_size_limit && line.bytesize > @field_size_limit
            raise SmarterCSV::FieldSizeLimitExceeded,
                  "Multiline field exceeds field_size_limit of #{@field_size_limit} bytes " \
                  "(accumulated #{line.bytesize} bytes)"
          end

          # Opt #8 (memchr guard): if the newly appended line contains no quote character,
          # it cannot close the currently open quoted field — skip the full re-parse and
          # keep accumulating physical lines.  String#include? uses memchr internally (C speed).
          next unless next_line.include?(@quote_char)

          if @use_acceleration
            # :nocov:
            hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx)
            if @quote_escaping_auto && data_size == -1 && line.include?('\\')
              hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
            end
            # :nocov:
          else
            # Optimization #18: use detect_multiline as a cheap gate before attempting a full
            # Ruby re-parse on the growing stitched line. detect_multiline_strict now uses
            # byteindex skip-ahead (Opt #17) and is faster than parse_line_to_hash_ruby on
            # the same content. Saves N-2 wasted full parses per multiline row.
            next if detect_multiline(line, options)

            has_quotes = true # we know the line has quotes — we've been stitching a quoted field
            hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
            if @quote_escaping_auto && data_size == -1 && line.include?('\\')
              hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
            end
          end
        end

        # --- EXTRA COLUMNS ---
        if data_size > @headers.size
          raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}" if options[:missing_headers] == :raise

          while @headers.size < data_size
            @headers << "#{options[:missing_header_prefix]}#{@headers.size + 1}".to_sym
          end
        end

        next if hash.nil?

        # --- FIELD SIZE LIMIT CHECK ---
        # Pre-filter: if the raw line fits within the limit, no individual field can exceed it
        # (a field is always a substring of its row). Only iterate over values for large rows.
        if @field_size_limit && line.bytesize > @field_size_limit
          hash.each_value do |v|
            if v.is_a?(String) && v.bytesize > @field_size_limit
              raise SmarterCSV::FieldSizeLimitExceeded,
                    "Field exceeds field_size_limit of #{@field_size_limit} bytes (got #{v.bytesize} bytes)"
            end
          end
        end

        # --- COLUMN SELECTION ---
        hash.select! { |k, _| @only_headers_set.include?(k) }   if @only_headers_set
        hash.reject! { |k, _| @except_headers_set.include?(k) } if @except_headers_set

        # --- HASH CLEANUP & TRANSFORMATIONS ---
        if @use_acceleration
          # C already applied: remove_empty_values, convert_values_to_numeric, remove_zero_values.
          # Remove nil/"" keys left by key_mapping or empty CSV headers.
          if @delete_nil_keys
            hash.delete(nil)
            hash.delete('')
          end
          hash.delete(:"") if @delete_empty_keys

          if (matcher = options[:nil_values_matching])
            if options[:remove_empty_values]
              hash.delete_if do |_k, v|
                str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
                str_val && matcher.match?(str_val)
              end
            else
              hash.each_key do |k|
                v = hash[k]
                str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
                hash[k] = nil if str_val && matcher.match?(str_val)
              end
            end
          end

          if options[:value_converters]
            options[:value_converters].each do |key, converter|
              hash[key] = converter.respond_to?(:convert) ? converter.convert(hash[key]) : converter.call(hash[key]) if hash.key?(key)
            end
          end
        else
          hash = hash_transformations(hash, options)
        end

        next if options[:remove_empty_hashes] && hash.empty?

        $stderr.puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == :debug
        # optional adding of csv_line_number to the hash to help debugging
        hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
      rescue SmarterCSV::Error, EOFError => e
        raise if options[:on_bad_row] == :raise

        handle_bad_row(e, line, bad_row_start_csv_line, bad_row_start_file_line, options)
        next
      end

      # process the chunks or the resulting hash
      if use_chunks
        chunk << hash # append temp result to chunk

        if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
          _on_chunk&.call({ chunk_number: @chunk_count + 1, rows_in_chunk: chunk.size, total_rows_so_far: @csv_line_count })
          # do something with the chunk
          if block_given?
            yield chunk, @chunk_count # do something with the hashes in the chunk in the block
          else
            @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
          end
          @chunk_count += 1
          chunk.clear # re-initialize for next chunk of data
        else
          # the last chunk may contain partial data, which is handled below
        end
        # while a chunk is being filled up we don't need to do anything else here

      else # no chunk handling
        if block_given?
          yield [hash], @chunk_count # do something with the hash in the block (better to use chunking here)
          @chunk_count += 1
        else
          @result << hash
        end
      end
    end

    # print new line to retain last processing line message
    $stderr.print "\n" if @verbose == :debug

    # handling of last chunk:
    if !chunk.nil? && chunk.size > 0
      _on_chunk&.call({ chunk_number: @chunk_count + 1, rows_in_chunk: chunk.size, total_rows_so_far: @csv_line_count })
      # do something with the chunk
      if block_given?
        yield chunk, @chunk_count # do something with the hashes in the chunk in the block
      else
        @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
      end
      @chunk_count += 1
      # chunk = [] # initialize for next chunk of data
    end

    if _on_complete
      _on_complete.call({
        total_rows:   @csv_line_count,
        total_chunks: @chunk_count,
        duration:     Process.clock_gettime(Process::CLOCK_MONOTONIC) - _start_time,
        bad_rows:     @errors[:bad_row_count] || 0,
      })
    end
  ensure
    fh.close if fh.respond_to?(:close)
  end

  if block_given?
    @chunk_count # when we do processing through a block we only care how many chunks we processed
  else
    @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
  end
end