Module: SmarterCSV

Defined in:: lib/smarter_csv/parse.rb,
lib/smarter_csv/file_io.rb,
lib/smarter_csv/headers.rb,
lib/smarter_csv/version.rb,
lib/smarter_csv/variables.rb,
lib/smarter_csv/smarter_csv.rb,
lib/smarter_csv/auto_detection.rb,
lib/smarter_csv/options_processing.rb,
ext/smarter_csv/smarter_csv.c

Defined Under Namespace

Classes: DuplicateHeaders, HeaderSizeMismatch, IncorrectOption, KeyMappingError, MissingKeys, NoColSepDetected, SmarterCSVException, ValidationError

Constant Summary collapse

VERSION =

"1.9.3"

DEFAULT_OPTIONS =

{
  acceleration: true,
  auto_row_sep_chars: 500,
  chunk_size: nil,
  col_sep: :auto, # was: ',',
  comment_regexp: nil, # was: /\A#/,
  convert_values_to_numeric: true,
  downcase_header: true,
  duplicate_header_suffix: nil,
  file_encoding: 'utf-8',
  force_simple_split: false,
  force_utf8: false,
  headers_in_file: true,
  invalid_byte_sequence: '',
  keep_original_headers: false,
  key_mapping: nil,
  quote_char: '"',
  remove_empty_hashes: true,
  remove_empty_values: true,
  remove_unmapped_keys: false,
  remove_values_matching: nil,
  remove_zero_values: false,
  required_headers: nil,
  required_keys: nil,
  row_sep: :auto, # was: $/,
  silence_missing_keys: false,
  skip_lines: nil,
  strings_as_keys: false,
  strip_chars_from_headers: nil,
  strip_whitespace: true,
  user_provided_headers: nil,
  value_converters: nil,
  verbose: false,
  with_line_numbers: false,
}.freeze

Class Attribute Summary collapse

.chunk_count ⇒ Object readonly

Returns the value of attribute chunk_count.
.csv_line_count ⇒ Object readonly

Returns the value of attribute csv_line_count.
.errors ⇒ Object readonly

Returns the value of attribute errors.
.file_line_count ⇒ Object readonly

Returns the value of attribute file_line_count.
.headers ⇒ Object readonly

Returns the value of attribute headers.
.raw_header ⇒ Object readonly

Returns the value of attribute raw_header.
.result ⇒ Object readonly

Returns the value of attribute result.
.warnings ⇒ Object readonly

Returns the value of attribute warnings.

Class Method Summary collapse

.count_quote_chars(line, quote_char) ⇒ Object
- the ‘scan` method iterates through the string and finds all occurrences of the pattern * The reqular expression: - (?<!\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
.default_options ⇒ Object

NOTE: this is not called when “parse” methods are tested by themselves.
.has_acceleration? ⇒ Boolean
.headerA ⇒ Object

:nocov:.
.initialize_variables ⇒ Object
.parse_csv_line_c(line, col_sep, quote_char, max_size) ⇒ Object

max_size: pass nil if no limit is specified.
.process(input, given_options = {}, &block) ⇒ Object

first parameter: filename or input object which responds to readline method.
.process_headers(filehandle, options) ⇒ Object
.process_options(given_options = {}) ⇒ Object

NOTE: this is not called when “parse” methods are tested by themselves.

Class Attribute Details

.chunk_count ⇒ `Object` (readonly)

Returns the value of attribute chunk_count.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def chunk_count
  @chunk_count
end

.csv_line_count ⇒ `Object` (readonly)

Returns the value of attribute csv_line_count.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def csv_line_count
  @csv_line_count
end

.errors ⇒ `Object` (readonly)

Returns the value of attribute errors.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def errors
  @errors
end

.file_line_count ⇒ `Object` (readonly)

Returns the value of attribute file_line_count.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def file_line_count
  @file_line_count
end

.headers ⇒ `Object` (readonly)

Returns the value of attribute headers.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def headers
  @headers
end

.raw_header ⇒ `Object` (readonly)

Returns the value of attribute raw_header.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def raw_header
  @raw_header
end

.result ⇒ `Object` (readonly)

Returns the value of attribute result.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def result
  @result
end

.warnings ⇒ `Object` (readonly)

Returns the value of attribute warnings.



5
6
7

# File 'lib/smarter_csv/variables.rb', line 5

def warnings
  @warnings
end

Class Method Details

.count_quote_chars(line, quote_char) ⇒ `Object`

the ‘scan` method iterates through the string and finds all occurrences of the pattern
The reqular expression:
- (?<!\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
- (?:\\)* : Non-capturing group for an even number of backslashes (escaped backslashes).
```
This allows for any number of escaped backslashes before the quote character.
```
- #Regexp.escape(quote_char) : Dynamically inserts the quote_char into the regex,
```
ensuring it's properly escaped for use in the regex.
```



185
186
187

# File 'lib/smarter_csv/smarter_csv.rb', line 185

def count_quote_chars(line, quote_char)
  line.scan(/(?<!\\)(?:\\\\)*#{Regexp.escape(quote_char)}/).count
end

.default_options ⇒ `Object`

NOTE: this is not called when “parse” methods are tested by themselves

ONLY FOR BACKWARDS-COMPATIBILITY



58
59
60

# File 'lib/smarter_csv/options_processing.rb', line 58

def default_options
  DEFAULT_OPTIONS
end

.has_acceleration? ⇒ `Boolean`

Returns:

(Boolean)



189
190
191

# File 'lib/smarter_csv/smarter_csv.rb', line 189

def has_acceleration?
  @has_acceleration ||= !!defined?(parse_csv_line_c)
end

.headerA ⇒ `Object`

:nocov:

# File 'lib/smarter_csv/variables.rb', line 20

def headerA
  warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
  @headerA
end

.initialize_variables ⇒ `Object`

# File 'lib/smarter_csv/variables.rb', line 7

def initialize_variables
  @csv_line_count = 0
  @chunk_count = 0
  @errors = {}
  @file_line_count = 0
  @headerA = []
  @headers = nil
  @raw_header = nil # header as it appears in the file
  @result = []
  @warnings = {}
end

.parse_csv_line_c(line, col_sep, quote_char, max_size) ⇒ `Object`

max_size: pass nil if no limit is specified

# File 'ext/smarter_csv/smarter_csv.c', line 15

static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
  if (RB_TYPE_P(line, T_NIL) == 1) {
    return rb_ary_new();
  }

  if (RB_TYPE_P(line, T_STRING) != 1) {
    rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
  }

  rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
  char *startP = RSTRING_PTR(line); /* may not be null terminated */
  long line_len = RSTRING_LEN(line);
  char *endP = startP + line_len ; /* points behind the string */
  char *p = startP;

  char *col_sepP = RSTRING_PTR(col_sep);
  long col_sep_len = RSTRING_LEN(col_sep);

  char *quoteP = RSTRING_PTR(quote_char);
  long quote_count = 0;

  bool col_sep_found = true;

  VALUE elements = rb_ary_new();
  VALUE field;
  long i;

  char prev_char = '\0'; // Store the previous character for comparison against an escape character
  long backslash_count = 0; // to count consecutive backslash characters

  while (p < endP) {
    /* does the remaining string start with col_sep ? */
    col_sep_found = true;
    for(i=0; (i < col_sep_len) && (p+i < endP) ; i++) {
      col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
    }
    /* if col_sep was found and we have even quotes */
    if (col_sep_found && (quote_count % 2 == 0)) {
      /* if max_size != nil && lements.size >= header_size */
      if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
        break;
      } else {
        /* push that field with original encoding onto the results */
        field = rb_enc_str_new(startP, p - startP, encoding);
        rb_ary_push(elements, field);

        p += col_sep_len;
        startP = p;
      }
    } else {
      if (*p == '\\') {
        backslash_count++;
      } else {
        if (*p == *quoteP && (backslash_count % 2 == 0)) {
          quote_count++;
        }
        backslash_count = 0; // no more consecutive backslash characters
      }
      p++;
    }

    prev_char = *(p - 1); // Update the previous character
  } /* while */

  /* check if the last part of the line needs to be processed */
  if ((max_size == Qnil) || RARRAY_LEN(elements) < NUM2INT(max_size)) {
    /* copy the remaining line as a field with original encoding onto the results */
    field = rb_enc_str_new(startP, endP - startP, encoding);
    rb_ary_push(elements, field);
  }

  return elements;
}

.process(input, given_options = {}, &block) ⇒ `Object`

first parameter: filename or input object which responds to readline method

# File 'lib/smarter_csv/smarter_csv.rb', line 14

def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
  options = process_options(given_options)

  initialize_variables

  has_rails = !!defined?(Rails)
  begin
    fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

    # auto-detect the row separator
    options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
    # attempt to auto-detect column separator
    options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto

    if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
      puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
    end

    skip_lines(fh, options)

    @headers, header_size = process_headers(fh, options)
    @headerA = @headers # @headerA is deprecated, use @headers

    # in case we use chunking.. we'll need to set it up..
    if options[:chunk_size].to_i > 0
      use_chunks = true
      chunk_size = options[:chunk_size].to_i
      @chunk_count = 0
      chunk = []
    else
      use_chunks = false
    end

    # now on to processing all the rest of the lines in the CSV file:
    until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
      line = readline_with_counts(fh, options)

      # replace invalid byte sequence in UTF-8 with question mark to avoid errors
      line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i

      print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]

      next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any

      # cater for the quoted csv data containing the row separator carriage return character
      # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
      # by detecting the existence of an uneven number of quote characters

      multiline = count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
      while count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
        next_line = fh.readline(options[:row_sep])
        next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
        line += next_line
        @file_line_count += 1
      end
      print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline

      line.chomp!(options[:row_sep])

      dataA, _data_size = parse(line, options, header_size)

      dataA.map!{|x| x.strip} if options[:strip_whitespace]

      # if all values are blank, then ignore this line
      next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))

      hash = @headers.zip(dataA).to_h

      # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
      hash.delete(nil)
      hash.delete('')
      hash.delete(:"")

      if options[:remove_empty_values] == true
        hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
      end

      hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
      hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]

      if options[:convert_values_to_numeric]
        hash.each do |k, v|
          # deal with the :only / :except options to :convert_values_to_numeric
          next if limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)

          # convert if it's a numeric value:
          case v
          when /^[+-]?\d+\.\d+$/
            hash[k] = v.to_f
          when /^[+-]?\d+$/
            hash[k] = v.to_i
          end
        end
      end

      if options[:value_converters]
        hash.each do |k, v|
          converter = options[:value_converters][k]
          next unless converter

          hash[k] = converter.convert(v)
        end
      end

      next if options[:remove_empty_hashes] && hash.empty?

      hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]

      if use_chunks
        chunk << hash # append temp result to chunk

        if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
          # do something with the chunk
          if block_given?
            yield chunk # do something with the hashes in the chunk in the block
          else
            @result << chunk # not sure yet, why anybody would want to do this without a block
          end
          @chunk_count += 1
          chunk = [] # initialize for next chunk of data
        else

          # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)

        end

        # while a chunk is being filled up we don't need to do anything else here

      else # no chunk handling
        if block_given?
          yield [hash] # do something with the hash in the block (better to use chunking here)
        else
          @result << hash
        end
      end
    end

    # print new line to retain last processing line message
    print "\n" if options[:verbose]

    # last chunk:
    if !chunk.nil? && chunk.size > 0
      # do something with the chunk
      if block_given?
        yield chunk # do something with the hashes in the chunk in the block
      else
        @result << chunk # not sure yet, why anybody would want to do this without a block
      end
      @chunk_count += 1
      # chunk = [] # initialize for next chunk of data
    end
  ensure
    fh.close if fh.respond_to?(:close)
  end

  if block_given?
    @chunk_count # when we do processing through a block we only care how many chunks we processed
  else
    @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
  end
end

.process_headers(filehandle, options) ⇒ `Object`

# File 'lib/smarter_csv/headers.rb', line 5

def process_headers(filehandle, options)
  @raw_header = nil # header as it appears in the file
  @headers = nil # the processed headers
  header_array = []
  file_header_size = nil

  # if headers_in_file, get the headers -> We get the number of columns, even when user provided headers
  if options[:headers_in_file] # extract the header line
    # process the header line in the CSV file..
    # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
    header_line = @raw_header = readline_with_counts(filehandle, options)
    header_line = preprocess_header_line(header_line, options)
    file_header_array, file_header_size = parse_and_modify_headers(header_line, options)
  else
    unless options[:user_provided_headers]
      raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers"
    end
  end

  if options[:user_provided_headers]
    unless options[:user_provided_headers].is_a?(Array) && !options[:user_provided_headers].empty?
      raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for user_provided_headers! Expecting array with headers.")
    end

    # use user-provided headers
    user_header_array = options[:user_provided_headers]
    # user_provided_headers: their count should match the headers_in_file if any
    if defined?(file_header_size) && !file_header_size.nil?
      if user_header_array.size != file_header_size
        raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{user_header_array.size} headers !=  CSV-file has #{file_header_size} headers"
      else
        # we could print out the mapping of file_header_array to header_array here
      end
    end
    header_array = user_header_array
  else
    header_array = file_header_array
  end

  # detect duplicate headers and disambiguate
  header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]

  # symbolize headers
  header_array.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]

  # wouldn't make sense to re-map user provided headers
  header_array = remap_headers(header_array, options) if options[:key_mapping] && !options[:user_provided_headers]

  validate_and_deprecate_headers(header_array, options)

  [header_array, header_array.size]
end

.process_options(given_options = {}) ⇒ `Object`

NOTE: this is not called when “parse” methods are tested by themselves

# File 'lib/smarter_csv/options_processing.rb', line 42

def process_options(given_options = {})
  puts "User provided options:\n#{pp(given_options)}\n" if given_options[:verbose]

  # fix invalid input
  given_options[:invalid_byte_sequence] = '' if given_options[:invalid_byte_sequence].nil?

  @options = DEFAULT_OPTIONS.dup.merge!(given_options)
  puts "Computed options:\n#{pp(@options)}\n" if given_options[:verbose]

  validate_options!(@options)
  @options
end

Module: SmarterCSV

Defined Under Namespace

Constant Summary collapse

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.chunk_count ⇒ Object (readonly)

.csv_line_count ⇒ Object (readonly)

.errors ⇒ Object (readonly)

.file_line_count ⇒ Object (readonly)

.headers ⇒ Object (readonly)

.raw_header ⇒ Object (readonly)

.result ⇒ Object (readonly)

.warnings ⇒ Object (readonly)

Class Method Details

.count_quote_chars(line, quote_char) ⇒ Object

.default_options ⇒ Object

.has_acceleration? ⇒ Boolean

.headerA ⇒ Object

.initialize_variables ⇒ Object

.parse_csv_line_c(line, col_sep, quote_char, max_size) ⇒ Object

.process(input, given_options = {}, &block) ⇒ Object

.process_headers(filehandle, options) ⇒ Object

.process_options(given_options = {}) ⇒ Object