Module: SmarterCSV

Defined in:: lib/smarter_csv/version.rb,
lib/smarter_csv/smarter_csv.rb

Defined Under Namespace

Classes: DuplicateHeaders, HeaderSizeMismatch, IncorrectOption, KeyMappingError, MalformedCSVError, MissingHeaders, NoColSepDetected, SmarterCSVException

Constant Summary collapse

VERSION =

"1.6.1"

Class Method Summary collapse

.process(input, options = {}, &block) ⇒ Object

first parameter: filename or input object which responds to readline method.

Class Method Details

.process(input, options = {}, &block) ⇒ `Object`

first parameter: filename or input object which responds to readline method

# File 'lib/smarter_csv/smarter_csv.rb', line 12

def SmarterCSV.process(input, options={}, &block)
  options = default_options.merge(options)
  options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?

  headerA = []
  result = []
  @file_line_count = 0
  @csv_line_count = 0
  has_rails = !! defined?(Rails)
  begin
    fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

    # auto-detect the row separator
    options[:row_sep] = SmarterCSV.guess_line_ending(fh, options) if options[:row_sep].to_sym == :auto
    # attempt to auto-detect column separator
    options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep].to_sym == :auto

    if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && ( fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8') )
      puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
    end

    if options[:skip_lines].to_i > 0
      options[:skip_lines].to_i.times do
        readline_with_counts(fh, options)
      end
    end

    headerA, header_size = process_headers(fh, options)

    # in case we use chunking.. we'll need to set it up..
    if ! options[:chunk_size].nil? && options[:chunk_size].to_i > 0
      use_chunks = true
      chunk_size = options[:chunk_size].to_i
      chunk_count = 0
      chunk = []
    else
      use_chunks = false
    end

    # now on to processing all the rest of the lines in the CSV file:
    while ! fh.eof?    # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
      line = readline_with_counts(fh, options)

      # replace invalid byte sequence in UTF-8 with question mark to avoid errors
      line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i

      print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]

      next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any

      # cater for the quoted csv data containing the row separator carriage return character
      # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
      # by detecting the existence of an uneven number of quote characters

      multiline = line.count(options[:quote_char])%2 == 1 # should handle quote_char nil
      while line.count(options[:quote_char])%2 == 1 # should handle quote_char nil
        next_line = fh.readline(options[:row_sep])
        next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
        line += next_line
        @file_line_count += 1
      end
      print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline

      line.chomp!(options[:row_sep])

      dataA, data_size = parse(line, options, header_size)

      dataA.map!{|x| x.strip} if options[:strip_whitespace]

      # if all values are blank, then ignore this line
      # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
      next if options[:remove_empty_hashes] && blank?(dataA)

      hash = Hash.zip(headerA,dataA)  # from Facets of Ruby library

      # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
      # Note: Ruby < 1.9 doesn't allow empty symbol literals!
      hash.delete(nil); hash.delete('');
      if RUBY_VERSION.to_f > 1.8
        eval('hash.delete(:"")')
      end

      if options[:remove_empty_values] == true
        if has_rails
          hash.delete_if{|k,v| v.blank?}
        else
          hash.delete_if{|k,v| blank?(v)}
        end
      end

      hash.delete_if{|k,v| ! v.nil? && v =~ /^(\d+|\d+\.\d+)$/ && v.to_f == 0} if options[:remove_zero_values]   # values are typically Strings!
      hash.delete_if{|k,v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]

      if options[:convert_values_to_numeric]
        hash.each do |k,v|
          # deal with the :only / :except options to :convert_values_to_numeric
          next if SmarterCSV.only_or_except_limit_execution( options, :convert_values_to_numeric , k )

          # convert if it's a numeric value:
          case v
          when /^[+-]?\d+\.\d+$/
            hash[k] = v.to_f
          when /^[+-]?\d+$/
            hash[k] = v.to_i
          end
        end
      end

      if options[:value_converters]
        hash.each do |k,v|
          converter = options[:value_converters][k]
          next unless converter
          hash[k] = converter.convert(v)
        end
      end

      next if hash.empty? if options[:remove_empty_hashes]

      if use_chunks
        chunk << hash  # append temp result to chunk

        if chunk.size >= chunk_size || fh.eof?   # if chunk if full, or EOF reached
          # do something with the chunk
          if block_given?
            yield chunk  # do something with the hashes in the chunk in the block
          else
            result << chunk  # not sure yet, why anybody would want to do this without a block
          end
          chunk_count += 1
          chunk = []  # initialize for next chunk of data
        else

          # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)

        end

        # while a chunk is being filled up we don't need to do anything else here

      else # no chunk handling
        if block_given?
          yield [hash]  # do something with the hash in the block (better to use chunking here)
        else
          result << hash
        end
      end
    end

    # print new line to retain last processing line message
    print "\n" if options[:verbose]

    # last chunk:
    if ! chunk.nil? && chunk.size > 0
      # do something with the chunk
      if block_given?
        yield chunk  # do something with the hashes in the chunk in the block
      else
        result << chunk  # not sure yet, why anybody would want to do this without a block
      end
      chunk_count += 1
      chunk = []  # initialize for next chunk of data
    end
  ensure
    fh.close if fh.respond_to?(:close)
  end
  if block_given?
    return chunk_count  # when we do processing through a block we only care how many chunks we processed
  else
    return result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
  end
end

Module: SmarterCSV

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.process(input, options = {}, &block) ⇒ Object

.process(input, options = {}, &block) ⇒ `Object`