Module: Philiprehberger::CsvKit
- Defined in:
- lib/philiprehberger/csv_kit.rb,
lib/philiprehberger/csv_kit/row.rb,
lib/philiprehberger/csv_kit/writer.rb,
lib/philiprehberger/csv_kit/dialect.rb,
lib/philiprehberger/csv_kit/version.rb,
lib/philiprehberger/csv_kit/detector.rb,
lib/philiprehberger/csv_kit/callbacks.rb,
lib/philiprehberger/csv_kit/processor.rb,
lib/philiprehberger/csv_kit/error_handler.rb
Defined Under Namespace
Modules: Callbacks, ErrorHandler Classes: Detector, Dialect, Error, Processor, Row, Writer
Constant Summary collapse
- VERSION =
'0.10.0'
Class Method Summary collapse
-
.count(path_or_io, dialect: nil) ⇒ Integer
Count data rows without loading them all into memory.
-
.each_hash(path_or_io, dialect: nil) {|Hash{Symbol => String}| ... } ⇒ Enumerator?
Stream rows one at a time as symbolized hashes with constant memory.
-
.filter(path_or_io, dialect: nil) {|Hash{Symbol => String}| ... } ⇒ String
Filter rows and return matching rows as a CSV string.
-
.find(path_or_io, dialect: nil) {|Hash{Symbol => String}| ... } ⇒ Hash{Symbol => String}?
Find the first row matching a predicate, streaming (stops as soon as a match is found).
-
.headers(path_or_io, dialect: nil) ⇒ Array<Symbol>
Return the header row as an array of symbols.
-
.pluck(path_or_io, *keys, dialect: nil) ⇒ Array<Hash{Symbol => String}>
Extract specific columns from a CSV.
-
.process(path_or_io, dialect: nil) {|Processor| ... } ⇒ Array<Row>
Streaming DSL — yields a Processor for configuration, then executes.
-
.sample(path_or_io, n, dialect: nil) ⇒ Array<Hash{Symbol => String}>
Return n randomly sampled rows using reservoir sampling (Algorithm R).
-
.to_csv(rows, headers: nil, dialect: nil) ⇒ String
Serialize an array of hashes to a CSV string.
-
.to_hashes(path_or_io, dialect: nil) ⇒ Array<Hash{Symbol => String}>
Load an entire CSV into an array of symbolized hashes.
-
.transpose(path_or_io, dialect: nil) ⇒ Hash{Symbol => Array}
Read a CSV and return a hash mapping each header to the column of values.
Class Method Details
.count(path_or_io, dialect: nil) ⇒ Integer
Count data rows without loading them all into memory.
111 112 113 114 115 |
# File 'lib/philiprehberger/csv_kit.rb', line 111 def self.count(path_or_io, dialect: nil) n = 0 foreach_row(path_or_io, headers: true, dialect: dialect) { |_| n += 1 } n end |
.each_hash(path_or_io, dialect: nil) {|Hash{Symbol => String}| ... } ⇒ Enumerator?
Stream rows one at a time as symbolized hashes with constant memory. Returns an Enumerator if no block is given.
124 125 126 127 128 129 130 131 132 |
# File 'lib/philiprehberger/csv_kit.rb', line 124 def self.each_hash(path_or_io, dialect: nil, &block) enum = Enumerator.new do |yielder| foreach_row(path_or_io, headers: true, dialect: dialect) do |row| yielder.yield(row.to_h.transform_keys(&:to_sym)) end end block ? enum.each(&block) : enum end |
.filter(path_or_io, dialect: nil) {|Hash{Symbol => String}| ... } ⇒ String
Filter rows and return matching rows as a CSV string.
180 181 182 183 184 185 186 187 188 189 |
# File 'lib/philiprehberger/csv_kit.rb', line 180 def self.filter(path_or_io, dialect: nil, &) rows = to_hashes(path_or_io, dialect: dialect).select(&) return '' if rows.empty? headers = rows.first.keys CSV.generate do |csv| csv << headers rows.each { |row| csv << headers.map { |k| row[k] } } end end |
.find(path_or_io, dialect: nil) {|Hash{Symbol => String}| ... } ⇒ Hash{Symbol => String}?
Find the first row matching a predicate, streaming (stops as soon as a match is found).
166 167 168 169 170 171 172 |
# File 'lib/philiprehberger/csv_kit.rb', line 166 def self.find(path_or_io, dialect: nil, &block) foreach_row(path_or_io, headers: true, dialect: dialect) do |row| hash = row.to_h.transform_keys(&:to_sym) return hash if block.call(hash) end nil end |
.headers(path_or_io, dialect: nil) ⇒ Array<Symbol>
Return the header row as an array of symbols.
94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/philiprehberger/csv_kit.rb', line 94 def self.headers(path_or_io, dialect: nil) csv_opts = {} csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect row = nil with_csv(path_or_io, csv_opts) do |csv| row = csv.shift end return [] unless row row.map(&:to_sym) end |
.pluck(path_or_io, *keys, dialect: nil) ⇒ Array<Hash{Symbol => String}>
Extract specific columns from a CSV.
85 86 87 |
# File 'lib/philiprehberger/csv_kit.rb', line 85 def self.pluck(path_or_io, *keys, dialect: nil) to_hashes(path_or_io, dialect: dialect).map { |h| h.slice(*keys) } end |
.process(path_or_io, dialect: nil) {|Processor| ... } ⇒ Array<Row>
Streaming DSL — yields a Processor for configuration, then executes.
26 27 28 29 30 |
# File 'lib/philiprehberger/csv_kit.rb', line 26 def self.process(path_or_io, dialect: nil, &block) processor = Processor.new(path_or_io, dialect: dialect) block.call(processor) processor.run end |
.sample(path_or_io, n, dialect: nil) ⇒ Array<Hash{Symbol => String}>
Return n randomly sampled rows using reservoir sampling (Algorithm R). Memory usage is O(n) regardless of file size. If the file has fewer than n rows, all rows are returned.
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/philiprehberger/csv_kit.rb', line 142 def self.sample(path_or_io, n, dialect: nil) reservoir = [] index = 0 foreach_row(path_or_io, headers: true, dialect: dialect) do |row| hash = row.to_h.transform_keys(&:to_sym) if index < n reservoir << hash else j = rand(index + 1) reservoir[j] = hash if j < n end index += 1 end reservoir end |
.to_csv(rows, headers: nil, dialect: nil) ⇒ String
Serialize an array of hashes to a CSV string.
If headers is omitted, the keys of the first hash are used. Empty input returns an empty string. Dialect options are passed through to the writer.
68 69 70 71 72 73 74 75 76 77 |
# File 'lib/philiprehberger/csv_kit.rb', line 68 def self.to_csv(rows, headers: nil, dialect: nil) return '' if rows.empty? && headers.nil? resolved_headers = (headers || rows.first.keys).map(&:to_sym) io = StringIO.new Writer.stream(io, headers: resolved_headers, dialect: dialect) do |w| rows.each { |row| w << (row.is_a?(Hash) ? row.transform_keys(&:to_sym) : row) } end io.string end |
.to_hashes(path_or_io, dialect: nil) ⇒ Array<Hash{Symbol => String}>
Load an entire CSV into an array of symbolized hashes.
37 38 39 40 41 42 43 |
# File 'lib/philiprehberger/csv_kit.rb', line 37 def self.to_hashes(path_or_io, dialect: nil) rows = [] foreach_row(path_or_io, headers: true, dialect: dialect) do |row| rows << row.to_h.transform_keys(&:to_sym) end rows end |
.transpose(path_or_io, dialect: nil) ⇒ Hash{Symbol => Array}
Read a CSV and return a hash mapping each header to the column of values.
50 51 52 53 54 55 56 57 |
# File 'lib/philiprehberger/csv_kit.rb', line 50 def self.transpose(path_or_io, dialect: nil) rows = to_hashes(path_or_io, dialect: dialect) return {} if rows.empty? rows.first.keys.to_h do |key| [key, rows.map { |row| row[key] }] end end |