Class: Chronicle::Email::MboxExtractor

Inherits:
Chronicle::ETL::Extractor
  • Object
show all
Defined in:
lib/chronicle/email/mbox_extractor.rb

Constant Summary collapse

NEW_EMAIL_REGEX =

mbox format is a bunch of emails concatanated together, separated by a line that starts with “From ”

Regexp.new('^From [^\s]+ .{24}')

Instance Method Summary collapse

Instance Method Details

#extractObject



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/chronicle/email/mbox_extractor.rb', line 31

def extract
  file = File.open(@filename)
  tmp = Tempfile.new('chronicle-mbox')

  # Read the .mbox file line by line and look for a header that indicates
  # the start of a new email. As we read line by line, we save to a tmp
  # file and then read it back when we notice the next header.
  # Doing it this way is a lot faster than saving each line to a
  # a variable, especially when we're reading emails with large binary
  # attachments.
  #
  # TODO: make this thread-safe (one tmp file per email?)
  file.each do |line|
    if line =~ (NEW_EMAIL_REGEX) && File.size(tmp).positive?
      tmp.rewind

      email = Mail.new(tmp.read)
      data = {
        raw: email,
        time: email.date&.to_time,
        subject: email.subject,
        from: email&.from&.join(', '),
        to: email&.to&.join(', ')
      }
      yield build_extraction(data:)
      tmp.truncate(0)
      tmp.rewind
    end
    tmp.write(line)
  end
ensure
  tmp.close
  tmp.unlink
  file.close
end

#prepareObject



27
28
29
# File 'lib/chronicle/email/mbox_extractor.rb', line 27

def prepare
  @filename = @config.input.first
end

#results_countObject



21
22
23
24
25
# File 'lib/chronicle/email/mbox_extractor.rb', line 21

def results_count
  File.foreach(@filename).sum do |line|
    line.scan(NEW_EMAIL_REGEX).count
  end
end