Module: Sisimai::RFC2045

Defined in:
lib/sisimai/rfc2045.rb

Overview

Sisimai::RFC2045 is a MIME Utilities for Sisimai.

Class Method Summary collapse

Class Method Details

.boundary(argv0 = '', start = -1)) ⇒ String

Get a boundary string

Parameters:

  • argv0 (String) (defaults to: '')

    The value of Content-Type header

  • start (Integer) (defaults to: -1))

    -1: boundary string itself 0: Start of boundary 1: End of boundary

Returns:

  • (String)

    Boundary string



127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/sisimai/rfc2045.rb', line 127

def boundary(argv0 = '', start = -1)
  return "" if argv0.empty?
  btext = parameter(argv0, 'boundary')
  return '' if btext.empty?

  # Content-Type: multipart/mixed; boundary=Apple-Mail-5--931376066
  # Content-Type: multipart/report; report-type=delivery-status;
  #    boundary="n6H9lKZh014511.1247824040/mx.example.jp"
  btext = '--' + btext if start > -1
  btext = btext + '--' if start >  0
  return btext
end

.decodeB(argv0 = nil) ⇒ String

Decode MIME BASE64 Encoded string

Parameters:

  • argv0 (String) (defaults to: nil)

    MIME Encoded text

Returns:

  • (String)

    MIME-Decoded text



88
89
90
91
92
93
94
# File 'lib/sisimai/rfc2045.rb', line 88

def decodeB(argv0 = nil)
  return "" if argv0.nil? || argv0.empty?

  p = nil
  if cv = argv0.match(%r|([+/\=0-9A-Za-z\r\n]+)|) then p = Base64.decode64(cv[1]) end
  return p ? p.scrub('?') : ""
end

.decodeH(argvs = []) ⇒ String

Decode MIME-Encoded string in an email header

Parameters:

  • argvs (Array) (defaults to: [])

    An array including MIME-Encoded text

Returns:

  • (String)

    MIME-Decoded text



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/sisimai/rfc2045.rb', line 36

def decodeH(argvs = [])
  ctxcharset = nil
  qbencoding = nil
  textblocks = []

  while e = argvs.shift do
    # Check and decode each element
    e = e.strip.delete('"')

    if self.is_encoded(e)
      # MIME Encoded string like "=?utf-8?B?55m954yr44Gr44KD44KT44GT?="
      next unless cv = e.match(/\A(.*)=[?]([-_0-9A-Za-z]+)[?]([BbQq])[?](.+)[?]=?(.*)\z/)

      ctxcharset ||= cv[2]
      qbencoding ||= cv[3]
      notdecoded   = cv[4]

      textblocks << cv[1]
      textblocks << if qbencoding.upcase == 'B'
                        Base64.decode64(notdecoded)
                      else
                        notdecoded.unpack('M').first
                      end
      textblocks[-1].gsub!(/\r\n/, '')
      textblocks << cv[5]
    else
      textblocks << if textblocks.empty? then e else " #{e}" end
    end
  end
  return '' if textblocks.empty?

  p = textblocks.join('')
  if ctxcharset && qbencoding
    # utf8 => UTF-8
    ctxcharset = 'UTF-8' if ctxcharset.casecmp('UTF8') == 0

    if ctxcharset.casecmp('UTF-8') != 0
      # Characterset is not UTF-8
      begin
        p = p.encode!('UTF-8', ctxcharset)
      rescue
        p = 'FAILED TO CONVERT THE SUBJECT'
      end
    end
  end
  q = p.dup
  return q.force_encoding('UTF-8').scrub('?')
end

.decodeQ(argv0 = nil) ⇒ String

Decode MIME Quoted-Printable Encoded string

Parameters:

  • argv0 (String) (defaults to: nil)

    MIME Encoded text

Returns:

  • (String)

    MIME Decoded text



99
100
101
102
# File 'lib/sisimai/rfc2045.rb', line 99

def decodeQ(argv0 = nil)
  return "" if argv0.nil? || argv0.empty?
  return argv0.unpack('M').first.scrub('?')
end

.haircut(block = '', heads = false) ⇒ Array

Cut header fields except Content-Type, Content-Transfer-Encoding from multipart/* block

Parameters:

  • block (String) (defaults to: '')

    multipart/* block text

  • heads (Boolean) (defaults to: false)

    true = Returns only Content-(Type|Transfer-Encoding) headers

Returns:

  • (Array)

    Two headers and body part of multipart/* block

Since:

  • v5.0.0



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/sisimai/rfc2045.rb', line 145

def haircut(block = '', heads = false)
  return nil if block.empty?

  (upperchunk, lowerchunk) = block.split("\n\n", 2)
  return ['', ''] if upperchunk.nil? || upperchunk.empty? || lowerchunk.nil?
  return ['', ''] if upperchunk.index('Content-Type').nil?

  headerpart = ['', ''] # ["text/plain; charset=iso-2022-jp; ...", "quoted-printable"]
  multipart1 = []       # [headerpart, "body"]

  upperchunk.split("\n").each do |e|
    # Remove fields except Content-Type:, and Content-Transfer-Encoding: in each part of 
    # multipart/* block such as the following:
    #   Date: Thu, 29 Apr 2018 22:22:22 +0900
    #   MIME-Version: 1.0
    #   Message-ID: ...
    #   Content-Transfer-Encoding: quoted-printable
    #   Content-Type: text/plain; charset=us-ascii
    if e.index('Content-Type:') == 0
      # Content-Type: ***
      v = e.split(' ', 2)[-1]
      headerpart[0] = v.index('boundary=') ? v : v.downcase

    elsif e.index('Content-Transfer-Encoding:') == 0
      # Content-Transfer-Encoding: ***
      headerpart[1] = e.split(' ', 2)[-1].downcase

    elsif e.index('boundary=') || e.index('charset=')
      # "Content-Type" field has boundary="..." or charset="utf-8"
      next if headerpart[0].empty?
      headerpart[0] += " #{e}"
      headerpart[0]  = headerpart[0].gsub(/\s\s+/, ' ')
    end
  end
  return headerpart if heads

  mediatypev = headerpart[0].downcase
  ctencoding = headerpart[1]
  multipart1 = headerpart << ''

  while true do
    # Check the upper block: Make a body part at the 2nd element of multipart1
    multipart1[2] = sprintf("Content-Type: %s\n", headerpart[0])

    # Do not append Content-Transfer-Encoding: header when the part is the original message:
    # Content-Type is message/rfc822 or text/rfc822-headers, or message/delivery-status, or
    # message/feedback-report
    break if mediatypev.index('/rfc822')
    break if mediatypev.index('/delivery-status')
    break if mediatypev.index('/feedback-report')
    break if ctencoding.empty?

    multipart1[2] += sprintf("Content-Transfer-Encoding: %s\n", ctencoding)
    break
  end

  while true do
    # Append LF before the lower chunk into the 2nd element of multipart1
    break if lowerchunk.empty?
    break if lowerchunk[0, 1] == "\n"

    multipart1[2] += "\n"
    break
  end
  multipart1[2] += lowerchunk
  return multipart1
end

.is_encoded(argv1) ⇒ Boolean

Check that the argument is MIME-Encoded string or not

Parameters:

  • argvs (String)

    String to be checked

Returns:

  • (Boolean)

    false: Not MIME encoded string, true: MIME encoded string



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/sisimai/rfc2045.rb', line 11

def is_encoded(argv1)
  return false if argv1.nil? || argv1.empty?

  text1 = argv1.delete('"')
  mime1 = false
  piece = []

  if text1.include?(' ')
    # Multiple MIME-Encoded strings in a line
    piece = text1.split(' ')
  else
    piece << text1
  end

  while e = piece.shift do
    # Check all the string in the array
    next unless e =~ /[ \t]*=[?][-_0-9A-Za-z]+[?][BbQq][?].+[?]=?[ \t]*/
    mime1 = true
  end
  return mime1
end

.levelout(argv0 = '', argv1 = '') ⇒ Array

Split argv1: multipart/* blocks by a boundary string in argv0

Parameters:

  • argv0 (String) (defaults to: '')

    The value of Content-Type header

  • argv1 (String) (defaults to: '')

    A pointer to multipart/* message blocks

Returns:

  • (Array)

    List of each part of multipart/*

Since:

  • v5.0.0



218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/sisimai/rfc2045.rb', line 218

def levelout(argv0 = '', argv1 = '')
  return [] if argv0.empty?
  return [] if argv1.empty?

  boundary01 = boundary(argv0, 0); return [] if boundary01.empty?
  multiparts = argv1.split(Regexp.new(Regexp.escape(boundary01) + "\n"))
  partstable = []

  # Remove empty or useless preamble and epilogue of multipart/* block
  multiparts.shift if multiparts[0].size < 8
  return []        if multiparts.empty?
  multiparts.pop   if multiparts.size > 2 && multiparts[-1].size < 8

  while e = multiparts.shift do
    # Check each part and breaks up internal multipart/* block
    f = haircut(e)
    if f[0].index('multipart/')
      # There is nested multipart/* block
      boundary02 = boundary(f[0], -1); next if boundary02.empty?
      bodyinside = f[-1].split("\n\n", 2)[-1]
      next if bodyinside.size < 9 || bodyinside.index(boundary02).nil?

      v = levelout(f[0], bodyinside)
      partstable += v if v.size > 0
    else
      # The part is not a multipart/* block
      b = f[-1].size > 0 ? f[-1] : e
      v = [f[0], f[1], f[0].size > 0 ? b.split("\n\n", 2)[-1] : b]
      partstable << v
    end
  end
  return [] if partstable.empty?

  # Remove $boundary01.'--' and strings from the boundary to the end of the body part.
  boundary01.chomp!
  b = partstable[-1][2]
  p = b.index(boundary01 + '--')
  b[p, b.size] = "" if p

  return partstable
end

.makeflat(argv0 = '', argv1 = '') ⇒ String

Make flat multipart/* part blocks and decode

Parameters:

  • argv0 (String) (defaults to: '')

    The value of Content-Type header

  • argv1 (String) (defaults to: '')

    A pointer to multipart/* message blocks

Returns:



264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
# File 'lib/sisimai/rfc2045.rb', line 264

def makeflat(argv0 = '', argv1 = '')
  return "" if argv0.nil? || argv1.nil?
  return "" if argv0.downcase.index('multipart/') == false || argv0.downcase.index('boundary=') == false

  # Some bounce messages include lower-cased "content-type:" field such as the followings:
  #   - content-type: message/delivery-status        => Content-Type: message/delivery-status
  #   - content-transfer-encoding: quoted-printable  => Content-Transfer-Encoding: quoted-printable
  #   - CHARSET=, BOUNDARY=                          => charset-, boundary=
  #   - message/xdelivery-status                     => message/delivery-status
  iso2022set = %r/charset=["']?(iso-2022-[-a-z0-9]+)['"]?\b/
  multiparts = levelout(argv0, argv1)
  flattenout = ''
  delimiters = ["/delivery-status", "/rfc822", "/feedback-report", "/partial"]

  while e = multiparts.shift do
    # Pick only the following parts Sisimai::Lhost will use, and decode each part
    #   - text/plain, text/rfc822-headers
    #   - message/delivery-status, message/rfc822, message/partial, message/feedback-report
    istexthtml = false
    mediatypev = parameter(e[0]); mediatypev = "text/plain" if mediatypev.empty?
    next if mediatypev.start_with?('text/', 'message/') == false

    if mediatypev == 'text/html'
      # Skip text/html part when the value of Content-Type: header in an internal part of
      # multipart/* includes multipart/alternative;
      next if argv0.index('multipart/alternative')
      istexthtml = true
    end

    ctencoding = e[1]
    bodyinside = e[2]
    bodystring = ''

    if ctencoding.size > 0
      # Check the value of Content-Transfer-Encoding: header
      if ctencoding == 'base64'
        # Content-Transfer-Encoding: base64
        bodystring = decodeB(bodyinside) || ''
        dontappend = false; while first10 = bodystring[0,10] do
          # Don't pick the decoded part as an error message when the part is
          # - BASE64 encoded.
          # - the value of the charset is not utf-8.
          # - NOT a plain text.
          break if     Sisimai::String.aligned(e[0], ['charset', '=', 'utf-8'])
          break unless first10 =~ /[\x00-\x08\x0E-\x1F\x7F-]/
          dontappend = true; break
        end
        next if dontappend

      elsif ctencoding == 'quoted-printable'
        # Content-Transfer-Encoding: quoted-printable
        bodystring = decodeQ(bodyinside) || ''

      elsif ctencoding == '7bit'
        # Content-Transfer-Encoding: 7bit
        if cv = e[0].downcase.match(iso2022set)
          # Content-Type: text/plain; charset=ISO-2022-JP
          bodystring = Sisimai::String.to_utf8(bodyinside, cv[1])
        else
          # No "charset" parameter in the value of Content-Type: header
          bodystring = bodyinside
        end
      else
        # Content-Transfer-Encoding: 8bit, binary, and so on
        bodystring = bodyinside
      end

      if istexthtml
        # Try to delete HTML tags inside of text/html part whenever possible
        bodystring = Sisimai::String.to_plain(bodystring)
      end
      next if bodystring.empty?

      # The body string will be encoded to UTF-8 forcely and call String#scrub method to avoid
      # the following errors:
      #   - incompatible character encodings: ASCII-8BIT and UTF-8
      #   - invalid byte sequence in UTF-8
      if bodystring.encoding.to_s != 'UTF-8'
        # ASCII-8BIT or other 8bit encodings
        ctxcharset = parameter(e[0], 'charset')
        if ctxcharset.empty?
          # The part which has no "charset" parameter causes an ArgumentError: invalid byte
          # sequence in UTF-8 so String#scrub should be called
          bodystring.scrub!('?')
        else
          # ISO-8859-1, GB2312, and so on
          bodystring = Sisimai::String.to_utf8(bodystring, ctxcharset)
        end
        bodystring += "\n\n"
      end

      bodystring.gsub!(/\r\n/, "\n") if bodystring.include?("\r\n") # Convert CRLF to LF

    else
      # There is no Content-Transfer-Encoding header in the part
      be = bodyinside.encoding.to_s
      bodyinside  = Sisimai::String.to_utf8(bodyinside, be) if be != 'UTF-8' 
      bodystring += bodyinside
    end

    if delimiters.any? { |a| mediatypev.include?(a) }
      # Add Content-Type: header of each part (will be used as a delimiter at Sisimai::Lhost)
      # into the body inside when the value of Content-Type: field is message/delivery-status,
      # message/rfc822, or text/rfc822-headers
      bodystring = sprintf("Content-Type: %s\n%s", mediatypev, bodystring)
    end

    # Append "\n" when the last character of $bodystring is not LF
    bodystring += "\n\n" if bodystring[-2, 2] != "\n\n"
    flattenout += bodystring
  end

  return flattenout
end

.parameter(argv0 = '', argv1 = '') ⇒ String

Find a value of specified field name from Content-Type: header

Parameters:

  • argv0 (String) (defaults to: '')

    The value of Content-Type: header

  • argv1 (String) (defaults to: '')

    Lower-cased attribute name of the parameter

Returns:

  • (String)

    The value of the parameter

Since:

  • v5.0.0



109
110
111
112
113
114
115
116
117
118
119
# File 'lib/sisimai/rfc2045.rb', line 109

def parameter(argv0 = '', argv1 = '')
  return "" if argv0.empty?
  parameterq = argv1.size > 0 ? argv1 + '=' : ''
  paramindex = argv1.size > 0 ? argv0.index(parameterq) : 0; return '' if paramindex.nil?

  # Find the value of the parameter name specified in argv1
  foundtoken = argv0[paramindex + parameterq.size, argv0.size].split(';', 2)[0] || ''
  foundtoken = foundtoken.downcase if argv1 != 'boundary'
  foundtoken = foundtoken.delete('"').delete("'")
  return foundtoken
end