Module: MsgExtractor::Util

Defined in:
lib/msg_extractor/util.rb

Constant Summary collapse

ENTITIES =
{
  "amp" => "&", "lt" => "<", "gt" => ">", "quot" => '"',
  "apos" => "'", "nbsp" => " "
}.freeze

Class Method Summary collapse

Class Method Details

.decode_entities(text) ⇒ Object

Single-pass entity decoder. Handles named entities, decimal numeric references, and hex numeric references. Hostile codepoints (out-of-range or surrogate) are replaced with the Unicode replacement character instead of raising. Avoids double-decoding: &amp;#65; → “&#65;”, not “A”.



40
41
42
43
44
45
46
47
48
49
# File 'lib/msg_extractor/util.rb', line 40

def decode_entities(text)
  text.gsub(/&(?:(amp|lt|gt|quot|apos|nbsp)|#(\d+)|#x(\h+));/) do
    if (name = Regexp.last_match(1))
      ENTITIES[name]
    else
      cp = Regexp.last_match(2)&.to_i || Regexp.last_match(3).to_i(16)
      cp <= 0x10FFFF && !(0xD800..0xDFFF).cover?(cp) ? cp.chr(Encoding::UTF_8) : "\u{FFFD}"
    end
  end
end

.dedupe_path(path) ⇒ Object

“f.txt” -> “f (1).txt” -> “f (2).txt” until the path is free.



12
13
14
15
16
17
18
19
# File 'lib/msg_extractor/util.rb', line 12

def dedupe_path(path)
  return path unless ::File.exist?(path)
  extension = ::File.extname(path)
  base = path.delete_suffix(extension)
  counter = 1
  counter += 1 while ::File.exist?("#{base} (#{counter})#{extension}")
  "#{base} (#{counter})#{extension}"
end

.html_to_text(html) ⇒ Object

Crude tag-stripping fallback used only when a message has an HTML body but no plain-text body.



23
24
25
26
27
28
29
# File 'lib/msg_extractor/util.rb', line 23

def html_to_text(html)
  text = strip_blocks(html)
           .gsub(/<br\s*\/?>/i, "\n")
           .gsub(%r{</(p|div|tr|li|h[1-6])>}i, "\n")
           .gsub(/<[^>]+>/, "")
  decode_entities(text).gsub(/[ \t]+\n/, "\n").gsub(/\n{3,}/, "\n\n").strip
end

.sanitize_filename(name) ⇒ Object



5
6
7
8
9
# File 'lib/msg_extractor/util.rb', line 5

def sanitize_filename(name)
  cleaned = name.to_s.gsub(%r{[\x00-\x1F\\/:*?"<>|]}, "_").strip
  cleaned = "unnamed" if cleaned.empty? || cleaned.match?(/\A\.+\z/)
  cleaned
end