Class: Archaeo::EncodingDetector

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/encoding_detector.rb

Overview

Detects and transcodes content from legacy encodings to UTF-8.

Tries a configurable list of encodings in priority order, returning the first that produces valid output. Used as a fallback when Content-Type charset and HTML meta charset are both absent.

Constant Summary collapse

DEFAULT_ENCODINGS =
[
  Encoding::UTF_8,
  Encoding::Windows_1251,
  Encoding::GB18030,
  Encoding::Shift_JIS,
  Encoding::EUC_KR,
  Encoding::ISO_8859_1,
  Encoding::Windows_1252,
].freeze
BINARY_THRESHOLD =
0.1
TEXT_CONTROL_BYTES =
[0x09, 0x0A, 0x0D].freeze

Instance Method Summary collapse

Constructor Details

#initialize(encodings: DEFAULT_ENCODINGS) ⇒ EncodingDetector

Returns a new instance of EncodingDetector.



24
25
26
# File 'lib/archaeo/encoding_detector.rb', line 24

def initialize(encodings: DEFAULT_ENCODINGS)
  @encodings = encodings
end

Instance Method Details

#binary?(bytes) ⇒ Boolean

Returns:

  • (Boolean)


53
54
55
56
57
58
59
60
61
# File 'lib/archaeo/encoding_detector.rb', line 53

def binary?(bytes)
  return false if bytes.nil? || bytes.empty?

  sample = bytes.byteslice(0, [bytes.bytesize, 4096].min)
  non_printable = sample.bytes.count do |b|
    b < 0x20 && !TEXT_CONTROL_BYTES.include?(b)
  end
  non_printable.to_f / sample.bytesize > BINARY_THRESHOLD
end

#detect(bytes) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
# File 'lib/archaeo/encoding_detector.rb', line 28

def detect(bytes)
  return Encoding::UTF_8 if bytes.nil? || bytes.empty?

  string = bytes_to_string(bytes)

  @encodings.each do |enc|
    return enc if valid_in_encoding?(string, enc)
  end

  Encoding::UTF_8
end

#transcode(bytes, fallback: Encoding::UTF_8) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/archaeo/encoding_detector.rb', line 40

def transcode(bytes, fallback: Encoding::UTF_8)
  return "" if bytes.nil? || bytes.empty?

  string = bytes.is_a?(String) ? bytes.dup : bytes.to_s
  return string if string.encoding == Encoding::UTF_8 && string.valid_encoding?

  binary = bytes_to_string(bytes)
  detected = detect(bytes)
  return binary.force_encoding(Encoding::UTF_8) if detected == Encoding::UTF_8

  encode_to_utf8(binary, detected, fallback)
end