Module: TidyBytes

Included in:
String
Defined in:
lib/wayback_machine_downloader/tidy_bytes.rb

Overview

essentially, this is for converting a string with a potentially broken or unknown encoding into a valid UTF-8 string @todo: consider using charlock_holmes for this in the future

Defined Under Namespace

Modules: InstanceMethods

Constant Summary collapse

UNICODE_REPLACEMENT_CHARACTER =
""
COMMON_ENCODINGS =

common encodings to try for best multilingual compatibility

[
  Encoding::UTF_8,
  Encoding::Windows_1251, # Cyrillic/Russian legacy
  Encoding::GB18030,      # Simplified Chinese
  Encoding::Shift_JIS,    # Japanese
  Encoding::EUC_KR,       # Korean
  Encoding::ISO_8859_1,   # Western European
  Encoding::Windows_1252  # Western European/Latin1 superset
].select { |enc| Encoding.name_list.include?(enc.name) }

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.included(base) ⇒ Object



61
62
63
# File 'lib/wayback_machine_downloader/tidy_bytes.rb', line 61

def self.included(base)
  base.send(:include, InstanceMethods)
end

Instance Method Details

#binary_data?Boolean

returns true if the string appears to be binary (has null bytes)

Returns:

  • (Boolean)


21
22
23
# File 'lib/wayback_machine_downloader/tidy_bytes.rb', line 21

def binary_data?
  self.include?("\x00".b)
end

#tidy_bytesObject

attempts to return a valid UTF-8 version of the string



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/wayback_machine_downloader/tidy_bytes.rb', line 26

def tidy_bytes
  return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
  return self.dup.force_encoding("BINARY") if binary_data?

  str = self.dup
  COMMON_ENCODINGS.each do |enc|
    str.force_encoding(enc)
    begin
      utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
      return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
    rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
      # try next encoding
    end
  end

  # if no clean conversion found, try again but accept replacement characters
  str = self.dup
  COMMON_ENCODINGS.each do |enc|
    str.force_encoding(enc)
    begin
      utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
      return utf8 if utf8.valid_encoding?
    rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
      # try next encoding
    end
  end

  # fallback: replace all invalid/undefined bytes
  str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
end

#tidy_bytes!Object



57
58
59
# File 'lib/wayback_machine_downloader/tidy_bytes.rb', line 57

def tidy_bytes!
  replace(self.tidy_bytes)
end