Module: UnicodeScriptDetector::Confusables

Defined in:
lib/unicode_script_detector/confusables.rb

Constant Summary collapse

MAPPING =

Curated mapping of characters that visually resemble Latin letters. AUTO-GENERATED from unicode.org/Public/security/latest/confusables.txt Run ‘rake update_confusables` to regenerate.

{

  # Cyrillic → Latin
  'Ѕ' => 'S',   # U+0405
  'І' => 'l',   # U+0406
  'Ј' => 'J',   # U+0408
  'А' => 'A',   # U+0410
  'В' => 'B',   # U+0412
  'Е' => 'E',   # U+0415
  'З' => '3',   # U+0417
  'К' => 'K',   # U+041A
  'М' => 'M',   # U+041C
  'Н' => 'H',   # U+041D
  'О' => 'O',   # U+041E
  'Р' => 'P',   # U+0420
  'С' => 'C',   # U+0421
  'Т' => 'T',   # U+0422
  'У' => 'Y',   # U+0423
  'Х' => 'X',   # U+0425
  'Ь' => 'b',   # U+042C
  'а' => 'a',   # U+0430
  'б' => '6',   # U+0431
  'г' => 'r',   # U+0433
  'е' => 'e',   # U+0435
  'о' => 'o',   # U+043E
  'р' => 'p',   # U+0440
  'с' => 'c',   # U+0441
  'у' => 'y',   # U+0443
  'х' => 'x',   # U+0445
  'ш' => 'w',   # U+0448
  'ѕ' => 's',   # U+0455
  'і' => 'i',   # U+0456
  'ј' => 'j',   # U+0458
  'ѡ' => 'w',   # U+0461
  'Ѵ' => 'V',   # U+0474
  'ѵ' => 'v',   # U+0475
  'Ү' => 'Y',   # U+04AE
  'ү' => 'y',   # U+04AF
  'һ' => 'h',   # U+04BB
  'ҽ' => 'e',   # U+04BD
  'Ӏ' => 'l',   # U+04C0
  'ӏ' => 'l',   # U+04CF
  'Ӡ' => '3',   # U+04E0
  'ԁ' => 'd',   # U+0501
  'Ԍ' => 'G',   # U+050C
  'ԛ' => 'q',   # U+051B
  'Ԝ' => 'W',   # U+051C
  'ԝ' => 'w',   # U+051D

  # Greek → Latin
  'ͺ' => 'i',   # U+037A
  'Ϳ' => 'J',   # U+037F
  'Α' => 'A',   # U+0391
  'Β' => 'B',   # U+0392
  'Ε' => 'E',   # U+0395
  'Ζ' => 'Z',   # U+0396
  'Η' => 'H',   # U+0397
  'Ι' => 'l',   # U+0399
  'Κ' => 'K',   # U+039A
  'Μ' => 'M',   # U+039C
  'Ν' => 'N',   # U+039D
  'Ο' => 'O',   # U+039F
  'Ρ' => 'P',   # U+03A1
  'Τ' => 'T',   # U+03A4
  'Υ' => 'Y',   # U+03A5
  'Χ' => 'X',   # U+03A7
  'α' => 'a',   # U+03B1
  'γ' => 'y',   # U+03B3
  'ι' => 'i',   # U+03B9
  'ν' => 'v',   # U+03BD
  'ο' => 'o',   # U+03BF
  'ρ' => 'p',   # U+03C1
  'σ' => 'o',   # U+03C3
  'υ' => 'u',   # U+03C5
  'ϒ' => 'Y',   # U+03D2
  'Ϝ' => 'F',   # U+03DC
  'Ϩ' => '2',   # U+03E8
  'Ϭ' => '6',   # U+03EC
  'ϭ' => 'o',   # U+03ED
  'ϱ' => 'p',   # U+03F1
  'ϲ' => 'c',   # U+03F2
  'ϳ' => 'j',   # U+03F3
  'ϸ' => 'p',   # U+03F8
  'Ϲ' => 'C',   # U+03F9
  'Ϻ' => 'M',   # U+03FA

  # Armenian → Latin
  'Ս' => 'U',   # U+054D
  'Տ' => 'S',   # U+054F
  'Օ' => 'O',   # U+0555
  'ա' => 'w',   # U+0561
  'գ' => 'q',   # U+0563
  'զ' => 'q',   # U+0566
  'հ' => 'h',   # U+0570
  'ո' => 'n',   # U+0578
  'ռ' => 'n',   # U+057C
  'ս' => 'u',   # U+057D
  'ց' => 'g',   # U+0581
  'ւ' => 'i',   # U+0582
  'ք' => 'f',   # U+0584
  'օ' => 'o',   # U+0585

  # Georgian → Latin
  '' => 'y',   # U+10E7
  '' => 'o',   # U+10FF

  # Hebrew → Latin
  '׀' => 'l',   # U+05C0
  'ו' => 'l',   # U+05D5
  'ט' => 'v',   # U+05D8
  'ן' => 'l',   # U+05DF
  'ס' => 'o',   # U+05E1

  # Ethiopic → Latin
  '' => 'U',   # U+1200
  '' => 'O',   # U+12D0
}.freeze
INVISIBLE_CHARACTERS =
[
  "\u200B", # Zero-width space
  "\u200C", # Zero-width non-joiner
  "\u200D", # Zero-width joiner
  "\u200E", # Left-to-right mark
  "\u200F", # Right-to-left mark
  "\uFEFF", # Zero-width no-break space (BOM)
  "\u2060", # Word joiner
  "\u00AD", # Soft hyphen
  "\u180E", # Mongolian vowel separator
  "\u2061", # Function application
  "\u2062", # Invisible times
  "\u2063", # Invisible separator
  "\u2064" # Invisible plus
].freeze
DIRECTIONAL_OVERRIDES =
[
  "\u202A", # Left-to-right embedding
  "\u202B", # Right-to-left embedding
  "\u202C", # Pop directional formatting
  "\u202D", # Left-to-right override
  "\u202E" # Right-to-left override
].freeze
SAFE_SCRIPT_COMBINATIONS =
[
  Set[:Latin, :Han, :Hiragana, :Katakana],
  Set[:Latin, :Han, :Bopomofo],
  Set[:Latin, :Han, :Hangul],
  Set[:Hiragana, :Katakana, :Han],
  Set[:Latin, :Inherited],
  Set[:Latin, :Common],
  Set[:Latin, :Punctuation],
  Set[:Latin, :Digit],
  Set[:Latin, :Whitespace]
].freeze

Class Method Summary collapse

Class Method Details

.confusable?(char) ⇒ Boolean

Returns:

  • (Boolean)


162
163
164
# File 'lib/unicode_script_detector/confusables.rb', line 162

def self.confusable?(char)
  MAPPING.key?(char)
end

.directional_override?(char) ⇒ Boolean

Returns:

  • (Boolean)


174
175
176
# File 'lib/unicode_script_detector/confusables.rb', line 174

def self.directional_override?(char)
  DIRECTIONAL_OVERRIDES.include?(char)
end

.invisible?(char) ⇒ Boolean

Returns:

  • (Boolean)


170
171
172
# File 'lib/unicode_script_detector/confusables.rb', line 170

def self.invisible?(char)
  INVISIBLE_CHARACTERS.include?(char)
end

.looks_like(char) ⇒ Object



166
167
168
# File 'lib/unicode_script_detector/confusables.rb', line 166

def self.looks_like(char)
  MAPPING[char]
end