Class: WhatLanguage

Inherits:
Object
  • Object
show all
Defined in:
lib/whatlanguage/languages.rb,
lib/whatlanguage.rb,
lib/whatlanguage/version.rb

Overview

AUTO-GENERATED from the ISO 639-3 registry + whatlang dataset codes. Maps ISO 639-3 code => [language name symbol, ISO 639-1 (or 639-3 fallback) symbol]. Original gem languages keep their historical symbols and 2-letter codes.

Defined Under Namespace

Classes: Result

Constant Summary collapse

VERSION =
'2.0.0'

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*selection, only: nil, min_chars: DEFAULT_MIN_CHARS) ⇒ WhatLanguage

Returns a new instance of WhatLanguage.



123
124
125
126
127
# File 'lib/whatlanguage.rb', line 123

def initialize(*selection, only: nil, min_chars: DEFAULT_MIN_CHARS)
  @selection = Array(only || (selection.empty? ? [:all] : selection))
  validate_selection!
  @min_chars = min_chars
end

Class Method Details

.detect(text) ⇒ Object



83
84
85
# File 'lib/whatlanguage.rb', line 83

def detect(text)
  default_detector.detect(text)
end

.language(text) ⇒ Object



87
88
89
# File 'lib/whatlanguage.rb', line 87

def language(text)
  default_detector.language(text)
end

.language_iso(text) ⇒ Object



91
92
93
# File 'lib/whatlanguage.rb', line 91

def language_iso(text)
  default_detector.language_iso(text)
end

.languagesObject



106
107
108
# File 'lib/whatlanguage.rb', line 106

def languages
  NAME_TO_CODE.keys
end

.process_textObject



104
105
106
# File 'lib/whatlanguage.rb', line 104

def score_hash(text)
  default_detector.score_hash(text)
end

.profilesObject

script name => [[code, [trigram, …]], …], loaded once and memoized.



111
112
113
114
# File 'lib/whatlanguage.rb', line 111

def profiles
  @profiles ||= JSON.parse(File.read(File.join(__dir__, 'whatlanguage', 'trigrams.json')))
                    .transform_values { |langs| langs.map { |code, str| [code, str.split('|')] } }
end

.ranked(text) ⇒ Object



95
96
97
# File 'lib/whatlanguage.rb', line 95

def ranked(text)
  default_detector.ranked(text)
end

.score_hash(text) ⇒ Object



99
100
101
# File 'lib/whatlanguage.rb', line 99

def score_hash(text)
  default_detector.score_hash(text)
end

.scoresObject



103
104
105
# File 'lib/whatlanguage.rb', line 103

def score_hash(text)
  default_detector.score_hash(text)
end

Instance Method Details

#detect(text) ⇒ Object

Detection result with the winning language, ISO code, winning score, and full ranked scores. Returns nil when the text is too short or unrecognized.



181
182
183
184
185
186
187
# File 'lib/whatlanguage.rb', line 181

def detect(text)
  ranked_scores = ranked(text)
  return nil if ranked_scores.empty?

  name, score = ranked_scores.first
  Result.new(language: name, iso: ISO_CODES[name], score: score, ranked: ranked_scores)
end

#language(text) ⇒ Object

Most likely language as a name symbol, or nil when no language is detected.



190
191
192
# File 'lib/whatlanguage.rb', line 190

def language(text)
  detect(text)&.language
end

#language_iso(text) ⇒ Object

Most likely language as an ISO 639-1 symbol (639-3 fallback), or nil.



195
196
197
# File 'lib/whatlanguage.rb', line 195

def language_iso(text)
  detect(text)&.iso
end

#languagesObject

Language-name symbols this instance scores against: every supported language for :all, otherwise the requested selection intersected with the supported set (legacy aliases such as :pinyin resolved to their modern names).



132
133
134
135
136
137
138
139
140
# File 'lib/whatlanguage.rb', line 132

def languages
  @languages ||=
    if @selection.include?(:all)
      self.class.languages
    else
      wanted = @selection.map { |s| NAME_ALIASES.fetch(s, s) }
      self.class.languages & wanted
    end
end

#ranked(text) ⇒ Object

Per-language scores as an array sorted from most likely to least likely.



175
176
177
# File 'lib/whatlanguage.rb', line 175

def ranked(text)
  score_hash(text).sort_by { |_name, score| -score }
end

#score_hash(text) ⇒ Object Also known as: scores, process_text

Per-language scores for the text (higher = more likely). Languages outside the current selection, or not under the detected script, are absent; the hash defaults to 0. Only the relative ranking is meaningful.



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/whatlanguage.rb', line 145

def score_hash(text)
  results = Hash.new(0)
  text = normalize_text(text)
  script = detect_script(text)
  return results unless script

  if (code = DETERMINISTIC[script])
    name = CODE_INFO[code].first
    results[name] = MAX_TOTAL_DISTANCE if allowed?(name)
    return results
  end

  candidates = self.class.profiles[script]
  return results unless candidates
  return results if significant_char_count(text) < @min_chars

  positions = trigram_positions(text)
  candidates.each do |code, trigrams|
    name = CODE_INFO[code].first
    next unless allowed?(name)

    results[name] = MAX_TOTAL_DISTANCE - distance(trigrams, positions)
  end
  results
end