Class: Kotoshu::Language::LanguageIdentifier

Inherits:
Object
  • Object
show all
Defined in:
lib/kotoshu/language/identifier.rb

Overview

Language identification using FastText LID model.

Identifies the language of text using FastText’s pretrained language identification model (lid.176.ftz).

Examples:

Detect language

lid = LanguageIdentifier.new
result = lid.detect("Hello world")
result.language  # => "en"
result.confidence  # => 0.95

Detect from file

results = lid.detect_from_file("document.txt", top_k: 3)
results.map(&:language)  # => ["en", "de", "fr"]

Defined Under Namespace

Classes: DetectionResult

Constant Summary collapse

MODEL_URL =

FastText LID model URL

'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
LANGUAGE_MAPPING =

Language code mapping (FastText LID → ISO 639-1)

{
  # FastText uses format like "__label__en" for English
  'en' => 'en',
  'de' => 'de',
  'es' => 'es',
  'fr' => 'fr',
  'pt' => 'pt',
  'ru' => 'ru',
  'it' => 'it',
  'nl' => 'nl',
  'pl' => 'pl',
  'sv' => 'sv',
  'da' => 'da',
  'no' => 'no',
  'fi' => 'fi',
  'cs' => 'cs',
  'el' => 'el',
  'hu' => 'hu',
  'ro' => 'ro',
  'bg' => 'bg',
  'sk' => 'sk',
  'sl' => 'sl',
  'hr' => 'hr',
  'sr' => 'sr',
  'et' => 'et',
  'lv' => 'lv',
  'lt' => 'lt',
  'mt' => 'mt',
  'ga' => 'ga',
  'cy' => 'cy',
  'tr' => 'tr',
  'ar' => 'ar',
  'he' => 'he',
  'fa' => 'fa',
  'ur' => 'ur',
  'hi' => 'hi',
  'bn' => 'bn',
  'ta' => 'ta',
  'te' => 'te',
  'ml' => 'ml',
  'kn' => 'kn',
  'th' => 'th',
  'vi' => 'vi',
  'id' => 'id',
  'ms' => 'ms',
  'sw' => 'sw',
  'zh' => 'zh',
  'ja' => 'ja',
  'ko' => 'ko'
}.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(model_path: nil, auto_download: true) ⇒ LanguageIdentifier

Create a new language identifier.

Parameters:

  • model_path (String) (defaults to: nil)

    Path to lid.176.ftz model

  • auto_download (Boolean) (defaults to: true)

    Download model if not found



92
93
94
95
96
# File 'lib/kotoshu/language/identifier.rb', line 92

def initialize(model_path: nil, auto_download: true)
  @model_path = model_path || default_model_path
  @auto_download = auto_download
  @loaded = false
end

Instance Attribute Details

#loadedObject (readonly)

Returns the value of attribute loaded.



86
87
88
# File 'lib/kotoshu/language/identifier.rb', line 86

def loaded
  @loaded
end

#model_pathObject (readonly)

Returns the value of attribute model_path.



86
87
88
# File 'lib/kotoshu/language/identifier.rb', line 86

def model_path
  @model_path
end

Class Method Details

.supported_languagesArray<String>

Get supported languages.

Returns:

  • (Array<String>)

    List of supported ISO 639-1 codes



184
185
186
# File 'lib/kotoshu/language/identifier.rb', line 184

def self.supported_languages
  LANGUAGE_MAPPING.keys
end

Instance Method Details

#detect(text, top_k: 1) ⇒ Array<DetectionResult>

Detect language of text.

Parameters:

  • text (String)

    Text to analyze

  • top_k (Integer) (defaults to: 1)

    Number of top results to return

Returns:



103
104
105
106
107
108
109
110
111
112
113
# File 'lib/kotoshu/language/identifier.rb', line 103

def detect(text, top_k: 1)
  ensure_model_loaded

  # Preprocess text
  text = preprocess_text(text)

  # Run detection
  results = run_detection(text, top_k)

  results
end

#detect_from_file(filepath, top_k: 1) ⇒ Array<DetectionResult>

Detect language from file.

Parameters:

  • filepath (String)

    Path to file

  • top_k (Integer) (defaults to: 1)

    Number of top results

Returns:



120
121
122
123
# File 'lib/kotoshu/language/identifier.rb', line 120

def detect_from_file(filepath, top_k: 1)
  text = File.read(filepath, encoding: 'UTF-8')
  detect(text, top_k: top_k)
end

#detect_primary(text) ⇒ DetectionResult?

Get the most likely language.

Parameters:

  • text (String)

    Text to analyze

Returns:



129
130
131
# File 'lib/kotoshu/language/identifier.rb', line 129

def detect_primary(text)
  detect(text, top_k: 1).first
end

#download_modelString

Download the FastText LID model.

Returns:

  • (String)

    Path to downloaded model



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# File 'lib/kotoshu/language/identifier.rb', line 143

def download_model
  require 'net/http'
  require 'uri'
  require 'fileutils'

  # Create directory
  FileUtils.mkdir_p(File.dirname(@model_path))

  puts "Downloading language identification model..."
  puts "  From: #{MODEL_URL}"
  puts "  To: #{@model_path}"

  uri = URI.parse(MODEL_URL)
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = true

  request = Net::HTTP::Get.new(uri.request_uri)

  http.request(request) do |response|
    case response
    when Net::HTTPSuccess
      File.open(@model_path, 'wb') do |file|
        response.read_body do |chunk|
          file.write(chunk)
        end
      end
      puts "  ✓ Download complete"
    when Net::HTTPRedirection
      # Follow redirect
      follow_redirect(response['location'])
    else
      raise "Failed to download model: #{response.code} #{response.message}"
    end
  end

  @model_path
end

#model_downloaded?Boolean

Check if model is downloaded.

Returns:

  • (Boolean)

    True if model file exists



136
137
138
# File 'lib/kotoshu/language/identifier.rb', line 136

def model_downloaded?
  File.exist?(@model_path)
end