Class: Kotoshu::Language::Detector

Inherits:
Object
  • Object
show all
Defined in:
lib/kotoshu/language/detector.rb

Overview

Language detection based on character sets and patterns.

Uses Unicode character ranges to identify probable language. Provides confidence scoring for multiple matches.

Examples:

Detect language

Language::Detector.detect("Hello world")  # => "en"

Constant Summary collapse

CHARACTER_SETS =

Character set ranges for language detection

{
  cyrillic: /\p{Cyrillic}/,
  hiragana: /[\u3040-\u309F]/,
  katakana: /[\u30A0-\u30FF]/,
  cjk: /[\u4E00-\u9FFF]/,
  hangul: /[\uAC00-\uD7AF]/,
  latin: /[a-zA-Zà-ÿ]/
}.freeze
LANGUAGE_PATTERNS =

Language-specific patterns

{
  # Russian: Cyrillic
  russian: {
    pattern: /\p{Cyrillic}[а-яА-ЯёЁ]/,
    min_ratio: 0.3,
    scripts: [:cyrillic]
  },

  # Japanese: Mixed script (Hiragana + Katakana + Kanji)
  japanese: {
    pattern: /[\u3040-\u309F]|[\u30A0-\u30FF]|[\u4E00-\u9FFF]/,
    min_ratio: 0.2,
    scripts: [:hiragana, :katakana, :cjk],
    must_have: [:hiragana]  # Only require hiragana, not both
  },

  # Portuguese: Latin with specific accents
  portuguese: {
    pattern: /[ãõáàâãéêíóôõúç]/i,
    min_ratio: 0.05,
    scripts: [:latin]
  },

  # French: Latin with specific accents (NOT German umlauts)
  french: {
    pattern: /[éèêëàâùûüîïôç]/i,  # Removed ä, ö (not French)
    min_ratio: 0.02,  # Lower threshold
    scripts: [:latin],
    priority: 1  # Higher priority than English
  },

  # Spanish: Latin with inverted punctuation
  spanish: {
    pattern: /[áéíóúüñ¿¡]/i,
    min_ratio: 0.02,  # Lower threshold
    scripts: [:latin],
    priority: 1
  },

  # German: Latin with umlauts and eszett
  german: {
    pattern: /[äöüßÄÖÜ]/,  # Explicitly include uppercase
    min_ratio: 0.02,  # Lower threshold
    scripts: [:latin],
    priority: 1
  },

  # English: Latin with minimal accents
  english: {
    pattern: /[a-zA-Z]/,
    min_ratio: 0.3,
    scripts: [:latin],
    max_accent_ratio: 0.02
  }
}.freeze
CODE_MAPPING =

Language code mapping

{
  russian: "ru",
  japanese: "ja",
  portuguese: "pt",
  french: "fr",
  spanish: "es",
  german: "de",
  english: "en"
}.freeze

Class Method Summary collapse

Class Method Details

.detect(text) ⇒ String?

Detect language from text.

Returns the most probable language code based on character analysis.

Parameters:

  • text (String)

    Text to analyze

Returns:

  • (String, nil)

    Detected language code or nil if uncertain



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/kotoshu/language/detector.rb', line 98

def detect(text)
  return nil if text.nil? || text.strip.empty?

  scores = analyze_languages(text)
  return nil if scores.empty?

  # Sort by score, then by priority (higher priority first)
  result = scores.max_by do |code, score|
    config = LANGUAGE_PATTERNS.find { |k, v| CODE_MAPPING[k] == code }
    priority = config&.last&.dig(:priority) || 0
    [score, priority]
  end

  result&.first
end

.detect_candidates(text, limit: 3) ⇒ Array<Array<String, Float>>

Get multiple language candidates.

Parameters:

  • text (String)

    Text to analyze

  • limit (Integer) (defaults to: 3)

    Maximum candidates to return

Returns:

  • (Array<Array<String, Float>>)

    Array of [code, confidence] pairs



135
136
137
138
139
140
141
142
143
144
145
146
# File 'lib/kotoshu/language/detector.rb', line 135

def detect_candidates(text, limit: 3)
  return [] if text.nil? || text.strip.empty?

  scores = analyze_languages(text)
  return [] if scores.empty?

  total_score = scores.values.sum.to_f
  scores
    .sort_by { |_, score| -score }
    .first(limit)
    .map { |code, score| [code, score / total_score] }
end

.detect_with_confidence(text) ⇒ Array<String, Float>

Detect with confidence score.

Parameters:

  • text (String)

    Text to analyze

Returns:

  • (Array<String, Float>)

    Language code and confidence (0-1)



118
119
120
121
122
123
124
125
126
127
128
# File 'lib/kotoshu/language/detector.rb', line 118

def detect_with_confidence(text)
  return [nil, 0.0] if text.nil? || text.strip.empty?

  scores = analyze_languages(text)
  return [nil, 0.0] if scores.empty?

  top_language, top_score = scores.max_by { |_, score| score }
  confidence = normalize_confidence(top_score, scores.values)

  [top_language, confidence]
end