Class: Kotoshu::Language::Tokenizer::RussianTokenizer

Inherits:

Base

Object
Base
Kotoshu::Language::Tokenizer::RussianTokenizer

show all

Defined in:: lib/kotoshu/language/tokenizer/russian_tokenizer.rb

Overview

Tokenizer for Russian text.

Ported from LanguageTool’s RussianWordTokenizer.

Handles:

Apostrophe as word character
Dot as word character (for abbreviations)
Special abbreviations: б/у (second-hand), б/н (new)
Spaced dots: .. , .

Direct Known Subclasses

Kotoshu::Languages::Russian::Tokenizer

Constant Summary collapse

WORD_SEPARATORS = Russian-specific word separators (exclude apostrophe and dot)

/[\s"()\[\]{}<>,;:!?\\\/|`~@#$%^&*+\-·]/.freeze

ABBREVIATION_PLACEHOLDERS = Special abbreviations that should not be split Using non-printing characters as placeholders

{
  "б/у" => "\u0001\u0001SOCR_BU\u0001\u0001",
  "б/н" => "\u0001\u0001SOCR_BN\u0001\u0001"
}.freeze

PLACEHOLDER_RESTORE = Reverse placeholders for restoration

{
  "\u0001\u0001SOCR_BU\u0001\u0001" => "б/у",
  "\u0001\u0001SOCR_BN\u0001\u0001" => "б/н",
  "\u0001\u0001SP_DDOT_SP\u0001\u0001" => " .. ",
  "\u0001\u0001SP_DOT_SP\u0001\u0001" => " . ",
  "\u0001\u0001SP_DOT\u0001\u0001" => "."
}.freeze

Instance Method Summary collapse

#tokenize(text) ⇒ Object

Methods inherited from Base

#normalize, #skip_token?, #tokenize_with_positions, #word_boundary_regex, #word_char?

Instance Method Details

#tokenize(text) ⇒ `Object`

# File 'lib/kotoshu/language/tokenizer/russian_tokenizer.rb', line 35

def tokenize(text)
  return [] if text.nil? || text.strip.empty?

  # Replace abbreviations with placeholders
  text = replace_abbreviations(text)

  # Split on word boundaries
  raw_tokens = text.split(WORD_SEPARATORS)

  # Restore abbreviations and filter
  raw_tokens
    .map { |token| restore_abbreviations(token) }
    .map { |token| normalize(token) }
    .reject { |token| skip_token?(token) }
end