Class: Kotoshu::Language::Tokenizer::PortugueseTokenizer

Inherits:

Base

Object
Base
Kotoshu::Language::Tokenizer::PortugueseTokenizer

show all

Defined in:: lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb

Overview

Tokenizer for Portuguese text.

Ported from LanguageTool’s PortugueseWordTokenizer.

Handles:

Decimal comma between digits (3,14)
Dotted numbers (1.000.000)
Dates (01.01.2024, 2024-01-01)
Colons in time (12:25)
Hyphens with do-not-split list
Spaced decimals (2 000 000)

Direct Known Subclasses

Kotoshu::Languages::Portuguese::Tokenizer

Constant Summary collapse

WORD_SEPARATORS = Portuguese word separators - most punctuation and whitespace Note: We protect special patterns before splitting

/[\s"()\[\]{}<>@€£\\$%‰‱ºªᵃᵒˢ|`~#^·]/.freeze

DECIMAL_COMMA_SUBST = Placeholder characters (using non-printing characters)

"\uE001"

NON_BREAKING_SPACE_SUBST =

"\uE002"

NON_BREAKING_DOT_SUBST =

"\uE003"

NON_BREAKING_COLON_SUBST =

"\uE004"

DECIMAL_COMMA_PATTERN = Decimal comma between digits: 3,14

/(\d),(\d)/

DOTTED_NUMBERS_PATTERN = Dotted numbers: 1.000.000

/(\d)\.(\d)/

COLON_NUMBERS_PATTERN = Colon in numbers (time): 12:25

/(\d):(\d)/

DATE_PATTERN = Date patterns: 01.01.2024, 2024-01-01

/(\d{2})\.(\d{2})\.(\d{4})|(\d{4})\.(\d{2})\.(\d{2})|(\d{4})-(\d{2})-(\d{2})/

SPACED_DECIMAL_PATTERN = Spaced decimals: 2 000 000

/(?<=^|[\s(])\d{1,3}( \d{3})+(?:[,#{DECIMAL_COMMA_SUBST}#{NON_BREAKING_DOT_SUBST}]\d+)?(?=\D|$)/

DO_NOT_SPLIT = Do-not-split list (from LanguageTool)

%w[
  mers-cov mcgraw-hill sars-cov-2 sars-cov
  ph-metre ph-metres anti-ivg anti-uv anti-vih al-qaïda
].freeze

Instance Method Summary collapse

#tokenize(text) ⇒ Object

Methods inherited from Base

#normalize, #skip_token?, #tokenize_with_positions, #word_boundary_regex, #word_char?

Instance Method Details

#tokenize(text) ⇒ `Object`

# File 'lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb', line 49

def tokenize(text)
  return [] if text.nil? || text.strip.empty?

  # Handle decimal commas
  if text.include?(",")
    text = text.gsub(DECIMAL_COMMA_PATTERN, "\\1#{DECIMAL_COMMA_SUBST}\\2")
  end

  # Handle dots in numbers and dates
  if text.include?(".")
    # Handle dates first (before dotted numbers to avoid conflicts)
    text = text.gsub(DATE_PATTERN) do |match|
      # match[0] is the full match, match[1-9] are the capture groups
      if match[1] && match[2] && match[3] # DD.MM.YYYY
        "#{match[1]}#{NON_BREAKING_DOT_SUBST}#{match[2]}#{NON_BREAKING_DOT_SUBST}#{match[3]}"
      elsif match[4] && match[5] && match[6] # YYYY.MM.DD
        "#{match[4]}#{NON_BREAKING_DOT_SUBST}#{match[5]}#{NON_BREAKING_DOT_SUBST}#{match[6]}"
      elsif match[7] && match[8] && match[9] # YYYY-MM-DD (keep as-is)
        match[0]
      else
        match[0]
      end
    end
    text = text.gsub(DOTTED_NUMBERS_PATTERN, "\\1#{NON_BREAKING_DOT_SUBST}\\2")
  end

  # Handle spaced decimals: 2 000 000
  text = handle_spaced_decimals(text)

  # Handle colons in time: 12:25
  if text.include?(":")
    text = text.gsub(COLON_NUMBERS_PATTERN, "\\1#{NON_BREAKING_COLON_SUBST}\\2")
  end

  # Split on word boundaries
  raw_tokens = text.split(WORD_SEPARATORS)

  # Process each token
  tokens = []
  raw_tokens.each do |token|
    next if token.empty?

    # Restore placeholders
    token = restore_placeholders(token)

    # Handle hyphenated words
    parts = words_to_add(token)
    tokens.concat(parts)
  end

  # Filter and normalize
  tokens
    .map { |token| normalize(token) }
    .reject { |token| skip_token?(token) }
end