Class: Kotoshu::Analyzers::SemanticAnalyzer

Inherits:

Object

Object
Kotoshu::Analyzers::SemanticAnalyzer

show all

Defined in:: lib/kotoshu/analyzers/semantic_analyzer.rb

Overview

Unified semantic error analyzer.

Uses word embeddings for context-aware error detection and suggestions. Provides unified semantic analysis without artificial spelling/grammar split.

Examples:

Analyzing a document

model = FastTextModel.from_github('en')
analyzer = SemanticAnalyzer.new(model)
errors = analyzer.analyze(document)

Checking a single word

suggestions = analyzer.suggest_corrections('helo', context_words: ['hello', 'world'])

Constant Summary collapse

HIGH_CONFIDENCE_THRESHOLD = Similarity threshold for high-confidence suggestions

0.85

MEDIUM_CONFIDENCE_THRESHOLD = Similarity threshold for medium-confidence suggestions

0.70

MIN_SIMILARITY = Minimum similarity for suggestions

0.50

DEFAULT_MAX_SUGGESTIONS = Default number of suggestions to generate

Instance Attribute Summary collapse

#max_suggestions ⇒ Object readonly

Returns the value of attribute max_suggestions.
#model ⇒ Object readonly

Returns the value of attribute model.

Instance Method Summary collapse

#analyze(document) ⇒ Array<Models::SemanticError>

Analyze a document for semantic errors.
#calculate_confidence(suggestions) ⇒ Float

Calculate confidence score for suggestions.
#detect_error(word:, location:, context: nil) ⇒ Models::SemanticError^?

Detect semantic error for a single word.
#initialize(model, max_suggestions: DEFAULT_MAX_SUGGESTIONS, min_similarity: MIN_SIMILARITY) ⇒ SemanticAnalyzer constructor

Create a new semantic analyzer.
#suggest_corrections(word, context: nil) ⇒ Array<Models::Suggestion>

Suggest corrections for a word.
#valid_word?(word) ⇒ Boolean

Check if a word is valid (exists in vocabulary).

Constructor Details

#initialize(model, max_suggestions: DEFAULT_MAX_SUGGESTIONS, min_similarity: MIN_SIMILARITY) ⇒ `SemanticAnalyzer`

Create a new semantic analyzer.

Parameters:

model (EmbeddingModel) —

The embedding model to use
max_suggestions (Integer) (defaults to: DEFAULT_MAX_SUGGESTIONS) —

Maximum suggestions per error
min_similarity (Float) (defaults to: MIN_SIMILARITY) —

Minimum similarity threshold

Raises:

(ArgumentError)

# File 'lib/kotoshu/analyzers/semantic_analyzer.rb', line 42

def initialize(model, max_suggestions: DEFAULT_MAX_SUGGESTIONS, min_similarity: MIN_SIMILARITY)
  raise ArgumentError, "Model must be an EmbeddingModel" unless model.is_a?(Models::EmbeddingModel)

  @model = model
  @max_suggestions = max_suggestions
  @min_similarity = min_similarity
end

Instance Attribute Details

#max_suggestions ⇒ `Object` (readonly)

Returns the value of attribute max_suggestions.



35
36
37

# File 'lib/kotoshu/analyzers/semantic_analyzer.rb', line 35

def max_suggestions
  @max_suggestions
end

#model ⇒ `Object` (readonly)

Returns the value of attribute model.



35
36
37

# File 'lib/kotoshu/analyzers/semantic_analyzer.rb', line 35

def model
  @model
end

Instance Method Details

#analyze(document) ⇒ `Array<Models::SemanticError>`

Analyze a document for semantic errors.

Parameters:

document (Document) —

The document to analyze

Returns:

(Array<Models::SemanticError>) —

List of errors found

# File 'lib/kotoshu/analyzers/semantic_analyzer.rb', line 54

def analyze(document)
  errors = []

  # Get text nodes from document
  document.text_nodes.each do |text_node|
    # Tokenize and check each word
    words = tokenize_words(text_node.text)

    words.each do |word|
      next if valid_word?(word)

      # Detect error
      error = detect_error(
        word: word,
        location: text_node.location,
        context: document.context_for(text_node.location)
      )

      errors << error if error
    end
  end

  # Sort errors by location and confidence
  errors.sort
end

#calculate_confidence(suggestions) ⇒ `Float`

Calculate confidence score for suggestions.

Parameters:

suggestions (Array<Models::Suggestion>) —

List of suggestions

Returns:

(Float) —

Confidence score (0.0 to 1.0)

# File 'lib/kotoshu/analyzers/semantic_analyzer.rb', line 164

def calculate_confidence(suggestions)
  return 0.0 unless suggestions&.any?

  # Confidence is based on top suggestion quality
  top = suggestions.first

  # High confidence: top suggestion > 0.85 similarity
  return 1.0 if top.confidence > HIGH_CONFIDENCE_THRESHOLD

  # Medium confidence: top suggestion > 0.70 similarity
  return 0.7 if top.confidence > MEDIUM_CONFIDENCE_THRESHOLD

  # Low confidence: top suggestion < 0.70
  0.5
end

#detect_error(word:, location:, context: nil) ⇒ `Models::SemanticError`^?

Detect semantic error for a single word.

Parameters:

word (String) —

The word to check
location (Location) —

Error location
context (Models::Context, nil) (defaults to: nil) —

Context around the word

Returns:

(Models::SemanticError, nil) —

Error object or nil if valid

# File 'lib/kotoshu/analyzers/semantic_analyzer.rb', line 86

def detect_error(word:, location:, context: nil)
  return nil if valid_word?(word)

  # Get suggestions
  suggestions = suggest_corrections(word, context: context)

  # Determine error type based on analysis
  error_type = classify_error(word, suggestions, context)

  # Calculate confidence based on suggestions
  confidence = calculate_confidence(suggestions)

  # Create error object
  Models::SemanticError.new(
    id: generate_error_id(word, location),
    location: location,
    original: word,
    suggestions: suggestions,
    error_type: error_type,
    confidence: confidence,
    context: context
  )
end

#suggest_corrections(word, context: nil) ⇒ `Array<Models::Suggestion>`

Suggest corrections for a word.

Parameters:

word (String) —

The misspelled word
context (Models::Context, nil) (defaults to: nil) —

Context for context-aware suggestions

Returns:

(Array<Models::Suggestion>) —

Suggested corrections

# File 'lib/kotoshu/analyzers/semantic_analyzer.rb', line 115

def suggest_corrections(word, context: nil)
  return [] if word.nil? || word.empty?

  # Get nearest neighbors from embedding model
  neighbors = @model.nearest_neighbors(word, k: @max_suggestions * 3)

  # Filter by minimum similarity
  neighbors = neighbors.select { |n| n.similarity >= @min_similarity }

  # If we have context, rank by contextual relevance
  if context && context.respond_to?(:surrounding_words)
    neighbors = rank_by_context(neighbors, context)
  end

  # Convert to Suggestions
  neighbors.first(@max_suggestions).map do |neighbor|
    Models::Suggestion.new(
      word: neighbor.word,
      confidence: neighbor.similarity,
      source: :semantic,
      metadata: {
        distance: neighbor.distance,
        similarity: neighbor.similarity
      }
    )
  end
end

#valid_word?(word) ⇒ `Boolean`

Check if a word is valid (exists in vocabulary).

Parameters:

word (String) —

The word to check

Returns:

(Boolean) —

True if word is valid

# File 'lib/kotoshu/analyzers/semantic_analyzer.rb', line 147

def valid_word?(word)
  return false if word.nil? || word.empty?

  # Skip numbers
  return true if word =~ /^\d+$/

  # Skip single characters (likely abbreviations)
  return true if word.length == 1

  # Check if word exists in model vocabulary
  @model.has_word?(word)
end

Class: Kotoshu::Analyzers::SemanticAnalyzer

Overview

Examples:

Analyzing a document

Checking a single word

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(model, max_suggestions: DEFAULT_MAX_SUGGESTIONS, min_similarity: MIN_SIMILARITY) ⇒ SemanticAnalyzer

Instance Attribute Details

#max_suggestions ⇒ Object (readonly)

#model ⇒ Object (readonly)

Instance Method Details

#analyze(document) ⇒ Array<Models::SemanticError>

#calculate_confidence(suggestions) ⇒ Float

#detect_error(word:, location:, context: nil) ⇒ Models::SemanticError?

#suggest_corrections(word, context: nil) ⇒ Array<Models::Suggestion>

#valid_word?(word) ⇒ Boolean

#initialize(model, max_suggestions: DEFAULT_MAX_SUGGESTIONS, min_similarity: MIN_SIMILARITY) ⇒ `SemanticAnalyzer`

#max_suggestions ⇒ `Object` (readonly)

#model ⇒ `Object` (readonly)

#analyze(document) ⇒ `Array<Models::SemanticError>`

#calculate_confidence(suggestions) ⇒ `Float`

#detect_error(word:, location:, context: nil) ⇒ `Models::SemanticError`^?

#suggest_corrections(word, context: nil) ⇒ `Array<Models::Suggestion>`

#valid_word?(word) ⇒ `Boolean`