Module: Plain::Merge

Defined in:: lib/plain/merge.rb,
lib/plain/merge/version.rb

Defined Under Namespace

Modules: Version

Constant Summary collapse

PACKAGE_NAME =

"plain-merge"

DEFAULT_TEXT_REFINEMENT_THRESHOLD =

0.7

DEFAULT_TEXT_REFINEMENT_WEIGHTS =

{
  content: 0.7,
  length: 0.15,
  position: 0.15
}.freeze

VERSION =

Version::VERSION

Class Method Summary collapse

.analyze_text(source) ⇒ Object
.is_similar(left_source, right_source, threshold) ⇒ Object
.match_text_blocks(template_source, destination_source) ⇒ Object
.merge_text(template_source, destination_source) ⇒ Object
.normalize_text(source) ⇒ Object
.refined_text_similarity(template_block, destination_block, template_total, destination_total, weights = DEFAULT_TEXT_REFINEMENT_WEIGHTS) ⇒ Object
.similarity_score(left_source, right_source) ⇒ Object
.text_feature_profile ⇒ Object
.text_parse_request(source) ⇒ Object

Class Method Details

.analyze_text(source) ⇒ `Object`

# File 'lib/plain/merge.rb', line 39

def analyze_text(source)
  normalized_source = normalize_text(source)
  parts = normalized_source.empty? ? [] : normalized_source.split("\n\n")
  cursor = 0

  blocks = parts.each_with_index.map do |normalized, index|
    start_offset = cursor
    end_offset = start_offset + normalized.length
    cursor = end_offset + 2

    {
      index: index,
      normalized: normalized,
      span: {
        start: start_offset,
        end: end_offset
      }
    }
  end

  {
    kind: "text",
    normalized_source: normalized_source,
    blocks: blocks
  }
end

.is_similar(left_source, right_source, threshold) ⇒ `Object`

# File 'lib/plain/merge.rb', line 84

def is_similar(left_source, right_source, threshold)
  score = similarity_score(left_source, right_source)
  {
    score: score,
    threshold: threshold,
    matched: score >= threshold
  }
end

.match_text_blocks(template_source, destination_source) ⇒ `Object`

# File 'lib/plain/merge.rb', line 93

def match_text_blocks(template_source, destination_source)
  template = analyze_text(template_source)
  destination = analyze_text(destination_source)
  matched_template = {}
  matched_destination = {}
  matched = []

  destination[:blocks].each_with_index do |destination_block, destination_index|
    template_index = template[:blocks].find_index.with_index do |template_block, candidate_index|
      !matched_template[candidate_index] && template_block[:normalized] == destination_block[:normalized]
    end
    next unless template_index

    matched_template[template_index] = true
    matched_destination[destination_index] = true
    matched << {
      template_index: template_index,
      destination_index: destination_index,
      phase: "exact",
      score: 1.0
    }
  end

  destination[:blocks].each_with_index do |destination_block, destination_index|
    next if matched_destination[destination_index]

    best_template_index = nil
    best_score = 0.0
    template[:blocks].each_with_index do |template_block, template_index|
      next if matched_template[template_index]

      score = refined_text_similarity(
        template_block,
        destination_block,
        template[:blocks].length,
        destination[:blocks].length
      )
      next unless score >= DEFAULT_TEXT_REFINEMENT_THRESHOLD && score > best_score

      best_score = score
      best_template_index = template_index
    end

    next unless best_template_index

    matched_template[best_template_index] = true
    matched_destination[destination_index] = true
    matched << {
      template_index: best_template_index,
      destination_index: destination_index,
      phase: "refined",
      score: best_score
    }
  end

  {
    matched: matched,
    unmatched_template: template[:blocks].each_index.reject { |index| matched_template[index] },
    unmatched_destination: destination[:blocks].each_index.reject { |index| matched_destination[index] }
  }
end

.merge_text(template_source, destination_source) ⇒ `Object`

# File 'lib/plain/merge.rb', line 155

def merge_text(template_source, destination_source)
  template = analyze_text(template_source)
  destination = analyze_text(destination_source)
  matches = match_text_blocks(template_source, destination_source)
  matched_template = matches[:matched].each_with_object({}) { |match, memo| memo[match[:template_index]] = true }
  merged_blocks = destination[:blocks].map { |block| block[:normalized] }

  template[:blocks].each_with_index do |block, index|
    next if matched_template[index]

    merged_blocks << block[:normalized]
  end

  {
    ok: true,
    diagnostics: [],
    output: merged_blocks.join("\n\n")
  }
end

.normalize_text(source) ⇒ `Object`

# File 'lib/plain/merge.rb', line 29

def normalize_text(source)
  source
    .gsub(/\r\n?/, "\n")
    .strip
    .split(/\n\s*\n+/)
    .map { |block| block.strip.gsub(/\s+/, " ") }
    .reject(&:empty?)
    .join("\n\n")
end

.refined_text_similarity(template_block, destination_block, template_total, destination_total, weights = DEFAULT_TEXT_REFINEMENT_WEIGHTS) ⇒ `Object`

# File 'lib/plain/merge.rb', line 175

def refined_text_similarity(template_block, destination_block, template_total, destination_total, weights = DEFAULT_TEXT_REFINEMENT_WEIGHTS)
  content = string_similarity(template_block[:normalized], destination_block[:normalized])
  length = length_similarity(template_block[:normalized], destination_block[:normalized])
  position = position_similarity(
    template_block[:index],
    destination_block[:index],
    template_total,
    destination_total
  )

  (weights[:content] * content) + (weights[:length] * length) + (weights[:position] * position)
end

.similarity_score(left_source, right_source) ⇒ `Object`

# File 'lib/plain/merge.rb', line 66

def similarity_score(left_source, right_source)
  left = analyze_text(left_source)
  right = analyze_text(right_source)
  total = [left[:blocks].length, right[:blocks].length].max
  return 1.0 if total.zero?

  sum = 0.0
  total.times do |index|
    left_block = left[:blocks][index]
    right_block = right[:blocks][index]
    next unless left_block && right_block

    sum += jaccard(left_block[:normalized], right_block[:normalized])
  end

  sum / total
end

.text_feature_profile ⇒ `Object`

# File 'lib/plain/merge.rb', line 17

def text_feature_profile
  {
    family: "text",
    supported_dialects: [],
    supported_policies: []
  }
end

.text_parse_request(source) ⇒ `Object`



25
26
27

# File 'lib/plain/merge.rb', line 25

def text_parse_request(source)
  TreeHaver::ParserRequest.new(source: source, language: "text")
end

Module: Plain::Merge

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.analyze_text(source) ⇒ Object

.is_similar(left_source, right_source, threshold) ⇒ Object

.match_text_blocks(template_source, destination_source) ⇒ Object

.merge_text(template_source, destination_source) ⇒ Object

.normalize_text(source) ⇒ Object

.refined_text_similarity(template_block, destination_block, template_total, destination_total, weights = DEFAULT_TEXT_REFINEMENT_WEIGHTS) ⇒ Object

.similarity_score(left_source, right_source) ⇒ Object

.text_feature_profile ⇒ Object

.text_parse_request(source) ⇒ Object

.analyze_text(source) ⇒ `Object`

.is_similar(left_source, right_source, threshold) ⇒ `Object`

.match_text_blocks(template_source, destination_source) ⇒ `Object`

.merge_text(template_source, destination_source) ⇒ `Object`

.normalize_text(source) ⇒ `Object`

.refined_text_similarity(template_block, destination_block, template_total, destination_total, weights = DEFAULT_TEXT_REFINEMENT_WEIGHTS) ⇒ `Object`

.similarity_score(left_source, right_source) ⇒ `Object`

.text_feature_profile ⇒ `Object`

.text_parse_request(source) ⇒ `Object`