Class: SemanticTextChunker::ChunkBuilder

Inherits:
Object
  • Object
show all
Defined in:
lib/semantic_text_chunker/chunk_builder.rb

Instance Method Summary collapse

Constructor Details

#initialize(sentences:, boundaries:, overlap_sentences:, hard_boundaries: []) ⇒ ChunkBuilder

Returns a new instance of ChunkBuilder.



5
6
7
8
9
10
# File 'lib/semantic_text_chunker/chunk_builder.rb', line 5

def initialize(sentences:, boundaries:, overlap_sentences:, hard_boundaries: [])
  @sentences         = sentences
  @boundaries        = boundaries
  @overlap_sentences = overlap_sentences
  @hard_boundaries   = hard_boundaries.to_set
end

Instance Method Details

#buildObject



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/semantic_text_chunker/chunk_builder.rb', line 12

def build
  return [@sentences.join(" ")] if @boundaries.empty?

  chunks = []
  prev_end = -1

  split_points = @boundaries + [@sentences.size - 1]

  split_points.each_with_index do |boundary, idx|
    start = if idx == 0
      0
    elsif @hard_boundaries.include?(prev_end)
      # Don't carry overlap across a structural boundary.
      prev_end + 1
    else
      # Overlap: go back N sentences from previous boundary
      [prev_end - @overlap_sentences + 1, 0].max
    end

    chunk = @sentences[start..boundary].join(" ").strip
    chunks << chunk unless chunk.empty?
    prev_end = boundary
  end

  chunks
end