Class: SemanticTextChunker::ChunkBuilder
- Inherits:
-
Object
- Object
- SemanticTextChunker::ChunkBuilder
- Defined in:
- lib/semantic_text_chunker/chunk_builder.rb
Instance Method Summary collapse
- #build ⇒ Object
-
#initialize(sentences:, boundaries:, overlap_sentences:, hard_boundaries: []) ⇒ ChunkBuilder
constructor
A new instance of ChunkBuilder.
Constructor Details
#initialize(sentences:, boundaries:, overlap_sentences:, hard_boundaries: []) ⇒ ChunkBuilder
Returns a new instance of ChunkBuilder.
5 6 7 8 9 10 |
# File 'lib/semantic_text_chunker/chunk_builder.rb', line 5 def initialize(sentences:, boundaries:, overlap_sentences:, hard_boundaries: []) @sentences = sentences @boundaries = boundaries @overlap_sentences = overlap_sentences @hard_boundaries = hard_boundaries.to_set end |
Instance Method Details
#build ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/semantic_text_chunker/chunk_builder.rb', line 12 def build return [@sentences.join(" ")] if @boundaries.empty? chunks = [] prev_end = -1 split_points = @boundaries + [@sentences.size - 1] split_points.each_with_index do |boundary, idx| start = if idx == 0 0 elsif @hard_boundaries.include?(prev_end) # Don't carry overlap across a structural boundary. prev_end + 1 else # Overlap: go back N sentences from previous boundary [prev_end - @overlap_sentences + 1, 0].max end chunk = @sentences[start..boundary].join(" ").strip chunks << chunk unless chunk.empty? prev_end = boundary end chunks end |