Class: SemanticTextChunker::Chunker

Inherits:
Object
  • Object
show all
Defined in:
lib/semantic_text_chunker/chunker.rb

Instance Method Summary collapse

Constructor Details

#initialize(embedder: Embedders::Null.new, threshold: 0.75, max_tokens: 512, overlap_sentences: 2) ⇒ Chunker

Returns a new instance of Chunker.



10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/semantic_text_chunker/chunker.rb', line 10

def initialize(
  embedder: Embedders::Null.new,
  threshold: 0.75,
  max_tokens: 512,
  overlap_sentences: 2
)
  @embedder          = embedder
  @threshold         = threshold
  @max_tokens        = max_tokens
  @overlap_sentences = overlap_sentences
  @splitter          = Splitters::SentenceSplitter.new
end

Instance Method Details

#chunk(text) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/semantic_text_chunker/chunker.rb', line 23

def chunk(text)
  return [] if text.nil? || text.strip.empty?

  sentences  = @splitter.split(text)
  embeddings = @embedder.embed(sentences)

  boundaries = BoundaryDetector.new(
    sentences:  sentences,
    embeddings: embeddings,
    threshold:  @threshold,
    max_tokens: @max_tokens,
    embedder:   @embedder
  ).boundaries

  ChunkBuilder.new(
    sentences:         sentences,
    boundaries:        boundaries,
    overlap_sentences: @overlap_sentences
  ).build
end

#chunk_with_metadata(text, **metadata) ⇒ Object



44
45
46
47
# File 'lib/semantic_text_chunker/chunker.rb', line 44

def (text, **)
  prefix = Metadata.prefix(**)
  chunk(text).map { |c| prefix + c }
end