Class: LangExtract::Core::SentenceAwareChunker

Inherits:
Object
  • Object
show all
Defined in:
lib/langextract/core/chunking.rb

Constant Summary collapse

DEFAULT_MAX_CHAR_BUFFER =
2_000

Instance Method Summary collapse

Constructor Details

#initialize(max_char_buffer: DEFAULT_MAX_CHAR_BUFFER, tokenizer: UnicodeTokenizer.new) ⇒ SentenceAwareChunker

Returns a new instance of SentenceAwareChunker.

Raises:

  • (ArgumentError)


22
23
24
25
26
27
# File 'lib/langextract/core/chunking.rb', line 22

def initialize(max_char_buffer: DEFAULT_MAX_CHAR_BUFFER, tokenizer: UnicodeTokenizer.new)
  raise ArgumentError, "max_char_buffer must be positive" unless max_char_buffer.positive?

  @max_char_buffer = max_char_buffer
  @tokenizer = tokenizer
end

Instance Method Details

#chunks(document) ⇒ Object



29
30
31
32
33
34
35
# File 'lib/langextract/core/chunking.rb', line 29

def chunks(document)
  text = document.respond_to?(:text) ? document.text : document.to_s
  document_id = document.respond_to?(:id) ? document.id : nil
  sentences = sentence_intervals(text)
  token_lookup = token_lookup(text)
  build_chunks(text, sentences, token_lookup, document_id)
end