Class: Ragnar::Chunker
- Inherits:
-
Object
- Object
- Ragnar::Chunker
- Defined in:
- lib/ragnar/chunker.rb
Instance Attribute Summary collapse
-
#chunk_overlap ⇒ Object
readonly
Returns the value of attribute chunk_overlap.
-
#chunk_size ⇒ Object
readonly
Returns the value of attribute chunk_size.
Class Method Summary collapse
Instance Method Summary collapse
- #chunk_documents(documents) ⇒ Object
- #chunk_file(file_path) ⇒ Object
- #chunk_text(text, metadata = {}) ⇒ Object
-
#initialize(chunk_size: Ragnar::DEFAULT_CHUNK_SIZE, chunk_overlap: Ragnar::DEFAULT_CHUNK_OVERLAP) ⇒ Chunker
constructor
A new instance of Chunker.
Constructor Details
#initialize(chunk_size: Ragnar::DEFAULT_CHUNK_SIZE, chunk_overlap: Ragnar::DEFAULT_CHUNK_OVERLAP) ⇒ Chunker
Returns a new instance of Chunker.
5 6 7 8 9 10 11 12 13 14 |
# File 'lib/ragnar/chunker.rb', line 5 def initialize(chunk_size: Ragnar::DEFAULT_CHUNK_SIZE, chunk_overlap: Ragnar::DEFAULT_CHUNK_OVERLAP) @chunk_size = chunk_size @chunk_overlap = chunk_overlap # Use RecursiveCharacterTextSplitter for better chunking @splitter = Baran::RecursiveCharacterTextSplitter.new( chunk_size: chunk_size, chunk_overlap: chunk_overlap, separators: ["\n\n", "\n", ". ", " ", ""] ) end |
Instance Attribute Details
#chunk_overlap ⇒ Object (readonly)
Returns the value of attribute chunk_overlap.
3 4 5 |
# File 'lib/ragnar/chunker.rb', line 3 def chunk_overlap @chunk_overlap end |
#chunk_size ⇒ Object (readonly)
Returns the value of attribute chunk_size.
3 4 5 |
# File 'lib/ragnar/chunker.rb', line 3 def chunk_size @chunk_size end |
Class Method Details
.semantic_chunker(model: nil) ⇒ Object
87 88 89 90 91 92 93 94 95 |
# File 'lib/ragnar/chunker.rb', line 87 def self.semantic_chunker(model: nil) # Future enhancement: Use more sophisticated chunking with semantic boundaries # Could use sentence embeddings to find natural break points Baran::RecursiveCharacterTextSplitter.new( chunk_size: Ragnar::DEFAULT_CHUNK_SIZE, chunk_overlap: Ragnar::DEFAULT_CHUNK_OVERLAP, separators: ["\n\n", "\n", ". ", " ", ""] ) end |
Instance Method Details
#chunk_documents(documents) ⇒ Object
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/ragnar/chunker.rb', line 64 def chunk_documents(documents) all_chunks = [] documents.each do |doc| if doc.is_a?(String) # If it's a file path if File.exist?(doc) all_chunks.concat(chunk_file(doc)) else # Treat as raw text all_chunks.concat(chunk_text(doc)) end elsif doc.is_a?(Hash) # If it's a document hash with text and metadata text = doc[:text] || doc["text"] = doc[:metadata] || doc["metadata"] || {} all_chunks.concat(chunk_text(text, )) end end all_chunks end |
#chunk_file(file_path) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/ragnar/chunker.rb', line 47 def chunk_file(file_path) unless File.exist?(file_path) raise "File not found: #{file_path}" end text = File.read(file_path, encoding: 'utf-8', invalid: :replace, undef: :replace) = { file_path: File.absolute_path(file_path), file_name: File.basename(file_path), file_size: File.size(file_path), file_modified: File.mtime(file_path).to_s } chunk_text(text, ) end |
#chunk_text(text, metadata = {}) ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/ragnar/chunker.rb', line 16 def chunk_text(text, = {}) return [] if text.nil? || text.strip.empty? # Use Baran to split the text into chunks chunks = @splitter.chunks(text) # Add metadata to each chunk # Baran returns chunks as hashes with :text and :cursor keys chunks.map.with_index do |chunk_data, index| # Extract the actual text from the chunk chunk_text = if chunk_data.is_a?(Hash) chunk_data[:text] || chunk_data["text"] else chunk_data.to_s end { text: chunk_text, index: index, metadata: .merge( chunk_index: index, total_chunks: chunks.size, chunk_size: chunk_text.size ) } end rescue => e puts "Error chunking text: #{e.}" [] end |