Module: Legion::Extensions::Rfp::Ingest::Runners::Corpus

Extended by:
Helpers::Client
Includes:
Helpers::Lex
Included in:
Client
Defined in:
lib/legion/extensions/rfp/ingest/runners/corpus.rb

Instance Method Summary collapse

Methods included from Helpers::Client

client

Instance Method Details

#ingest_directory(directory:, tags: [], recursive: true) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/legion/extensions/rfp/ingest/runners/corpus.rb', line 31

def ingest_directory(directory:, tags: [], recursive: true, **)
  pattern = recursive ? ::File.join(directory, '**', '*') : ::File.join(directory, '*')
  files = Dir.glob(pattern).select { |f| ::File.file?(f) }

  results = files.filter_map do |file_path|
    next unless supported?(file_path: file_path)[:result]

    ingest_document(file_path: file_path, tags: tags)
  end

  { result: results, files_processed: results.length, total_chunks: results.sum { |r| r[:count] } }
end

#ingest_document(file_path:, tags: [], metadata: {}) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/legion/extensions/rfp/ingest/runners/corpus.rb', line 11

def ingest_document(file_path:, tags: [], metadata: {}, **)
  supported = supported?(file_path: file_path)
  return { result: nil, error: "Unsupported format: #{file_path}" } unless supported[:result]

  extracted = extract_text(file_path: file_path)
  chunked = chunk_text(text: extracted[:result])

  ingested = chunked[:result].map.with_index do |chunk, idx|
    {
      content:  chunk[:text],
      source:   file_path,
      chunk_id: idx,
      tags:     tags,
      metadata: .merge(format: extracted[:format], offset: chunk[:offset])
    }
  end

  { result: ingested, count: ingested.length, source: file_path }
end

#ingest_to_apollo(chunks:, scope: :global) ⇒ Object



44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/legion/extensions/rfp/ingest/runners/corpus.rb', line 44

def ingest_to_apollo(chunks:, scope: :global, **)
  return { result: nil, error: 'Apollo not available' } unless defined?(Legion::Apollo)

  ingested = chunks.map do |chunk|
    Legion::Apollo.ingest(
      content:  chunk[:content],
      tags:     chunk[:tags] || [],
      metadata: chunk[:metadata] || {},
      scope:    scope
    )
  end

  { result: ingested, count: ingested.length }
end