Module: Legion::Extensions::Rfp::Ingest::Runners::Documents

Extended by:
Helpers::Client
Includes:
Helpers::Lex
Included in:
Client
Defined in:
lib/legion/extensions/rfp/ingest/runners/documents.rb

Constant Summary collapse

SUPPORTED_FORMATS =
%w[pdf docx md markdown xlsx html htm].freeze

Instance Method Summary collapse

Methods included from Helpers::Client

client

Instance Method Details

#chunk_text(text:, chunk_size: 1000, overlap: 200) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/legion/extensions/rfp/ingest/runners/documents.rb', line 31

def chunk_text(text:, chunk_size: 1000, overlap: 200, **)
  return { result: [], count: 0 } if text.nil? || text.empty?

  chunks = []
  pos = 0
  while pos < text.length
    chunk = text[pos, chunk_size]
    chunks << { text: chunk, offset: pos, length: chunk.length }
    pos += (chunk_size - overlap)
  end
  { result: chunks, count: chunks.length }
end

#extract_text(file_path:, format: nil) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/legion/extensions/rfp/ingest/runners/documents.rb', line 18

def extract_text(file_path:, format: nil, **)
  fmt = format || ::File.extname(file_path.to_s).delete('.').downcase
  content = case fmt
            when 'pdf'      then extract_pdf(file_path)
            when 'docx'     then extract_docx(file_path)
            when 'md', 'markdown' then ::File.read(file_path)
            when 'xlsx' then extract_xlsx(file_path)
            when 'html', 'htm' then extract_html(file_path)
            else raise ArgumentError, "Unsupported format: #{fmt}"
            end
  { result: content, format: fmt, size: content.length }
end

#supported?(file_path:) ⇒ Boolean

Returns:

  • (Boolean)


13
14
15
16
# File 'lib/legion/extensions/rfp/ingest/runners/documents.rb', line 13

def supported?(file_path:, **)
  ext = ::File.extname(file_path.to_s).delete('.').downcase
  { result: SUPPORTED_FORMATS.include?(ext), format: ext }
end