Module: Legion::Extensions::Rfp::Ingest::Runners::Documents
- Extended by:
- Helpers::Client
- Includes:
- Helpers::Lex
- Included in:
- Client
- Defined in:
- lib/legion/extensions/rfp/ingest/runners/documents.rb
Constant Summary collapse
- SUPPORTED_FORMATS =
%w[pdf docx md markdown xlsx html htm].freeze
Instance Method Summary collapse
- #chunk_text(text:, chunk_size: 1000, overlap: 200) ⇒ Object
- #extract_text(file_path:, format: nil) ⇒ Object
- #supported?(file_path:) ⇒ Boolean
Methods included from Helpers::Client
Instance Method Details
#chunk_text(text:, chunk_size: 1000, overlap: 200) ⇒ Object
31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/legion/extensions/rfp/ingest/runners/documents.rb', line 31 def chunk_text(text:, chunk_size: 1000, overlap: 200, **) return { result: [], count: 0 } if text.nil? || text.empty? chunks = [] pos = 0 while pos < text.length chunk = text[pos, chunk_size] chunks << { text: chunk, offset: pos, length: chunk.length } pos += (chunk_size - overlap) end { result: chunks, count: chunks.length } end |
#extract_text(file_path:, format: nil) ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/legion/extensions/rfp/ingest/runners/documents.rb', line 18 def extract_text(file_path:, format: nil, **) fmt = format || ::File.extname(file_path.to_s).delete('.').downcase content = case fmt when 'pdf' then extract_pdf(file_path) when 'docx' then extract_docx(file_path) when 'md', 'markdown' then ::File.read(file_path) when 'xlsx' then extract_xlsx(file_path) when 'html', 'htm' then extract_html(file_path) else raise ArgumentError, "Unsupported format: #{fmt}" end { result: content, format: fmt, size: content.length } end |
#supported?(file_path:) ⇒ Boolean
13 14 15 16 |
# File 'lib/legion/extensions/rfp/ingest/runners/documents.rb', line 13 def supported?(file_path:, **) ext = ::File.extname(file_path.to_s).delete('.').downcase { result: SUPPORTED_FORMATS.include?(ext), format: ext } end |