Module: Legion::Extensions::Knowledge::Helpers::Parser
- Defined in:
- lib/legion/extensions/knowledge/helpers/parser.rb
Class Method Summary collapse
- .extract_via_data(file_path:) ⇒ Object
- .parse(file_path:) ⇒ Object
- .parse_markdown(file_path:) ⇒ Object
- .parse_text(file_path:) ⇒ Object
Class Method Details
.extract_via_data(file_path:) ⇒ Object
51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/legion/extensions/knowledge/helpers/parser.rb', line 51 def extract_via_data(file_path:) return [{ error: 'unsupported format', source_file: file_path }] unless defined?(::Legion::Data::Extract) result = ::Legion::Data::Extract.extract(file_path, type: :auto) return [{ error: 'extraction_failed', source_file: file_path, detail: result }] unless result.is_a?(Hash) && result[:text] heading = ::File.basename(file_path, '.*') [{ heading: heading, section_path: [], content: result[:text].strip, source_file: file_path }] rescue StandardError => e [{ error: 'extraction_failed', source_file: file_path, detail: e. }] end |
.parse(file_path:) ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/legion/extensions/knowledge/helpers/parser.rb', line 10 def parse(file_path:) ext = ::File.extname(file_path).downcase case ext when '.md' parse_markdown(file_path: file_path) when '.txt' parse_text(file_path: file_path) when '.pdf', '.docx' extract_via_data(file_path: file_path) else [{ error: 'unsupported format', source_file: file_path }] end end |
.parse_markdown(file_path:) ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# File 'lib/legion/extensions/knowledge/helpers/parser.rb', line 25 def parse_markdown(file_path:) content = ::File.read(file_path, encoding: 'utf-8') sections = [] current_heading = ::File.basename(file_path, '.*') current_lines = [] heading_stack = {} content.each_line do |line| level = heading_level(line) if level flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path) title = line.sub(/^#+\s*/, '').chomp heading_stack.delete_if { |d, _| d >= level } heading_stack[level] = title current_heading = title current_lines = [] else current_lines << line end end flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path) sections.empty? ? [{ heading: ::File.basename(file_path, '.*'), section_path: [], content: content.strip, source_file: file_path }] : sections end |
.parse_text(file_path:) ⇒ Object
63 64 65 66 67 68 |
# File 'lib/legion/extensions/knowledge/helpers/parser.rb', line 63 def parse_text(file_path:) content = ::File.read(file_path, encoding: 'utf-8') heading = ::File.basename(file_path, '.*') [{ heading: heading, section_path: [], content: content.strip, source_file: file_path }] end |