Class: LlmDocsBuilder::HtmlDetector
- Inherits:
-
Object
- Object
- LlmDocsBuilder::HtmlDetector
- Defined in:
- lib/llm_docs_builder/html_detector.rb
Overview
Detects whether input should be treated as HTML and related snippet checks
Instance Method Summary collapse
-
#allow_inline_body_text?(content, text) ⇒ Boolean
Determines if inline body text should be allowed in HTML context.
-
#detection_snippet(content) ⇒ String?
Prepare a snippet of content for HTML detection by removing leading whitespace and build metadata comments.
-
#full_html_document?(content) ⇒ Boolean
Check if the full document should be treated as HTML by parsing it and ensuring we do not observe unwrapped markdown constructs like plain text or lists.
-
#html_candidate_snippet?(snippet) ⇒ Boolean
Determine whether a snippet appears to start with HTML markup.
-
#html_content?(content, snippet = detection_snippet(content)) ⇒ Boolean
Detect if loaded content is HTML instead of markdown.
-
#html_content_snippet?(snippet) ⇒ Boolean
Determine whether a snippet should be treated as HTML.
-
#html_with_body_wrapper?(content) ⇒ Boolean
Checks if content has HTML document structure wrapper tags.
-
#markdown_heading_snippet?(snippet) ⇒ Boolean
Detect common markdown heading syntax within the snippet.
-
#markdown_like_text?(text) ⇒ Boolean
Checks if text looks like markdown syntax.
-
#meaningful_text?(text) ⇒ Boolean
Checks if text contains meaningful non-whitespace content.
-
#table_fragment?(snippet) ⇒ Boolean
Detect whether the snippet represents a table fragment we should preserve.
Instance Method Details
#allow_inline_body_text?(content, text) ⇒ Boolean
Determines if inline body text should be allowed in HTML context
117 118 119 120 121 |
# File 'lib/llm_docs_builder/html_detector.rb', line 117 def allow_inline_body_text?(content, text) return false if markdown_like_text?(text) html_with_body_wrapper?(content) end |
#detection_snippet(content) ⇒ String?
Prepare a snippet of content for HTML detection by removing leading whitespace and build metadata comments.
22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/llm_docs_builder/html_detector.rb', line 22 def detection_snippet(content) return unless content snippet = content.lstrip return unless snippet comment_prefix = /\A<!--.*?-->\s*/m # Remote docs often include build metadata comments; skip them before tag detection. return '' if snippet.empty? while snippet.sub!(comment_prefix, '') snippet.lstrip[0, 500] end |
#full_html_document?(content) ⇒ Boolean
Check if the full document should be treated as HTML by parsing it and ensuring we do not observe unwrapped markdown constructs like plain text or lists.
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/llm_docs_builder/html_detector.rb', line 59 def full_html_document?(content) document = Nokogiri::HTML::Document.parse(content) body = document.at('body') return false unless body return false if document.xpath('/text()').any? { |node| meaningful_text?(node.text) } body.xpath('./text()').each do |node| text = node.text next unless meaningful_text?(text) return false unless allow_inline_body_text?(content, text) end true rescue Nokogiri::XML::SyntaxError false end |
#html_candidate_snippet?(snippet) ⇒ Boolean
Determine whether a snippet appears to start with HTML markup.
50 51 52 |
# File 'lib/llm_docs_builder/html_detector.rb', line 50 def html_candidate_snippet?(snippet) snippet.match?(/\A<\s*(?:!DOCTYPE\s+html|html\b|body\b|head\b|article\b|section\b|main\b|p\b|div\b|table\b|thead\b|tbody\b|tr\b|td\b|th\b|meta\b|link\b|h[1-6]\b|ul\b|ol\b|li\b|blockquote\b)/i) end |
#html_content?(content, snippet = detection_snippet(content)) ⇒ Boolean
Detect if loaded content is HTML instead of markdown
11 12 13 14 15 |
# File 'lib/llm_docs_builder/html_detector.rb', line 11 def html_content?(content, snippet = detection_snippet(content)) return false unless html_content_snippet?(snippet) full_html_document?(content) end |
#html_content_snippet?(snippet) ⇒ Boolean
Determine whether a snippet should be treated as HTML.
39 40 41 42 43 44 |
# File 'lib/llm_docs_builder/html_detector.rb', line 39 def html_content_snippet?(snippet) return false unless snippet && !snippet.empty? return false if markdown_heading_snippet?(snippet) html_candidate_snippet?(snippet) end |
#html_with_body_wrapper?(content) ⇒ Boolean
Checks if content has HTML document structure wrapper tags
127 128 129 130 131 |
# File 'lib/llm_docs_builder/html_detector.rb', line 127 def html_with_body_wrapper?(content) content.match?(/<\s*!DOCTYPE\s+html/i) || content.match?(/<\s*html\b/i) || content.match?(/<\s*body\b/i) end |
#markdown_heading_snippet?(snippet) ⇒ Boolean
Detect common markdown heading syntax within the snippet.
147 148 149 150 151 152 153 154 155 156 157 |
# File 'lib/llm_docs_builder/html_detector.rb', line 147 def markdown_heading_snippet?(snippet) snippet.each_line do |line| trimmed = line.lstrip next if trimmed.empty? next if trimmed.start_with?('<') return true if trimmed.match?(/\A#+\s+/) end false end |
#markdown_like_text?(text) ⇒ Boolean
Checks if text looks like markdown syntax
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/llm_docs_builder/html_detector.rb', line 93 def markdown_like_text?(text) return false if text.nil? return true if markdown_heading_snippet?(text) text.each_line do |line| trimmed = line.lstrip next if trimmed.empty? next if trimmed.start_with?('<') return true if trimmed.match?(/\A[*+-]\s+\S/) return true if trimmed.match?(/\A\d+\.\s+\S/) return true if trimmed.match?(/\A>\s+\S/) return true if trimmed.start_with?('```', '~~~') return true if trimmed.strip.match?(/\A(?:-{3,}|_{3,}|={3,})\z/) end false end |
#meaningful_text?(text) ⇒ Boolean
Checks if text contains meaningful non-whitespace content
82 83 84 85 86 87 |
# File 'lib/llm_docs_builder/html_detector.rb', line 82 def meaningful_text?(text) return false if text.nil? stripped = text.strip stripped.match?(/\S/) end |
#table_fragment?(snippet) ⇒ Boolean
Detect whether the snippet represents a table fragment we should preserve.
137 138 139 140 141 |
# File 'lib/llm_docs_builder/html_detector.rb', line 137 def table_fragment?(snippet) return false unless snippet && !snippet.empty? snippet.match?(/\A<\s*(?:table|thead|tbody|tr|td|th)\b/i) end |