Module: Legion::CLI::Chat::WebFetch

Defined in:
lib/legion/cli/chat/web_fetch.rb

Defined Under Namespace

Classes: FetchError

Constant Summary collapse

MAX_BODY =

1 MB

1_048_576
MAX_REDIRECTS =
5
TIMEOUT =
15
CONTEXT_LIMIT =

chars injected into conversation

12_000

Class Method Summary collapse

Class Method Details

.clean_whitespace(text) ⇒ Object



138
139
140
141
142
143
144
145
146
# File 'lib/legion/cli/chat/web_fetch.rb', line 138

def clean_whitespace(text)
  text = text.gsub(' ', ' ')
             .gsub('&', '&')
             .gsub('&lt;', '<')
             .gsub('&gt;', '>')
             .gsub('&quot;', '"')
             .gsub('&#39;', "'")
  text.gsub(/\n{3,}/, "\n\n").gsub(/ +/, ' ').strip
end

.convert_blocks!(text) ⇒ Object



125
126
127
128
129
130
131
132
# File 'lib/legion/cli/chat/web_fetch.rb', line 125

def convert_blocks!(text)
  text.gsub!(%r{<pre[^>]*>(.*?)</pre>}mi, "\n```\n\\1\n```\n")
  text.gsub!(%r{<blockquote[^>]*>(.*?)</blockquote>}mi, "\n> \\1\n")
  text.gsub!(/<p[^>]*>/mi, "\n\n")
  text.gsub!(%r{</p>}mi, "\n")
  text.gsub!(%r{<br\s*/?>}, "\n")
  text.gsub!(%r{<hr\s*/?>}, "\n---\n")
end

.convert_formatting!(text) ⇒ Object



119
120
121
122
123
# File 'lib/legion/cli/chat/web_fetch.rb', line 119

def convert_formatting!(text)
  text.gsub!(%r{<(b|strong)[^>]*>(.*?)</\1>}mi, '**\\2**')
  text.gsub!(%r{<(i|em)[^>]*>(.*?)</\1>}mi, '*\\2*')
  text.gsub!(%r{<code[^>]*>(.*?)</code>}mi, '`\\1`')
end

.convert_headings!(text) ⇒ Object



103
104
105
106
107
108
# File 'lib/legion/cli/chat/web_fetch.rb', line 103

def convert_headings!(text)
  (1..6).each do |n|
    prefix = '#' * n
    text.gsub!(%r{<h#{n}[^>]*>(.*?)</h#{n}>}mi, "\n#{prefix} \\1\n")
  end
end

.convert_links!(text) ⇒ Object



110
111
112
# File 'lib/legion/cli/chat/web_fetch.rb', line 110

def convert_links!(text)
  text.gsub!(%r{<a[^>]*href=["']([^"']*)["'][^>]*>(.*?)</a>}mi, '[\\2](\\1)')
end

.convert_lists!(text) ⇒ Object



114
115
116
117
# File 'lib/legion/cli/chat/web_fetch.rb', line 114

def convert_lists!(text)
  text.gsub!(%r{<li[^>]*>(.*?)</li>}mi, "\n- \\1")
  text.gsub!(%r{</?[ou]l[^>]*>}mi, "\n")
end

.fetch(url) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/legion/cli/chat/web_fetch.rb', line 20

def fetch(url)
  uri = parse_uri(url)
  body, content_type = follow_redirects(uri)

  text = if html?(content_type)
           html_to_markdown(body)
         else
           body
         end

  truncate(text.strip, CONTEXT_LIMIT)
end

.follow_redirects(uri, limit = MAX_REDIRECTS) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/legion/cli/chat/web_fetch.rb', line 43

def follow_redirects(uri, limit = MAX_REDIRECTS)
  raise FetchError, 'Too many redirects' if limit.zero?

  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = (uri.scheme == 'https')
  http.open_timeout = TIMEOUT
  http.read_timeout = TIMEOUT

  request = Net::HTTP::Get.new(uri.request_uri)
  request['User-Agent'] = 'LegionIO/1.0 (CLI web fetch)'
  request['Accept']     = 'text/html, text/plain, application/json'

  response = http.request(request)

  case response
  when Net::HTTPRedirection
    location = response['location']
    new_uri = URI.parse(location)
    new_uri = URI.join(uri, location) unless new_uri.host
    follow_redirects(new_uri, limit - 1)
  when Net::HTTPSuccess
    body = response.body&.dup&.force_encoding('UTF-8') || ''
    raise FetchError, "Response too large (#{body.bytesize} bytes)" if body.bytesize > MAX_BODY

    [body, response['content-type']]
  else
    raise FetchError, "HTTP #{response.code}: #{response.message}"
  end
rescue SocketError => e
  raise FetchError, "Connection failed: #{e.message}"
rescue Net::OpenTimeout, Net::ReadTimeout
  raise FetchError, "Request timed out (#{TIMEOUT}s)"
rescue OpenSSL::SSL::SSLError => e
  raise FetchError, "SSL error: #{e.message}"
end

.html?(content_type) ⇒ Boolean

Returns:

  • (Boolean)


79
80
81
# File 'lib/legion/cli/chat/web_fetch.rb', line 79

def html?(content_type)
  content_type&.include?('text/html') || false
end

.html_to_markdown(html) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
# File 'lib/legion/cli/chat/web_fetch.rb', line 83

def html_to_markdown(html)
  text = html.dup
  strip_invisible!(text)
  convert_headings!(text)
  convert_links!(text)
  convert_lists!(text)
  convert_formatting!(text)
  convert_blocks!(text)
  strip_remaining_tags!(text)
  clean_whitespace(text)
end

.parse_uri(url) ⇒ Object



33
34
35
36
37
38
39
40
41
# File 'lib/legion/cli/chat/web_fetch.rb', line 33

def parse_uri(url)
  url = "https://#{url}" unless url.match?(%r{\Ahttps?://})
  uri = URI.parse(url)
  raise FetchError, "Invalid URL: #{url}" unless uri.is_a?(URI::HTTP)

  uri
rescue URI::InvalidURIError
  raise FetchError, "Invalid URL: #{url}"
end

.strip_invisible!(text) ⇒ Object



95
96
97
98
99
100
101
# File 'lib/legion/cli/chat/web_fetch.rb', line 95

def strip_invisible!(text)
  text.gsub!(%r{<script[^>]*>.*?</script>}mi, '')
  text.gsub!(%r{<style[^>]*>.*?</style>}mi, '')
  text.gsub!(%r{<nav[^>]*>.*?</nav>}mi, '')
  text.gsub!(%r{<footer[^>]*>.*?</footer>}mi, '')
  text.gsub!(/<!--.*?-->/m, '')
end

.strip_remaining_tags!(text) ⇒ Object



134
135
136
# File 'lib/legion/cli/chat/web_fetch.rb', line 134

def strip_remaining_tags!(text)
  text.gsub!(/<[^>]+>/, '')
end

.truncate(text, limit) ⇒ Object



148
149
150
151
152
# File 'lib/legion/cli/chat/web_fetch.rb', line 148

def truncate(text, limit)
  return text if text.length <= limit

  text[0, limit] + "\n\n[... truncated at #{limit} characters]"
end