Module: Coelacanth::Robots
- Defined in:
- lib/coelacanth/robots.rb
Defined Under Namespace
Classes: RULE_STRUCT
Constant Summary collapse
- DEFAULT_USER_AGENT =
"CoelacanthBot"
Class Method Summary collapse
- .allowed?(uri, user_agent: user_agent()) ⇒ Boolean
- .build_rule(type:, value:) ⇒ Object
- .cache_key(uri) ⇒ Object
- .clear_cache! ⇒ Object
- .default_port_for(scheme) ⇒ Object
- .evaluate(rules, path) ⇒ Object
- .fetch_rules(uri) ⇒ Object
- .normalize_agent(agent) ⇒ Object
- .normalize_path(uri) ⇒ Object
- .parse_robots(body) ⇒ Object
- .robots_cache ⇒ Object
- .robots_uri_for(uri) ⇒ Object
- .rules_for(uri) ⇒ Object
- .sanitize_line(line) ⇒ Object
- .user_agent ⇒ Object
Class Method Details
.allowed?(uri, user_agent: user_agent()) ⇒ Boolean
12 13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/coelacanth/robots.rb', line 12 def allowed?(uri, user_agent: user_agent()) rules = rules_for(uri) return true if rules.empty? agent_key = normalize_agent(user_agent) agent_rules = rules[agent_key] agent_rules = rules["*"] if agent_rules.nil? || agent_rules.empty? return true if agent_rules.nil? || agent_rules.empty? evaluate(agent_rules, normalize_path(uri)) end |
.build_rule(type:, value:) ⇒ Object
104 105 106 107 108 109 110 111 |
# File 'lib/coelacanth/robots.rb', line 104 def build_rule(type:, value:) pattern = value.start_with?("/") ? value : "/#{value}" escaped = Regexp.escape(pattern) escaped = escaped.gsub("\\*", ".*") escaped = escaped.gsub("\\$", "\\z") regex = Regexp.new("\\A" + escaped) RULE_STRUCT.new(type: type, pattern: pattern, regex: regex, length: pattern.length) end |
.cache_key(uri) ⇒ Object
139 140 141 142 143 144 |
# File 'lib/coelacanth/robots.rb', line 139 def cache_key(uri) port = uri.port default_port = default_port_for(uri.scheme) port_part = port && port != default_port ? ":#{port}" : "" "#{uri.scheme}://#{uri.host}#{port_part}" end |
.clear_cache! ⇒ Object
33 34 35 |
# File 'lib/coelacanth/robots.rb', line 33 def clear_cache! robots_cache.clear end |
.default_port_for(scheme) ⇒ Object
146 147 148 |
# File 'lib/coelacanth/robots.rb', line 146 def default_port_for(scheme) scheme == "https" ? URI::HTTPS.default_port : URI::HTTP.default_port end |
.evaluate(rules, path) ⇒ Object
113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/coelacanth/robots.rb', line 113 def evaluate(rules, path) matches = rules.select { |rule| rule.regex.match?(path) } return true if matches.empty? longest_allow = matches.select { |rule| rule.type == :allow }.max_by(&:length) longest_disallow = matches.select { |rule| rule.type == :disallow }.max_by(&:length) return true if longest_disallow.nil? return true if longest_allow && longest_allow.length >= longest_disallow.length false end |
.fetch_rules(uri) ⇒ Object
41 42 43 44 45 46 47 48 |
# File 'lib/coelacanth/robots.rb', line 41 def fetch_rules(uri) response = Coelacanth::HTTP.raw_get_response(robots_uri_for(uri)) return {} unless response.is_a?(Net::HTTPSuccess) parse_robots(response.body.to_s) rescue Coelacanth::TimeoutError, StandardError {} end |
.normalize_agent(agent) ⇒ Object
135 136 137 |
# File 'lib/coelacanth/robots.rb', line 135 def normalize_agent(agent) agent.to_s.strip.downcase end |
.normalize_path(uri) ⇒ Object
126 127 128 129 130 131 132 133 |
# File 'lib/coelacanth/robots.rb', line 126 def normalize_path(uri) path = uri.path path = "/" if path.nil? || path.empty? query = uri.query return path if query.nil? || query.empty? "#{path}?#{query}" end |
.parse_robots(body) ⇒ Object
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/coelacanth/robots.rb', line 58 def parse_robots(body) rules = Hash.new { |hash, key| hash[key] = [] } current_agents = [] last_directive = nil body.each_line do |line| sanitized = sanitize_line(line) if sanitized.empty? current_agents = [] last_directive = nil next end field, value = sanitized.split(":", 2) next if value.nil? field = field.strip.downcase value = value.strip case field when "user-agent" current_agents = [] unless last_directive == :user_agent agent = normalize_agent(value) current_agents << agent unless current_agents.include?(agent) last_directive = :user_agent when "allow", "disallow" last_directive = field.to_sym next if value.empty? current_agents = ["*"] if current_agents.empty? rule = build_rule(type: last_directive, value: value) current_agents.each do |agent| rules[agent] << rule end else last_directive = field.to_sym end end rules end |
.robots_cache ⇒ Object
37 38 39 |
# File 'lib/coelacanth/robots.rb', line 37 def robots_cache @robots_cache ||= {} end |
.robots_uri_for(uri) ⇒ Object
50 51 52 53 54 55 56 |
# File 'lib/coelacanth/robots.rb', line 50 def robots_uri_for(uri) klass = uri.scheme == "https" ? URI::HTTPS : URI::HTTP port = uri.port port = nil if port == default_port_for(uri.scheme) klass.build(host: uri.host, path: "/robots.txt", port: port) end |
.rules_for(uri) ⇒ Object
29 30 31 |
# File 'lib/coelacanth/robots.rb', line 29 def rules_for(uri) robots_cache[cache_key(uri)] ||= fetch_rules(uri) end |
.sanitize_line(line) ⇒ Object
100 101 102 |
# File 'lib/coelacanth/robots.rb', line 100 def sanitize_line(line) line.split("#", 2).first.to_s.strip end |
.user_agent ⇒ Object
25 26 27 |
# File 'lib/coelacanth/robots.rb', line 25 def user_agent ENV.fetch("COELACANTH_HTTP_USER_AGENT", DEFAULT_USER_AGENT) end |