Module: Coelacanth::Robots

Defined in:
lib/coelacanth/robots.rb

Defined Under Namespace

Classes: RULE_STRUCT

Constant Summary collapse

DEFAULT_USER_AGENT =
"CoelacanthBot"

Class Method Summary collapse

Class Method Details

.allowed?(uri, user_agent: user_agent()) ⇒ Boolean

Returns:

  • (Boolean)


12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/coelacanth/robots.rb', line 12

def allowed?(uri, user_agent: user_agent())
  rules = rules_for(uri)
  return true if rules.empty?

  agent_key = normalize_agent(user_agent)
  agent_rules = rules[agent_key]
  agent_rules = rules["*"] if agent_rules.nil? || agent_rules.empty?

  return true if agent_rules.nil? || agent_rules.empty?

  evaluate(agent_rules, normalize_path(uri))
end

.build_rule(type:, value:) ⇒ Object



104
105
106
107
108
109
110
111
# File 'lib/coelacanth/robots.rb', line 104

def build_rule(type:, value:)
  pattern = value.start_with?("/") ? value : "/#{value}"
  escaped = Regexp.escape(pattern)
  escaped = escaped.gsub("\\*", ".*")
  escaped = escaped.gsub("\\$", "\\z")
  regex = Regexp.new("\\A" + escaped)
  RULE_STRUCT.new(type: type, pattern: pattern, regex: regex, length: pattern.length)
end

.cache_key(uri) ⇒ Object



139
140
141
142
143
144
# File 'lib/coelacanth/robots.rb', line 139

def cache_key(uri)
  port = uri.port
  default_port = default_port_for(uri.scheme)
  port_part = port && port != default_port ? ":#{port}" : ""
  "#{uri.scheme}://#{uri.host}#{port_part}"
end

.clear_cache!Object



33
34
35
# File 'lib/coelacanth/robots.rb', line 33

def clear_cache!
  robots_cache.clear
end

.default_port_for(scheme) ⇒ Object



146
147
148
# File 'lib/coelacanth/robots.rb', line 146

def default_port_for(scheme)
  scheme == "https" ? URI::HTTPS.default_port : URI::HTTP.default_port
end

.evaluate(rules, path) ⇒ Object



113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/coelacanth/robots.rb', line 113

def evaluate(rules, path)
  matches = rules.select { |rule| rule.regex.match?(path) }
  return true if matches.empty?

  longest_allow = matches.select { |rule| rule.type == :allow }.max_by(&:length)
  longest_disallow = matches.select { |rule| rule.type == :disallow }.max_by(&:length)

  return true if longest_disallow.nil?
  return true if longest_allow && longest_allow.length >= longest_disallow.length

  false
end

.fetch_rules(uri) ⇒ Object



41
42
43
44
45
46
47
48
# File 'lib/coelacanth/robots.rb', line 41

def fetch_rules(uri)
  response = Coelacanth::HTTP.raw_get_response(robots_uri_for(uri))
  return {} unless response.is_a?(Net::HTTPSuccess)

  parse_robots(response.body.to_s)
rescue Coelacanth::TimeoutError, StandardError
  {}
end

.normalize_agent(agent) ⇒ Object



135
136
137
# File 'lib/coelacanth/robots.rb', line 135

def normalize_agent(agent)
  agent.to_s.strip.downcase
end

.normalize_path(uri) ⇒ Object



126
127
128
129
130
131
132
133
# File 'lib/coelacanth/robots.rb', line 126

def normalize_path(uri)
  path = uri.path
  path = "/" if path.nil? || path.empty?
  query = uri.query
  return path if query.nil? || query.empty?

  "#{path}?#{query}"
end

.parse_robots(body) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/coelacanth/robots.rb', line 58

def parse_robots(body)
  rules = Hash.new { |hash, key| hash[key] = [] }
  current_agents = []
  last_directive = nil

  body.each_line do |line|
    sanitized = sanitize_line(line)
    if sanitized.empty?
      current_agents = []
      last_directive = nil
      next
    end

    field, value = sanitized.split(":", 2)
    next if value.nil?

    field = field.strip.downcase
    value = value.strip

    case field
    when "user-agent"
      current_agents = [] unless last_directive == :user_agent
      agent = normalize_agent(value)
      current_agents << agent unless current_agents.include?(agent)
      last_directive = :user_agent
    when "allow", "disallow"
      last_directive = field.to_sym
      next if value.empty?

      current_agents = ["*"] if current_agents.empty?
      rule = build_rule(type: last_directive, value: value)
      current_agents.each do |agent|
        rules[agent] << rule
      end
    else
      last_directive = field.to_sym
    end
  end

  rules
end

.robots_cacheObject



37
38
39
# File 'lib/coelacanth/robots.rb', line 37

def robots_cache
  @robots_cache ||= {}
end

.robots_uri_for(uri) ⇒ Object



50
51
52
53
54
55
56
# File 'lib/coelacanth/robots.rb', line 50

def robots_uri_for(uri)
  klass = uri.scheme == "https" ? URI::HTTPS : URI::HTTP
  port = uri.port
  port = nil if port == default_port_for(uri.scheme)

  klass.build(host: uri.host, path: "/robots.txt", port: port)
end

.rules_for(uri) ⇒ Object



29
30
31
# File 'lib/coelacanth/robots.rb', line 29

def rules_for(uri)
  robots_cache[cache_key(uri)] ||= fetch_rules(uri)
end

.sanitize_line(line) ⇒ Object



100
101
102
# File 'lib/coelacanth/robots.rb', line 100

def sanitize_line(line)
  line.split("#", 2).first.to_s.strip
end

.user_agentObject



25
26
27
# File 'lib/coelacanth/robots.rb', line 25

def user_agent
  ENV.fetch("COELACANTH_HTTP_USER_AGENT", DEFAULT_USER_AGENT)
end