Class: Parklife::Crawler
- Inherits:
-
Object
- Object
- Parklife::Crawler
- Defined in:
- lib/parklife/crawler.rb
Constant Summary collapse
- RESPONDERS =
{ 200 => Responder::Ok, 301 => Responder::Redirect, 302 => Responder::Redirect, 304 => Responder::NotModified, 404 => Responder::NotFound, }
Instance Attribute Summary collapse
-
#browser ⇒ Object
readonly
Returns the value of attribute browser.
-
#build ⇒ Object
readonly
Returns the value of attribute build.
-
#cache ⇒ Object
readonly
Returns the value of attribute cache.
-
#config ⇒ Object
readonly
Returns the value of attribute config.
-
#routes ⇒ Object
readonly
Returns the value of attribute routes.
-
#visited ⇒ Object
readonly
Returns the value of attribute visited.
Instance Method Summary collapse
- #crawl(html) ⇒ Object
- #get(path) ⇒ Object
-
#initialize(config, routes, cache) ⇒ Crawler
constructor
A new instance of Crawler.
- #responder_for_status(status) ⇒ Object
- #start ⇒ Object
- #visited?(route) ⇒ Boolean
Constructor Details
#initialize(config, routes, cache) ⇒ Crawler
Returns a new instance of Crawler.
25 26 27 28 29 30 31 32 33 |
# File 'lib/parklife/crawler.rb', line 25 def initialize(config, routes, cache) @config = config @routes = routes.to_a @cache = cache @browser = Browser.new(config.app, config.base) @build = Build.new(config.build_dir, nested_index: config.nested_index) @visited = Set.new @responder_for_status = {} end |
Instance Attribute Details
#browser ⇒ Object (readonly)
Returns the value of attribute browser.
23 24 25 |
# File 'lib/parklife/crawler.rb', line 23 def browser @browser end |
#build ⇒ Object (readonly)
Returns the value of attribute build.
23 24 25 |
# File 'lib/parklife/crawler.rb', line 23 def build @build end |
#cache ⇒ Object (readonly)
Returns the value of attribute cache.
23 24 25 |
# File 'lib/parklife/crawler.rb', line 23 def cache @cache end |
#config ⇒ Object (readonly)
Returns the value of attribute config.
23 24 25 |
# File 'lib/parklife/crawler.rb', line 23 def config @config end |
#routes ⇒ Object (readonly)
Returns the value of attribute routes.
23 24 25 |
# File 'lib/parklife/crawler.rb', line 23 def routes @routes end |
#visited ⇒ Object (readonly)
Returns the value of attribute visited.
23 24 25 |
# File 'lib/parklife/crawler.rb', line 23 def visited @visited end |
Instance Method Details
#crawl(html) ⇒ Object
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/parklife/crawler.rb', line 35 def crawl(html) Utils.scan_for_links(html) do |path| # If the app is mounted at a subdirectory then it responds to paths that # *exclude* the subdirectory and generates links that *include* the # subdirectory (so if the app is mounted at "/foo" and serving "/bar" # then the full path would be "/foo/bar" and a generated link would # include the mount path like "/foo/link"). # # Anyway, this mount path prefix must be trimmed from link paths so that # correct app routes are created. baseless_path = path.delete_prefix(config.base.path) new_route = Route.new(baseless_path, crawl: true) next if visited?(new_route) routes << new_route end end |
#get(path) ⇒ Object
54 55 56 57 58 59 60 61 62 |
# File 'lib/parklife/crawler.rb', line 54 def get(path) headers = if (etag = cache&.etag(path)) { 'HTTP_IF_NONE_MATCH' => etag } else nil end browser.get(path, headers: headers) end |
#responder_for_status(status) ⇒ Object
64 65 66 67 68 |
# File 'lib/parklife/crawler.rb', line 64 def responder_for_status(status) @responder_for_status[status] ||= RESPONDERS .fetch(status, Responder::Unknown) .new(self) end |
#start ⇒ Object
70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/parklife/crawler.rb', line 70 def start while (route = routes.shift) next if visited?(route) response = get(route.path) @visited << route config.reporter.visit(route, response) responder_for_status(response.status).call(route, response) end config.reporter.finish ensure build. unless config. end |
#visited?(route) ⇒ Boolean
84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/parklife/crawler.rb', line 84 def visited?(route) if route.crawl # A crawl=true route is only counted as visited when it has already been # crawled, if it's been visited by a non-crawl route then it must be # visited again so it can be crawled. @visited.include?(route) else # A crawl=false route is counted as visited whether it was previously # visited with either a crawl or non-crawl route. @visited.include?(route) || @visited.include?(route.with_crawl) end end |