Class: Parklife::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/parklife/crawler.rb

Constant Summary collapse

RESPONDERS =
{
  200 => Responder::Ok,
  301 => Responder::Redirect,
  302 => Responder::Redirect,
  304 => Responder::NotModified,
  404 => Responder::NotFound,
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(config, routes, cache) ⇒ Crawler

Returns a new instance of Crawler.



25
26
27
28
29
30
31
32
33
# File 'lib/parklife/crawler.rb', line 25

def initialize(config, routes, cache)
  @config = config
  @routes = routes.to_a
  @cache = cache
  @browser = Browser.new(config.app, config.base)
  @build = Build.new(config.build_dir, nested_index: config.nested_index)
  @visited = Set.new
  @responder_for_status = {}
end

Instance Attribute Details

#browserObject (readonly)

Returns the value of attribute browser.



23
24
25
# File 'lib/parklife/crawler.rb', line 23

def browser
  @browser
end

#buildObject (readonly)

Returns the value of attribute build.



23
24
25
# File 'lib/parklife/crawler.rb', line 23

def build
  @build
end

#cacheObject (readonly)

Returns the value of attribute cache.



23
24
25
# File 'lib/parklife/crawler.rb', line 23

def cache
  @cache
end

#configObject (readonly)

Returns the value of attribute config.



23
24
25
# File 'lib/parklife/crawler.rb', line 23

def config
  @config
end

#routesObject (readonly)

Returns the value of attribute routes.



23
24
25
# File 'lib/parklife/crawler.rb', line 23

def routes
  @routes
end

#visitedObject (readonly)

Returns the value of attribute visited.



23
24
25
# File 'lib/parklife/crawler.rb', line 23

def visited
  @visited
end

Instance Method Details

#crawl(html) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/parklife/crawler.rb', line 35

def crawl(html)
  Utils.scan_for_links(html) do |path|
    # If the app is mounted at a subdirectory then it responds to paths that
    # *exclude* the subdirectory and generates links that *include* the
    # subdirectory (so if the app is mounted at "/foo" and serving "/bar"
    # then the full path would be "/foo/bar" and a generated link would
    # include the mount path like "/foo/link").
    #
    # Anyway, this mount path prefix must be trimmed from link paths so that
    # correct app routes are created.
    baseless_path = path.delete_prefix(config.base.path)
    new_route = Route.new(baseless_path, crawl: true)

    next if visited?(new_route)

    routes << new_route
  end
end

#get(path) ⇒ Object



54
55
56
57
58
59
60
61
62
# File 'lib/parklife/crawler.rb', line 54

def get(path)
  headers = if (etag = cache&.etag(path))
    { 'HTTP_IF_NONE_MATCH' => etag }
  else
    nil
  end

  browser.get(path, headers: headers)
end

#responder_for_status(status) ⇒ Object



64
65
66
67
68
# File 'lib/parklife/crawler.rb', line 64

def responder_for_status(status)
  @responder_for_status[status] ||= RESPONDERS
    .fetch(status, Responder::Unknown)
    .new(self)
end

#startObject



70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/parklife/crawler.rb', line 70

def start
  while (route = routes.shift)
    next if visited?(route)
    response = get(route.path)
    @visited << route
    config.reporter.visit(route, response)
    responder_for_status(response.status).call(route, response)
  end

  config.reporter.finish
ensure
  build.write_meta unless config.skip_build_meta
end

#visited?(route) ⇒ Boolean

Returns:

  • (Boolean)


84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/parklife/crawler.rb', line 84

def visited?(route)
  if route.crawl
    # A crawl=true route is only counted as visited when it has already been
    # crawled, if it's been visited by a non-crawl route then it must be
    # visited again so it can be crawled.
    @visited.include?(route)
  else
    # A crawl=false route is counted as visited whether it was previously
    # visited with either a crawl or non-crawl route.
    @visited.include?(route) || @visited.include?(route.with_crawl)
  end
end