Class: Parklife::Crawler

Inherits:

Object

Object
Parklife::Crawler

show all

Defined in:: lib/parklife/crawler.rb

Constant Summary collapse

RESPONDERS =

{
  200 => Responder::Ok,
  301 => Responder::Redirect,
  302 => Responder::Redirect,
  304 => Responder::NotModified,
  404 => Responder::NotFound,
}

Instance Attribute Summary collapse

#browser ⇒ Object readonly

Returns the value of attribute browser.
#build ⇒ Object readonly

Returns the value of attribute build.
#cache ⇒ Object readonly

Returns the value of attribute cache.
#config ⇒ Object readonly

Returns the value of attribute config.
#routes ⇒ Object readonly

Returns the value of attribute routes.
#visited ⇒ Object readonly

Returns the value of attribute visited.

Instance Method Summary collapse

#crawl(html) ⇒ Object
#get(path) ⇒ Object
#initialize(config, routes, cache) ⇒ Crawler constructor

A new instance of Crawler.
#responder_for_status(status) ⇒ Object
#start ⇒ Object
#visited?(route) ⇒ Boolean

Constructor Details

#initialize(config, routes, cache) ⇒ `Crawler`

Returns a new instance of Crawler.

# File 'lib/parklife/crawler.rb', line 25

def initialize(config, routes, cache)
  @config = config
  @routes = routes.to_a
  @cache = cache
  @browser = Browser.new(config.app, config.base)
  @build = Build.new(config.build_dir, nested_index: config.nested_index)
  @visited = Set.new
  @responder_for_status = {}
end

Instance Attribute Details

#browser ⇒ `Object` (readonly)

Returns the value of attribute browser.



23
24
25

# File 'lib/parklife/crawler.rb', line 23

def browser
  @browser
end

#build ⇒ `Object` (readonly)

Returns the value of attribute build.



23
24
25

# File 'lib/parklife/crawler.rb', line 23

def build
  @build
end

#cache ⇒ `Object` (readonly)

Returns the value of attribute cache.



23
24
25

# File 'lib/parklife/crawler.rb', line 23

def cache
  @cache
end

#config ⇒ `Object` (readonly)

Returns the value of attribute config.



23
24
25

# File 'lib/parklife/crawler.rb', line 23

def config
  @config
end

#routes ⇒ `Object` (readonly)

Returns the value of attribute routes.



23
24
25

# File 'lib/parklife/crawler.rb', line 23

def routes
  @routes
end

#visited ⇒ `Object` (readonly)

Returns the value of attribute visited.



23
24
25

# File 'lib/parklife/crawler.rb', line 23

def visited
  @visited
end

Instance Method Details

#crawl(html) ⇒ `Object`

# File 'lib/parklife/crawler.rb', line 35

def crawl(html)
  Utils.scan_for_links(html) do |path|
    # If the app is mounted at a subdirectory then it responds to paths that
    # *exclude* the subdirectory and generates links that *include* the
    # subdirectory (so if the app is mounted at "/foo" and serving "/bar"
    # then the full path would be "/foo/bar" and a generated link would
    # include the mount path like "/foo/link").
    #
    # Anyway, this mount path prefix must be trimmed from link paths so that
    # correct app routes are created.
    baseless_path = path.delete_prefix(config.base.path)
    new_route = Route.new(baseless_path, crawl: true)

    next if visited?(new_route)

    routes << new_route
  end
end

#get(path) ⇒ `Object`

# File 'lib/parklife/crawler.rb', line 54

def get(path)
  headers = if (etag = cache&.etag(path))
    { 'HTTP_IF_NONE_MATCH' => etag }
  else
    nil
  end

  browser.get(path, headers: headers)
end

#responder_for_status(status) ⇒ `Object`

# File 'lib/parklife/crawler.rb', line 64

def responder_for_status(status)
  @responder_for_status[status] ||= RESPONDERS
    .fetch(status, Responder::Unknown)
    .new(self)
end

#start ⇒ `Object`

# File 'lib/parklife/crawler.rb', line 70

def start
  while (route = routes.shift)
    next if visited?(route)
    response = get(route.path)
    @visited << route
    config.reporter.visit(route, response)
    responder_for_status(response.status).call(route, response)
  end

  config.reporter.finish
ensure
  build.write_meta unless config.skip_build_meta
end

#visited?(route) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/parklife/crawler.rb', line 84

def visited?(route)
  if route.crawl
    # A crawl=true route is only counted as visited when it has already been
    # crawled, if it's been visited by a non-crawl route then it must be
    # visited again so it can be crawled.
    @visited.include?(route)
  else
    # A crawl=false route is counted as visited whether it was previously
    # visited with either a crawl or non-crawl route.
    @visited.include?(route) || @visited.include?(route.with_crawl)
  end
end

Class: Parklife::Crawler

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(config, routes, cache) ⇒ Crawler

Instance Attribute Details

#browser ⇒ Object (readonly)

#build ⇒ Object (readonly)

#cache ⇒ Object (readonly)

#config ⇒ Object (readonly)

#routes ⇒ Object (readonly)

#visited ⇒ Object (readonly)