Class: Digger::Page

Inherits:

Object

Object
Digger::Page

show all

Defined in:: lib/digger/page.rb

Instance Attribute Summary collapse

#aliases ⇒ Object

Returns the value of attribute aliases.
#body ⇒ Object readonly

The raw HTTP response body of the page.
#code ⇒ Object

Integer response code of the page.
#depth ⇒ Object

Depth of this page from the root of the crawl.
#domain_aliases ⇒ Object

Returns the value of attribute domain_aliases.
#error ⇒ Object readonly

Exception object, if one was raised during HTTP#fetch_page.
#fetched_at ⇒ Object

Returns the value of attribute fetched_at.
#headers ⇒ Object readonly

Headers of the HTTP response.
#redirect_to ⇒ Object readonly

URL of the page this one redirected to, if any.
#referer ⇒ Object

URL of the page that brought us to this page.
#response_time ⇒ Object

Response time of the request for this page in milliseconds.
#storable ⇒ Object

Whether the current page should be stored Default: true.
#url ⇒ Object readonly

Returns the value of attribute url.
#user_data ⇒ Object

OpenStruct it holds users defined data.

Instance Method Summary collapse

#base ⇒ Object

Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE.
#content_type ⇒ Object

The content-type returned by the HTTP request for this page.
#cookies ⇒ Object
#discard_doc! ⇒ Object

Delete the Nokogiri document and response body to conserve memory.
#discard_links! ⇒ Object

Discard links, a next call of page.links will return an empty array.
#doc ⇒ Object

Nokogiri document for the HTML body.
#expired?(ttl) ⇒ Boolean
#fetched? ⇒ Boolean

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.
#html? ⇒ Boolean

Returns true if the page is a HTML document, returns false otherwise.
#in_domain?(uri) ⇒ Boolean

Returns true if uri is in the same domain as the page, returns false otherwise.
#initialize(url, params = {}) ⇒ Page constructor

Create a new page.
#json ⇒ Object
#jsonp ⇒ Object
#links ⇒ Object

Array of distinct A tag HREFs from the page.
#not_found? ⇒ Boolean

Returns true if the page was not found (returned 404 code), returns false otherwise.
#redirect? ⇒ Boolean

Returns true if the page is a HTTP redirect, returns false otherwise.
#storable? ⇒ Boolean

Returns true if page is marked as storeable false otherwise Default is true.
#success? ⇒ Boolean

Returns true if the page is a HTTP success, returns false otherwise.
#title ⇒ Object
#to_absolute(link) ⇒ Object

Converts relative URL link into an absolute URL based on the location of the page.
#to_hash ⇒ Object
#to_json ⇒ Object

Constructor Details

#initialize(url, params = {}) ⇒ `Page`

Create a new page

# File 'lib/digger/page.rb', line 41

def initialize(url, params = {})
  @url = URI(url)
  @code = params[:code]
  @headers = params[:headers] || {}
  @headers['content-type'] ||= ['']
  @aliases = Array(params[:aka]).compact
  @referer = params[:referer]
  @depth = params[:depth] || 0
  @redirect_to = to_absolute(params[:redirect_to])
  @response_time = params[:response_time]
  @body = params[:body]
  @error = params[:error]
  @fetched = !params[:code].nil?
  @user_data = OpenStruct.new
  @domain_aliases = params[:domain_aliases] ||= []
  @storable = true
  @fetched_at = params[:fetched_at]
end

Instance Attribute Details

#aliases ⇒ `Object`

Returns the value of attribute aliases.



32
33
34

# File 'lib/digger/page.rb', line 32

def aliases
  @aliases
end

#body ⇒ `Object` (readonly)

The raw HTTP response body of the page



14
15
16

# File 'lib/digger/page.rb', line 14

def body
  @body
end

#code ⇒ `Object`

Integer response code of the page



22
23
24

# File 'lib/digger/page.rb', line 22

def code
  @code
end

#depth ⇒ `Object`

Depth of this page from the root of the crawl.



24
25
26

# File 'lib/digger/page.rb', line 24

def depth
  @depth
end

#domain_aliases ⇒ `Object`

Returns the value of attribute domain_aliases.



32
33
34

# File 'lib/digger/page.rb', line 32

def domain_aliases
  @domain_aliases
end

#error ⇒ `Object` (readonly)

Exception object, if one was raised during HTTP#fetch_page



20
21
22

# File 'lib/digger/page.rb', line 20

def error
  @error
end

#fetched_at ⇒ `Object`

Returns the value of attribute fetched_at.



32
33
34

# File 'lib/digger/page.rb', line 32

def fetched_at
  @fetched_at
end

#headers ⇒ `Object` (readonly)

Headers of the HTTP response



16
17
18

# File 'lib/digger/page.rb', line 16

def headers
  @headers
end

#redirect_to ⇒ `Object` (readonly)

URL of the page this one redirected to, if any



18
19
20

# File 'lib/digger/page.rb', line 18

def redirect_to
  @redirect_to
end

#referer ⇒ `Object`

URL of the page that brought us to this page



26
27
28

# File 'lib/digger/page.rb', line 26

def referer
  @referer
end

#response_time ⇒ `Object`

Response time of the request for this page in milliseconds



28
29
30

# File 'lib/digger/page.rb', line 28

def response_time
  @response_time
end

#storable ⇒ `Object`

Whether the current page should be stored Default: true



36
37
38

# File 'lib/digger/page.rb', line 36

def storable
  @storable
end

#url ⇒ `Object` (readonly)

Returns the value of attribute url.



12
13
14

# File 'lib/digger/page.rb', line 12

def url
  @url
end

#user_data ⇒ `Object`

OpenStruct it holds users defined data



30
31
32

# File 'lib/digger/page.rb', line 30

def user_data
  @user_data
end

Class Method Details

.from_hash(hash) ⇒ `Object`

# File 'lib/digger/page.rb', line 255

def self.from_hash(hash)
  page = new(URI(hash['url']))
  {
    '@headers'       => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
    '@body'          => hash['body'],
    '@links'         => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
    '@code'          => hash['code'].to_i,
    '@depth'         => hash['depth'].to_i,
    '@referer'       => hash['referer'],
    '@redirect_to'   => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
    '@response_time' => hash['response_time'].to_i,
    '@fetched'       => hash['fetched'],
    '@user_data'     => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
    '@fetched_at'    => hash['fetched_at'],
    '@error'         => hash['error']
  }.each do |var, value|
    page.instance_variable_set(var, value)
  end
  page
end

.from_json(json) ⇒ `Object`

# File 'lib/digger/page.rb', line 276

def self.from_json(json)
  hash = JSON.parse json
  from_hash hash
end

Instance Method Details

#base ⇒ `Object`

Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE

# File 'lib/digger/page.rb', line 175

def base
  @base = if doc
            href = doc.search('//head/base/@href')
            URI(href.to_s) unless href.nil? rescue nil
          end unless @base

  return nil if @base && @base.to_s.empty?

  @base
end

#content_type ⇒ `Object`

The content-type returned by the HTTP request for this page



135
136
137

# File 'lib/digger/page.rb', line 135

def content_type
  headers['content-type'].first
end

#cookies ⇒ `Object`



105
106
107

# File 'lib/digger/page.rb', line 105

def cookies
  @cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
end

#discard_doc! ⇒ `Object`

Delete the Nokogiri document and response body to conserve memory

# File 'lib/digger/page.rb', line 119

def discard_doc!
  links # force parsing of page links before we trash the document
  @doc = @body = nil
end

#discard_links! ⇒ `Object`

Discard links, a next call of page.links will return an empty array



112
113
114

# File 'lib/digger/page.rb', line 112

def discard_links!
  @links = []
end

#doc ⇒ `Object`

Nokogiri document for the HTML body

# File 'lib/digger/page.rb', line 86

def doc
  # return @doc if @doc
  # @body ||= ''
  # @body = @body.encode('utf-8', 'binary', :invalid => :replace,
  #                      :undef => :replace, :replace => '')
  # @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
  @doc ||= begin
    Nokogiri::HTML(body) if !body.nil? && html? rescue nil
  end
end

#expired?(ttl) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/digger/page.rb', line 249

def expired?(ttl)
  return false if fetched_at.nil?

  (Time.now.to_i - ttl) > fetched_at
end

#fetched? ⇒ `Boolean`

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.

Returns:

(Boolean)



128
129
130

# File 'lib/digger/page.rb', line 128

def fetched?
  @fetched
end

#html? ⇒ `Boolean`

Returns true if the page is a HTML document, returns false otherwise.

Returns:

(Boolean)



143
144
145

# File 'lib/digger/page.rb', line 143

def html?
  content_type =~ %r{^(text/html|application/xhtml+xml)\b}
end

#in_domain?(uri) ⇒ `Boolean`

Returns true if uri is in the same domain as the page, returns false otherwise

Returns:

(Boolean)

# File 'lib/digger/page.rb', line 210

def in_domain?(uri)
  @domain_aliases ||= []
  uri.host == @url.host || @domain_aliases.include?(uri.host)
end

#json ⇒ `Object`



97
98
99

# File 'lib/digger/page.rb', line 97

def json
  @json ||= JSON.parse body
end

#jsonp ⇒ `Object`



101
102
103

# File 'lib/digger/page.rb', line 101

def jsonp
  @jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
end

#links ⇒ `Object`

Array of distinct A tag HREFs from the page

# File 'lib/digger/page.rb', line 67

def links
  unless @links.nil?
    @links = Set.new
    return [] unless doc

    doc.search('//a[@href]').each do |a|
      u = a['href']
      next if u.nil? || u.empty?

      abs = to_absolute(u) rescue next
      @links << abs if abs && in_domain?(abs)
    end
  end
  @links.to_a
end

#not_found? ⇒ `Boolean`

Returns true if the page was not found (returned 404 code), returns false otherwise.

Returns:

(Boolean)



167
168
169

# File 'lib/digger/page.rb', line 167

def not_found?
  @code == 404
end

#redirect? ⇒ `Boolean`

Returns true if the page is a HTTP redirect, returns false otherwise.

Returns:

(Boolean)



151
152
153

# File 'lib/digger/page.rb', line 151

def redirect?
  (300...400).include?(@code)
end

#storable? ⇒ `Boolean`

Returns true if page is marked as storeable false otherwise Default is true

Returns:

(Boolean)



245
246
247

# File 'lib/digger/page.rb', line 245

def storable?
  @storable
end

#success? ⇒ `Boolean`

Returns true if the page is a HTTP success, returns false otherwise.

Returns:

(Boolean)



159
160
161

# File 'lib/digger/page.rb', line 159

def success?
  (200..206).include?(@code)
end

#title ⇒ `Object`



60
61
62

# File 'lib/digger/page.rb', line 60

def title
  doc&.title
end

#to_absolute(link) ⇒ `Object`

Converts relative URL link into an absolute URL based on the location of the page

# File 'lib/digger/page.rb', line 190

def to_absolute(link)
  return nil if link.nil?

  link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
  relative = begin
               URI(link)
             rescue URI::Error
               return nil
             end
  absolute = base ? base.merge(relative) : @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  absolute
end

#to_hash ⇒ `Object`

# File 'lib/digger/page.rb', line 215

def to_hash
  {
    'url'           => @url.to_s,
    'headers'       => Marshal.dump(@headers),
    'body'          => @body,
    'links'         => links.map(&:to_s),
    'code'          => @code,
    'depth'         => @depth,
    'referer'       => @referer.to_s,
    'redirect_to'   => @redirect_to.to_s,
    'response_time' => @response_time,
    'fetched'       => @fetched,
    'user_data'     => @user_data.nil? ? {} : @user_data.marshal_dump,
    'fetched_at'    => @fetched_at,
    'error'         => @error.to_s
  }
end

#to_json ⇒ `Object`

# File 'lib/digger/page.rb', line 233

def to_json
  th = to_hash.dup
  th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
  th.delete('headers') if content_type.empty?
  th.to_json
end

Class: Digger::Page

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, params = {}) ⇒ Page

Instance Attribute Details

#aliases ⇒ Object

#body ⇒ Object (readonly)

#code ⇒ Object

#depth ⇒ Object

#domain_aliases ⇒ Object

#error ⇒ Object (readonly)

#fetched_at ⇒ Object

#headers ⇒ Object (readonly)

#redirect_to ⇒ Object (readonly)

#referer ⇒ Object

#response_time ⇒ Object

#storable ⇒ Object

#url ⇒ Object (readonly)

#user_data ⇒ Object

Class Method Details

.from_hash(hash) ⇒ Object

.from_json(json) ⇒ Object

Instance Method Details

#base ⇒ Object

#content_type ⇒ Object

#cookies ⇒ Object

#discard_doc! ⇒ Object

#discard_links! ⇒ Object

#doc ⇒ Object

#expired?(ttl) ⇒ Boolean

#fetched? ⇒ Boolean

#html? ⇒ Boolean

#in_domain?(uri) ⇒ Boolean

#json ⇒ Object

#jsonp ⇒ Object

#links ⇒ Object

#not_found? ⇒ Boolean

#redirect? ⇒ Boolean

#storable? ⇒ Boolean

#success? ⇒ Boolean

#title ⇒ Object

#to_absolute(link) ⇒ Object

#to_hash ⇒ Object

#to_json ⇒ Object

#initialize(url, params = {}) ⇒ `Page`

#aliases ⇒ `Object`

#body ⇒ `Object` (readonly)

#code ⇒ `Object`

#depth ⇒ `Object`

#domain_aliases ⇒ `Object`

#error ⇒ `Object` (readonly)

#fetched_at ⇒ `Object`

#headers ⇒ `Object` (readonly)

#redirect_to ⇒ `Object` (readonly)

#referer ⇒ `Object`

#response_time ⇒ `Object`

#storable ⇒ `Object`

#url ⇒ `Object` (readonly)

#user_data ⇒ `Object`

.from_hash(hash) ⇒ `Object`

.from_json(json) ⇒ `Object`

#base ⇒ `Object`

#content_type ⇒ `Object`

#cookies ⇒ `Object`

#discard_doc! ⇒ `Object`

#discard_links! ⇒ `Object`

#doc ⇒ `Object`

#expired?(ttl) ⇒ `Boolean`

#fetched? ⇒ `Boolean`

#html? ⇒ `Boolean`

#in_domain?(uri) ⇒ `Boolean`

#json ⇒ `Object`

#jsonp ⇒ `Object`

#links ⇒ `Object`

#not_found? ⇒ `Boolean`

#redirect? ⇒ `Boolean`

#storable? ⇒ `Boolean`

#success? ⇒ `Boolean`

#title ⇒ `Object`

#to_absolute(link) ⇒ `Object`

#to_hash ⇒ `Object`

#to_json ⇒ `Object`