Class: Digger::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/digger/page.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, params = {}) ⇒ Page

Create a new page



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/digger/page.rb', line 41

def initialize(url, params = {})
  @url = URI(url)
  @code = params[:code]
  @headers = params[:headers] || {}
  @headers['content-type'] ||= ['']
  @aliases = Array(params[:aka]).compact
  @referer = params[:referer]
  @depth = params[:depth] || 0
  @redirect_to = to_absolute(params[:redirect_to])
  @response_time = params[:response_time]
  @body = params[:body]
  @error = params[:error]
  @fetched = !params[:code].nil?
  @user_data = OpenStruct.new
  @domain_aliases = params[:domain_aliases] ||= []
  @storable = true
  @fetched_at = params[:fetched_at]
end

Instance Attribute Details

#aliasesObject

Returns the value of attribute aliases.



32
33
34
# File 'lib/digger/page.rb', line 32

def aliases
  @aliases
end

#bodyObject (readonly)

The raw HTTP response body of the page



14
15
16
# File 'lib/digger/page.rb', line 14

def body
  @body
end

#codeObject

Integer response code of the page



22
23
24
# File 'lib/digger/page.rb', line 22

def code
  @code
end

#depthObject

Depth of this page from the root of the crawl.



24
25
26
# File 'lib/digger/page.rb', line 24

def depth
  @depth
end

#domain_aliasesObject

Returns the value of attribute domain_aliases.



32
33
34
# File 'lib/digger/page.rb', line 32

def domain_aliases
  @domain_aliases
end

#errorObject (readonly)

Exception object, if one was raised during HTTP#fetch_page



20
21
22
# File 'lib/digger/page.rb', line 20

def error
  @error
end

#fetched_atObject

Returns the value of attribute fetched_at.



32
33
34
# File 'lib/digger/page.rb', line 32

def fetched_at
  @fetched_at
end

#headersObject (readonly)

Headers of the HTTP response



16
17
18
# File 'lib/digger/page.rb', line 16

def headers
  @headers
end

#redirect_toObject (readonly)

URL of the page this one redirected to, if any



18
19
20
# File 'lib/digger/page.rb', line 18

def redirect_to
  @redirect_to
end

#refererObject

URL of the page that brought us to this page



26
27
28
# File 'lib/digger/page.rb', line 26

def referer
  @referer
end

#response_timeObject

Response time of the request for this page in milliseconds



28
29
30
# File 'lib/digger/page.rb', line 28

def response_time
  @response_time
end

#storableObject

Whether the current page should be stored Default: true



36
37
38
# File 'lib/digger/page.rb', line 36

def storable
  @storable
end

#urlObject (readonly)

Returns the value of attribute url.



12
13
14
# File 'lib/digger/page.rb', line 12

def url
  @url
end

#user_dataObject

OpenStruct it holds users defined data



30
31
32
# File 'lib/digger/page.rb', line 30

def user_data
  @user_data
end

Class Method Details

.from_hash(hash) ⇒ Object



255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/digger/page.rb', line 255

def self.from_hash(hash)
  page = new(URI(hash['url']))
  {
    '@headers'       => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
    '@body'          => hash['body'],
    '@links'         => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
    '@code'          => hash['code'].to_i,
    '@depth'         => hash['depth'].to_i,
    '@referer'       => hash['referer'],
    '@redirect_to'   => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
    '@response_time' => hash['response_time'].to_i,
    '@fetched'       => hash['fetched'],
    '@user_data'     => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
    '@fetched_at'    => hash['fetched_at'],
    '@error'         => hash['error']
  }.each do |var, value|
    page.instance_variable_set(var, value)
  end
  page
end

.from_json(json) ⇒ Object



276
277
278
279
# File 'lib/digger/page.rb', line 276

def self.from_json(json)
  hash = JSON.parse json
  from_hash hash
end

Instance Method Details

#baseObject

Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE



175
176
177
178
179
180
181
182
183
184
# File 'lib/digger/page.rb', line 175

def base
  @base = if doc
            href = doc.search('//head/base/@href')
            URI(href.to_s) unless href.nil? rescue nil
          end unless @base

  return nil if @base && @base.to_s.empty?

  @base
end

#content_typeObject

The content-type returned by the HTTP request for this page



135
136
137
# File 'lib/digger/page.rb', line 135

def content_type
  headers['content-type'].first
end

#cookiesObject



105
106
107
# File 'lib/digger/page.rb', line 105

def cookies
  @cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
end

#discard_doc!Object

Delete the Nokogiri document and response body to conserve memory



119
120
121
122
# File 'lib/digger/page.rb', line 119

def discard_doc!
  links # force parsing of page links before we trash the document
  @doc = @body = nil
end

#discard_links!Object

Discard links, a next call of page.links will return an empty array



112
113
114
# File 'lib/digger/page.rb', line 112

def discard_links!
  @links = []
end

#docObject

Nokogiri document for the HTML body



86
87
88
89
90
91
92
93
94
95
# File 'lib/digger/page.rb', line 86

def doc
  # return @doc if @doc
  # @body ||= ''
  # @body = @body.encode('utf-8', 'binary', :invalid => :replace,
  #                      :undef => :replace, :replace => '')
  # @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
  @doc ||= begin
    Nokogiri::HTML(body) if !body.nil? && html? rescue nil
  end
end

#expired?(ttl) ⇒ Boolean

Returns:

  • (Boolean)


249
250
251
252
253
# File 'lib/digger/page.rb', line 249

def expired?(ttl)
  return false if fetched_at.nil?

  (Time.now.to_i - ttl) > fetched_at
end

#fetched?Boolean

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.

Returns:

  • (Boolean)


128
129
130
# File 'lib/digger/page.rb', line 128

def fetched?
  @fetched
end

#html?Boolean

Returns true if the page is a HTML document, returns false otherwise.

Returns:

  • (Boolean)


143
144
145
# File 'lib/digger/page.rb', line 143

def html?
  content_type =~ %r{^(text/html|application/xhtml+xml)\b}
end

#in_domain?(uri) ⇒ Boolean

Returns true if uri is in the same domain as the page, returns false otherwise

Returns:

  • (Boolean)


210
211
212
213
# File 'lib/digger/page.rb', line 210

def in_domain?(uri)
  @domain_aliases ||= []
  uri.host == @url.host || @domain_aliases.include?(uri.host)
end

#jsonObject



97
98
99
# File 'lib/digger/page.rb', line 97

def json
  @json ||= JSON.parse body
end

#jsonpObject



101
102
103
# File 'lib/digger/page.rb', line 101

def jsonp
  @jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
end

Array of distinct A tag HREFs from the page



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/digger/page.rb', line 67

def links
  unless @links.nil?
    @links = Set.new
    return [] unless doc

    doc.search('//a[@href]').each do |a|
      u = a['href']
      next if u.nil? || u.empty?

      abs = to_absolute(u) rescue next
      @links << abs if abs && in_domain?(abs)
    end
  end
  @links.to_a
end

#not_found?Boolean

Returns true if the page was not found (returned 404 code), returns false otherwise.

Returns:

  • (Boolean)


167
168
169
# File 'lib/digger/page.rb', line 167

def not_found?
  @code == 404
end

#redirect?Boolean

Returns true if the page is a HTTP redirect, returns false otherwise.

Returns:

  • (Boolean)


151
152
153
# File 'lib/digger/page.rb', line 151

def redirect?
  (300...400).include?(@code)
end

#storable?Boolean

Returns true if page is marked as storeable false otherwise Default is true

Returns:

  • (Boolean)


245
246
247
# File 'lib/digger/page.rb', line 245

def storable?
  @storable
end

#success?Boolean

Returns true if the page is a HTTP success, returns false otherwise.

Returns:

  • (Boolean)


159
160
161
# File 'lib/digger/page.rb', line 159

def success?
  (200..206).include?(@code)
end

#titleObject



60
61
62
# File 'lib/digger/page.rb', line 60

def title
  doc&.title
end

#to_absolute(link) ⇒ Object

Converts relative URL link into an absolute URL based on the location of the page



190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/digger/page.rb', line 190

def to_absolute(link)
  return nil if link.nil?

  link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
  relative = begin
               URI(link)
             rescue URI::Error
               return nil
             end
  absolute = base ? base.merge(relative) : @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  absolute
end

#to_hashObject



215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# File 'lib/digger/page.rb', line 215

def to_hash
  {
    'url'           => @url.to_s,
    'headers'       => Marshal.dump(@headers),
    'body'          => @body,
    'links'         => links.map(&:to_s),
    'code'          => @code,
    'depth'         => @depth,
    'referer'       => @referer.to_s,
    'redirect_to'   => @redirect_to.to_s,
    'response_time' => @response_time,
    'fetched'       => @fetched,
    'user_data'     => @user_data.nil? ? {} : @user_data.marshal_dump,
    'fetched_at'    => @fetched_at,
    'error'         => @error.to_s
  }
end

#to_jsonObject



233
234
235
236
237
238
# File 'lib/digger/page.rb', line 233

def to_json
  th = to_hash.dup
  th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
  th.delete('headers') if content_type.empty?
  th.to_json
end