Class: Digger::Page
- Inherits:
-
Object
- Object
- Digger::Page
- Defined in:
- lib/digger/page.rb
Instance Attribute Summary collapse
-
#aliases ⇒ Object
Returns the value of attribute aliases.
-
#body ⇒ Object
readonly
The raw HTTP response body of the page.
-
#code ⇒ Object
Integer response code of the page.
-
#depth ⇒ Object
Depth of this page from the root of the crawl.
-
#domain_aliases ⇒ Object
Returns the value of attribute domain_aliases.
-
#error ⇒ Object
readonly
Exception object, if one was raised during HTTP#fetch_page.
-
#fetched_at ⇒ Object
Returns the value of attribute fetched_at.
-
#headers ⇒ Object
readonly
Headers of the HTTP response.
-
#redirect_to ⇒ Object
readonly
URL of the page this one redirected to, if any.
-
#referer ⇒ Object
URL of the page that brought us to this page.
-
#response_time ⇒ Object
Response time of the request for this page in milliseconds.
-
#storable ⇒ Object
Whether the current page should be stored Default: true.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
-
#user_data ⇒ Object
OpenStruct it holds users defined data.
Class Method Summary collapse
Instance Method Summary collapse
-
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE.
-
#content_type ⇒ Object
The content-type returned by the HTTP request for this page.
- #cookies ⇒ Object
-
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory.
-
#discard_links! ⇒ Object
Discard links, a next call of page.links will return an empty array.
-
#doc ⇒ Object
Nokogiri document for the HTML body.
- #expired?(ttl) ⇒ Boolean
-
#fetched? ⇒ Boolean
Was the page successfully fetched?
trueif the page was fetched with no error,falseotherwise. -
#html? ⇒ Boolean
Returns
trueif the page is a HTML document, returnsfalseotherwise. -
#in_domain?(uri) ⇒ Boolean
Returns
trueif uri is in the same domain as the page, returnsfalseotherwise. -
#initialize(url, params = {}) ⇒ Page
constructor
Create a new page.
- #json ⇒ Object
- #jsonp ⇒ Object
-
#links ⇒ Object
Array of distinct A tag HREFs from the page.
-
#not_found? ⇒ Boolean
Returns
trueif the page was not found (returned 404 code), returnsfalseotherwise. -
#redirect? ⇒ Boolean
Returns
trueif the page is a HTTP redirect, returnsfalseotherwise. -
#storable? ⇒ Boolean
Returns
trueif page is marked as storeablefalseotherwise Default istrue. -
#success? ⇒ Boolean
Returns
trueif the page is a HTTP success, returnsfalseotherwise. - #title ⇒ Object
-
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page.
- #to_hash ⇒ Object
- #to_json ⇒ Object
Constructor Details
#initialize(url, params = {}) ⇒ Page
Create a new page
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/digger/page.rb', line 41 def initialize(url, params = {}) @url = URI(url) @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? @user_data = OpenStruct.new @domain_aliases = params[:domain_aliases] ||= [] @storable = true @fetched_at = params[:fetched_at] end |
Instance Attribute Details
#aliases ⇒ Object
Returns the value of attribute aliases.
32 33 34 |
# File 'lib/digger/page.rb', line 32 def aliases @aliases end |
#body ⇒ Object (readonly)
The raw HTTP response body of the page
14 15 16 |
# File 'lib/digger/page.rb', line 14 def body @body end |
#code ⇒ Object
Integer response code of the page
22 23 24 |
# File 'lib/digger/page.rb', line 22 def code @code end |
#depth ⇒ Object
Depth of this page from the root of the crawl.
24 25 26 |
# File 'lib/digger/page.rb', line 24 def depth @depth end |
#domain_aliases ⇒ Object
Returns the value of attribute domain_aliases.
32 33 34 |
# File 'lib/digger/page.rb', line 32 def domain_aliases @domain_aliases end |
#error ⇒ Object (readonly)
Exception object, if one was raised during HTTP#fetch_page
20 21 22 |
# File 'lib/digger/page.rb', line 20 def error @error end |
#fetched_at ⇒ Object
Returns the value of attribute fetched_at.
32 33 34 |
# File 'lib/digger/page.rb', line 32 def fetched_at @fetched_at end |
#headers ⇒ Object (readonly)
Headers of the HTTP response
16 17 18 |
# File 'lib/digger/page.rb', line 16 def headers @headers end |
#redirect_to ⇒ Object (readonly)
URL of the page this one redirected to, if any
18 19 20 |
# File 'lib/digger/page.rb', line 18 def redirect_to @redirect_to end |
#referer ⇒ Object
URL of the page that brought us to this page
26 27 28 |
# File 'lib/digger/page.rb', line 26 def referer @referer end |
#response_time ⇒ Object
Response time of the request for this page in milliseconds
28 29 30 |
# File 'lib/digger/page.rb', line 28 def response_time @response_time end |
#storable ⇒ Object
Whether the current page should be stored Default: true
36 37 38 |
# File 'lib/digger/page.rb', line 36 def storable @storable end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
12 13 14 |
# File 'lib/digger/page.rb', line 12 def url @url end |
#user_data ⇒ Object
OpenStruct it holds users defined data
30 31 32 |
# File 'lib/digger/page.rb', line 30 def user_data @user_data end |
Class Method Details
.from_hash(hash) ⇒ Object
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
# File 'lib/digger/page.rb', line 255 def self.from_hash(hash) page = new(URI(hash['url'])) { '@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] }, '@body' => hash['body'], '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [], '@code' => hash['code'].to_i, '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'], '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil, '@fetched_at' => hash['fetched_at'], '@error' => hash['error'] }.each do |var, value| page.instance_variable_set(var, value) end page end |
.from_json(json) ⇒ Object
276 277 278 279 |
# File 'lib/digger/page.rb', line 276 def self.from_json(json) hash = JSON.parse json from_hash hash end |
Instance Method Details
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
175 176 177 178 179 180 181 182 183 184 |
# File 'lib/digger/page.rb', line 175 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s.empty? @base end |
#content_type ⇒ Object
The content-type returned by the HTTP request for this page
135 136 137 |
# File 'lib/digger/page.rb', line 135 def content_type headers['content-type'].first end |
#cookies ⇒ Object
105 106 107 |
# File 'lib/digger/page.rb', line 105 def @cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) } end |
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory
119 120 121 122 |
# File 'lib/digger/page.rb', line 119 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end |
#discard_links! ⇒ Object
Discard links, a next call of page.links will return an empty array
112 113 114 |
# File 'lib/digger/page.rb', line 112 def discard_links! @links = [] end |
#doc ⇒ Object
Nokogiri document for the HTML body
86 87 88 89 90 91 92 93 94 95 |
# File 'lib/digger/page.rb', line 86 def doc # return @doc if @doc # @body ||= '' # @body = @body.encode('utf-8', 'binary', :invalid => :replace, # :undef => :replace, :replace => '') # @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html? @doc ||= begin Nokogiri::HTML(body) if !body.nil? && html? rescue nil end end |
#expired?(ttl) ⇒ Boolean
249 250 251 252 253 |
# File 'lib/digger/page.rb', line 249 def expired?(ttl) return false if fetched_at.nil? (Time.now.to_i - ttl) > fetched_at end |
#fetched? ⇒ Boolean
Was the page successfully fetched? true if the page was fetched with no error, false otherwise.
128 129 130 |
# File 'lib/digger/page.rb', line 128 def fetched? @fetched end |
#html? ⇒ Boolean
Returns true if the page is a HTML document, returns false otherwise.
143 144 145 |
# File 'lib/digger/page.rb', line 143 def html? content_type =~ %r{^(text/html|application/xhtml+xml)\b} end |
#in_domain?(uri) ⇒ Boolean
Returns true if uri is in the same domain as the page, returns false otherwise
210 211 212 213 |
# File 'lib/digger/page.rb', line 210 def in_domain?(uri) @domain_aliases ||= [] uri.host == @url.host || @domain_aliases.include?(uri.host) end |
#json ⇒ Object
97 98 99 |
# File 'lib/digger/page.rb', line 97 def json @json ||= JSON.parse body end |
#jsonp ⇒ Object
101 102 103 |
# File 'lib/digger/page.rb', line 101 def jsonp @jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1] end |
#links ⇒ Object
Array of distinct A tag HREFs from the page
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/digger/page.rb', line 67 def links unless @links.nil? @links = Set.new return [] unless doc doc.search('//a[@href]').each do |a| u = a['href'] next if u.nil? || u.empty? abs = to_absolute(u) rescue next @links << abs if abs && in_domain?(abs) end end @links.to_a end |
#not_found? ⇒ Boolean
Returns true if the page was not found (returned 404 code), returns false otherwise.
167 168 169 |
# File 'lib/digger/page.rb', line 167 def not_found? @code == 404 end |
#redirect? ⇒ Boolean
Returns true if the page is a HTTP redirect, returns false otherwise.
151 152 153 |
# File 'lib/digger/page.rb', line 151 def redirect? (300...400).include?(@code) end |
#storable? ⇒ Boolean
Returns true if page is marked as storeable false otherwise Default is true
245 246 247 |
# File 'lib/digger/page.rb', line 245 def storable? @storable end |
#success? ⇒ Boolean
Returns true if the page is a HTTP success, returns false otherwise.
159 160 161 |
# File 'lib/digger/page.rb', line 159 def success? (200..206).include?(@code) end |
#title ⇒ Object
60 61 62 |
# File 'lib/digger/page.rb', line 60 def title doc&.title end |
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
# File 'lib/digger/page.rb', line 190 def to_absolute(link) return nil if link.nil? link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '') relative = begin URI(link) rescue URI::Error return nil end absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? absolute end |
#to_hash ⇒ Object
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
# File 'lib/digger/page.rb', line 215 def to_hash { 'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched, 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump, 'fetched_at' => @fetched_at, 'error' => @error.to_s } end |
#to_json ⇒ Object
233 234 235 236 237 238 |
# File 'lib/digger/page.rb', line 233 def to_json th = to_hash.dup th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) } th.delete('headers') if content_type.empty? th.to_json end |