Class: Archaeo::Page
- Inherits:
-
Object
- Object
- Archaeo::Page
- Defined in:
- lib/archaeo/page.rb
Overview
Model representing a fetched archived page from the Wayback Machine.
Contains the page content, metadata, and provenance information for a single archived resource. Content is automatically transcoded to UTF-8 from the detected source encoding.
Instance Attribute Summary collapse
-
#archive_url ⇒ Object
readonly
Returns the value of attribute archive_url.
-
#content_type ⇒ Object
readonly
Returns the value of attribute content_type.
-
#original_url ⇒ Object
readonly
Returns the value of attribute original_url.
-
#status_code ⇒ Object
readonly
Returns the value of attribute status_code.
-
#timestamp ⇒ Object
readonly
Returns the value of attribute timestamp.
Instance Method Summary collapse
- #as_json ⇒ Object
- #binary? ⇒ Boolean
- #content ⇒ Object
- #css? ⇒ Boolean
- #encoding ⇒ Object
- #forms ⇒ Object
- #headings ⇒ Object
- #html? ⇒ Boolean
- #image? ⇒ Boolean
- #images ⇒ Object
-
#initialize(content:, content_type:, status_code:, archive_url:, original_url:, timestamp:) ⇒ Page
constructor
A new instance of Page.
- #inspect ⇒ Object
- #json? ⇒ Boolean
- #links ⇒ Object
- #meta_tags ⇒ Object
- #microposts ⇒ Object
- #scripts ⇒ Object
- #size ⇒ Object
- #text? ⇒ Boolean
- #title ⇒ Object
- #to_h ⇒ Object
Constructor Details
#initialize(content:, content_type:, status_code:, archive_url:, original_url:, timestamp:) ⇒ Page
Returns a new instance of Page.
15 16 17 18 19 20 21 22 23 |
# File 'lib/archaeo/page.rb', line 15 def initialize(content:, content_type:, status_code:, archive_url:, original_url:, timestamp:) @raw_content = content @content_type = content_type @status_code = status_code @archive_url = archive_url @original_url = original_url @timestamp = Timestamp.coerce() end |
Instance Attribute Details
#archive_url ⇒ Object (readonly)
Returns the value of attribute archive_url.
12 13 14 |
# File 'lib/archaeo/page.rb', line 12 def archive_url @archive_url end |
#content_type ⇒ Object (readonly)
Returns the value of attribute content_type.
12 13 14 |
# File 'lib/archaeo/page.rb', line 12 def content_type @content_type end |
#original_url ⇒ Object (readonly)
Returns the value of attribute original_url.
12 13 14 |
# File 'lib/archaeo/page.rb', line 12 def original_url @original_url end |
#status_code ⇒ Object (readonly)
Returns the value of attribute status_code.
12 13 14 |
# File 'lib/archaeo/page.rb', line 12 def status_code @status_code end |
#timestamp ⇒ Object (readonly)
Returns the value of attribute timestamp.
12 13 14 |
# File 'lib/archaeo/page.rb', line 12 def @timestamp end |
Instance Method Details
#as_json ⇒ Object
144 145 146 |
# File 'lib/archaeo/page.rb', line 144 def as_json(*) to_h.transform_values { |v| v.is_a?(Timestamp) ? v.to_s : v } end |
#binary? ⇒ Boolean
57 58 59 |
# File 'lib/archaeo/page.rb', line 57 def binary? !(text? || json? || html?) end |
#content ⇒ Object
25 26 27 |
# File 'lib/archaeo/page.rb', line 25 def content @content ||= transcode(@raw_content) end |
#css? ⇒ Boolean
53 54 55 |
# File 'lib/archaeo/page.rb', line 53 def css? @content_type&.include?("text/css") end |
#encoding ⇒ Object
33 34 35 |
# File 'lib/archaeo/page.rb', line 33 def encoding @encoding ||= detect_encoding end |
#forms ⇒ Object
106 107 108 109 110 111 112 113 |
# File 'lib/archaeo/page.rb', line 106 def forms return [] unless html? @forms ||= html_doc.css("form").map do |form| { action: form["action"].to_s, method: (form["method"] || "GET").upcase, fields: extract_form_fields(form) } end end |
#headings ⇒ Object
89 90 91 92 93 94 95 |
# File 'lib/archaeo/page.rb', line 89 def headings return [] unless html? @headings ||= html_doc.css("h1, h2, h3, h4, h5, h6").map do |el| { level: el.name[1].to_i, text: el.text.strip } end end |
#html? ⇒ Boolean
37 38 39 |
# File 'lib/archaeo/page.rb', line 37 def html? @content_type&.include?("text/html") end |
#image? ⇒ Boolean
45 46 47 |
# File 'lib/archaeo/page.rb', line 45 def image? @content_type&.start_with?("image/") end |
#images ⇒ Object
97 98 99 100 101 102 103 104 |
# File 'lib/archaeo/page.rb', line 97 def images return [] unless html? @images ||= html_doc.css("img[src]").map do |el| { src: el["src"], alt: el["alt"].to_s, width: el["width"]&.to_i, height: el["height"]&.to_i } end end |
#inspect ⇒ Object
148 149 150 |
# File 'lib/archaeo/page.rb', line 148 def inspect "#<#{self.class.name} #{@content_type} #{size} bytes>" end |
#json? ⇒ Boolean
41 42 43 |
# File 'lib/archaeo/page.rb', line 41 def json? @content_type&.include?("application/json") end |
#links ⇒ Object
65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/archaeo/page.rb', line 65 def links return [] unless html? @links ||= begin base = @archive_url || @original_url html_doc.css("a[href]").map do |anchor| href = resolve_page_url(anchor["href"], base) { href: href, text: anchor.text.strip, external: href && !href.include?(original_domain) } end end end |
#meta_tags ⇒ Object
78 79 80 81 82 83 84 85 86 87 |
# File 'lib/archaeo/page.rb', line 78 def return {} unless html? @meta_tags ||= begin result = (html_doc) canonical = html_doc.at_css('link[rel="canonical"]') result["canonical"] = canonical["href"].to_s if canonical result end end |
#microposts ⇒ Object
123 124 125 126 127 128 129 130 |
# File 'lib/archaeo/page.rb', line 123 def microposts return [] unless html? @microposts ||= begin containers = find_article_containers(html_doc) containers.filter_map { |el| extract_micropost(el) } end end |
#scripts ⇒ Object
115 116 117 118 119 120 121 |
# File 'lib/archaeo/page.rb', line 115 def scripts return [] unless html? @scripts ||= html_doc.css("script").map do |el| { src: el["src"].to_s, type: el["type"].to_s } end end |
#size ⇒ Object
29 30 31 |
# File 'lib/archaeo/page.rb', line 29 def size content.length end |
#text? ⇒ Boolean
49 50 51 |
# File 'lib/archaeo/page.rb', line 49 def text? @content_type&.start_with?("text/") end |
#title ⇒ Object
61 62 63 |
# File 'lib/archaeo/page.rb', line 61 def title @title ||= html_doc.at_css("title")&.text&.strip end |
#to_h ⇒ Object
132 133 134 135 136 137 138 139 140 141 142 |
# File 'lib/archaeo/page.rb', line 132 def to_h { content_type: @content_type, status_code: @status_code, archive_url: @archive_url, original_url: @original_url, timestamp: @timestamp, size: size, encoding: encoding.to_s, } end |