Class: Archaeo::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/page.rb

Overview

Model representing a fetched archived page from the Wayback Machine.

Contains the page content, metadata, and provenance information for a single archived resource. Content is automatically transcoded to UTF-8 from the detected source encoding.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(content:, content_type:, status_code:, archive_url:, original_url:, timestamp:) ⇒ Page

Returns a new instance of Page.



15
16
17
18
19
20
21
22
23
# File 'lib/archaeo/page.rb', line 15

def initialize(content:, content_type:, status_code:,
               archive_url:, original_url:, timestamp:)
  @raw_content = content
  @content_type = content_type
  @status_code = status_code
  @archive_url = archive_url
  @original_url = original_url
  @timestamp = Timestamp.coerce(timestamp)
end

Instance Attribute Details

#archive_urlObject (readonly)

Returns the value of attribute archive_url.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def archive_url
  @archive_url
end

#content_typeObject (readonly)

Returns the value of attribute content_type.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def content_type
  @content_type
end

#original_urlObject (readonly)

Returns the value of attribute original_url.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def original_url
  @original_url
end

#status_codeObject (readonly)

Returns the value of attribute status_code.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def status_code
  @status_code
end

#timestampObject (readonly)

Returns the value of attribute timestamp.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def timestamp
  @timestamp
end

Instance Method Details

#as_jsonObject



144
145
146
# File 'lib/archaeo/page.rb', line 144

def as_json(*)
  to_h.transform_values { |v| v.is_a?(Timestamp) ? v.to_s : v }
end

#binary?Boolean

Returns:

  • (Boolean)


57
58
59
# File 'lib/archaeo/page.rb', line 57

def binary?
  !(text? || json? || html?)
end

#contentObject



25
26
27
# File 'lib/archaeo/page.rb', line 25

def content
  @content ||= transcode(@raw_content)
end

#css?Boolean

Returns:

  • (Boolean)


53
54
55
# File 'lib/archaeo/page.rb', line 53

def css?
  @content_type&.include?("text/css")
end

#encodingObject



33
34
35
# File 'lib/archaeo/page.rb', line 33

def encoding
  @encoding ||= detect_encoding
end

#formsObject



106
107
108
109
110
111
112
113
# File 'lib/archaeo/page.rb', line 106

def forms
  return [] unless html?

  @forms ||= html_doc.css("form").map do |form|
    { action: form["action"].to_s, method: (form["method"] || "GET").upcase,
      fields: extract_form_fields(form) }
  end
end

#headingsObject



89
90
91
92
93
94
95
# File 'lib/archaeo/page.rb', line 89

def headings
  return [] unless html?

  @headings ||= html_doc.css("h1, h2, h3, h4, h5, h6").map do |el|
    { level: el.name[1].to_i, text: el.text.strip }
  end
end

#html?Boolean

Returns:

  • (Boolean)


37
38
39
# File 'lib/archaeo/page.rb', line 37

def html?
  @content_type&.include?("text/html")
end

#image?Boolean

Returns:

  • (Boolean)


45
46
47
# File 'lib/archaeo/page.rb', line 45

def image?
  @content_type&.start_with?("image/")
end

#imagesObject



97
98
99
100
101
102
103
104
# File 'lib/archaeo/page.rb', line 97

def images
  return [] unless html?

  @images ||= html_doc.css("img[src]").map do |el|
    { src: el["src"], alt: el["alt"].to_s,
      width: el["width"]&.to_i, height: el["height"]&.to_i }
  end
end

#inspectObject



148
149
150
# File 'lib/archaeo/page.rb', line 148

def inspect
  "#<#{self.class.name} #{@content_type} #{size} bytes>"
end

#json?Boolean

Returns:

  • (Boolean)


41
42
43
# File 'lib/archaeo/page.rb', line 41

def json?
  @content_type&.include?("application/json")
end


65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/archaeo/page.rb', line 65

def links
  return [] unless html?

  @links ||= begin
    base = @archive_url || @original_url
    html_doc.css("a[href]").map do |anchor|
      href = resolve_page_url(anchor["href"], base)
      { href: href, text: anchor.text.strip,
        external: href && !href.include?(original_domain) }
    end
  end
end

#meta_tagsObject



78
79
80
81
82
83
84
85
86
87
# File 'lib/archaeo/page.rb', line 78

def meta_tags
  return {} unless html?

  @meta_tags ||= begin
    result = extract_meta_entries(html_doc)
    canonical = html_doc.at_css('link[rel="canonical"]')
    result["canonical"] = canonical["href"].to_s if canonical
    result
  end
end

#micropostsObject



123
124
125
126
127
128
129
130
# File 'lib/archaeo/page.rb', line 123

def microposts
  return [] unless html?

  @microposts ||= begin
    containers = find_article_containers(html_doc)
    containers.filter_map { |el| extract_micropost(el) }
  end
end

#scriptsObject



115
116
117
118
119
120
121
# File 'lib/archaeo/page.rb', line 115

def scripts
  return [] unless html?

  @scripts ||= html_doc.css("script").map do |el|
    { src: el["src"].to_s, type: el["type"].to_s }
  end
end

#sizeObject



29
30
31
# File 'lib/archaeo/page.rb', line 29

def size
  content.length
end

#text?Boolean

Returns:

  • (Boolean)


49
50
51
# File 'lib/archaeo/page.rb', line 49

def text?
  @content_type&.start_with?("text/")
end

#titleObject



61
62
63
# File 'lib/archaeo/page.rb', line 61

def title
  @title ||= html_doc.at_css("title")&.text&.strip
end

#to_hObject



132
133
134
135
136
137
138
139
140
141
142
# File 'lib/archaeo/page.rb', line 132

def to_h
  {
    content_type: @content_type,
    status_code: @status_code,
    archive_url: @archive_url,
    original_url: @original_url,
    timestamp: @timestamp,
    size: size,
    encoding: encoding.to_s,
  }
end