Class: Archaeo::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/page.rb

Overview

Model representing a fetched archived page from the Wayback Machine.

Contains the page content, metadata, and provenance information for a single archived resource. Content is automatically transcoded to UTF-8 from the detected source encoding.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(content:, content_type:, status_code:, archive_url:, original_url:, timestamp:) ⇒ Page

Returns a new instance of Page.



15
16
17
18
19
20
21
22
23
# File 'lib/archaeo/page.rb', line 15

def initialize(content:, content_type:, status_code:,
               archive_url:, original_url:, timestamp:)
  @raw_content = content
  @content_type = content_type
  @status_code = status_code
  @archive_url = archive_url
  @original_url = original_url
  @timestamp = Timestamp.coerce(timestamp)
end

Instance Attribute Details

#archive_urlObject (readonly)

Returns the value of attribute archive_url.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def archive_url
  @archive_url
end

#content_typeObject (readonly)

Returns the value of attribute content_type.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def content_type
  @content_type
end

#original_urlObject (readonly)

Returns the value of attribute original_url.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def original_url
  @original_url
end

#status_codeObject (readonly)

Returns the value of attribute status_code.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def status_code
  @status_code
end

#timestampObject (readonly)

Returns the value of attribute timestamp.



12
13
14
# File 'lib/archaeo/page.rb', line 12

def timestamp
  @timestamp
end

Instance Method Details

#as_jsonObject



108
109
110
111
112
113
114
115
116
117
118
# File 'lib/archaeo/page.rb', line 108

def as_json(*)
  {
    content_type: @content_type,
    status_code: @status_code,
    archive_url: @archive_url,
    original_url: @original_url,
    timestamp: @timestamp.to_s,
    size: size,
    encoding: encoding.to_s,
  }
end

#binary?Boolean

Returns:

  • (Boolean)


57
58
59
# File 'lib/archaeo/page.rb', line 57

def binary?
  !(text? || json? || html?)
end

#contentObject



25
26
27
# File 'lib/archaeo/page.rb', line 25

def content
  @content ||= transcode(@raw_content)
end

#css?Boolean

Returns:

  • (Boolean)


53
54
55
# File 'lib/archaeo/page.rb', line 53

def css?
  @content_type&.include?("text/css")
end

#encodingObject



33
34
35
# File 'lib/archaeo/page.rb', line 33

def encoding
  @encoding ||= detect_encoding
end

#html?Boolean

Returns:

  • (Boolean)


37
38
39
# File 'lib/archaeo/page.rb', line 37

def html?
  @content_type&.include?("text/html")
end

#image?Boolean

Returns:

  • (Boolean)


45
46
47
# File 'lib/archaeo/page.rb', line 45

def image?
  @content_type&.start_with?("image/")
end

#inspectObject



120
121
122
# File 'lib/archaeo/page.rb', line 120

def inspect
  "#<#{self.class.name} #{@content_type} #{size} bytes>"
end

#json?Boolean

Returns:

  • (Boolean)


41
42
43
# File 'lib/archaeo/page.rb', line 41

def json?
  @content_type&.include?("application/json")
end


70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/archaeo/page.rb', line 70

def links
  return [] unless html?

  @links ||= begin
    doc = Nokogiri::HTML(@raw_content)
    base = @archive_url || @original_url
    doc.css("a[href]").map do |anchor|
      href = resolve_page_url(anchor["href"], base)
      { href: href, text: anchor.text.strip,
        external: href && !href.include?(original_domain) }
    end
  end
end

#meta_tagsObject



84
85
86
87
88
89
90
91
92
93
94
# File 'lib/archaeo/page.rb', line 84

def meta_tags
  return {} unless html?

  @meta_tags ||= begin
    doc = Nokogiri::HTML(@raw_content)
    result = extract_meta_entries(doc)
    canonical = doc.at_css('link[rel="canonical"]')
    result["canonical"] = canonical["href"].to_s if canonical
    result
  end
end

#sizeObject



29
30
31
# File 'lib/archaeo/page.rb', line 29

def size
  content.length
end

#text?Boolean

Returns:

  • (Boolean)


49
50
51
# File 'lib/archaeo/page.rb', line 49

def text?
  @content_type&.start_with?("text/")
end

#titleObject



61
62
63
64
65
66
67
68
# File 'lib/archaeo/page.rb', line 61

def title
  @title ||= begin
    doc = Nokogiri::HTML(@raw_content)
    doc.at_css("title")&.text&.strip
  rescue StandardError
    nil
  end
end

#to_hObject



96
97
98
99
100
101
102
103
104
105
106
# File 'lib/archaeo/page.rb', line 96

def to_h
  {
    content_type: @content_type,
    status_code: @status_code,
    archive_url: @archive_url,
    original_url: @original_url,
    timestamp: @timestamp,
    size: size,
    encoding: encoding.to_s,
  }
end