Class: Relaton::Omg::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton/omg/scraper.rb

Constant Summary collapse

URL_PATTERN =
"https://www.omg.org/spec/"

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(acronym, version = nil, spec = nil) ⇒ Scraper

Returns a new instance of Scraper.



10
11
12
13
14
# File 'lib/relaton/omg/scraper.rb', line 10

def initialize(acronym, version = nil, spec = nil)
  @acronym = acronym
  @version = version
  @spec = spec
end

Class Method Details

.scrape_page(ref) ⇒ Object



16
17
18
19
20
21
22
23
24
25
# File 'lib/relaton/omg/scraper.rb', line 16

def self.scrape_page(ref)
  %r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref
  return unless acronym

  scraper = new(acronym, version, spec)
  doc = scraper.get_doc
  return if doc.nil? || scraper.fetch_link.empty?

  Omg::ItemData.new(**scraper.item)
end

Instance Method Details

#doc_versionObject



79
80
81
# File 'lib/relaton/omg/scraper.rb', line 79

def doc_version
  @doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
end

#fetch_abstractObject



70
71
72
73
# File 'lib/relaton/omg/scraper.rb', line 70

def fetch_abstract
  content = @doc.at('//section[@id="document-metadata"]/div/div/p').text
  [Bib::Abstract.new(content: content, language: "en", script: "Latn")]
end

#fetch_dateObject



83
84
85
# File 'lib/relaton/omg/scraper.rb', line 83

def fetch_date
  [Bib::Date.new(type: "published", at: pub_date.to_s)]
end

#fetch_docidObject



63
64
65
66
67
68
# File 'lib/relaton/omg/scraper.rb', line 63

def fetch_docid
  id = ["OMG", @acronym]
  id << doc_version if doc_version
  id << @spec if @spec
  [Bib::Docidentifier.new(content: id.join(" "), type: "OMG", primary: true)]
end

#fetch_keywordObject



128
129
130
131
132
# File 'lib/relaton/omg/scraper.rb', line 128

def fetch_keyword
  @doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map do |kw|
    Bib::Keyword.new(vocab: Bib::LocalizedString.new(content: kw.text))
  end
end

#fetch_licenseObject



134
135
136
137
138
# File 'lib/relaton/omg/scraper.rb', line 134

def fetch_license
  @doc.xpath(
    '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
  ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
end


97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/relaton/omg/scraper.rb', line 97

def fetch_link
  return @links if @links

  @links = []
  if @spec
    a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']")
    @links << Bib::Uri.new(type: "src", content: a[:href]) if a
  else
    a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
    @links << Bib::Uri.new(type: "src", content: a[:href]) if a
    pdf = @doc.at('//a[@class="download-document"]')
    @links << Bib::Uri.new(type: "pdf", content: pdf[:href]) if pdf
  end
  @links
end

#fetch_relationObject



113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/relaton/omg/scraper.rb', line 113

def fetch_relation
  v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
  v.reduce([]) do |mem, row|
    ver = row.at("td").text
    unless ver == doc_version
      acronym = row.at("td[3]/a")[:href].split("/")[4]
      id = ["OMG", acronym, ver].join(" ")
      docid = Bib::Docidentifier.new(content: id, type: "OMG")
      bibitem = Bib::ItemBase.new(formattedref: Bib::Formattedref.new(content: id), docidentifier: [docid])
      mem << Bib::Relation.new(type: "obsoletes", bibitem: bibitem)
    end
    mem
  end
end

#fetch_statusObject



91
92
93
94
95
# File 'lib/relaton/omg/scraper.rb', line 91

def fetch_status
  status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd')
  stage = status.text.strip.match(/\w+/).to_s
  Bib::Status.new(stage: Bib::Status::Stage.new(content: stage))
end

#fetch_titleObject



57
58
59
60
61
# File 'lib/relaton/omg/scraper.rb', line 57

def fetch_title
  content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text
  content += ": #{@spec}" if @spec
  [Bib::Title.new(type: "main", content: content, language: "en", script: "Latn")]
end

#fetch_versionObject



75
76
77
# File 'lib/relaton/omg/scraper.rb', line 75

def fetch_version
  [Bib::Version.new(revision_date: pub_date, draft: doc_version)]
end

#get_docObject



27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/relaton/omg/scraper.rb', line 27

def get_doc
  @url = "#{URL_PATTERN}#{@acronym}/"
  @url += @version.gsub(" ", "/") if @version
  agent = Mechanize.new
  agent.open_timeout = 10
  @doc = agent.get(@url)
rescue Mechanize::ResponseCodeError => e
  return if e.response_code == "404"

  raise Relaton::RequestError, "Unable acces #{@url} (#{e.response_code})"
rescue Net::OpenTimeout
  raise Relaton::RequestError, "Unable acces #{@url} (timeout)"
end

#itemObject



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/relaton/omg/scraper.rb', line 41

def item
  {
    fetched: ::Date.today.to_s,
    docidentifier: fetch_docid,
    title: fetch_title,
    abstract: fetch_abstract,
    version: fetch_version,
    date: fetch_date,
    status: fetch_status,
    source: fetch_link,
    relation: fetch_relation,
    keyword: fetch_keyword,
    license: fetch_license,
  }
end

#pub_dateObject



87
88
89
# File 'lib/relaton/omg/scraper.rb', line 87

def pub_date
  ::Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
end