Class: Relaton::Omg::Scraper
- Inherits:
-
Object
- Object
- Relaton::Omg::Scraper
- Defined in:
- lib/relaton/omg/scraper.rb
Constant Summary collapse
- URL_PATTERN =
"https://www.omg.org/spec/"
Class Method Summary collapse
Instance Method Summary collapse
- #doc_version ⇒ Object
- #fetch_abstract ⇒ Object
- #fetch_date ⇒ Object
- #fetch_docid ⇒ Object
- #fetch_keyword ⇒ Object
- #fetch_license ⇒ Object
- #fetch_link ⇒ Object
- #fetch_relation ⇒ Object
- #fetch_status ⇒ Object
- #fetch_title ⇒ Object
- #fetch_version ⇒ Object
- #get_doc ⇒ Object
-
#initialize(acronym, version = nil, spec = nil) ⇒ Scraper
constructor
A new instance of Scraper.
- #item ⇒ Object
- #pub_date ⇒ Object
Constructor Details
#initialize(acronym, version = nil, spec = nil) ⇒ Scraper
Returns a new instance of Scraper.
10 11 12 13 14 |
# File 'lib/relaton/omg/scraper.rb', line 10 def initialize(acronym, version = nil, spec = nil) @acronym = acronym @version = version @spec = spec end |
Class Method Details
.scrape_page(ref) ⇒ Object
16 17 18 19 20 21 22 23 24 25 |
# File 'lib/relaton/omg/scraper.rb', line 16 def self.scrape_page(ref) %r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref return unless acronym scraper = new(acronym, version, spec) doc = scraper.get_doc return if doc.nil? || scraper.fetch_link.empty? Omg::ItemData.new(**scraper.item) end |
Instance Method Details
#doc_version ⇒ Object
79 80 81 |
# File 'lib/relaton/omg/scraper.rb', line 79 def doc_version @doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text end |
#fetch_abstract ⇒ Object
70 71 72 73 |
# File 'lib/relaton/omg/scraper.rb', line 70 def fetch_abstract content = @doc.at('//section[@id="document-metadata"]/div/div/p').text [Bib::Abstract.new(content: content, language: "en", script: "Latn")] end |
#fetch_date ⇒ Object
83 84 85 |
# File 'lib/relaton/omg/scraper.rb', line 83 def fetch_date [Bib::Date.new(type: "published", at: pub_date.to_s)] end |
#fetch_docid ⇒ Object
63 64 65 66 67 68 |
# File 'lib/relaton/omg/scraper.rb', line 63 def fetch_docid id = ["OMG", @acronym] id << doc_version if doc_version id << @spec if @spec [Bib::Docidentifier.new(content: id.join(" "), type: "OMG", primary: true)] end |
#fetch_keyword ⇒ Object
128 129 130 131 132 |
# File 'lib/relaton/omg/scraper.rb', line 128 def fetch_keyword @doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map do |kw| Bib::Keyword.new(vocab: Bib::LocalizedString.new(content: kw.text)) end end |
#fetch_license ⇒ Object
134 135 136 137 138 |
# File 'lib/relaton/omg/scraper.rb', line 134 def fetch_license @doc.xpath( '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span', ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip } end |
#fetch_link ⇒ Object
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/relaton/omg/scraper.rb', line 97 def fetch_link return @links if @links @links = [] if @spec a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']") @links << Bib::Uri.new(type: "src", content: a[:href]) if a else a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a') @links << Bib::Uri.new(type: "src", content: a[:href]) if a pdf = @doc.at('//a[@class="download-document"]') @links << Bib::Uri.new(type: "pdf", content: pdf[:href]) if pdf end @links end |
#fetch_relation ⇒ Object
113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/relaton/omg/scraper.rb', line 113 def fetch_relation v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr') v.reduce([]) do |mem, row| ver = row.at("td").text unless ver == doc_version acronym = row.at("td[3]/a")[:href].split("/")[4] id = ["OMG", acronym, ver].join(" ") docid = Bib::Docidentifier.new(content: id, type: "OMG") bibitem = Bib::ItemBase.new(formattedref: Bib::Formattedref.new(content: id), docidentifier: [docid]) mem << Bib::Relation.new(type: "obsoletes", bibitem: bibitem) end mem end end |
#fetch_status ⇒ Object
91 92 93 94 95 |
# File 'lib/relaton/omg/scraper.rb', line 91 def fetch_status status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd') stage = status.text.strip.match(/\w+/).to_s Bib::Status.new(stage: Bib::Status::Stage.new(content: stage)) end |
#fetch_title ⇒ Object
57 58 59 60 61 |
# File 'lib/relaton/omg/scraper.rb', line 57 def fetch_title content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text content += ": #{@spec}" if @spec [Bib::Title.new(type: "main", content: content, language: "en", script: "Latn")] end |
#fetch_version ⇒ Object
75 76 77 |
# File 'lib/relaton/omg/scraper.rb', line 75 def fetch_version [Bib::Version.new(revision_date: pub_date, draft: doc_version)] end |
#get_doc ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/relaton/omg/scraper.rb', line 27 def get_doc @url = "#{URL_PATTERN}#{@acronym}/" @url += @version.gsub(" ", "/") if @version agent = Mechanize.new agent.open_timeout = 10 @doc = agent.get(@url) rescue Mechanize::ResponseCodeError => e return if e.response_code == "404" raise Relaton::RequestError, "Unable acces #{@url} (#{e.response_code})" rescue Net::OpenTimeout raise Relaton::RequestError, "Unable acces #{@url} (timeout)" end |
#item ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/relaton/omg/scraper.rb', line 41 def item { fetched: ::Date.today.to_s, docidentifier: fetch_docid, title: fetch_title, abstract: fetch_abstract, version: fetch_version, date: fetch_date, status: fetch_status, source: fetch_link, relation: fetch_relation, keyword: fetch_keyword, license: fetch_license, } end |
#pub_date ⇒ Object
87 88 89 |
# File 'lib/relaton/omg/scraper.rb', line 87 def pub_date ::Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip end |