Class: Relaton::Omg::Scraper

Inherits:

Object

Object
Relaton::Omg::Scraper

show all

Defined in:: lib/relaton/omg/scraper.rb

Constant Summary collapse

URL_PATTERN =

"https://www.omg.org/spec/"

Class Method Summary collapse

.scrape_page(ref) ⇒ Object

Instance Method Summary collapse

#doc_version ⇒ Object
#fetch_abstract ⇒ Object
#fetch_date ⇒ Object
#fetch_docid ⇒ Object
#fetch_keyword ⇒ Object
#fetch_license ⇒ Object
#fetch_link ⇒ Object
#fetch_relation ⇒ Object
#fetch_status ⇒ Object
#fetch_title ⇒ Object
#fetch_version ⇒ Object
#get_doc ⇒ Object
#initialize(acronym, version = nil, spec = nil) ⇒ Scraper constructor

A new instance of Scraper.
#item ⇒ Object
#pub_date ⇒ Object

Constructor Details

#initialize(acronym, version = nil, spec = nil) ⇒ `Scraper`

Returns a new instance of Scraper.

# File 'lib/relaton/omg/scraper.rb', line 10

def initialize(acronym, version = nil, spec = nil)
  @acronym = acronym
  @version = version
  @spec = spec
end

Class Method Details

.scrape_page(ref) ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 16

def self.scrape_page(ref)
  %r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref
  return unless acronym

  scraper = new(acronym, version, spec)
  doc = scraper.get_doc
  return if doc.nil? || scraper.fetch_link.empty?

  Omg::ItemData.new(**scraper.item)
end

Instance Method Details

#doc_version ⇒ `Object`



79
80
81

# File 'lib/relaton/omg/scraper.rb', line 79

def doc_version
  @doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
end

#fetch_abstract ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 70

def fetch_abstract
  content = @doc.at('//section[@id="document-metadata"]/div/div/p').text
  [Bib::Abstract.new(content: content, language: "en", script: "Latn")]
end

#fetch_date ⇒ `Object`



83
84
85

# File 'lib/relaton/omg/scraper.rb', line 83

def fetch_date
  [Bib::Date.new(type: "published", at: pub_date.to_s)]
end

#fetch_docid ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 63

def fetch_docid
  id = ["OMG", @acronym]
  id << doc_version if doc_version
  id << @spec if @spec
  [Bib::Docidentifier.new(content: id.join(" "), type: "OMG", primary: true)]
end

#fetch_keyword ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 128

def fetch_keyword
  @doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map do |kw|
    Bib::Keyword.new(vocab: Bib::LocalizedString.new(content: kw.text))
  end
end

#fetch_license ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 134

def fetch_license
  @doc.xpath(
    '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
  ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
end

#fetch_link ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 97

def fetch_link
  return @links if @links

  @links = []
  if @spec
    a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']")
    @links << Bib::Uri.new(type: "src", content: a[:href]) if a
  else
    a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
    @links << Bib::Uri.new(type: "src", content: a[:href]) if a
    pdf = @doc.at('//a[@class="download-document"]')
    @links << Bib::Uri.new(type: "pdf", content: pdf[:href]) if pdf
  end
  @links
end

#fetch_relation ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 113

def fetch_relation
  v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
  v.reduce([]) do |mem, row|
    ver = row.at("td").text
    unless ver == doc_version
      acronym = row.at("td[3]/a")[:href].split("/")[4]
      id = ["OMG", acronym, ver].join(" ")
      docid = Bib::Docidentifier.new(content: id, type: "OMG")
      bibitem = Bib::ItemBase.new(formattedref: Bib::Formattedref.new(content: id), docidentifier: [docid])
      mem << Bib::Relation.new(type: "obsoletes", bibitem: bibitem)
    end
    mem
  end
end

#fetch_status ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 91

def fetch_status
  status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd')
  stage = status.text.strip.match(/\w+/).to_s
  Bib::Status.new(stage: Bib::Status::Stage.new(content: stage))
end

#fetch_title ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 57

def fetch_title
  content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text
  content += ": #{@spec}" if @spec
  [Bib::Title.new(type: "main", content: content, language: "en", script: "Latn")]
end

#fetch_version ⇒ `Object`



75
76
77

# File 'lib/relaton/omg/scraper.rb', line 75

def fetch_version
  [Bib::Version.new(revision_date: pub_date, draft: doc_version)]
end

#get_doc ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 27

def get_doc
  @url = "#{URL_PATTERN}#{@acronym}/"
  @url += @version.gsub(" ", "/") if @version
  agent = Mechanize.new
  agent.open_timeout = 10
  @doc = agent.get(@url)
rescue Mechanize::ResponseCodeError => e
  return if e.response_code == "404"

  raise Relaton::RequestError, "Unable acces #{@url} (#{e.response_code})"
rescue Net::OpenTimeout
  raise Relaton::RequestError, "Unable acces #{@url} (timeout)"
end

#item ⇒ `Object`

# File 'lib/relaton/omg/scraper.rb', line 41

def item
  {
    fetched: ::Date.today.to_s,
    docidentifier: fetch_docid,
    title: fetch_title,
    abstract: fetch_abstract,
    version: fetch_version,
    date: fetch_date,
    status: fetch_status,
    source: fetch_link,
    relation: fetch_relation,
    keyword: fetch_keyword,
    license: fetch_license,
  }
end

#pub_date ⇒ `Object`



87
88
89

# File 'lib/relaton/omg/scraper.rb', line 87

def pub_date
  ::Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
end

Class: Relaton::Omg::Scraper

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(acronym, version = nil, spec = nil) ⇒ Scraper

Class Method Details

.scrape_page(ref) ⇒ Object

Instance Method Details

#doc_version ⇒ Object

#fetch_abstract ⇒ Object

#fetch_date ⇒ Object

#fetch_docid ⇒ Object

#fetch_keyword ⇒ Object

#fetch_license ⇒ Object

#fetch_link ⇒ Object

#fetch_relation ⇒ Object

#fetch_status ⇒ Object

#fetch_title ⇒ Object

#fetch_version ⇒ Object

#get_doc ⇒ Object

#item ⇒ Object

#pub_date ⇒ Object

#initialize(acronym, version = nil, spec = nil) ⇒ `Scraper`

.scrape_page(ref) ⇒ `Object`

#doc_version ⇒ `Object`

#fetch_abstract ⇒ `Object`

#fetch_date ⇒ `Object`

#fetch_docid ⇒ `Object`

#fetch_keyword ⇒ `Object`

#fetch_license ⇒ `Object`

#fetch_link ⇒ `Object`

#fetch_relation ⇒ `Object`

#fetch_status ⇒ `Object`

#fetch_title ⇒ `Object`

#fetch_version ⇒ `Object`

#get_doc ⇒ `Object`

#item ⇒ `Object`

#pub_date ⇒ `Object`