Class: Relaton::Iso::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton/iso/scraper.rb

Overview

Scrapper.

Constant Summary collapse

DOMAIN =

rubocop:disable Metrics/ModuleLength

"https://www.iso.org"
TYPES =
{
  "TS" => "technical-specification",
  "DTS" => "technical-specification",
  "TR" => "technical-report",
  "DTR" => "technical-report",
  "PAS" => "publicly-available-specification",
  # "AWI" => "approvedWorkItem",
  # "CD" => "committeeDraft",
  # "FDIS" => "finalDraftInternationalStandard",
  # "NP" => "newProposal",
  # "DIS" => "draftInternationalStandard",
  # "WD" => "workingDraft",
  # "R" => "recommendation",
  "Guide" => "guide",
  "ISO" => "international-standard",
  "IEC" => "international-standard",
  "IWA" => "international-workshop-agreement",
}.freeze
STGABBR =
{
  "00" => "NWIP",
  "10" => "AWI",
  "20" => "WD",
  "30" => "CD",
  "40" => "DIS",
  "50" => "FDIS",
  "60" => { "00" => "PRF", "60" => "FINAL" },
}.freeze
PUBLISHERS =
{
  "IEC" => { name: "International Electrotechnical Commission", uri: "www.iec.ch" },
  "ISO" => { name: "International Organization for Standardization", uri: "www.iso.org" },
  "IEEE" => { name: "Institute of Electrical and Electronics Engineers", uri: "www.ieee.org" },
  "SAE" => { name: "SAE International", uri: "www.sae.org" },
  "CIE" => { name: " International Commission on Illumination", uri: "cie.co.at" },
  "ASME" => { name: "American Society of Mechanical Engineers", uri: "www.asme.org" },
}.freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lang, errors) ⇒ Scraper

extend self



49
50
51
52
# File 'lib/relaton/iso/scraper.rb', line 49

def initialize(lang, errors)
  @lang = lang
  @errors = errors
end

Class Method Details

.parse_page(path, lang: nil, errors: {}) ⇒ RelatonIsoBib::IsoBibliographicItem

Parse page.

Parameters:

  • path (String)

    page path

  • lang (String, nil) (defaults to: nil)

    language

  • errors (Hash) (defaults to: {})

    collection of parsing errors

Returns:

  • (RelatonIsoBib::IsoBibliographicItem)


59
60
61
# File 'lib/relaton/iso/scraper.rb', line 59

def self.parse_page(path, lang: nil, errors: {})
  new(lang, errors).parse(path)
end

Instance Method Details

#editionObject



119
120
121
122
123
124
125
# File 'lib/relaton/iso/scraper.rb', line 119

def edition
  return @edition if defined?(@edition)

  ed = @doc.at("//div[div[.='Edition']]/text()[last()]")
  @errors[:edition] &&= ed.nil?
  @edition = ed && Bib::Edition.new(content: ed.text.match(/\d+$/).to_s)
end

#fetch_relaton_docidsArray<RelatonBib::DocumentIdentifier>

Create document ids.

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


132
133
134
135
136
137
138
# File 'lib/relaton/iso/scraper.rb', line 132

def fetch_relaton_docids
  [
    Docidentifier.new(content: pubid, type: "ISO", primary: true),
    Docidentifier.new(content: isoref, type: "iso-reference"),
    Docidentifier.new(content: urn, type: "URN"),
  ]
end

#idObject



95
96
97
98
99
100
101
# File 'lib/relaton/iso/scraper.rb', line 95

def id
  return @id if defined?(@id)

  did = @doc.at("//h1/span[1]")
  @errors[:id] &&= did.nil?
  @id = did && did.text.split(" | ").first.strip
end

#isorefString

Create ISO reference identifier with English language.

Returns:

  • (String)

    English reference identifier



145
146
147
148
# File 'lib/relaton/iso/scraper.rb', line 145

def isoref
  params = pubid.to_h.except(:typed_stage)
  ::Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
end

#parse(path) ⇒ Object

rubocop:disable Metrics/AbcSize,Metrics/MethodLength



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/relaton/iso/scraper.rb', line 63

def parse(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  @doc, @url = get_page path
  titles, abstract, langs = fetch_titles_abstract

  contributors = fetch_contributors
  eg_contributor = fetch_editorialgroup_contributor
  contributors << eg_contributor if eg_contributor

  ItemData.new(
    id: id.gsub(/[^\w]/, ""),
    # fetched: Date.today.to_s,
    type: "standard",
    docidentifier: fetch_relaton_docids,
    docnumber: fetch_docnumber,
    edition: edition,
    language: langs.map { |l| l[:lang] },
    script: langs.map { |l| script(l[:lang]) }.uniq,
    title: titles,
    status: fetch_status,
    ics: fetch_ics,
    date: fetch_dates,
    contributor: contributors,
    abstract: abstract,
    copyright: fetch_copyright,
    source: fetch_source(@url),
    relation: fetch_relations,
    place: [Bib::Place.new(city: "Geneva")],
    structuredidentifier: fetch_structuredidentifier,
    ext: parse_ext,
  )
end

#pubidObject

rubocop:disable Metrics/AbcSize



103
104
105
106
107
108
109
110
111
# File 'lib/relaton/iso/scraper.rb', line 103

def pubid # rubocop:disable Metrics/AbcSize
  return @pubid if @pubid

  @pubid = ::Pubid::Iso::Identifier.parse(id)
  @pubid.root.edition ||= edition.content if @pubid.base
  @pubid
rescue StandardError => e
  Util.error "Failed to parse pubid from #{id}: #{e.message}"
end

#urnObject



113
114
115
116
117
# File 'lib/relaton/iso/scraper.rb', line 113

def urn
  pubid_dup = pubid.dup
  pubid_dup.stage ||= ::Pubid::Iso::Identifier.parse_stage(stage_code)
  pubid_dup
end