Module: Relaton::Gb::Scraper

Included in:
GbScraper, SecScraper, TScraper
Defined in:
lib/relaton/gb/scraper.rb

Overview

Common scrapping methods.

Constant Summary collapse

STAGES =
{ "即将实施" => "published",
"现行" => "activated",
"废止" => "obsoleted",
"被代替" => "replaced" }.freeze

Instance Method Summary collapse

Instance Method Details

#create_org_name(lang, name, gbtype) ⇒ Relaton::Bib::TypedLocalizedString?

Parameters:

  • lang (String)
  • name (String)
  • gbtype (Hash)

Returns:

  • (Relaton::Bib::TypedLocalizedString, nil)


63
64
65
66
67
68
69
# File 'lib/relaton/gb/scraper.rb', line 63

def create_org_name(lang, name, gbtype)
  ag = GbAgencies::Agencies.new(lang, {}, "")
  content = ag.standard_agency1(gbtype.scope, name, gbtype.mandate)
  return unless content

  Bib::TypedLocalizedString.new language: lang, content: content
end

#get_contributors(doc, docref) ⇒ Array<Relaton::Bib::Contributor>

Parameters:

  • doc (Nokogiri::HTML::Document)
  • docref (Strings)

Returns:

  • (Array<Relaton::Bib::Contributor>)


47
48
49
50
51
52
53
54
55
56
57
# File 'lib/relaton/gb/scraper.rb', line 47

def get_contributors(doc, docref)
  name = docref.match(/^[^\s]+/).to_s
  name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
  gbtype = get_gbtype(doc, docref)
  org_names = %w[en zh].map { |l| create_org_name(l, name, gbtype) }.compact
  return [] unless org_names.any?

  org = Bib::Organization.new name: org_names
  role = Bib::Contributor::Role.new type: "publisher"
  [Bib::Contributor.new(organization: org, role: [role])]
end

#get_docid(docref) ⇒ Array<Relaton::Bib::Docidentifier>

Parameters:

  • docref (String)

Returns:

  • (Array<Relaton::Bib::Docidentifier>)


40
41
42
# File 'lib/relaton/gb/scraper.rb', line 40

def get_docid(docref)
  [Docidentifier.new(content: docref, type: "Chinese Standard", primary: true)]
end

#get_status(doc, status = nil) ⇒ Relaton::Bib::Status

Parameters:

  • doc (Nokogiri::HTML::Document)
  • status (String, NilClass) (defaults to: nil)

Returns:

  • (Relaton::Bib::Status)


85
86
87
88
89
90
91
# File 'lib/relaton/gb/scraper.rb', line 85

def get_status(doc, status = nil)
  status ||= doc.at("//td[contains(., '标准状态')]/span")&.text&.strip
  return unless STAGES[status]

  stage = Bib::Status::Stage.new content: STAGES[status]
  Bib::Status.new stage: stage
end

#get_titles(doc) ⇒ Array<Relaton::Bib::Title>

Parameters:

  • doc (Nokogiri::HTML::Document)

Returns:

  • (Array<Relaton::Bib::Title>)


73
74
75
76
77
78
79
80
# File 'lib/relaton/gb/scraper.rb', line 73

def get_titles(doc)
  tzh = doc.at("//td[contains(text(), '中文标准名称')]/b").text
  titles = Relaton::Bib::Title.from_string tzh, "zh", "Hans"
  ten = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
  return titles if ten.empty?

  titles + Relaton::Bib::Title.from_string(ten, "en", "Latn")
end

#scrapped_data(doc, src, hit) ⇒ Hash

Parameters:

  • doc (Nokogiri::HTML::Document)
  • src (String)
  • hit (RelatonGb::Hit)

Returns:

  • (Hash)


22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/relaton/gb/scraper.rb', line 22

def scrapped_data(doc, src, hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  {
    fetched: Date.today.to_s,
    type: "standard",
    docidentifier: get_docid(hit.docref),
    title: get_titles(doc),
    contributor: get_contributors(doc, hit.docref),
    status: get_status(doc, hit.status),
    source: get_source(src),
    date: get_dates(doc),
    language: ["zh"],
    script: ["Hans"],
    ext: get_ext(doc, hit.docref),
  }
end