Module: Relaton::Gb::Scraper
- Included in:
- GbScraper, SecScraper, TScraper
- Defined in:
- lib/relaton/gb/scraper.rb
Overview
Common scrapping methods.
Constant Summary collapse
- STAGES =
{ "即将实施" => "published", "现行" => "activated", "废止" => "obsoleted", "被代替" => "replaced" }.freeze
Instance Method Summary collapse
- #create_org_name(lang, name, gbtype) ⇒ Relaton::Bib::TypedLocalizedString?
- #get_contributors(doc, docref) ⇒ Array<Relaton::Bib::Contributor>
- #get_docid(docref) ⇒ Array<Relaton::Bib::Docidentifier>
- #get_status(doc, status = nil) ⇒ Relaton::Bib::Status
- #get_titles(doc) ⇒ Array<Relaton::Bib::Title>
- #scrapped_data(doc, src, hit) ⇒ Hash
Instance Method Details
#create_org_name(lang, name, gbtype) ⇒ Relaton::Bib::TypedLocalizedString?
63 64 65 66 67 68 69 |
# File 'lib/relaton/gb/scraper.rb', line 63 def create_org_name(lang, name, gbtype) ag = GbAgencies::Agencies.new(lang, {}, "") content = ag.standard_agency1(gbtype.scope, name, gbtype.mandate) return unless content Bib::TypedLocalizedString.new language: lang, content: content end |
#get_contributors(doc, docref) ⇒ Array<Relaton::Bib::Contributor>
47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/relaton/gb/scraper.rb', line 47 def get_contributors(doc, docref) name = docref.match(/^[^\s]+/).to_s name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/ gbtype = get_gbtype(doc, docref) org_names = %w[en zh].map { |l| create_org_name(l, name, gbtype) }.compact return [] unless org_names.any? org = Bib::Organization.new name: org_names role = Bib::Contributor::Role.new type: "publisher" [Bib::Contributor.new(organization: org, role: [role])] end |
#get_docid(docref) ⇒ Array<Relaton::Bib::Docidentifier>
40 41 42 |
# File 'lib/relaton/gb/scraper.rb', line 40 def get_docid(docref) [Docidentifier.new(content: docref, type: "Chinese Standard", primary: true)] end |
#get_status(doc, status = nil) ⇒ Relaton::Bib::Status
85 86 87 88 89 90 91 |
# File 'lib/relaton/gb/scraper.rb', line 85 def get_status(doc, status = nil) status ||= doc.at("//td[contains(., '标准状态')]/span")&.text&.strip return unless STAGES[status] stage = Bib::Status::Stage.new content: STAGES[status] Bib::Status.new stage: stage end |
#get_titles(doc) ⇒ Array<Relaton::Bib::Title>
73 74 75 76 77 78 79 80 |
# File 'lib/relaton/gb/scraper.rb', line 73 def get_titles(doc) tzh = doc.at("//td[contains(text(), '中文标准名称')]/b").text titles = Relaton::Bib::Title.from_string tzh, "zh", "Hans" ten = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s return titles if ten.empty? titles + Relaton::Bib::Title.from_string(ten, "en", "Latn") end |
#scrapped_data(doc, src, hit) ⇒ Hash
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/relaton/gb/scraper.rb', line 22 def scrapped_data(doc, src, hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength { fetched: Date.today.to_s, type: "standard", docidentifier: get_docid(hit.docref), title: get_titles(doc), contributor: get_contributors(doc, hit.docref), status: get_status(doc, hit.status), source: get_source(src), date: get_dates(doc), language: ["zh"], script: ["Hans"], ext: get_ext(doc, hit.docref), } end |