Module: Relaton::Gb::TScraper

Extended by:
Scraper
Defined in:
lib/relaton/gb/t_scraper.rb

Overview

Social standard scarpper.

Constant Summary

Constants included from Scraper

Scraper::STAGES

Class Method Summary collapse

Methods included from Scraper

create_org_name, get_contributors, get_docid, get_status, get_titles, scrapped_data

Class Method Details

.agentObject



51
52
53
# File 'lib/relaton/gb/t_scraper.rb', line 51

def agent
  @agent ||= Mechanize.new
end

.scrape_doc(hit) ⇒ RelatonGb::GbBibliographicItem

Parameters:

  • hit (RelatonGb::Hit)

    standard’s page path

Returns:

  • (RelatonGb::GbBibliographicItem)


43
44
45
46
47
48
49
# File 'lib/relaton/gb/t_scraper.rb', line 43

def scrape_doc(hit)
  src = "http://www.ttbz.org.cn#{hit.pid}"
  doc = agent.get(src)
  ItemData.new(**scrapped_data(doc, src, hit))
rescue Mechanize::Error => e
  raise Relaton::RequestError, "Cannot access #{src}: #{e.message}"
end

.scrape_page(text) ⇒ Relaton::Gb::HitCollection

rubocop:disable Metrics/MethodLength, Metrics/AbcSize

Parameters:

  • text (String)

Returns:



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/relaton/gb/t_scraper.rb', line 19

def scrape_page(text)
  url = "http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" \
        "#{CGI.escape(text.tr('-', [8212].pack('U')))}"
  doc = agent.get(url)
  xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
  t_xpath = "../preceding-sibling::td[4]"
  hits = doc.xpath(xpath).map do |h|
    docref = h.at(t_xpath).text.gsub(/Ă¢\u0080\u0094/, "-")
    status = h.at("../preceding-sibling::td[1]").text.delete "\r\n"
    pid = h[:href].sub(%r{/$}, "")
    Hit.new pid: pid, docref: docref, status: status, scraper: self
  end
  HitCollection.new hits
rescue Mechanize::ResponseCodeError => e
  return nil if e.response_code == "404"

  raise Relaton::RequestError, "Cannot access #{url}: #{e.message}"
rescue Mechanize::Error => e
  raise Relaton::RequestError, "Cannot access #{url}: #{e.message}"
end