Module: Relaton::Gb::SecScraper
- Extended by:
- Core::ArrayWrapper, Scraper
- Defined in:
- lib/relaton/gb/sec_scraper.rb
Overview
Sector standard scraper
Constant Summary
Constants included from Scraper
Class Method Summary collapse
Methods included from Scraper
create_org_name, get_contributors, get_docid, get_status, get_titles, scrapped_data
Class Method Details
.scrape_doc(hit) ⇒ Relaton::Gb::ItemData
43 44 45 46 47 48 49 50 51 52 |
# File 'lib/relaton/gb/sec_scraper.rb', line 43 def scrape_doc(hit) src = "https://hbba.sacinfo.org.cn/stdDetail/#{hit.pid}" page_uri = URI src doc = Nokogiri::HTML Net::HTTP.get(page_uri) ItemData.new(**scrapped_data(doc, src, hit)) rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, OpenSSL::SSL::SSLError, Errno::ETIMEDOUT, Net::OpenTimeout raise Relaton::RequestError, "Cannot access #{src}" end |
.scrape_page(text) ⇒ Relaton::Gb::HitCollection
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/relaton/gb/sec_scraper.rb', line 22 def scrape_page(text) # uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}" uri = URI "https://hbba.sacinfo.org.cn/stdQueryList" resp = Net::HTTP.post uri, URI.encode_www_form({ key: text }) # res = JSON.parse Net::HTTP.get(uri) json = JSON.parse resp.body hits = json["records"].map do |h| Hit.new pid: h["pk"], docref: h["code"], status: h["status"], scraper: self end # hits = res["rows"].map do |r| # Hit.new pid: r["id"], title: r["STD_CODE"], scraper: self # end HitCollection.new hits rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, OpenSSL::SSL::SSLError, Errno::ETIMEDOUT, Net::OpenTimeout raise Relaton::RequestError, "Cannot access #{uri}" end |