Class: RelatonJis::Scraper
- Inherits:
-
Object
- Object
- RelatonJis::Scraper
- Defined in:
- lib/relaton_jis/scraper.rb
Constant Summary collapse
- ATTRS =
%i[ fetched title link abstract docid docnumber date type language script docstatus doctype ics contributor editorialgroup structuredidentifier ].freeze
- LANGS =
{ "和文" => { lang: "ja", script: "Jpan" }, "英訳" => { lang: "en", script: "Latn" } }.freeze
- DATETYPES =
{ "発行年月日" => "issued", "確認年月日" => "confirmed" }.freeze
- STATUSES =
{ "有効" => "valid", "廃止" => "withdrawn" }.freeze
Instance Method Summary collapse
- #fetch ⇒ Object
- #fetch_abstract ⇒ Object
- #fetch_contributor ⇒ Object
- #fetch_date ⇒ Object
- #fetch_docid ⇒ Object
- #fetch_docnumber ⇒ Object
- #fetch_docstatus ⇒ Object
- #fetch_doctype ⇒ Object
- #fetch_editorialgroup ⇒ Object
- #fetch_fetched ⇒ Object
- #fetch_ics ⇒ Object
- #fetch_language ⇒ Object
- #fetch_link ⇒ Object
- #fetch_script ⇒ Object
- #fetch_structuredidentifier ⇒ Object
- #fetch_title ⇒ Object
- #fetch_type ⇒ Object
-
#initialize(url) ⇒ Scraper
constructor
A new instance of Scraper.
- #langs_scripts ⇒ Object
Constructor Details
#initialize(url) ⇒ Scraper
Returns a new instance of Scraper.
14 15 16 17 |
# File 'lib/relaton_jis/scraper.rb', line 14 def initialize(url) @url = url @agent = Mechanize.new end |
Instance Method Details
#fetch ⇒ Object
19 20 21 22 23 24 25 |
# File 'lib/relaton_jis/scraper.rb', line 19 def fetch @doc = @agent.get(@url).at "//div[@id='main']/section" attrs = ATTRS.each_with_object({}) do |attr, hash| hash[attr] = send "fetch_#{attr}" end BibliographicItem.new(**attrs) end |
#fetch_abstract ⇒ Object
48 49 50 51 52 |
# File 'lib/relaton_jis/scraper.rb', line 48 def fetch_abstract @doc.xpath("./table/tr[th[.='規格概要']]/td").map do |node| RelatonBib::FormattedString.new content: node.text.strip, language: "ja", script: "Jpan" end end |
#fetch_contributor ⇒ Object
112 113 114 115 116 117 118 |
# File 'lib/relaton_jis/scraper.rb', line 112 def fetch_contributor @doc.xpath("./table/tr[th[.='原案作成団体']]/td").map do |node| name = RelatonBib::LocalizedString.new node.text.strip, "ja", "Jpan" org = RelatonBib::Organization.new name: name RelatonBib::ContributionInfo.new entity: org, role: [type: "author"] end end |
#fetch_date ⇒ Object
64 65 66 67 68 69 70 71 72 |
# File 'lib/relaton_jis/scraper.rb', line 64 def fetch_date DATETYPES.each_with_object([]) do |(key, type), a| node = @doc.at("./div/div/div/p/text()[contains(.,'#{key}')]") next unless node on = node.text.match(/\d{4}-\d{2}-\d{2}/).to_s a << RelatonBib::BibliographicDate.new(type: type, on: on) end end |
#fetch_docid ⇒ Object
54 55 56 57 |
# File 'lib/relaton_jis/scraper.rb', line 54 def fetch_docid id = @doc.at("./h2/text()[1]").text.strip [RelatonBib::DocumentIdentifier.new(id: id, type: "JIS", primary: true)] end |
#fetch_docnumber ⇒ Object
59 60 61 62 |
# File 'lib/relaton_jis/scraper.rb', line 59 def fetch_docnumber match = @doc.at("./h2/text()[1]").text.strip.match(/^\w+\s(\w)\s?(\d+)/) "#{match[1]}#{match[2]}" end |
#fetch_docstatus ⇒ Object
95 96 97 98 99 100 |
# File 'lib/relaton_jis/scraper.rb', line 95 def fetch_docstatus st = @doc.at("./div/div/div/p/text()[contains(.,'状態')]/following-sibling::span") return unless st RelatonBib::DocumentStatus.new(stage: STATUSES[st.text.strip]) end |
#fetch_doctype ⇒ Object
102 103 104 |
# File 'lib/relaton_jis/scraper.rb', line 102 def fetch_doctype "standard" end |
#fetch_editorialgroup ⇒ Object
120 121 122 123 124 125 126 |
# File 'lib/relaton_jis/scraper.rb', line 120 def fetch_editorialgroup node = @doc.at("./table/tr[th[.='原案作成団体']]/td") return unless node tc = RelatonBib::WorkGroup.new name: node.text.strip RelatonIsoBib::EditorialGroup.new technical_committee: [tc] end |
#fetch_fetched ⇒ Object
27 28 29 |
# File 'lib/relaton_jis/scraper.rb', line 27 def fetch_fetched Date.today.to_s end |
#fetch_ics ⇒ Object
106 107 108 109 110 |
# File 'lib/relaton_jis/scraper.rb', line 106 def fetch_ics @doc.xpath("./table/tr[th[.='ICS']]/td").map do |node| RelatonIsoBib::Ics.new node.text.strip end end |
#fetch_language ⇒ Object
78 79 80 |
# File 'lib/relaton_jis/scraper.rb', line 78 def fetch_language langs_scripts.map { |l| l[:lang] } end |
#fetch_link ⇒ Object
38 39 40 41 42 43 44 45 46 |
# File 'lib/relaton_jis/scraper.rb', line 38 def fetch_link src = RelatonBib::TypedUri.new content: @url, type: "src" uri = URI @url domain = "#{uri.scheme}://#{uri.host}" @doc.xpath("./table/tr[th[.='プレビュー']]/td/a").reduce([src]) do |mem, node| href = "#{domain}#{node[:href]}" mem << RelatonBib::TypedUri.new(content: href, type: "pdf") end end |
#fetch_script ⇒ Object
82 83 84 |
# File 'lib/relaton_jis/scraper.rb', line 82 def fetch_script langs_scripts.map { |l| l[:script] } end |
#fetch_structuredidentifier ⇒ Object
128 129 130 |
# File 'lib/relaton_jis/scraper.rb', line 128 def fetch_structuredidentifier RelatonIsoBib::StructuredIdentifier.new project_number: fetch_docnumber, type: "JIS" end |
#fetch_title ⇒ Object
31 32 33 34 35 36 |
# File 'lib/relaton_jis/scraper.rb', line 31 def fetch_title { "ja" => "Jpan", "en" => "Lant" }.map.with_index do |(lang, script), i| content = @doc.at("./h2/text()[#{i + 2}]").text.strip RelatonBib::TypedTitleString.new content: content, language: lang, script: script end end |
#fetch_type ⇒ Object
74 75 76 |
# File 'lib/relaton_jis/scraper.rb', line 74 def fetch_type "standard" end |
#langs_scripts ⇒ Object
86 87 88 89 90 91 92 93 |
# File 'lib/relaton_jis/scraper.rb', line 86 def langs_scripts @langs_scripts ||= LANGS.each_with_object([]) do |(key, lang), a| l = @doc.at("./div/div/div[@class='blockContentFile']/div/div/p[1]/span[contains(.,'#{key}')]/following-sibling::span") next if l.nil? || l.text.strip == "-" a << lang end end |