Class: RelatonJis::Scraper
- Inherits:
-
Object
- Object
- RelatonJis::Scraper
- Defined in:
- lib/relaton_jis/scraper.rb
Constant Summary collapse
- ATTRS =
%i[ fetched title link abstract docid docnumber date type language script docstatus doctype ics contributor editorialgroup structuredidentifier ].freeze
- LANGS =
{ "和文" => { lang: "ja", script: "Jpan" }, "英訳" => { lang: "en", script: "Latn" } }.freeze
- DATETYPES =
{ "発行年月日" => "issued", "確認年月日" => "confirmed" }.freeze
- STATUSES =
{ "有効" => "valid", "廃止" => "withdrawn" }.freeze
Instance Method Summary collapse
- #document_id ⇒ Object
- #fetch ⇒ Object
- #fetch_abstract ⇒ Object
- #fetch_contributor ⇒ Object
- #fetch_date ⇒ Object
- #fetch_docid ⇒ Object
- #fetch_docnumber ⇒ Object
- #fetch_docstatus ⇒ Object
- #fetch_doctype ⇒ Object
- #fetch_editorialgroup ⇒ Object
- #fetch_fetched ⇒ Object
- #fetch_ics ⇒ Object
- #fetch_language ⇒ Object
- #fetch_link ⇒ Object
- #fetch_script ⇒ Object
- #fetch_structuredidentifier ⇒ Object
- #fetch_title ⇒ Object
- #fetch_type ⇒ Object
-
#initialize(url) ⇒ Scraper
constructor
A new instance of Scraper.
- #langs_scripts ⇒ Object
Constructor Details
#initialize(url) ⇒ Scraper
Returns a new instance of Scraper.
14 15 16 17 |
# File 'lib/relaton_jis/scraper.rb', line 14 def initialize(url) @url = url @agent = Mechanize.new end |
Instance Method Details
#document_id ⇒ Object
63 64 65 |
# File 'lib/relaton_jis/scraper.rb', line 63 def document_id @document_id ||= @doc.at("./h2/text()[1]").text.strip end |
#fetch ⇒ Object
19 20 21 22 23 24 25 |
# File 'lib/relaton_jis/scraper.rb', line 19 def fetch @doc = @agent.get(@url).at "//div[@id='main']/section" attrs = ATTRS.each_with_object({}) do |attr, hash| hash[attr] = send "fetch_#{attr}" end BibliographicItem.new(**attrs) end |
#fetch_abstract ⇒ Object
48 49 50 51 52 |
# File 'lib/relaton_jis/scraper.rb', line 48 def fetch_abstract @doc.xpath("./table/tr[th[.='規格概要']]/td").map do |node| RelatonBib::FormattedString.new content: node.text.strip, language: "ja", script: "Jpan" end end |
#fetch_contributor ⇒ Object
120 121 122 123 124 125 126 |
# File 'lib/relaton_jis/scraper.rb', line 120 def fetch_contributor @doc.xpath("./table/tr[th[.='原案作成団体']]/td").map do |node| name = RelatonBib::LocalizedString.new node.text.strip, "ja", "Jpan" org = RelatonBib::Organization.new name: name RelatonBib::ContributionInfo.new entity: org, role: [type: "author"] end end |
#fetch_date ⇒ Object
67 68 69 70 71 72 73 74 75 |
# File 'lib/relaton_jis/scraper.rb', line 67 def fetch_date DATETYPES.each_with_object([]) do |(key, type), a| node = @doc.at("./div/div/div/p/text()[contains(.,'#{key}')]") next unless node on = node.text.match(/\d{4}-\d{2}-\d{2}/).to_s a << RelatonBib::BibliographicDate.new(type: type, on: on) end end |
#fetch_docid ⇒ Object
54 55 56 |
# File 'lib/relaton_jis/scraper.rb', line 54 def fetch_docid [RelatonBib::DocumentIdentifier.new(id: document_id, type: "JIS", primary: true)] end |
#fetch_docnumber ⇒ Object
58 59 60 61 |
# File 'lib/relaton_jis/scraper.rb', line 58 def fetch_docnumber match = document_id.match(/^\w+\s(\w)\s?(\d+)/) "#{match[1]}#{match[2]}" end |
#fetch_docstatus ⇒ Object
98 99 100 101 102 103 |
# File 'lib/relaton_jis/scraper.rb', line 98 def fetch_docstatus st = @doc.at("./div/div/div/p/text()[contains(.,'状態')]/following-sibling::span") return unless st RelatonBib::DocumentStatus.new(stage: STATUSES[st.text.strip]) end |
#fetch_doctype ⇒ Object
105 106 107 108 109 110 111 112 |
# File 'lib/relaton_jis/scraper.rb', line 105 def fetch_doctype case document_id when /JIS\s[A-Z]\s[\w-]+:\d{4}\/AMENDMENT/ then "amendment" when /JIS\s[A-Z]\s[\w-]+/ then "japanese-industrial-standard" when /TR[\s\/][\w-]+/ then "technical-report" when /TS[\s\/][\w-]+/ then "technical-specification" end end |
#fetch_editorialgroup ⇒ Object
128 129 130 131 132 133 134 |
# File 'lib/relaton_jis/scraper.rb', line 128 def fetch_editorialgroup node = @doc.at("./table/tr[th[.='原案作成団体']]/td") return unless node tc = RelatonBib::WorkGroup.new name: node.text.strip RelatonIsoBib::EditorialGroup.new technical_committee: [tc] end |
#fetch_fetched ⇒ Object
27 28 29 |
# File 'lib/relaton_jis/scraper.rb', line 27 def fetch_fetched Date.today.to_s end |
#fetch_ics ⇒ Object
114 115 116 117 118 |
# File 'lib/relaton_jis/scraper.rb', line 114 def fetch_ics @doc.xpath("./table/tr[th[.='ICS']]/td").map do |node| RelatonIsoBib::Ics.new node.text.strip end end |
#fetch_language ⇒ Object
81 82 83 |
# File 'lib/relaton_jis/scraper.rb', line 81 def fetch_language langs_scripts.map { |l| l[:lang] } end |
#fetch_link ⇒ Object
38 39 40 41 42 43 44 45 46 |
# File 'lib/relaton_jis/scraper.rb', line 38 def fetch_link src = RelatonBib::TypedUri.new content: @url, type: "src" uri = URI @url domain = "#{uri.scheme}://#{uri.host}" @doc.xpath("./table/tr[th[.='プレビュー']]/td/a").reduce([src]) do |mem, node| href = "#{domain}#{node[:href]}" mem << RelatonBib::TypedUri.new(content: href, type: "pdf") end end |
#fetch_script ⇒ Object
85 86 87 |
# File 'lib/relaton_jis/scraper.rb', line 85 def fetch_script langs_scripts.map { |l| l[:script] } end |
#fetch_structuredidentifier ⇒ Object
136 137 138 |
# File 'lib/relaton_jis/scraper.rb', line 136 def fetch_structuredidentifier RelatonIsoBib::StructuredIdentifier.new project_number: fetch_docnumber, type: "JIS" end |
#fetch_title ⇒ Object
31 32 33 34 35 36 |
# File 'lib/relaton_jis/scraper.rb', line 31 def fetch_title { "ja" => "Jpan", "en" => "Lant" }.map.with_index do |(lang, script), i| content = @doc.at("./h2/text()[#{i + 2}]").text.strip RelatonBib::TypedTitleString.new content: content, language: lang, script: script end end |
#fetch_type ⇒ Object
77 78 79 |
# File 'lib/relaton_jis/scraper.rb', line 77 def fetch_type "standard" end |
#langs_scripts ⇒ Object
89 90 91 92 93 94 95 96 |
# File 'lib/relaton_jis/scraper.rb', line 89 def langs_scripts @langs_scripts ||= LANGS.each_with_object([]) do |(key, lang), a| l = @doc.at("./div/div/div[@class='blockContentFile']/div/div/p[1]/span[contains(.,'#{key}')]/following-sibling::span") next if l.nil? || l.text.strip == "-" a << lang end end |