Class: RelatonJis::Scraper
- Inherits:
-
Object
- Object
- RelatonJis::Scraper
- Defined in:
- lib/relaton_jis/scraper.rb
Constant Summary collapse
- ATTRS =
%i[ title link abstract docid docnumber date type language script docstatus doctype ics contributor editorialgroup structuredidentifier ].freeze
- LANGS =
{ "和文" => { lang: "ja", script: "Jpan" }, "英訳" => { lang: "en", script: "Latn" } }.freeze
- DATETYPES =
{ "発行年月日" => "issued", "確認年月日" => "confirmed" }.freeze
- STATUSES =
{ "有効" => "valid", "廃止" => "withdrawn" }.freeze
Instance Method Summary collapse
- #create_contrib(node, role) ⇒ Object
- #create_orgname(node) ⇒ Object
- #document_id ⇒ Object
- #fetch ⇒ Object
- #fetch_abstract ⇒ Object
- #fetch_contributor ⇒ Object
- #fetch_date ⇒ Object
- #fetch_docid ⇒ Object
- #fetch_docnumber ⇒ Object
- #fetch_docstatus ⇒ Object
- #fetch_doctype ⇒ Object
- #fetch_editorialgroup ⇒ Object
- #fetch_ics ⇒ Object
- #fetch_language ⇒ Object
- #fetch_link ⇒ Object
- #fetch_script ⇒ Object
- #fetch_structuredidentifier ⇒ Object
-
#fetch_title ⇒ Object
def fetch_fetched Date.today.to_s end.
- #fetch_type ⇒ Object
-
#initialize(url) ⇒ Scraper
constructor
A new instance of Scraper.
- #langs_scripts ⇒ Object
Constructor Details
#initialize(url) ⇒ Scraper
Returns a new instance of Scraper.
16 17 18 19 |
# File 'lib/relaton_jis/scraper.rb', line 16 def initialize(url) @url = url @agent = Mechanize.new end |
Instance Method Details
#create_contrib(node, role) ⇒ Object
131 132 133 134 |
# File 'lib/relaton_jis/scraper.rb', line 131 def create_contrib(node, role) org = RelatonBib::Organization.new name: create_orgname(node) RelatonBib::ContributionInfo.new entity: org, role: [type: role] end |
#create_orgname(node) ⇒ Object
136 137 138 139 140 141 142 |
# File 'lib/relaton_jis/scraper.rb', line 136 def create_orgname(node) name = [RelatonBib::LocalizedString.new(node.text.strip, "ja", "Jpan")] if node.text.include?("日本規格協会") name << RelatonBib::LocalizedString.new("Japanese Industrial Standards", "en", "Latn") end name end |
#document_id ⇒ Object
65 66 67 |
# File 'lib/relaton_jis/scraper.rb', line 65 def document_id @document_id ||= @doc.at("./h2/text()[1]").text.strip end |
#fetch ⇒ Object
21 22 23 24 25 26 27 |
# File 'lib/relaton_jis/scraper.rb', line 21 def fetch @doc = @agent.get(@url).at "//div[@id='main']/section" attrs = ATTRS.each_with_object({}) do |attr, hash| hash[attr] = send "fetch_#{attr}" end BibliographicItem.new(**attrs) end |
#fetch_abstract ⇒ Object
50 51 52 53 54 |
# File 'lib/relaton_jis/scraper.rb', line 50 def fetch_abstract @doc.xpath("./table/tr[th[.='規格概要']]/td").map do |node| RelatonBib::FormattedString.new content: node.text.strip, language: "ja", script: "Jpan" end end |
#fetch_contributor ⇒ Object
124 125 126 127 128 129 |
# File 'lib/relaton_jis/scraper.rb', line 124 def fetch_contributor @doc.xpath("./table/tr[th[.='原案作成団体']]/td").reduce([]) do |a, node| a << create_contrib(node, "author") a << create_contrib(node, "publisher") end end |
#fetch_date ⇒ Object
69 70 71 72 73 74 75 76 77 |
# File 'lib/relaton_jis/scraper.rb', line 69 def fetch_date DATETYPES.each_with_object([]) do |(key, type), a| node = @doc.at("./div/div/div/p/text()[contains(.,'#{key}')]") next unless node on = node.text.match(/\d{4}-\d{2}-\d{2}/).to_s a << RelatonBib::BibliographicDate.new(type: type, on: on) end end |
#fetch_docid ⇒ Object
56 57 58 |
# File 'lib/relaton_jis/scraper.rb', line 56 def fetch_docid [RelatonBib::DocumentIdentifier.new(id: document_id, type: "JIS", primary: true)] end |
#fetch_docnumber ⇒ Object
60 61 62 63 |
# File 'lib/relaton_jis/scraper.rb', line 60 def fetch_docnumber match = document_id.match(/^\w+\s(\w)\s?(\d+)/) "#{match[1]}#{match[2]}" end |
#fetch_docstatus ⇒ Object
100 101 102 103 104 105 |
# File 'lib/relaton_jis/scraper.rb', line 100 def fetch_docstatus st = @doc.at("./div/div/div/p/text()[contains(.,'状態')]/following-sibling::span") return unless st RelatonBib::DocumentStatus.new(stage: STATUSES[st.text.strip]) end |
#fetch_doctype ⇒ Object
107 108 109 110 111 112 113 114 115 |
# File 'lib/relaton_jis/scraper.rb', line 107 def fetch_doctype type = case document_id when /JIS\s[A-Z]\s[\w-]+:\d{4}\/AMENDMENT/ then "amendment" when /JIS\s[A-Z]\s[\w-]+/ then "japanese-industrial-standard" when /TR[\s\/][\w-]+/ then "technical-report" when /TS[\s\/][\w-]+/ then "technical-specification" end DocumentType.new type: type end |
#fetch_editorialgroup ⇒ Object
144 145 146 147 148 149 150 |
# File 'lib/relaton_jis/scraper.rb', line 144 def fetch_editorialgroup node = @doc.at("./table/tr[th[.='原案作成団体']]/td") return unless node tc = RelatonBib::WorkGroup.new name: node.text.strip RelatonIsoBib::EditorialGroup.new technical_committee: [tc] end |
#fetch_ics ⇒ Object
117 118 119 120 121 122 |
# File 'lib/relaton_jis/scraper.rb', line 117 def fetch_ics td = @doc.at("./table/tr[th[.='ICS']]/td") return [] unless td td.text.strip.split.map { |code| RelatonIsoBib::Ics.new code } end |
#fetch_language ⇒ Object
83 84 85 |
# File 'lib/relaton_jis/scraper.rb', line 83 def fetch_language langs_scripts.map { |l| l[:lang] } end |
#fetch_link ⇒ Object
40 41 42 43 44 45 46 47 48 |
# File 'lib/relaton_jis/scraper.rb', line 40 def fetch_link src = RelatonBib::TypedUri.new content: @url, type: "src" uri = URI @url domain = "#{uri.scheme}://#{uri.host}" @doc.xpath("./table/tr[th[.='プレビュー']]/td/a").reduce([src]) do |mem, node| href = "#{domain}#{node[:href]}" mem << RelatonBib::TypedUri.new(content: href, type: "pdf") end end |
#fetch_script ⇒ Object
87 88 89 |
# File 'lib/relaton_jis/scraper.rb', line 87 def fetch_script langs_scripts.map { |l| l[:script] } end |
#fetch_structuredidentifier ⇒ Object
152 153 154 |
# File 'lib/relaton_jis/scraper.rb', line 152 def fetch_structuredidentifier RelatonIsoBib::StructuredIdentifier.new project_number: fetch_docnumber, type: "JIS" end |
#fetch_title ⇒ Object
def fetch_fetched
Date.today.to_s
end
33 34 35 36 37 38 |
# File 'lib/relaton_jis/scraper.rb', line 33 def fetch_title { "ja" => "Jpan", "en" => "Lant" }.map.with_index do |(lang, script), i| content = @doc.at("./h2/text()[#{i + 2}]").text.strip RelatonBib::TypedTitleString.new content: content, language: lang, script: script end end |
#fetch_type ⇒ Object
79 80 81 |
# File 'lib/relaton_jis/scraper.rb', line 79 def fetch_type "standard" end |
#langs_scripts ⇒ Object
91 92 93 94 95 96 97 98 |
# File 'lib/relaton_jis/scraper.rb', line 91 def langs_scripts @langs_scripts ||= LANGS.each_with_object([]) do |(key, lang), a| l = @doc.at("./div/div/div[@class='blockContentFile']/div/div/p[1]/span[contains(.,'#{key}')]/following-sibling::span") next if l.nil? || l.text.strip == "-" a << lang end end |