Class: Relaton::Jis::Scraper
- Inherits:
-
Object
- Object
- Relaton::Jis::Scraper
- Defined in:
- lib/relaton/jis/scraper.rb
Constant Summary collapse
- ATTRS =
%i[ title source abstract docidentifier docnumber date type language script status contributor structuredidentifier ext ].freeze
- LANGS =
{ "和文" => { lang: "ja", script: "Jpan" }, "英訳" => { lang: "en", script: "Latn" } }.freeze
- DATETYPES =
{ "発行年月日" => "issued", "確認年月日" => "confirmed" }.freeze
- STATUSES =
{ "有効" => "valid", "廃止" => "withdrawn" }.freeze
Instance Method Summary collapse
- #create_contrib(name, role) ⇒ Object
- #create_orgname(name) ⇒ Object
- #document_id ⇒ Object
-
#fetch ⇒ Object
rubocop:disable Metrics/MethodLength.
- #fetch_abstract ⇒ Object
- #fetch_contributor ⇒ Object
- #fetch_date ⇒ Object
- #fetch_docidentifier ⇒ Object
- #fetch_docnumber ⇒ Object
- #fetch_doctype ⇒ Object
-
#fetch_editorialgroup_contributor ⇒ Object
rubocop:disable Metrics/MethodLength.
- #fetch_ext ⇒ Object
- #fetch_ics ⇒ Object
- #fetch_language ⇒ Object
- #fetch_script ⇒ Object
-
#fetch_source ⇒ Object
rubocop:disable Metrics/MethodLength.
- #fetch_status ⇒ Object
- #fetch_structuredidentifier ⇒ Object
- #fetch_title ⇒ Object
- #fetch_type ⇒ Object
-
#initialize(url) ⇒ Scraper
constructor
A new instance of Scraper.
- #langs_scripts ⇒ Object
Constructor Details
#initialize(url) ⇒ Scraper
Returns a new instance of Scraper.
18 19 20 21 |
# File 'lib/relaton/jis/scraper.rb', line 18 def initialize(url) @url = url @agent = Mechanize.new end |
Instance Method Details
#create_contrib(name, role) ⇒ Object
150 151 152 153 154 |
# File 'lib/relaton/jis/scraper.rb', line 150 def create_contrib(name, role) org = Bib::Organization.new name: create_orgname(name) role_obj = Bib::Contributor::Role.new(type: role) Bib::Contributor.new organization: org, role: [role_obj] end |
#create_orgname(name) ⇒ Object
156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/relaton/jis/scraper.rb', line 156 def create_orgname(name) tls = Bib::TypedLocalizedString orgname = [tls.new(content: name, language: "ja", script: "Jpan")] if name.include?("日本規格協会") orgname << tls.new( content: "Japanese Industrial Standards", language: "en", script: "Latn" ) end orgname end |
#document_id ⇒ Object
74 75 76 |
# File 'lib/relaton/jis/scraper.rb', line 74 def document_id @document_id ||= @doc.at("./h2/text()[1]").text.strip end |
#fetch ⇒ Object
rubocop:disable Metrics/MethodLength
23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/relaton/jis/scraper.rb', line 23 def fetch # rubocop:disable Metrics/MethodLength @doc = @agent.get(@url).at "//div[@id='main']/section" contributors = fetch_contributor eg_contributor = fetch_editorialgroup_contributor contributors << eg_contributor if eg_contributor attrs = ATTRS.each_with_object({}) do |attr, hash| hash[attr] = send "fetch_#{attr}" end attrs[:contributor] = contributors Bib::ItemData.new(**attrs) end |
#fetch_abstract ⇒ Object
53 54 55 56 57 58 59 60 |
# File 'lib/relaton/jis/scraper.rb', line 53 def fetch_abstract @doc.xpath("//div[@id='honbun']").map do |node| Bib::LocalizedMarkedUpString.new( content: node.text.strip, language: "ja", script: "Jpan" ) end end |
#fetch_contributor ⇒ Object
139 140 141 142 143 144 145 146 147 148 |
# File 'lib/relaton/jis/scraper.rb', line 139 def fetch_contributor = create_contrib( "一般財団法人 日本規格協会", "authorizer" ) xpath = "./table/tr[th[.='原案作成団体']]/td" @doc.xpath(xpath).reduce([]) do |a, node| a << create_contrib(node.text.strip, "author") a << create_contrib(node.text.strip, "publisher") end end |
#fetch_date ⇒ Object
78 79 80 81 82 83 84 85 86 |
# File 'lib/relaton/jis/scraper.rb', line 78 def fetch_date DATETYPES.each_with_object([]) do |(key, type), a| node = @doc.at("./div/div/div/p/text()[contains(.,'#{key}')]") next unless node at = node.text.match(/\d{4}-\d{2}-\d{2}/).to_s a << Bib::Date.new(type: type, at: at) end end |
#fetch_docidentifier ⇒ Object
62 63 64 65 66 67 |
# File 'lib/relaton/jis/scraper.rb', line 62 def fetch_docidentifier docid = document_id [Docidentifier.new( content: docid, type: "JIS", primary: true, )] end |
#fetch_docnumber ⇒ Object
69 70 71 72 |
# File 'lib/relaton/jis/scraper.rb', line 69 def fetch_docnumber match = document_id.match(/^\w+\s(\w)\s?(\d+)/) "#{match[1]}#{match[2]}" end |
#fetch_doctype ⇒ Object
122 123 124 125 126 127 128 129 130 |
# File 'lib/relaton/jis/scraper.rb', line 122 def fetch_doctype type = case document_id when /JIS\s[A-Z]\s[\w-]+:\d{4}\/AMENDMENT/ then "amendment" when /JIS\s[A-Z]\s[\w-]+/ then "japanese-industrial-standard" when /TR[\s\/][\w-]+/ then "technical-report" when /TS[\s\/][\w-]+/ then "technical-specification" end Doctype.new content: type end |
#fetch_editorialgroup_contributor ⇒ Object
rubocop:disable Metrics/MethodLength
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/relaton/jis/scraper.rb', line 168 def fetch_editorialgroup_contributor # rubocop:disable Metrics/MethodLength node = @doc.at("./table/tr[th[.='原案作成団体']]/td") return unless node subdivision = Bib::Subdivision.new( type: "technical-committee", name: [Bib::TypedLocalizedString.new(content: node.text.strip)], ) desc = Bib::LocalizedMarkedUpString.new(content: "committee") role = Bib::Contributor::Role.new( type: "author", description: [desc], ) org = Bib::Organization.new( name: [], subdivision: [subdivision], ) Bib::Contributor.new(role: [role], organization: org) end |
#fetch_ext ⇒ Object
193 194 195 196 197 198 199 200 |
# File 'lib/relaton/jis/scraper.rb', line 193 def fetch_ext Ext.new( doctype: fetch_doctype, flavor: "jis", ics: fetch_ics, structuredidentifier: fetch_structuredidentifier, ) end |
#fetch_ics ⇒ Object
132 133 134 135 136 137 |
# File 'lib/relaton/jis/scraper.rb', line 132 def fetch_ics td = @doc.at("./table/tr[th[.='ICS']]/td") return [] unless td td.text.strip.split.map { |code| Bib::ICS.new code: code } end |
#fetch_language ⇒ Object
92 93 94 |
# File 'lib/relaton/jis/scraper.rb', line 92 def fetch_language langs_scripts.map { |l| l[:lang] } end |
#fetch_script ⇒ Object
96 97 98 |
# File 'lib/relaton/jis/scraper.rb', line 96 def fetch_script langs_scripts.map { |l| l[:script] } end |
#fetch_source ⇒ Object
rubocop:disable Metrics/MethodLength
42 43 44 45 46 47 48 49 50 51 |
# File 'lib/relaton/jis/scraper.rb', line 42 def fetch_source # rubocop:disable Metrics/MethodLength src = Bib::Uri.new content: @url, type: "src" uri = URI @url domain = "#{uri.scheme}://#{uri.host}" xpath = "./dl/dt[.='プレビュー']/following-sibling::dd[1]/a" @doc.xpath(xpath).reduce([src]) do |mem, node| href = "#{domain}#{node[:href]}" mem << Bib::Uri.new(content: href, type: "pdf") end end |
#fetch_status ⇒ Object
112 113 114 115 116 117 118 119 120 |
# File 'lib/relaton/jis/scraper.rb', line 112 def fetch_status xpath = "./div/div/div/p/text()[contains(.,'状態')]" \ "/following-sibling::span" st = @doc.at(xpath) return unless st stage = Bib::Status::Stage.new(content: STATUSES[st.text.strip]) Bib::Status.new(stage: stage) end |
#fetch_structuredidentifier ⇒ Object
186 187 188 189 190 191 |
# File 'lib/relaton/jis/scraper.rb', line 186 def fetch_structuredidentifier Iso::StructuredIdentifier.new( project_number: Iso::ProjectNumber.new(content: fetch_docnumber), type: "JIS", ) end |
#fetch_title ⇒ Object
35 36 37 38 39 40 |
# File 'lib/relaton/jis/scraper.rb', line 35 def fetch_title { "ja" => "Jpan", "en" => "Latn" }.map.with_index do |(lang, script), i| content = @doc.at("./h2/text()[#{i + 2}]").text.strip Bib::Title.new content: content, language: lang, script: script end end |
#fetch_type ⇒ Object
88 89 90 |
# File 'lib/relaton/jis/scraper.rb', line 88 def fetch_type "standard" end |
#langs_scripts ⇒ Object
100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/relaton/jis/scraper.rb', line 100 def langs_scripts @langs_scripts ||= LANGS.each_with_object([]) do |(key, lang), a| l = @doc.at( "./div/div/div[@class='blockContentFile']/div/div/p[1]" \ "/span[contains(.,'#{key}')]/following-sibling::span", ) next if l.nil? || l.text.strip == "-" a << lang end end |