Class: Relaton::Jis::Scraper
- Inherits:
-
Object
- Object
- Relaton::Jis::Scraper
- Defined in:
- lib/relaton/jis/scraper.rb
Constant Summary collapse
- ATTRS =
%i[ title source abstract docidentifier docnumber date type language script status contributor structuredidentifier ext ].freeze
- LANGS =
{ "和文" => { lang: "ja", script: "Jpan" }, "英訳" => { lang: "en", script: "Latn" } }.freeze
- DATETYPES =
{ "発行年月日" => "issued", "確認年月日" => "confirmed" }.freeze
- STATUSES =
{ "有効" => "valid", "廃止" => "withdrawn" }.freeze
Instance Method Summary collapse
- #create_contrib(name, role) ⇒ Object
- #create_orgname(name) ⇒ Object
- #document_id ⇒ Object
-
#fetch ⇒ Object
rubocop:disable Metrics/MethodLength.
- #fetch_abstract ⇒ Object
- #fetch_contributor ⇒ Object
- #fetch_date ⇒ Object
- #fetch_docidentifier ⇒ Object
- #fetch_docnumber ⇒ Object
-
#fetch_doctype ⇒ Object
rubocop:disable Metrics/CyclomaticComplexity.
-
#fetch_editorialgroup_contributor ⇒ Object
rubocop:disable Metrics/MethodLength.
- #fetch_ext ⇒ Object
- #fetch_ics ⇒ Object
- #fetch_language ⇒ Object
- #fetch_script ⇒ Object
-
#fetch_source ⇒ Object
rubocop:disable Metrics/MethodLength.
- #fetch_status ⇒ Object
- #fetch_structuredidentifier ⇒ Object
- #fetch_title ⇒ Object
- #fetch_type ⇒ Object
-
#initialize(url, errors = {}) ⇒ Scraper
constructor
A new instance of Scraper.
-
#langs_scripts ⇒ Object
rubocop:disable Metrics/MethodLength.
Constructor Details
#initialize(url, errors = {}) ⇒ Scraper
Returns a new instance of Scraper.
18 19 20 21 22 |
# File 'lib/relaton/jis/scraper.rb', line 18 def initialize(url, errors = {}) @url = url @agent = Mechanize.new @errors = errors end |
Instance Method Details
#create_contrib(name, role) ⇒ Object
178 179 180 181 182 |
# File 'lib/relaton/jis/scraper.rb', line 178 def create_contrib(name, role) org = Bib::Organization.new name: create_orgname(name) role_obj = Bib::Contributor::Role.new(type: role) Bib::Contributor.new organization: org, role: [role_obj] end |
#create_orgname(name) ⇒ Object
184 185 186 187 188 189 190 191 192 193 194 |
# File 'lib/relaton/jis/scraper.rb', line 184 def create_orgname(name) tls = Bib::TypedLocalizedString orgname = [tls.new(content: name, language: "ja", script: "Jpan")] if name.include?("日本規格協会") orgname << tls.new( content: "Japanese Industrial Standards", language: "en", script: "Latn" ) end orgname end |
#document_id ⇒ Object
86 87 88 |
# File 'lib/relaton/jis/scraper.rb', line 86 def document_id @document_id ||= @doc.at("./h2/text()[1]")&.text&.strip end |
#fetch ⇒ Object
rubocop:disable Metrics/MethodLength
24 25 26 27 28 29 30 31 32 |
# File 'lib/relaton/jis/scraper.rb', line 24 def fetch # rubocop:disable Metrics/MethodLength @doc = @agent.get(@url).at "//div[@id='main']/section" contributors = fetch_contributor eg_contributor = fetch_editorialgroup_contributor contributors << eg_contributor if eg_contributor attrs = ATTRS.to_h { |attr| [attr, send("fetch_#{attr}")] } attrs[:contributor] = contributors Bib::ItemData.new(**attrs) end |
#fetch_abstract ⇒ Object
56 57 58 59 60 61 62 63 64 65 |
# File 'lib/relaton/jis/scraper.rb', line 56 def fetch_abstract result = @doc.xpath("//div[@id='honbun']").map do |node| Bib::Abstract.new( content: node.text.strip, language: "ja", script: "Jpan" ) end @errors[:abstract] &&= result.empty? result end |
#fetch_contributor ⇒ Object
165 166 167 168 169 170 171 172 173 174 175 176 |
# File 'lib/relaton/jis/scraper.rb', line 165 def fetch_contributor = create_contrib( "一般財団法人 日本規格協会", "authorizer" ) xpath = "./table/tr[th[.='原案作成団体']]/td" result = @doc.xpath(xpath).reduce([]) do |a, node| a << create_contrib(node.text.strip, "author") a << create_contrib(node.text.strip, "publisher") end @errors[:contributor] &&= result.empty? result end |
#fetch_date ⇒ Object
90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/relaton/jis/scraper.rb', line 90 def fetch_date result = DATETYPES.each_with_object([]) do |(key, type), a| node = @doc.at("./div/div/div/p/text()[contains(.,'#{key}')]") next unless node at = node.text.match(/\d{4}-\d{2}-\d{2}/).to_s next if at.empty? a << Bib::Date.new(type: type, at: at) end @errors[:date] &&= result.empty? result end |
#fetch_docidentifier ⇒ Object
67 68 69 70 71 72 73 74 75 |
# File 'lib/relaton/jis/scraper.rb', line 67 def fetch_docidentifier docid = document_id @errors[:docidentifier] &&= docid.nil? || docid.empty? return [] if docid.nil? || docid.empty? [Docidentifier.new( content: docid, type: "JIS", primary: true, )] end |
#fetch_docnumber ⇒ Object
77 78 79 80 81 82 83 84 |
# File 'lib/relaton/jis/scraper.rb', line 77 def fetch_docnumber docid = document_id match = docid&.match(/^\w+\s(\w)\s?(\d+)/) @errors[:docnumber] &&= match.nil? return unless match "#{match[1]}#{match[2]}" end |
#fetch_doctype ⇒ Object
rubocop:disable Metrics/CyclomaticComplexity
144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/relaton/jis/scraper.rb', line 144 def fetch_doctype # rubocop:disable Metrics/CyclomaticComplexity type = case document_id when /JIS\s[A-Z]\s[\w-]+:\d{4}\/AMENDMENT/ then "amendment" when /JIS\s[A-Z]\s[\w-]+/ then "japanese-industrial-standard" when /TR[\s\/][\w-]+/ then "technical-report" when /TS[\s\/][\w-]+/ then "technical-specification" end @errors[:doctype] &&= type.nil? return unless type Doctype.new content: type end |
#fetch_editorialgroup_contributor ⇒ Object
rubocop:disable Metrics/MethodLength
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
# File 'lib/relaton/jis/scraper.rb', line 196 def fetch_editorialgroup_contributor # rubocop:disable Metrics/MethodLength node = @doc.at("./table/tr[th[.='原案作成団体']]/td") @errors[:editorialgroup] &&= node.nil? return unless node subdivision = Bib::Subdivision.new( type: "technical-committee", name: [Bib::TypedLocalizedString.new(content: node.text.strip)], ) desc = Bib::LocalizedMarkedUpString.new(content: "committee") role = Bib::Contributor::Role.new( type: "author", description: [desc], ) org = Bib::Organization.new( name: [], subdivision: [subdivision], ) Bib::Contributor.new(role: [role], organization: org) end |
#fetch_ext ⇒ Object
222 223 224 225 226 227 228 229 |
# File 'lib/relaton/jis/scraper.rb', line 222 def fetch_ext Ext.new( doctype: fetch_doctype, flavor: "jis", ics: fetch_ics, structuredidentifier: fetch_structuredidentifier, ) end |
#fetch_ics ⇒ Object
157 158 159 160 161 162 163 |
# File 'lib/relaton/jis/scraper.rb', line 157 def fetch_ics td = @doc.at("./table/tr[th[.='ICS']]/td") @errors[:ics] &&= td.nil? return [] unless td td.text.strip.split.map { |code| Bib::ICS.new code: code } end |
#fetch_language ⇒ Object
108 109 110 |
# File 'lib/relaton/jis/scraper.rb', line 108 def fetch_language langs_scripts.map { |l| l[:lang] } end |
#fetch_script ⇒ Object
112 113 114 |
# File 'lib/relaton/jis/scraper.rb', line 112 def fetch_script langs_scripts.map { |l| l[:script] } end |
#fetch_source ⇒ Object
rubocop:disable Metrics/MethodLength
43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/relaton/jis/scraper.rb', line 43 def fetch_source # rubocop:disable Metrics/MethodLength src = Bib::Uri.new content: @url, type: "src" uri = URI @url domain = "#{uri.scheme}://#{uri.host}" xpath = "./dl/dt[.='プレビュー']/following-sibling::dd[1]/a" result = @doc.xpath(xpath).reduce([src]) do |mem, node| href = "#{domain}#{node[:href]}" mem << Bib::Uri.new(content: href, type: "pdf") end @errors[:source] &&= result.empty? result end |
#fetch_status ⇒ Object
132 133 134 135 136 137 138 139 140 141 142 |
# File 'lib/relaton/jis/scraper.rb', line 132 def fetch_status xpath = "./div/div/div/p/text()[contains(.,'状態')]" \ "/following-sibling::span" st = @doc.at(xpath) status_val = STATUSES[st&.text&.strip] @errors[:status] &&= status_val.nil? return unless status_val stage = Bib::Status::Stage.new(content: status_val) Bib::Status.new(stage: stage) end |
#fetch_structuredidentifier ⇒ Object
215 216 217 218 219 220 |
# File 'lib/relaton/jis/scraper.rb', line 215 def fetch_structuredidentifier Iso::StructuredIdentifier.new( project_number: Iso::ProjectNumber.new(content: fetch_docnumber), type: "JIS", ) end |
#fetch_title ⇒ Object
34 35 36 37 38 39 40 41 |
# File 'lib/relaton/jis/scraper.rb', line 34 def fetch_title result = { "ja" => "Jpan", "en" => "Latn" }.map.with_index do |(lang, script), i| content = @doc.at("./h2/text()[#{i + 2}]").text.strip Bib::Title.new content: content, language: lang, script: script end @errors[:title] &&= result.empty? result end |
#fetch_type ⇒ Object
104 105 106 |
# File 'lib/relaton/jis/scraper.rb', line 104 def fetch_type "standard" end |
#langs_scripts ⇒ Object
rubocop:disable Metrics/MethodLength
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# File 'lib/relaton/jis/scraper.rb', line 116 def langs_scripts # rubocop:disable Metrics/MethodLength @langs_scripts ||= begin result = LANGS.each_with_object([]) do |(key, lang), a| l = @doc.at( "./div/div/div[@class='blockContentFile']/div/div/p[1]" \ "/span[contains(.,'#{key}')]/following-sibling::span", ) next if l.nil? || l.text.strip == "-" a << lang end @errors[:language] &&= result.empty? result end end |