Class: Relaton::Jis::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton/jis/scraper.rb

Constant Summary collapse

ATTRS =
%i[
  title source abstract docidentifier docnumber date type language script
  status contributor structuredidentifier ext
].freeze
LANGS =
{ "和文" => { lang: "ja", script: "Jpan" },
"英訳" => { lang: "en", script: "Latn" } }.freeze
DATETYPES =
{ "発行年月日" => "issued", "確認年月日" => "confirmed" }.freeze
STATUSES =
{ "有効" => "valid", "廃止" => "withdrawn" }.freeze

Instance Method Summary collapse

Constructor Details

#initialize(url, errors = {}) ⇒ Scraper

Returns a new instance of Scraper.



18
19
20
21
22
# File 'lib/relaton/jis/scraper.rb', line 18

def initialize(url, errors = {})
  @url = url
  @agent = Mechanize.new
  @errors = errors
end

Instance Method Details

#create_contrib(name, role) ⇒ Object



178
179
180
181
182
# File 'lib/relaton/jis/scraper.rb', line 178

def create_contrib(name, role)
  org = Bib::Organization.new name: create_orgname(name)
  role_obj = Bib::Contributor::Role.new(type: role)
  Bib::Contributor.new organization: org, role: [role_obj]
end

#create_orgname(name) ⇒ Object



184
185
186
187
188
189
190
191
192
193
194
# File 'lib/relaton/jis/scraper.rb', line 184

def create_orgname(name)
  tls = Bib::TypedLocalizedString
  orgname = [tls.new(content: name, language: "ja", script: "Jpan")]
  if name.include?("日本規格協会")
    orgname << tls.new(
      content: "Japanese Industrial Standards",
      language: "en", script: "Latn"
    )
  end
  orgname
end

#document_idObject



86
87
88
# File 'lib/relaton/jis/scraper.rb', line 86

def document_id
  @document_id ||= @doc.at("./h2/text()[1]")&.text&.strip
end

#fetchObject

rubocop:disable Metrics/MethodLength



24
25
26
27
28
29
30
31
32
# File 'lib/relaton/jis/scraper.rb', line 24

def fetch # rubocop:disable Metrics/MethodLength
  @doc = @agent.get(@url).at "//div[@id='main']/section"
  contributors = fetch_contributor
  eg_contributor = fetch_editorialgroup_contributor
  contributors << eg_contributor if eg_contributor
  attrs = ATTRS.to_h { |attr| [attr, send("fetch_#{attr}")] }
  attrs[:contributor] = contributors
  Bib::ItemData.new(**attrs)
end

#fetch_abstractObject



56
57
58
59
60
61
62
63
64
65
# File 'lib/relaton/jis/scraper.rb', line 56

def fetch_abstract
  result = @doc.xpath("//div[@id='honbun']").map do |node|
    Bib::Abstract.new(
      content: node.text.strip,
      language: "ja", script: "Jpan"
    )
  end
  @errors[:abstract] &&= result.empty?
  result
end

#fetch_contributorObject



165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/relaton/jis/scraper.rb', line 165

def fetch_contributor
  authorizer = create_contrib(
    "一般財団法人 日本規格協会", "authorizer"
  )
  xpath = "./table/tr[th[.='原案作成団体']]/td"
  result = @doc.xpath(xpath).reduce([authorizer]) do |a, node|
    a << create_contrib(node.text.strip, "author")
    a << create_contrib(node.text.strip, "publisher")
  end
  @errors[:contributor] &&= result.empty?
  result
end

#fetch_dateObject



90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/relaton/jis/scraper.rb', line 90

def fetch_date
  result = DATETYPES.each_with_object([]) do |(key, type), a|
    node = @doc.at("./div/div/div/p/text()[contains(.,'#{key}')]")
    next unless node

    at = node.text.match(/\d{4}-\d{2}-\d{2}/).to_s
    next if at.empty?

    a << Bib::Date.new(type: type, at: at)
  end
  @errors[:date] &&= result.empty?
  result
end

#fetch_docidentifierObject



67
68
69
70
71
72
73
74
75
# File 'lib/relaton/jis/scraper.rb', line 67

def fetch_docidentifier
  docid = document_id
  @errors[:docidentifier] &&= docid.nil? || docid.empty?
  return [] if docid.nil? || docid.empty?

  [Docidentifier.new(
    content: docid, type: "JIS", primary: true,
  )]
end

#fetch_docnumberObject



77
78
79
80
81
82
83
84
# File 'lib/relaton/jis/scraper.rb', line 77

def fetch_docnumber
  docid = document_id
  match = docid&.match(/^\w+\s(\w)\s?(\d+)/)
  @errors[:docnumber] &&= match.nil?
  return unless match

  "#{match[1]}#{match[2]}"
end

#fetch_doctypeObject

rubocop:disable Metrics/CyclomaticComplexity



144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/relaton/jis/scraper.rb', line 144

def fetch_doctype # rubocop:disable Metrics/CyclomaticComplexity
  type = case document_id
         when /JIS\s[A-Z]\s[\w-]+:\d{4}\/AMENDMENT/ then "amendment"
         when /JIS\s[A-Z]\s[\w-]+/ then "japanese-industrial-standard"
         when /TR[\s\/][\w-]+/ then "technical-report"
         when /TS[\s\/][\w-]+/ then "technical-specification"
         end
  @errors[:doctype] &&= type.nil?
  return unless type

  Doctype.new content: type
end

#fetch_editorialgroup_contributorObject

rubocop:disable Metrics/MethodLength



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/relaton/jis/scraper.rb', line 196

def fetch_editorialgroup_contributor # rubocop:disable Metrics/MethodLength
  node = @doc.at("./table/tr[th[.='原案作成団体']]/td")
  @errors[:editorialgroup] &&= node.nil?
  return unless node

  subdivision = Bib::Subdivision.new(
    type: "technical-committee",
    name: [Bib::TypedLocalizedString.new(content: node.text.strip)],
  )
  desc = Bib::LocalizedMarkedUpString.new(content: "committee")
  role = Bib::Contributor::Role.new(
    type: "author", description: [desc],
  )
  org = Bib::Organization.new(
    name: [], subdivision: [subdivision],
  )
  Bib::Contributor.new(role: [role], organization: org)
end

#fetch_extObject



222
223
224
225
226
227
228
229
# File 'lib/relaton/jis/scraper.rb', line 222

def fetch_ext
  Ext.new(
    doctype: fetch_doctype,
    flavor: "jis",
    ics: fetch_ics,
    structuredidentifier: fetch_structuredidentifier,
  )
end

#fetch_icsObject



157
158
159
160
161
162
163
# File 'lib/relaton/jis/scraper.rb', line 157

def fetch_ics
  td = @doc.at("./table/tr[th[.='ICS']]/td")
  @errors[:ics] &&= td.nil?
  return [] unless td

  td.text.strip.split.map { |code| Bib::ICS.new code: code }
end

#fetch_languageObject



108
109
110
# File 'lib/relaton/jis/scraper.rb', line 108

def fetch_language
  langs_scripts.map { |l| l[:lang] }
end

#fetch_scriptObject



112
113
114
# File 'lib/relaton/jis/scraper.rb', line 112

def fetch_script
  langs_scripts.map { |l| l[:script] }
end

#fetch_sourceObject

rubocop:disable Metrics/MethodLength



43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/relaton/jis/scraper.rb', line 43

def fetch_source # rubocop:disable Metrics/MethodLength
  src = Bib::Uri.new content: @url, type: "src"
  uri = URI @url
  domain = "#{uri.scheme}://#{uri.host}"
  xpath = "./dl/dt[.='プレビュー']/following-sibling::dd[1]/a"
  result = @doc.xpath(xpath).reduce([src]) do |mem, node|
    href = "#{domain}#{node[:href]}"
    mem << Bib::Uri.new(content: href, type: "pdf")
  end
  @errors[:source] &&= result.empty?
  result
end

#fetch_statusObject



132
133
134
135
136
137
138
139
140
141
142
# File 'lib/relaton/jis/scraper.rb', line 132

def fetch_status
  xpath = "./div/div/div/p/text()[contains(.,'状態')]" \
          "/following-sibling::span"
  st = @doc.at(xpath)
  status_val = STATUSES[st&.text&.strip]
  @errors[:status] &&= status_val.nil?
  return unless status_val

  stage = Bib::Status::Stage.new(content: status_val)
  Bib::Status.new(stage: stage)
end

#fetch_structuredidentifierObject



215
216
217
218
219
220
# File 'lib/relaton/jis/scraper.rb', line 215

def fetch_structuredidentifier
  Iso::StructuredIdentifier.new(
    project_number: Iso::ProjectNumber.new(content: fetch_docnumber),
    type: "JIS",
  )
end

#fetch_titleObject



34
35
36
37
38
39
40
41
# File 'lib/relaton/jis/scraper.rb', line 34

def fetch_title
  result = { "ja" => "Jpan", "en" => "Latn" }.map.with_index do |(lang, script), i|
    content = @doc.at("./h2/text()[#{i + 2}]").text.strip
    Bib::Title.new content: content, language: lang, script: script
  end
  @errors[:title] &&= result.empty?
  result
end

#fetch_typeObject



104
105
106
# File 'lib/relaton/jis/scraper.rb', line 104

def fetch_type
  "standard"
end

#langs_scriptsObject

rubocop:disable Metrics/MethodLength



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/relaton/jis/scraper.rb', line 116

def langs_scripts # rubocop:disable Metrics/MethodLength
  @langs_scripts ||= begin
    result = LANGS.each_with_object([]) do |(key, lang), a|
      l = @doc.at(
        "./div/div/div[@class='blockContentFile']/div/div/p[1]" \
        "/span[contains(.,'#{key}')]/following-sibling::span",
      )
      next if l.nil? || l.text.strip == "-"

      a << lang
    end
    @errors[:language] &&= result.empty?
    result
  end
end