Class: Relaton::Jis::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton/jis/scraper.rb

Constant Summary collapse

ATTRS =
%i[
  title source abstract docidentifier docnumber date type language script
  status contributor structuredidentifier ext
].freeze
LANGS =
{ "和文" => { lang: "ja", script: "Jpan" },
"英訳" => { lang: "en", script: "Latn" } }.freeze
DATETYPES =
{ "発行年月日" => "issued", "確認年月日" => "confirmed" }.freeze
STATUSES =
{ "有効" => "valid", "廃止" => "withdrawn" }.freeze

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Scraper

Returns a new instance of Scraper.



18
19
20
21
# File 'lib/relaton/jis/scraper.rb', line 18

def initialize(url)
  @url = url
  @agent = Mechanize.new
end

Instance Method Details

#create_contrib(name, role) ⇒ Object



150
151
152
153
154
# File 'lib/relaton/jis/scraper.rb', line 150

def create_contrib(name, role)
  org = Bib::Organization.new name: create_orgname(name)
  role_obj = Bib::Contributor::Role.new(type: role)
  Bib::Contributor.new organization: org, role: [role_obj]
end

#create_orgname(name) ⇒ Object



156
157
158
159
160
161
162
163
164
165
166
# File 'lib/relaton/jis/scraper.rb', line 156

def create_orgname(name)
  tls = Bib::TypedLocalizedString
  orgname = [tls.new(content: name, language: "ja", script: "Jpan")]
  if name.include?("日本規格協会")
    orgname << tls.new(
      content: "Japanese Industrial Standards",
      language: "en", script: "Latn"
    )
  end
  orgname
end

#document_idObject



74
75
76
# File 'lib/relaton/jis/scraper.rb', line 74

def document_id
  @document_id ||= @doc.at("./h2/text()[1]").text.strip
end

#fetchObject

rubocop:disable Metrics/MethodLength



23
24
25
26
27
28
29
30
31
32
33
# File 'lib/relaton/jis/scraper.rb', line 23

def fetch # rubocop:disable Metrics/MethodLength
  @doc = @agent.get(@url).at "//div[@id='main']/section"
  contributors = fetch_contributor
  eg_contributor = fetch_editorialgroup_contributor
  contributors << eg_contributor if eg_contributor
  attrs = ATTRS.each_with_object({}) do |attr, hash|
    hash[attr] = send "fetch_#{attr}"
  end
  attrs[:contributor] = contributors
  Bib::ItemData.new(**attrs)
end

#fetch_abstractObject



53
54
55
56
57
58
59
60
# File 'lib/relaton/jis/scraper.rb', line 53

def fetch_abstract
  @doc.xpath("//div[@id='honbun']").map do |node|
    Bib::LocalizedMarkedUpString.new(
      content: node.text.strip,
      language: "ja", script: "Jpan"
    )
  end
end

#fetch_contributorObject



139
140
141
142
143
144
145
146
147
148
# File 'lib/relaton/jis/scraper.rb', line 139

def fetch_contributor
  authorizer = create_contrib(
    "一般財団法人 日本規格協会", "authorizer"
  )
  xpath = "./table/tr[th[.='原案作成団体']]/td"
  @doc.xpath(xpath).reduce([authorizer]) do |a, node|
    a << create_contrib(node.text.strip, "author")
    a << create_contrib(node.text.strip, "publisher")
  end
end

#fetch_dateObject



78
79
80
81
82
83
84
85
86
# File 'lib/relaton/jis/scraper.rb', line 78

def fetch_date
  DATETYPES.each_with_object([]) do |(key, type), a|
    node = @doc.at("./div/div/div/p/text()[contains(.,'#{key}')]")
    next unless node

    at = node.text.match(/\d{4}-\d{2}-\d{2}/).to_s
    a << Bib::Date.new(type: type, at: at)
  end
end

#fetch_docidentifierObject



62
63
64
65
66
67
# File 'lib/relaton/jis/scraper.rb', line 62

def fetch_docidentifier
  docid = document_id
  [Docidentifier.new(
    content: docid, type: "JIS", primary: true,
  )]
end

#fetch_docnumberObject



69
70
71
72
# File 'lib/relaton/jis/scraper.rb', line 69

def fetch_docnumber
  match = document_id.match(/^\w+\s(\w)\s?(\d+)/)
  "#{match[1]}#{match[2]}"
end

#fetch_doctypeObject



122
123
124
125
126
127
128
129
130
# File 'lib/relaton/jis/scraper.rb', line 122

def fetch_doctype
  type = case document_id
         when /JIS\s[A-Z]\s[\w-]+:\d{4}\/AMENDMENT/ then "amendment"
         when /JIS\s[A-Z]\s[\w-]+/ then "japanese-industrial-standard"
         when /TR[\s\/][\w-]+/ then "technical-report"
         when /TS[\s\/][\w-]+/ then "technical-specification"
         end
  Doctype.new content: type
end

#fetch_editorialgroup_contributorObject

rubocop:disable Metrics/MethodLength



168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/relaton/jis/scraper.rb', line 168

def fetch_editorialgroup_contributor # rubocop:disable Metrics/MethodLength
  node = @doc.at("./table/tr[th[.='原案作成団体']]/td")
  return unless node

  subdivision = Bib::Subdivision.new(
    type: "technical-committee",
    name: [Bib::TypedLocalizedString.new(content: node.text.strip)],
  )
  desc = Bib::LocalizedMarkedUpString.new(content: "committee")
  role = Bib::Contributor::Role.new(
    type: "author", description: [desc],
  )
  org = Bib::Organization.new(
    name: [], subdivision: [subdivision],
  )
  Bib::Contributor.new(role: [role], organization: org)
end

#fetch_extObject



193
194
195
196
197
198
199
200
# File 'lib/relaton/jis/scraper.rb', line 193

def fetch_ext
  Ext.new(
    doctype: fetch_doctype,
    flavor: "jis",
    ics: fetch_ics,
    structuredidentifier: fetch_structuredidentifier,
  )
end

#fetch_icsObject



132
133
134
135
136
137
# File 'lib/relaton/jis/scraper.rb', line 132

def fetch_ics
  td = @doc.at("./table/tr[th[.='ICS']]/td")
  return [] unless td

  td.text.strip.split.map { |code| Bib::ICS.new code: code }
end

#fetch_languageObject



92
93
94
# File 'lib/relaton/jis/scraper.rb', line 92

def fetch_language
  langs_scripts.map { |l| l[:lang] }
end

#fetch_scriptObject



96
97
98
# File 'lib/relaton/jis/scraper.rb', line 96

def fetch_script
  langs_scripts.map { |l| l[:script] }
end

#fetch_sourceObject

rubocop:disable Metrics/MethodLength



42
43
44
45
46
47
48
49
50
51
# File 'lib/relaton/jis/scraper.rb', line 42

def fetch_source # rubocop:disable Metrics/MethodLength
  src = Bib::Uri.new content: @url, type: "src"
  uri = URI @url
  domain = "#{uri.scheme}://#{uri.host}"
  xpath = "./dl/dt[.='プレビュー']/following-sibling::dd[1]/a"
  @doc.xpath(xpath).reduce([src]) do |mem, node|
    href = "#{domain}#{node[:href]}"
    mem << Bib::Uri.new(content: href, type: "pdf")
  end
end

#fetch_statusObject



112
113
114
115
116
117
118
119
120
# File 'lib/relaton/jis/scraper.rb', line 112

def fetch_status
  xpath = "./div/div/div/p/text()[contains(.,'状態')]" \
          "/following-sibling::span"
  st = @doc.at(xpath)
  return unless st

  stage = Bib::Status::Stage.new(content: STATUSES[st.text.strip])
  Bib::Status.new(stage: stage)
end

#fetch_structuredidentifierObject



186
187
188
189
190
191
# File 'lib/relaton/jis/scraper.rb', line 186

def fetch_structuredidentifier
  Iso::StructuredIdentifier.new(
    project_number: Iso::ProjectNumber.new(content: fetch_docnumber),
    type: "JIS",
  )
end

#fetch_titleObject



35
36
37
38
39
40
# File 'lib/relaton/jis/scraper.rb', line 35

def fetch_title
  { "ja" => "Jpan", "en" => "Latn" }.map.with_index do |(lang, script), i|
    content = @doc.at("./h2/text()[#{i + 2}]").text.strip
    Bib::Title.new content: content, language: lang, script: script
  end
end

#fetch_typeObject



88
89
90
# File 'lib/relaton/jis/scraper.rb', line 88

def fetch_type
  "standard"
end

#langs_scriptsObject



100
101
102
103
104
105
106
107
108
109
110
# File 'lib/relaton/jis/scraper.rb', line 100

def langs_scripts
  @langs_scripts ||= LANGS.each_with_object([]) do |(key, lang), a|
    l = @doc.at(
      "./div/div/div[@class='blockContentFile']/div/div/p[1]" \
      "/span[contains(.,'#{key}')]/following-sibling::span",
    )
    next if l.nil? || l.text.strip == "-"

    a << lang
  end
end