Class: RelatonJis::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_jis/scraper.rb

Constant Summary collapse

ATTRS =
%i[
  title link abstract docid docnumber date type language script
  docstatus doctype ics contributor editorialgroup structuredidentifier
].freeze
LANGS =
{ "和文" => { lang: "ja", script: "Jpan" },
"英訳" => { lang: "en", script: "Latn" } }.freeze
DATETYPES =
{ "発行年月日" => "issued", "確認年月日" => "confirmed" }.freeze
STATUSES =
{ "有効" => "valid", "廃止" => "withdrawn" }.freeze

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Scraper

Returns a new instance of Scraper.



16
17
18
19
# File 'lib/relaton_jis/scraper.rb', line 16

def initialize(url)
  @url = url
  @agent = Mechanize.new
end

Instance Method Details

#create_contrib(name, role) ⇒ Object



132
133
134
135
# File 'lib/relaton_jis/scraper.rb', line 132

def create_contrib(name, role)
  org = RelatonBib::Organization.new name: create_orgname(name)
  RelatonBib::ContributionInfo.new entity: org, role: [type: role]
end

#create_orgname(name) ⇒ Object



137
138
139
140
141
142
143
# File 'lib/relaton_jis/scraper.rb', line 137

def create_orgname(name)
  orgname = [RelatonBib::LocalizedString.new(name, "ja", "Jpan")]
  if name.include?("日本規格協会")
    orgname << RelatonBib::LocalizedString.new("Japanese Industrial Standards", "en", "Latn")
  end
  orgname
end

#document_idObject



65
66
67
# File 'lib/relaton_jis/scraper.rb', line 65

def document_id
  @document_id ||= @doc.at("./h2/text()[1]").text.strip
end

#fetchObject



21
22
23
24
25
26
27
# File 'lib/relaton_jis/scraper.rb', line 21

def fetch
  @doc = @agent.get(@url).at "//div[@id='main']/section"
  attrs = ATTRS.each_with_object({}) do |attr, hash|
    hash[attr] = send "fetch_#{attr}"
  end
  BibliographicItem.new(**attrs)
end

#fetch_abstractObject



50
51
52
53
54
# File 'lib/relaton_jis/scraper.rb', line 50

def fetch_abstract
  @doc.xpath("//div[@id='honbun']").map do |node|
    RelatonBib::FormattedString.new content: node.text.strip, language: "ja", script: "Jpan"
  end
end

#fetch_contributorObject



124
125
126
127
128
129
130
# File 'lib/relaton_jis/scraper.rb', line 124

def fetch_contributor
  authorizer = create_contrib("一般財団法人 日本規格協会", "authorizer")
  @doc.xpath("./table/tr[th[.='原案作成団体']]/td").reduce([authorizer]) do |a, node|
    a << create_contrib(node.text.strip, "author")
    a << create_contrib(node.text.strip, "publisher")
  end
end

#fetch_dateObject



69
70
71
72
73
74
75
76
77
# File 'lib/relaton_jis/scraper.rb', line 69

def fetch_date
  DATETYPES.each_with_object([]) do |(key, type), a|
    node = @doc.at("./div/div/div/p/text()[contains(.,'#{key}')]")
    next unless node

    on = node.text.match(/\d{4}-\d{2}-\d{2}/).to_s
    a << RelatonBib::BibliographicDate.new(type: type, on: on)
  end
end

#fetch_docidObject



56
57
58
# File 'lib/relaton_jis/scraper.rb', line 56

def fetch_docid
  [RelatonBib::DocumentIdentifier.new(id: document_id, type: "JIS", primary: true)]
end

#fetch_docnumberObject



60
61
62
63
# File 'lib/relaton_jis/scraper.rb', line 60

def fetch_docnumber
  match = document_id.match(/^\w+\s(\w)\s?(\d+)/)
  "#{match[1]}#{match[2]}"
end

#fetch_docstatusObject



100
101
102
103
104
105
# File 'lib/relaton_jis/scraper.rb', line 100

def fetch_docstatus
  st = @doc.at("./div/div/div/p/text()[contains(.,'状態')]/following-sibling::span")
  return unless st

  RelatonBib::DocumentStatus.new(stage: STATUSES[st.text.strip])
end

#fetch_doctypeObject



107
108
109
110
111
112
113
114
115
# File 'lib/relaton_jis/scraper.rb', line 107

def fetch_doctype
  type =  case document_id
          when /JIS\s[A-Z]\s[\w-]+:\d{4}\/AMENDMENT/ then "amendment"
          when /JIS\s[A-Z]\s[\w-]+/ then "japanese-industrial-standard"
          when /TR[\s\/][\w-]+/ then "technical-report"
          when /TS[\s\/][\w-]+/ then "technical-specification"
          end
  DocumentType.new type: type
end

#fetch_editorialgroupObject



145
146
147
148
149
150
151
# File 'lib/relaton_jis/scraper.rb', line 145

def fetch_editorialgroup
  node = @doc.at("./table/tr[th[.='原案作成団体']]/td")
  return unless node

  tc = RelatonBib::WorkGroup.new name: node.text.strip
  RelatonIsoBib::EditorialGroup.new technical_committee: [tc]
end

#fetch_icsObject



117
118
119
120
121
122
# File 'lib/relaton_jis/scraper.rb', line 117

def fetch_ics
  td = @doc.at("./table/tr[th[.='ICS']]/td")
  return [] unless td

  td.text.strip.split.map { |code| RelatonIsoBib::Ics.new code }
end

#fetch_languageObject



83
84
85
# File 'lib/relaton_jis/scraper.rb', line 83

def fetch_language
  langs_scripts.map { |l| l[:lang] }
end


40
41
42
43
44
45
46
47
48
# File 'lib/relaton_jis/scraper.rb', line 40

def fetch_link
  src = RelatonBib::TypedUri.new content: @url, type: "src"
  uri = URI @url
  domain = "#{uri.scheme}://#{uri.host}"
  @doc.xpath("./dl/dt[.='プレビュー']/following-sibling::dd[1]/a").reduce([src]) do |mem, node|
    href = "#{domain}#{node[:href]}"
    mem << RelatonBib::TypedUri.new(content: href, type: "pdf")
  end
end

#fetch_scriptObject



87
88
89
# File 'lib/relaton_jis/scraper.rb', line 87

def fetch_script
  langs_scripts.map { |l| l[:script] }
end

#fetch_structuredidentifierObject



153
154
155
# File 'lib/relaton_jis/scraper.rb', line 153

def fetch_structuredidentifier
  RelatonIsoBib::StructuredIdentifier.new project_number: fetch_docnumber, type: "JIS"
end

#fetch_titleObject

def fetch_fetched

Date.today.to_s

end



33
34
35
36
37
38
# File 'lib/relaton_jis/scraper.rb', line 33

def fetch_title
  { "ja" => "Jpan", "en" => "Lant" }.map.with_index do |(lang, script), i|
    content = @doc.at("./h2/text()[#{i + 2}]").text.strip
    RelatonBib::TypedTitleString.new content: content, language: lang, script: script
  end
end

#fetch_typeObject



79
80
81
# File 'lib/relaton_jis/scraper.rb', line 79

def fetch_type
  "standard"
end

#langs_scriptsObject



91
92
93
94
95
96
97
98
# File 'lib/relaton_jis/scraper.rb', line 91

def langs_scripts
  @langs_scripts ||= LANGS.each_with_object([]) do |(key, lang), a|
    l = @doc.at("./div/div/div[@class='blockContentFile']/div/div/p[1]/span[contains(.,'#{key}')]/following-sibling::span")
    next if l.nil? || l.text.strip == "-"

    a << lang
  end
end