Class: RelatonJis::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_jis/scraper.rb

Constant Summary collapse

ATTRS =
%i[
  fetched title link abstract docid docnumber date type language script
  docstatus doctype ics contributor editorialgroup structuredidentifier
].freeze
LANGS =
{ "和文" => { lang: "ja", script: "Jpan" },
"英訳" => { lang: "en", script: "Latn" } }.freeze
DATETYPES =
{ "発行年月日" => "issued", "確認年月日" => "confirmed" }.freeze
STATUSES =
{ "有効" => "valid", "廃止" => "withdrawn" }.freeze

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Scraper

Returns a new instance of Scraper.



14
15
16
17
# File 'lib/relaton_jis/scraper.rb', line 14

def initialize(url)
  @url = url
  @agent = Mechanize.new
end

Instance Method Details

#fetchObject



19
20
21
22
23
24
25
# File 'lib/relaton_jis/scraper.rb', line 19

def fetch
  @doc = @agent.get(@url).at "//div[@id='main']/section"
  attrs = ATTRS.each_with_object({}) do |attr, hash|
    hash[attr] = send "fetch_#{attr}"
  end
  BibliographicItem.new(**attrs)
end

#fetch_abstractObject



48
49
50
51
52
# File 'lib/relaton_jis/scraper.rb', line 48

def fetch_abstract
  @doc.xpath("./table/tr[th[.='規格概要']]/td").map do |node|
    RelatonBib::FormattedString.new content: node.text.strip, language: "ja", script: "Jpan"
  end
end

#fetch_contributorObject



112
113
114
115
116
117
118
# File 'lib/relaton_jis/scraper.rb', line 112

def fetch_contributor
  @doc.xpath("./table/tr[th[.='原案作成団体']]/td").map do |node|
    name = RelatonBib::LocalizedString.new node.text.strip, "ja", "Jpan"
    org = RelatonBib::Organization.new name: name
    RelatonBib::ContributionInfo.new entity: org, role: [type: "author"]
  end
end

#fetch_dateObject



64
65
66
67
68
69
70
71
72
# File 'lib/relaton_jis/scraper.rb', line 64

def fetch_date
  DATETYPES.each_with_object([]) do |(key, type), a|
    node = @doc.at("./div/div/div/p/text()[contains(.,'#{key}')]")
    next unless node

    on = node.text.match(/\d{4}-\d{2}-\d{2}/).to_s
    a << RelatonBib::BibliographicDate.new(type: type, on: on)
  end
end

#fetch_docidObject



54
55
56
57
# File 'lib/relaton_jis/scraper.rb', line 54

def fetch_docid
  id = @doc.at("./h2/text()[1]").text.strip
  [RelatonBib::DocumentIdentifier.new(id: id, type: "JIS", primary: true)]
end

#fetch_docnumberObject



59
60
61
62
# File 'lib/relaton_jis/scraper.rb', line 59

def fetch_docnumber
  match = @doc.at("./h2/text()[1]").text.strip.match(/^\w+\s(\w)\s?(\d+)/)
  "#{match[1]}#{match[2]}"
end

#fetch_docstatusObject



95
96
97
98
99
100
# File 'lib/relaton_jis/scraper.rb', line 95

def fetch_docstatus
  st = @doc.at("./div/div/div/p/text()[contains(.,'状態')]/following-sibling::span")
  return unless st

  RelatonBib::DocumentStatus.new(stage: STATUSES[st.text.strip])
end

#fetch_doctypeObject



102
103
104
# File 'lib/relaton_jis/scraper.rb', line 102

def fetch_doctype
  "standard"
end

#fetch_editorialgroupObject



120
121
122
123
124
125
126
# File 'lib/relaton_jis/scraper.rb', line 120

def fetch_editorialgroup
  node = @doc.at("./table/tr[th[.='原案作成団体']]/td")
  return unless node

  tc = RelatonBib::WorkGroup.new name: node.text.strip
  RelatonIsoBib::EditorialGroup.new technical_committee: [tc]
end

#fetch_fetchedObject



27
28
29
# File 'lib/relaton_jis/scraper.rb', line 27

def fetch_fetched
  Date.today.to_s
end

#fetch_icsObject



106
107
108
109
110
# File 'lib/relaton_jis/scraper.rb', line 106

def fetch_ics
  @doc.xpath("./table/tr[th[.='ICS']]/td").map do |node|
    RelatonIsoBib::Ics.new node.text.strip
  end
end

#fetch_languageObject



78
79
80
# File 'lib/relaton_jis/scraper.rb', line 78

def fetch_language
  langs_scripts.map { |l| l[:lang] }
end


38
39
40
41
42
43
44
45
46
# File 'lib/relaton_jis/scraper.rb', line 38

def fetch_link
  src = RelatonBib::TypedUri.new content: @url, type: "src"
  uri = URI @url
  domain = "#{uri.scheme}://#{uri.host}"
  @doc.xpath("./table/tr[th[.='プレビュー']]/td/a").reduce([src]) do |mem, node|
    href = "#{domain}#{node[:href]}"
    mem << RelatonBib::TypedUri.new(content: href, type: "pdf")
  end
end

#fetch_scriptObject



82
83
84
# File 'lib/relaton_jis/scraper.rb', line 82

def fetch_script
  langs_scripts.map { |l| l[:script] }
end

#fetch_structuredidentifierObject



128
129
130
# File 'lib/relaton_jis/scraper.rb', line 128

def fetch_structuredidentifier
  RelatonIsoBib::StructuredIdentifier.new project_number: fetch_docnumber, type: "JIS"
end

#fetch_titleObject



31
32
33
34
35
36
# File 'lib/relaton_jis/scraper.rb', line 31

def fetch_title
  { "ja" => "Jpan", "en" => "Lant" }.map.with_index do |(lang, script), i|
    content = @doc.at("./h2/text()[#{i + 2}]").text.strip
    RelatonBib::TypedTitleString.new content: content, language: lang, script: script
  end
end

#fetch_typeObject



74
75
76
# File 'lib/relaton_jis/scraper.rb', line 74

def fetch_type
  "standard"
end

#langs_scriptsObject



86
87
88
89
90
91
92
93
# File 'lib/relaton_jis/scraper.rb', line 86

def langs_scripts
  @langs_scripts ||= LANGS.each_with_object([]) do |(key, lang), a|
    l = @doc.at("./div/div/div[@class='blockContentFile']/div/div/p[1]/span[contains(.,'#{key}')]/following-sibling::span")
    next if l.nil? || l.text.strip == "-"

    a << lang
  end
end