Class: Relaton::Cie::DataFetcher

Inherits:
Relaton::Core::DataFetcher
  • Object
show all
Defined in:
lib/relaton/cie/data_fetcher.rb

Constant Summary collapse

URL =
"https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"

Instance Method Summary collapse

Instance Method Details

#agentObject



16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/relaton/cie/data_fetcher.rb', line 16

def agent
  return @agent if @agent

  @agent = Mechanize.new
  @agent.request_headers = {
    "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language" => "en-US,en;q=0.5",
    "Connection" => "keep-alive",
    "sec-ch-ua" => '"Chromium";v="91", "Google Chrome";v="91", ";Not A Brand";v="99"',
    "Sec-Fetch-Dest" => "document"
  }
  @agent.user_agent_alias = "Linux Firefox"
  @agent
end

#fetch(_source = nil) ⇒ Object



263
264
265
266
# File 'lib/relaton/cie/data_fetcher.rb', line 263

def fetch(_source = nil)
  fetch_doc
  report_errors
end

#fetch_abstract(doc) ⇒ Array<Relaton::Bib::LocalizedMarkedUpString>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::LocalizedMarkedUpString>)


170
171
172
173
174
175
176
177
178
179
180
# File 'lib/relaton/cie/data_fetcher.rb', line 170

def fetch_abstract(doc)
  content = doc.at('//div[contains(@class,"description")]')&.text&.strip
  if content.nil? || content.empty?
    @errors[:abstract] &&= true
    return []
  end

  result = [Bib::Abstract.new(content: content, language: "en", script: "Latn")]
  @errors[:abstract] &&= result.empty?
  result
end

#fetch_contributor(doc) ⇒ Array<Relaton::Bib::Contributor>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::Contributor>)


184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/relaton/cie/data_fetcher.rb', line 184

def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity
  authors = doc.xpath('//hgroup/p[not(@class="pub_date")]').text.gsub "\"", ""
  contribs = []
  until authors.empty?
    /^(?<sname1>\S+(?:\sder?\s)?[^\s,]+)
    (?:,?\s(?<sname2>[\w-]{2,})(?=,\s+\w\.))?
    (?:,?\s(?<fname>W-T\.[\w-]{2,})(?!,\s+\w\.))?
    (?:(?:\s?,\s?|\s)(?<init>(?:\w(?:\s?\.|\s|,|$)[\s-]?)+))?
    (?:(?:[,;]\s*|\s+|\.|(?<=\s))(?:and\s)?)?/x =~ authors
    raise StandardError, "Author name not found in \"#{authors}\"" unless $LAST_MATCH_INFO

    authors.sub! $LAST_MATCH_INFO.to_s, ""
    sname = [sname1, sname2].compact.join " "
    surname = Bib::LocalizedString.new content: sname, language: "en", script: "Latn"
    forename = []
    forename << Bib::FullNameType::Forename.new(content: fname, language: "en", script: "Latn") if fname
    (init&.strip || "").split(/(?:,|\.)(?:-|\s)?/).each do |int|
      forename << Bib::FullNameType::Forename.new(content: "", initial: int.strip, language: "en", script: "Latn")
    end
    fullname = Bib::FullName.new surname: surname, forename: forename
    person = Bib::Person.new name: fullname
    role = Bib::Contributor::Role.new type: "author"
    contribs << Bib::Contributor.new(person: person, role: [role])
    @errors[:contributor_author] &&= contribs.empty?
  end
  org_name = Bib::TypedLocalizedString.new(content: "Commission Internationale de L'Eclairage")
  abbrev = Bib::LocalizedString.new content: "CIE"
  org_uri = Bib::Uri.new content: "cie.co.at"
  org = Bib::Organization.new(name: [org_name], abbreviation: abbrev, uri: [org_uri])
  org_role = Bib::Contributor::Role.new type: "publisher"
  contribs << Bib::Contributor.new(organization: org, role: [org_role])
end

#fetch_date(doc) ⇒ Array<Relaton::Bib::Date>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::Date>)


115
116
117
118
119
120
121
122
123
# File 'lib/relaton/cie/data_fetcher.rb', line 115

def fetch_date(doc)
  result = doc.xpath("//h3[.='Published:']/following-sibling::span").map do |d|
    pd = d.text.strip
    on = pd.match?(/^\d{4}(?:[^-]|$)/) ? pd : Date.strptime(pd, "%m/%d/%Y").strftime("%Y-%m-%d")
    Bib::Date.new(type: "published", at: on)
  end
  @errors[:date] &&= result.empty?
  result
end

#fetch_doc(url = URL) ⇒ Object



268
269
270
271
272
273
274
275
276
277
# File 'lib/relaton/cie/data_fetcher.rb', line 268

def fetch_doc(url = URL)
  result = time_req { agent.get url }
  result.xpath("//li[@data-product]").each { |hit| parse_page hit }
  np = result.at '//a[@class="next_page"]'
  if np
    fetch_doc "https://www.techstreet.com#{np[:href]}"
  else
    index.save
  end
end

#fetch_docid(hit, doc) ⇒ Array<Relaton::Bib::Docidentifier>

Parameters:

  • hit (Nokogiri::HTML::Document)
  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::Docidentifier>)


42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/relaton/cie/data_fetcher.rb', line 42

def fetch_docid(hit, doc)
  code, code2 = parse_code hit, doc
  docid = []
  if code && !code.strip.empty?
    docid << Bib::Docidentifier.new(type: "CIE", content: code, primary: true)
    @errors[:docid_1] &&= false
  else
    @errors[:docid_1] &&= true
  end
  if code2 && !code2.strip.empty?
    type2 = code2.match(/\w+/).to_s
    docid << Relaton::Bib::Docidentifier.new(type: type2, content: code2.strip)
    @errors[:docid_2] &&= false
  else
    @errors[:docid_2] &&= true
  end
  isbn = doc.at('//h3[contains(.,"ISBN")]/following-sibling::span')&.text
  if isbn && !isbn.strip.empty?
    docid << Bib::Docidentifier.new(type: "ISBN", content: isbn)
    @errors[:docid_isbn] &&= false
  else
    @errors[:docid_isbn] &&= true
  end
  docid
end

#fetch_docnumber(hit) ⇒ Object



95
96
97
# File 'lib/relaton/cie/data_fetcher.rb', line 95

def fetch_docnumber(hit)
  parse_code(hit).first.sub(/^CIE\s(?:ISO\s)?/, "")
end

#fetch_doctypeObject



221
222
223
# File 'lib/relaton/cie/data_fetcher.rb', line 221

def fetch_doctype
  Bib::Doctype.new(content: "document")
end

#fetch_edition(doc) ⇒ String

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (String)


127
128
129
130
131
132
133
134
135
136
137
# File 'lib/relaton/cie/data_fetcher.rb', line 127

def fetch_edition(doc)
  ed = doc.at("//h3[.='Edition:']/following-sibling::span")
  @errors[:edition] &&= true
  return unless ed

  content = ed.text.slice(/^\d+(?=(st|nd|rd|th))/)
  if content
    @errors[:edition] = false
    Bib::Edition.new(content: content)
  end
end

#fetch_extObject



217
218
219
# File 'lib/relaton/cie/data_fetcher.rb', line 217

def fetch_ext
  Ext.new(doctype: fetch_doctype, flavor: "cie")
end

#fetch_relation(doc) ⇒ Array<Relaton::Cie::Relation>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Cie::Relation>)


141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/relaton/cie/data_fetcher.rb', line 141

def fetch_relation(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  rels = doc.xpath('//section[@class="history"]/ol/li[not(contains(@class,"selected-product"))]').map do |rel|
    ref = rel.at("a")
    url = "https://www.techstreet.com#{ref[:href]}"
    title = Bib::Title.from_string ref.at('p/span[@class="title"]').text
    did = ref.at("h3").text
    docid = [Bib::Docidentifier.new(type: "CIE", content: did, primary: true)]
    on = ref.at("p/time")
    date = [Bib::Date.new(type: "published", at: on[:datetime])]
    source = [Bib::Uri.new(type: "src", content: url)]
    bibitem = ItemData.new docidentifier: docid, title: title, source: source, date: date
    type = ref.at('//li/i[contains(@class,"historical")]') ? "updates" : "updatedBy"
    Bib::Relation.new(type: type, bibitem: bibitem)
  end
  @errors[:relation] &&= rels.empty?
  rels
end

#fetch_source(url) ⇒ Array<Relaton::Bib::Uri>

Parameters:

  • url (String)

Returns:

  • (Array<Relaton::Bib::Uri>)


161
162
163
164
165
166
# File 'lib/relaton/cie/data_fetcher.rb', line 161

def fetch_source(url)
  @errors[:source] &&= url.nil? || url.empty?
  return [] if url.nil? || url.empty?

  [Bib::Uri.new(type: "src", content: url)]
end

#fetch_title(doc) ⇒ Array<Relaton::Bib::Title>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::Title>)


101
102
103
104
105
106
107
108
109
110
111
# File 'lib/relaton/cie/data_fetcher.rb', line 101

def fetch_title(doc)
  t = doc.at("//hgroup/h2/text()", "//hgroup/h1/text()")
  unless t && !t.text.strip.empty?
    @errors[:title] &&= true
    return []
  end

  result = Bib::Title.from_string t.text.strip
  @errors[:title] &&= result.empty?
  result
end

#indexObject



31
32
33
# File 'lib/relaton/cie/data_fetcher.rb', line 31

def index
  @index ||= Index.find_or_create :cie, file: "index-v1.yaml"
end

#log_error(msg) ⇒ Object



35
36
37
# File 'lib/relaton/cie/data_fetcher.rb', line 35

def log_error(msg)
  Util.error msg
end

#parse_cie_code(code1, code2, doc = nil) ⇒ Object

rubocop:disable Metrics/CyclomaticComplexity



87
88
89
90
91
92
93
# File 'lib/relaton/cie/data_fetcher.rb', line 87

def parse_cie_code(code1, code2, doc = nil) # rubocop:disable Metrics/CyclomaticComplexity
  code = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1
  add = doc&.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/)
  return code unless add

  "#{code} #{add[1]} #{add[2]}"
end

#parse_code(hit, doc = nil) ⇒ Object



68
69
70
71
72
73
# File 'lib/relaton/cie/data_fetcher.rb', line 68

def parse_code(hit, doc = nil)
  code = hit.at("h3/a").text.strip.squeeze(" ").sub(/\u25b9/, "").gsub(" / ", "/")
  c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code
  code = code[0...c2idx].strip if c2idx
  [primary_code(code, doc), c2]
end

#parse_page(hit) ⇒ Object

Parameters:

  • hit (Nokogiri::HTML::Element)


245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# File 'lib/relaton/cie/data_fetcher.rb', line 245

def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  url = hit.at('h3/a')[:href]
  doc = time_req { agent.get url }
  item = ItemData.new(
    type: "standard", source: fetch_source(url), docnumber: fetch_docnumber(hit),
    docidentifier: fetch_docid(hit, doc), title: fetch_title(doc),
    abstract: fetch_abstract(doc), date: fetch_date(doc),
    edition: fetch_edition(doc), contributor: fetch_contributor(doc),
    relation: fetch_relation(doc), language: "en", script: "Latn",
    ext: fetch_ext
  )
  write_file item
rescue StandardError => e
  Util.error do
    "Document: #{url}\n#{e.message}\n#{e.backtrace}"
  end
end

#primary_code(code, doc = nil) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
# File 'lib/relaton/cie/data_fetcher.rb', line 75

def primary_code(code, doc = nil)
  /^(?<code1>[^(]+)(?:\((?<code2>[a-zA-Z]+\d+,(?:\sPages)?[^)]+))?/ =~ code
  if code1&.match?(/^CIE/)
    parse_cie_code code1, code2, doc
  elsif (pcode = doc&.at('//h3[.="Product Code(s):"]/following-sibling::span'))
    "CIE #{pcode.text.strip.match(/[^,]+/)}"
  else
    num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "").gsub(/,(?=\S)/, " ")
    "CIE #{num}"
  end
end

#time_reqObject



279
280
281
282
283
284
285
286
287
288
289
290
291
# File 'lib/relaton/cie/data_fetcher.rb', line 279

def time_req
  tries = 0
  begin
    tries += 1
    sleep [4 - (Time.now - @last_request_time).to_i, 0].max if @last_request_time
    yield
  rescue SocketError => e
    retry if tries < 4
    raise e
  ensure
    @last_request_time = Time.now
  end
end

#to_bibxml(bib) ⇒ Object



242
# File 'lib/relaton/cie/data_fetcher.rb', line 242

def to_bibxml(bib) = bib.to_rfcxml

#to_xml(bib) ⇒ Object



240
# File 'lib/relaton/cie/data_fetcher.rb', line 240

def to_xml(bib) = bib.to_xml(bibdata: true)

#to_yaml(bib) ⇒ Object



241
# File 'lib/relaton/cie/data_fetcher.rb', line 241

def to_yaml(bib) = bib.to_yaml

#write_file(bib) ⇒ Object

Parameters:

  • bib (RelatonCie::BibliographicItem)


226
227
228
229
230
231
232
233
234
235
236
237
238
# File 'lib/relaton/cie/data_fetcher.rb', line 226

def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  id = bib.docidentifier[0].content
  file = output_file id
  if @files.include? file
    Util.warn do
      "File #{file} exists. Docid: #{bib.docidentifier[0].content}\n" \
      "Link: #{bib.source.detect { |l| l.type == 'src' }.content}"
    end
  else @files << file
  end
  index.add_or_update bib.docidentifier[0].content, file
  File.write file, serialize(bib), encoding: "UTF-8"
end