Class: Relaton::Cie::DataFetcher

Inherits:
Relaton::Core::DataFetcher
  • Object
show all
Defined in:
lib/relaton/cie/data_fetcher.rb

Constant Summary collapse

URL =
"https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
RETRIABLE_ERRORS =
[
  SocketError,
  Ferrum::TimeoutError,
  Ferrum::PendingConnectionsError,
  Ferrum::StatusError
].freeze

Instance Method Summary collapse

Instance Method Details

#agentObject



77
78
79
# File 'lib/relaton/cie/data_fetcher.rb', line 77

def agent
  @agent ||= BrowserAgent.new
end

#fetch(_source = nil) ⇒ Object



313
314
315
316
317
318
# File 'lib/relaton/cie/data_fetcher.rb', line 313

def fetch(_source = nil)
  fetch_doc
  report_errors
ensure
  @agent&.quit
end

#fetch_abstract(doc) ⇒ Array<Relaton::Bib::LocalizedMarkedUpString>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::LocalizedMarkedUpString>)


220
221
222
223
224
225
226
227
228
229
230
# File 'lib/relaton/cie/data_fetcher.rb', line 220

def fetch_abstract(doc)
  content = doc.at('//div[contains(@class,"description")]')&.text&.strip
  if content.nil? || content.empty?
    @errors[:abstract] &&= true
    return []
  end

  result = [Bib::Abstract.new(content: content, language: "en", script: "Latn")]
  @errors[:abstract] &&= result.empty?
  result
end

#fetch_contributor(doc) ⇒ Array<Relaton::Bib::Contributor>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::Contributor>)


234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'lib/relaton/cie/data_fetcher.rb', line 234

def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity
  authors = doc.xpath('//hgroup/p[not(@class="pub_date")]').text.gsub "\"", ""
  contribs = []
  until authors.empty?
    /^(?<sname1>\S+(?:\sder?\s)?[^\s,]+)
    (?:,?\s(?<sname2>[\w-]{2,})(?=,\s+\w\.))?
    (?:,?\s(?<fname>W-T\.[\w-]{2,})(?!,\s+\w\.))?
    (?:(?:\s?,\s?|\s)(?<init>(?:\w(?:\s?\.|\s|,|$)[\s-]?)+))?
    (?:(?:[,;]\s*|\s+|\.|(?<=\s))(?:and\s)?)?/x =~ authors
    raise StandardError, "Author name not found in \"#{authors}\"" unless $LAST_MATCH_INFO

    authors.sub! $LAST_MATCH_INFO.to_s, ""
    sname = [sname1, sname2].compact.join " "
    surname = Bib::LocalizedString.new content: sname, language: "en", script: "Latn"
    forename = []
    forename << Bib::FullNameType::Forename.new(content: fname, language: "en", script: "Latn") if fname
    (init&.strip || "").split(/(?:,|\.)(?:-|\s)?/).each do |int|
      forename << Bib::FullNameType::Forename.new(content: "", initial: int.strip, language: "en", script: "Latn")
    end
    fullname = Bib::FullName.new surname: surname, forename: forename
    person = Bib::Person.new name: fullname
    role = Bib::Contributor::Role.new type: "author"
    contribs << Bib::Contributor.new(person: person, role: [role])
    @errors[:contributor_author] &&= contribs.empty?
  end
  org_name = Bib::TypedLocalizedString.new(content: "Commission Internationale de L'Eclairage")
  abbrev = Bib::LocalizedString.new content: "CIE"
  org_uri = Bib::Uri.new content: "cie.co.at"
  org = Bib::Organization.new(name: [org_name], abbreviation: abbrev, uri: [org_uri])
  org_role = Bib::Contributor::Role.new type: "publisher"
  contribs << Bib::Contributor.new(organization: org, role: [org_role])
end

#fetch_date(doc) ⇒ Array<Relaton::Bib::Date>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::Date>)


165
166
167
168
169
170
171
172
173
# File 'lib/relaton/cie/data_fetcher.rb', line 165

def fetch_date(doc)
  result = doc.xpath("//h3[.='Published:']/following-sibling::span").map do |d|
    pd = d.text.strip
    on = pd.match?(/^\d{4}(?:[^-]|$)/) ? pd : Date.strptime(pd, "%m/%d/%Y").strftime("%Y-%m-%d")
    Bib::Date.new(type: "published", at: on)
  end
  @errors[:date] &&= result.empty?
  result
end

#fetch_doc(url = URL) ⇒ Object



320
321
322
323
324
325
326
327
328
329
330
331
# File 'lib/relaton/cie/data_fetcher.rb', line 320

def fetch_doc(url = URL)
  result = time_req { agent.get url }
  result.xpath("//li[@data-product]").each { |hit| parse_page hit }
  np = result.at '//a[@class="next_page"]'
  if np
    next_href = np[:href]
    next_url = next_href.start_with?("http") ? next_href : "https://www.techstreet.com#{next_href}"
    fetch_doc next_url
  else
    index.save
  end
end

#fetch_docid(hit, doc) ⇒ Array<Relaton::Bib::Docidentifier>

Parameters:

  • hit (Nokogiri::HTML::Document)
  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::Docidentifier>)


92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/relaton/cie/data_fetcher.rb', line 92

def fetch_docid(hit, doc)
  code, code2 = parse_code hit, doc
  docid = []
  if code && !code.strip.empty?
    docid << Bib::Docidentifier.new(type: "CIE", content: code, primary: true)
    @errors[:docid_1] &&= false
  else
    @errors[:docid_1] &&= true
  end
  if code2 && !code2.strip.empty?
    type2 = code2.match(/\w+/).to_s
    docid << Relaton::Bib::Docidentifier.new(type: type2, content: code2.strip)
    @errors[:docid_2] &&= false
  else
    @errors[:docid_2] &&= true
  end
  isbn = doc.at('//h3[contains(.,"ISBN")]/following-sibling::span')&.text
  if isbn && !isbn.strip.empty?
    docid << Bib::Docidentifier.new(type: "ISBN", content: isbn)
    @errors[:docid_isbn] &&= false
  else
    @errors[:docid_isbn] &&= true
  end
  docid
end

#fetch_docnumber(hit) ⇒ Object



145
146
147
# File 'lib/relaton/cie/data_fetcher.rb', line 145

def fetch_docnumber(hit)
  parse_code(hit).first.sub(/^CIE\s(?:ISO\s)?/, "")
end

#fetch_doctypeObject



271
272
273
# File 'lib/relaton/cie/data_fetcher.rb', line 271

def fetch_doctype
  Bib::Doctype.new(content: "document")
end

#fetch_edition(doc) ⇒ String

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (String)


177
178
179
180
181
182
183
184
185
186
187
# File 'lib/relaton/cie/data_fetcher.rb', line 177

def fetch_edition(doc)
  ed = doc.at("//h3[.='Edition:']/following-sibling::span")
  @errors[:edition] &&= true
  return unless ed

  content = ed.text.slice(/^\d+(?=(st|nd|rd|th))/)
  if content
    @errors[:edition] = false
    Bib::Edition.new(content: content)
  end
end

#fetch_extObject



267
268
269
# File 'lib/relaton/cie/data_fetcher.rb', line 267

def fetch_ext
  Ext.new(doctype: fetch_doctype, flavor: "cie")
end

#fetch_relation(doc) ⇒ Array<Relaton::Cie::Relation>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Cie::Relation>)


191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/relaton/cie/data_fetcher.rb', line 191

def fetch_relation(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  rels = doc.xpath('//section[@class="history"]/ol/li[not(contains(@class,"selected-product"))]').map do |rel|
    ref = rel.at("a")
    url = "https://www.techstreet.com#{ref[:href]}"
    title = Bib::Title.from_string ref.at('p/span[@class="title"]').text
    did = ref.at("h3").text
    docid = [Bib::Docidentifier.new(type: "CIE", content: did, primary: true)]
    on = ref.at("p/time")
    date = [Bib::Date.new(type: "published", at: on[:datetime])]
    source = [Bib::Uri.new(type: "src", content: url)]
    bibitem = ItemData.new docidentifier: docid, title: title, source: source, date: date
    type = ref.at('//li/i[contains(@class,"historical")]') ? "updates" : "updatedBy"
    Bib::Relation.new(type: type, bibitem: bibitem)
  end
  @errors[:relation] &&= rels.empty?
  rels
end

#fetch_source(url) ⇒ Array<Relaton::Bib::Uri>

Parameters:

  • url (String)

Returns:

  • (Array<Relaton::Bib::Uri>)


211
212
213
214
215
216
# File 'lib/relaton/cie/data_fetcher.rb', line 211

def fetch_source(url)
  @errors[:source] &&= url.nil? || url.empty?
  return [] if url.nil? || url.empty?

  [Bib::Uri.new(type: "src", content: url)]
end

#fetch_title(doc) ⇒ Array<Relaton::Bib::Title>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Relaton::Bib::Title>)


151
152
153
154
155
156
157
158
159
160
161
# File 'lib/relaton/cie/data_fetcher.rb', line 151

def fetch_title(doc)
  t = doc.at("//hgroup/h2/text()", "//hgroup/h1/text()")
  unless t && !t.text.strip.empty?
    @errors[:title] &&= true
    return []
  end

  result = Bib::Title.from_string t.text.strip
  @errors[:title] &&= result.empty?
  result
end

#indexObject



81
82
83
# File 'lib/relaton/cie/data_fetcher.rb', line 81

def index
  @index ||= Index.find_or_create :cie, file: "index-v1.yaml"
end

#log_error(msg) ⇒ Object



85
86
87
# File 'lib/relaton/cie/data_fetcher.rb', line 85

def log_error(msg)
  Util.error msg
end

#parse_cie_code(code1, code2, doc = nil) ⇒ Object

rubocop:disable Metrics/CyclomaticComplexity



137
138
139
140
141
142
143
# File 'lib/relaton/cie/data_fetcher.rb', line 137

def parse_cie_code(code1, code2, doc = nil) # rubocop:disable Metrics/CyclomaticComplexity
  code = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1
  add = doc&.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/)
  return code unless add

  "#{code} #{add[1]} #{add[2]}"
end

#parse_code(hit, doc = nil) ⇒ Object



118
119
120
121
122
123
# File 'lib/relaton/cie/data_fetcher.rb', line 118

def parse_code(hit, doc = nil)
  code = hit.at("h3/a").text.strip.squeeze(" ").sub(/\u25b9/, "").gsub(" / ", "/")
  c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code
  code = code[0...c2idx].strip if c2idx
  [primary_code(code, doc), c2]
end

#parse_page(hit) ⇒ Object

Parameters:

  • hit (Nokogiri::HTML::Element)


295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# File 'lib/relaton/cie/data_fetcher.rb', line 295

def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  url = hit.at('h3/a')[:href]
  doc = time_req { agent.get url }
  item = ItemData.new(
    type: "standard", source: fetch_source(url), docnumber: fetch_docnumber(hit),
    docidentifier: fetch_docid(hit, doc), title: fetch_title(doc),
    abstract: fetch_abstract(doc), date: fetch_date(doc),
    edition: fetch_edition(doc), contributor: fetch_contributor(doc),
    relation: fetch_relation(doc), language: "en", script: "Latn",
    ext: fetch_ext
  )
  write_file item
rescue StandardError => e
  Util.error do
    "Document: #{url}\n#{e.message}\n#{e.backtrace}"
  end
end

#primary_code(code, doc = nil) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
# File 'lib/relaton/cie/data_fetcher.rb', line 125

def primary_code(code, doc = nil)
  /^(?<code1>[^(]+)(?:\((?<code2>[a-zA-Z]+\d+,(?:\sPages)?[^)]+))?/ =~ code
  if code1&.match?(/^CIE/)
    parse_cie_code code1, code2, doc
  elsif (pcode = doc&.at('//h3[.="Product Code(s):"]/following-sibling::span'))
    "CIE #{pcode.text.strip.match(/[^,]+/)}"
  else
    num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "").gsub(/,(?=\S)/, " ")
    "CIE #{num}"
  end
end

#time_reqObject



340
341
342
343
344
345
346
347
348
349
350
351
352
# File 'lib/relaton/cie/data_fetcher.rb', line 340

def time_req
  tries = 0
  begin
    tries += 1
    sleep [4 - (Time.now - @last_request_time).to_i, 0].max if @last_request_time
    yield
  rescue *RETRIABLE_ERRORS => e
    retry if tries < 4
    raise e
  ensure
    @last_request_time = Time.now
  end
end

#to_bibxml(bib) ⇒ Object



292
# File 'lib/relaton/cie/data_fetcher.rb', line 292

def to_bibxml(bib) = bib.to_rfcxml

#to_xml(bib) ⇒ Object



290
# File 'lib/relaton/cie/data_fetcher.rb', line 290

def to_xml(bib) = bib.to_xml(bibdata: true)

#to_yaml(bib) ⇒ Object



291
# File 'lib/relaton/cie/data_fetcher.rb', line 291

def to_yaml(bib) = bib.to_yaml

#write_file(bib) ⇒ Object

Parameters:

  • bib (RelatonCie::BibliographicItem)


276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'lib/relaton/cie/data_fetcher.rb', line 276

def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  id = bib.docidentifier[0].content
  file = output_file id
  if @files.include? file
    Util.warn do
      "File #{file} exists. Docid: #{bib.docidentifier[0].content}\n" \
      "Link: #{bib.source.detect { |l| l.type == 'src' }.content}"
    end
  else @files << file
  end
  index.add_or_update bib.docidentifier[0].content, file
  File.write file, serialize(bib), encoding: "UTF-8"
end