Class: RelatonIec::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_iec/data_fetcher.rb

Constant Summary collapse

ENTRYPOINT =
"https://api.iec.ch/harmonized/publications?size=100&sortBy=urn&page=".freeze
CREDENTIAL =
"https://api.iec.ch/oauth/client_credential/accesstoken?grant_type=client_credentials".freeze

Instance Method Summary collapse

Constructor Details

#initialize(source = "iec-harmonised-latest", output: "data", format: "yaml") ⇒ DataFetcher

Initialize new instance.

Parameters:

  • source (String) (defaults to: "iec-harmonised-latest")

    source name (iec-harmonized-all, iec-harmonized-latest)

  • output (String) (defaults to: "data")

    output directory

  • format (String) (defaults to: "yaml")

    format of output files (xml, bibxml, yaml)



13
14
15
16
17
18
19
20
# File 'lib/relaton_iec/data_fetcher.rb', line 13

def initialize(source = "iec-harmonised-latest", output: "data", format: "yaml")
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = []
  @index = Index.new "index.yaml"
  @all = source == "iec-harmonised-all"
end

Instance Method Details

#access_tokenString

Get access token.

Returns:

  • (String)

    access token



121
122
123
124
125
126
127
128
129
130
131
# File 'lib/relaton_iec/data_fetcher.rb', line 121

def access_token # rubocop:disable Metrics/AbcSize
  @access_token ||= begin
    uri = URI CREDENTIAL
    req = Net::HTTP::Get.new uri
    req.basic_auth ENV.fetch("IEC_HAPI_PROJ_PUBS_KEY"), ENV.fetch("IEC_HAPI_PROJ_PUBS_SECRET")
    res = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
      http.request req
    end
    JSON.parse(res.body)["access_token"]
  end
end

#add_static_files_to_indexvoid

This method returns an undefined value.

Add static files to index.



51
52
53
54
55
56
57
# File 'lib/relaton_iec/data_fetcher.rb', line 51

def add_static_files_to_index
  Dir["static/*.yaml"].each do |file|
    pub = RelatonBib.parse_yaml File.read(file, encoding: "UTF-8")
    pubid = RelatonBib.array(pub["docid"]).detect { |id| id["primary"] }["id"]
    @index.add pubid, file
  end
end

#fetchObject

Fetch data from IEC.



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/relaton_iec/data_fetcher.rb', line 25

def fetch # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  t1 = Time.now
  puts "Started at: #{t1}"

  FileUtils.mkdir_p @output
  if @all
    FileUtils.rm Dir[File.join(@output, "*.#{@ext}")]
    @index.clear
  end
  fetch_all
  add_static_files_to_index
  @index.save

  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
rescue StandardError => e
  warn e.message
  warn e.backtrace.join("\n")
end

#fetch_allvoid

This method returns an undefined value.

Fetch documents from IEC API.



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/relaton_iec/data_fetcher.rb', line 64

def fetch_all # rubocop:disable Metrics/MethodLength
  page = 0
  next_page = true
  while next_page
    res = fetch_page_token page
    unless res.code == "200"
      warn "[relaton-iec] #{res.body}"
      break
    end
    json = JSON.parse res.body
    json["publication"].each { |pub| fetch_pub pub }
    page += 1
    next_page = res["link"]&.include? "rel=\"last\""
  end
end

#fetch_page(page) ⇒ Net::HTTP::Response

Fetch page from IEC API.

Parameters:

  • page (Integer)

    page number

Returns:

  • (Net::HTTP::Response)

    response



103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/relaton_iec/data_fetcher.rb', line 103

def fetch_page(page)
  url = "#{ENTRYPOINT}#{page}"
  if !@all && @index.last_change
    url += "&lastChangeTimestampFrom=#{@index.last_change}"
  end
  uri = URI url
  req = Net::HTTP::Get.new uri
  req["Authorization"] = "Bearer #{access_token}"
  Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
    http.request req
  end
end

#fetch_page_token(page) ⇒ Net::HTTP::Response

Fetch page. If response code is 401, then get new access token and try

Parameters:

  • page (Integer)

    page number

Returns:

  • (Net::HTTP::Response)

    response



87
88
89
90
91
92
93
94
# File 'lib/relaton_iec/data_fetcher.rb', line 87

def fetch_page_token(page)
  res = fetch_page page
  if res.code == "401"
    @access_token = nil
    res = fetch_page page
  end
  res
end

#fetch_pub(pub) ⇒ Object

Fetch publication and save it to file.

Parameters:

  • pub (Hash)

    publication



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/relaton_iec/data_fetcher.rb', line 138

def fetch_pub(pub) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
  bib = DataParser.new(pub).parse
  did = bib.docidentifier.detect &:primary
  file = File.join(@output, "#{did.id.downcase.gsub(/[:\s\/]/, '_')}.#{@ext}")
  if @files.include? file then warn "File #{file} exists."
  else
    @files << file
    @index.add index_id(pub), file, pub["lastChangeTimestamp"]
  end
  content = case @format
            when "xml" then bib.to_xml bibdata: true
            when "yaml", "yml" then bib.to_hash.to_yaml
            when "bibxml" then bib.to_bibxml
            end
  File.write file, content, encoding: "UTF-8"
end

#index_id(pub) ⇒ Object



155
156
157
158
159
160
161
162
163
164
# File 'lib/relaton_iec/data_fetcher.rb', line 155

def index_id(pub)
  /-(?<part>\d+)/ =~ pub["reference"]
  title = pub.dig("title", 0, "value")
  return pub["reference"] unless part && title

  ids = title.scan(/(?<=-\sPart\s)#{part[0]}\d+(?=:)/).map do |m|
    pub["reference"].sub(/-#{part}/, "-#{m}")
  end
  ids.size > 1 ? ids : pub["reference"]
end