Class: Relaton::Plateau::DataFetcher
- Inherits:
-
Core::DataFetcher
- Object
- Core::DataFetcher
- Relaton::Plateau::DataFetcher
- Defined in:
- lib/relaton/plateau/data_fetcher.rb
Overview
Fetcher class to fetch data from the Plateau website
Constant Summary collapse
- HANDBOOKS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/handbooks.json".freeze
- TECHNICAL_REPORTS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/technical-reports.json".freeze
Instance Method Summary collapse
-
#create_request(uri) ⇒ Object
Create a GET request with custom headers to mimic a browser.
-
#extract_handbooks_data ⇒ Object
Extract data for handbooks.
-
#extract_technical_reports_data ⇒ Object
Extract data for technical reports.
- #fetch(source) ⇒ Object
-
#fetch_json_data(url) ⇒ Hash
Fetch JSON data from a URL with custom headers.
- #file_name(id) ⇒ Object
-
#hadle_response(response) ⇒ Object
Handle different content encodings.
- #index ⇒ Object
- #log_error(msg) ⇒ Object
- #save_document(item) ⇒ Object
- #to_bibxml(bib) ⇒ Object
- #to_xml(bib) ⇒ Object
- #to_yaml(bib) ⇒ Object
Instance Method Details
#create_request(uri) ⇒ Object
Create a GET request with custom headers to mimic a browser
32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 32 def create_request(uri) request = Net::HTTP::Get.new(uri) request["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0" request["Accept"] = "*/*" request["Accept-Language"] = "en-US,en;q=0.5" request["Accept-Encoding"] = "gzip, deflate, br, zstd" request["Referer"] = "https://www.mlit.go.jp/plateau/libraries/" request["purpose"] = "prefetch" request["x-nextjs-data"] = "1" request["Connection"] = "keep-alive" request end |
#extract_handbooks_data ⇒ Object
Extract data for handbooks
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 89 def extract_handbooks_data data = fetch_json_data(HANDBOOKS_URL) Util.info "Extracting handbooks data..." data["pageProps"]["handbooks"]["nodes"].each do |entry| handbook = entry["handbook"] doctype = entry["slug"].match("-") ? "annex" : "handbook" handbook["versions"].each do |version| item = HandbookParser.new(version: version, entry: entry, doctype: doctype, errors: @errors).parse save_document(item) end end index.save report_errors end |
#extract_technical_reports_data ⇒ Object
Extract data for technical reports
108 109 110 111 112 113 114 115 116 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 108 def extract_technical_reports_data data = fetch_json_data(TECHNICAL_REPORTS_URL) Util.info "Extracting technical reports data..." data["pageProps"]["nodes"].map do |entry| save_document(TechnicalReportParser.new(entry, @errors).parse) end index.save report_errors end |
#fetch(source) ⇒ Object
23 24 25 26 27 28 29 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 23 def fetch(source) case source when "plateau-handbooks" then extract_handbooks_data when "plateau-technical-reports" then extract_technical_reports_data else puts "Invalid source: #{source}" end end |
#fetch_json_data(url) ⇒ Hash
Fetch JSON data from a URL with custom headers
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 60 def fetch_json_data(url) uri = URI(url) request = create_request(uri) # Send the request and get the response response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http| http.request(request) end # Check if the response is successful unless response.code.to_i == 200 Util.warn "Failed to fetch data: #{response.code} #{response.}" return {} end body = hadle_response(response) # Parse the JSON response JSON.parse(body) rescue StandardError => e # Handle any errors during the fetching process Util.error "Error fetching JSON data from #{url}: #{e.}" {} end |
#file_name(id) ⇒ Object
130 131 132 133 134 135 136 137 138 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 130 def file_name(id) name = id.gsub(/\s+/, "-").gsub(/[^\w-]+/, "").downcase if id.match?(/民間活用編/) name += "-private" elsif id.match?(/公共活用編/) name += "-public" end File.join(@output, "#{name}.#{@ext}") end |
#hadle_response(response) ⇒ Object
Handle different content encodings
46 47 48 49 50 51 52 53 54 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 46 def hadle_response(response) if response["Content-Encoding"] == "gzip" Zlib::GzipReader.new(StringIO.new(response.body)).read elsif response["Content-Encoding"] == "deflate" Zlib::Inflate.inflate(response.body) else response.body end end |
#index ⇒ Object
15 16 17 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 15 def index @index ||= Relaton::Index.find_or_create :plateau, file: "#{INDEXFILE}.yaml" end |
#log_error(msg) ⇒ Object
19 20 21 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 19 def log_error(msg) Util.error msg end |
#save_document(item) ⇒ Object
118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 118 def save_document(item) id = item.docidentifier.first.content file = file_name id if @files.include?(file) Util.warn "File #{file} already exists, skipping.", key: id else File.write(file, serialize(item)) @files << file index.add_or_update id, file end end |
#to_bibxml(bib) ⇒ Object
148 149 150 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 148 def to_bibxml(bib) bib.to_rfcxml end |
#to_xml(bib) ⇒ Object
144 145 146 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 144 def to_xml(bib) Item.to_xml(bib, bibdata: true) end |
#to_yaml(bib) ⇒ Object
140 141 142 |
# File 'lib/relaton/plateau/data_fetcher.rb', line 140 def to_yaml(bib) Item.to_yaml(bib) end |