Class: Relaton::Plateau::Fetcher
- Inherits:
-
Object
- Object
- Relaton::Plateau::Fetcher
- Defined in:
- lib/relaton/plateau/fetcher.rb
Overview
Fetcher class to fetch data from the Plateau website
Constant Summary collapse
- HANDBOOKS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/handbooks.json".freeze
- TECHNICAL_REPORTS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/technical-reports.json".freeze
Class Method Summary collapse
Instance Method Summary collapse
-
#create_request(uri) ⇒ Object
Create a GET request with custom headers to mimic a browser.
-
#extract_handbooks_data ⇒ Object
Extract data for handbooks.
-
#extract_technical_reports_data ⇒ Object
Extract data for technical reports.
-
#fetch_json_data(url) ⇒ Hash
Fetch JSON data from a URL with custom headers.
- #file_name(id) ⇒ Object
-
#hadle_response(response) ⇒ Object
Handle different content encodings.
- #index ⇒ Object
-
#initialize(output, format) ⇒ Fetcher
constructor
A new instance of Fetcher.
-
#save_document(item) ⇒ Object
def self.save_to_yaml(data, filename) File.open(filename, “w”) do |file| file.write(data.to_yaml) end puts “Data saved to #filename.” end.
- #serialize(item) ⇒ Object
Constructor Details
#initialize(output, format) ⇒ Fetcher
Returns a new instance of Fetcher.
13 14 15 16 17 18 |
# File 'lib/relaton/plateau/fetcher.rb', line 13 def initialize(output, format) @output = output @format = format @ext = format.sub(/^bib/, "") @files = [] end |
Class Method Details
.fetch(source, output: "data", format: "yaml") ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/relaton/plateau/fetcher.rb', line 24 def self.fetch(source, output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p output if source == "plateau-handbooks" new(output, format).extract_handbooks_data elsif source == "plateau-technical-reports" new(output, format).extract_technical_reports_data else puts "Invalid source: #{source}" end t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end |
Instance Method Details
#create_request(uri) ⇒ Object
Create a GET request with custom headers to mimic a browser
55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/relaton/plateau/fetcher.rb', line 55 def create_request(uri) request = Net::HTTP::Get.new(uri) request["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0" request["Accept"] = "*/*" request["Accept-Language"] = "en-US,en;q=0.5" request["Accept-Encoding"] = "gzip, deflate, br, zstd" request["Referer"] = "https://www.mlit.go.jp/plateau/libraries/" request["purpose"] = "prefetch" request["x-nextjs-data"] = "1" request["Connection"] = "keep-alive" request end |
#extract_handbooks_data ⇒ Object
Extract data for handbooks
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/relaton/plateau/fetcher.rb', line 112 def extract_handbooks_data data = fetch_json_data(HANDBOOKS_URL) Util.info "Extracting handbooks data..." data["pageProps"]["handbooks"]["nodes"].each do |entry| handbook = entry["handbook"] versions = handbook["versions"] description_parts = handbook["description"]&.split("<br />") || ["", ""] title_en = description_parts[0].strip if description_parts[0] abstract = description_parts[1].strip if description_parts[1] doctype = entry["slug"].match("-") ? "annex" : "handbook" versions.each do |version| item = HandbookParser.new( version: version, entry: entry, title_en: title_en, abstract: abstract, doctype: doctype ).parse save_document(item) # ::Relaton::Plateau::BibItem.new( # pubid: "PLATEAU Handbook ##{entry["slug"]}", # title_jp: handbook["title"], # title_en: title_en, # abstract_jp: abstract_jp, # cover: "https://www.mlit.go.jp/#{handbook["thumbnail"]["mediaItemUrl"]}", # type: document_type, # publication_date: Date.parse(version["date"].gsub(".", "-")), # url_pdf: version["pdf"], # url_html: version["html"], # filesize: version["filesize"].to_i, # edition_number: version["title"].match(/\d\.\d/)[0], # edition_text: version["title"], # # tags: [], # ) end end index.save end |
#extract_technical_reports_data ⇒ Object
Extract data for technical reports
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
# File 'lib/relaton/plateau/fetcher.rb', line 154 def extract_technical_reports_data data = fetch_json_data(TECHNICAL_REPORTS_URL) Util.info "Extracting technical reports data..." data["pageProps"]["nodes"].map do |entry| save_document(TechnicalReportParser.new(entry).parse) # technical_report = entry["technicalReport"] # ::Relaton::Plateau::BibItem.new( # title_jp: technical_report["title"], # abstract_jp: technical_report["subtitle"], # cover: "https://www.mlit.go.jp/#{technical_report["thumbnail"]["mediaItemUrl"]}", # pubid: "PLATEAU Tech Report ##{entry["slug"]}", # type: "technical-report", # subtype: entry["technicalReportCategories"]["nodes"].map { |cat| cat["name"] }, # publication_date: Date.parse(entry["date"]), # url_pdf: technical_report["pdf"], # filesize: technical_report["filesize"].to_i, # edition_number: "1.0", # edition_text: "1.0", # tags: entry["globalTags"]["nodes"].map { |tag| tag["name"] }, # ) end index.save end |
#fetch_json_data(url) ⇒ Hash
Fetch JSON data from a URL with custom headers
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/relaton/plateau/fetcher.rb', line 83 def fetch_json_data(url) uri = URI(url) request = create_request(uri) # Send the request and get the response response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http| http.request(request) end # Check if the response is successful unless response.code.to_i == 200 Util.warn "Failed to fetch data: #{response.code} #{response.}" return {} end body = hadle_response(response) # Parse the JSON response JSON.parse(body) rescue StandardError => e # Handle any errors during the fetching process Util.error "Error fetching JSON data from #{url}: #{e.}" {} end |
#file_name(id) ⇒ Object
200 201 202 203 204 205 206 207 208 |
# File 'lib/relaton/plateau/fetcher.rb', line 200 def file_name(id) name = id.gsub(/\s+/, "_").gsub(/\W+/, "").downcase if id.match?(/民間活用編/) name += "_private" elsif id.match?(/公共活用編/) name += "_public" end File.join(@output, "#{name}.#{@ext}") end |
#hadle_response(response) ⇒ Object
Handle different content encodings
69 70 71 72 73 74 75 76 77 |
# File 'lib/relaton/plateau/fetcher.rb', line 69 def hadle_response(response) if response["Content-Encoding"] == "gzip" Zlib::GzipReader.new(StringIO.new(response.body)).read elsif response["Content-Encoding"] == "deflate" Zlib::Inflate.inflate(response.body) else response.body end end |
#index ⇒ Object
20 21 22 |
# File 'lib/relaton/plateau/fetcher.rb', line 20 def index @index ||= Relaton::Index.find_or_create :plateau, file: "index-v1.yaml" end |
#save_document(item) ⇒ Object
def self.save_to_yaml(data, filename)
File.open(filename, "w") do |file|
file.write(data.to_yaml)
end
puts "Data saved to #{filename}."
end
188 189 190 191 192 193 194 195 196 197 198 |
# File 'lib/relaton/plateau/fetcher.rb', line 188 def save_document(item) id = item.docidentifier.first.id file = file_name id if @files.include?(file) Util.warn "File #{file} already exists, skipping.", key: id else File.write(file, serialize(item)) @files << file index.add_or_update id, file end end |
#serialize(item) ⇒ Object
210 211 212 213 214 215 216 |
# File 'lib/relaton/plateau/fetcher.rb', line 210 def serialize(item) case @format when "yaml" then item.to_hash.to_yaml when "xml" then item.to_xml bibdata: true else item.send("to_#{@format}") end end |