Class: Relaton::Plateau::DataFetcher

Inherits:
Core::DataFetcher
  • Object
show all
Defined in:
lib/relaton/plateau/data_fetcher.rb

Overview

Fetcher class to fetch data from the Plateau website

Constant Summary collapse

HANDBOOKS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/handbooks.json".freeze
TECHNICAL_REPORTS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/technical-reports.json".freeze

Instance Method Summary collapse

Instance Method Details

#create_request(uri) ⇒ Object

Create a GET request with custom headers to mimic a browser



32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/relaton/plateau/data_fetcher.rb', line 32

def create_request(uri)
  request = Net::HTTP::Get.new(uri)
  request["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0"
  request["Accept"] = "*/*"
  request["Accept-Language"] = "en-US,en;q=0.5"
  request["Accept-Encoding"] = "gzip, deflate, br, zstd"
  request["Referer"] = "https://www.mlit.go.jp/plateau/libraries/"
  request["purpose"] = "prefetch"
  request["x-nextjs-data"] = "1"
  request["Connection"] = "keep-alive"
  request
end

#extract_handbooks_dataObject

Extract data for handbooks



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/relaton/plateau/data_fetcher.rb', line 89

def extract_handbooks_data
  data = fetch_json_data(HANDBOOKS_URL)
  Util.info "Extracting handbooks data..."
  data["pageProps"]["handbooks"]["nodes"].each do |entry|
    handbook = entry["handbook"]
    doctype = entry["slug"].match("-") ? "annex" : "handbook"

    handbook["versions"].each do |version|
      item = HandbookParser.new(version: version, entry: entry, doctype: doctype, errors: @errors).parse
      save_document(item)
    end
  end
  index.save
  report_errors
end

#extract_technical_reports_dataObject

Extract data for technical reports



108
109
110
111
112
113
114
115
116
# File 'lib/relaton/plateau/data_fetcher.rb', line 108

def extract_technical_reports_data
  data = fetch_json_data(TECHNICAL_REPORTS_URL)
  Util.info "Extracting technical reports data..."
  data["pageProps"]["nodes"].map do |entry|
    save_document(TechnicalReportParser.new(entry, @errors).parse)
  end
  index.save
  report_errors
end

#fetch(source) ⇒ Object



23
24
25
26
27
28
29
# File 'lib/relaton/plateau/data_fetcher.rb', line 23

def fetch(source)
  case source
  when "plateau-handbooks" then extract_handbooks_data
  when "plateau-technical-reports" then extract_technical_reports_data
  else puts "Invalid source: #{source}"
  end
end

#fetch_json_data(url) ⇒ Hash

Fetch JSON data from a URL with custom headers

Parameters:

  • url (String)

    The URL to fetch JSON data from

Returns:

  • (Hash)

    The parsed JSON data



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/relaton/plateau/data_fetcher.rb', line 60

def fetch_json_data(url)
  uri = URI(url)

  request = create_request(uri)

  # Send the request and get the response
  response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
    http.request(request)
  end

  # Check if the response is successful
  unless response.code.to_i == 200
    Util.warn "Failed to fetch data: #{response.code} #{response.message}"
    return {}
  end

  body = hadle_response(response)

  # Parse the JSON response
  JSON.parse(body)
rescue StandardError => e
  # Handle any errors during the fetching process
  Util.error "Error fetching JSON data from #{url}: #{e.message}"
  {}
end

#file_name(id) ⇒ Object



130
131
132
133
134
135
136
137
138
# File 'lib/relaton/plateau/data_fetcher.rb', line 130

def file_name(id)
  name = id.gsub(/\s+/, "-").gsub(/[^\w-]+/, "").downcase
  if id.match?(/民間活用編/)
    name += "-private"
  elsif id.match?(/公共活用編/)
    name += "-public"
  end
  File.join(@output, "#{name}.#{@ext}")
end

#hadle_response(response) ⇒ Object

Handle different content encodings



46
47
48
49
50
51
52
53
54
# File 'lib/relaton/plateau/data_fetcher.rb', line 46

def hadle_response(response)
  if response["Content-Encoding"] == "gzip"
    Zlib::GzipReader.new(StringIO.new(response.body)).read
  elsif response["Content-Encoding"] == "deflate"
    Zlib::Inflate.inflate(response.body)
  else
    response.body
  end
end

#indexObject



15
16
17
# File 'lib/relaton/plateau/data_fetcher.rb', line 15

def index
  @index ||= Relaton::Index.find_or_create :plateau, file: "#{INDEXFILE}.yaml"
end

#log_error(msg) ⇒ Object



19
20
21
# File 'lib/relaton/plateau/data_fetcher.rb', line 19

def log_error(msg)
  Util.error msg
end

#save_document(item) ⇒ Object



118
119
120
121
122
123
124
125
126
127
128
# File 'lib/relaton/plateau/data_fetcher.rb', line 118

def save_document(item)
  id = item.docidentifier.first.content
  file = file_name id
  if @files.include?(file)
    Util.warn "File #{file} already exists, skipping.", key: id
  else
    File.write(file, serialize(item))
    @files << file
    index.add_or_update id, file
  end
end

#to_bibxml(bib) ⇒ Object



148
149
150
# File 'lib/relaton/plateau/data_fetcher.rb', line 148

def to_bibxml(bib)
  bib.to_rfcxml
end

#to_xml(bib) ⇒ Object



144
145
146
# File 'lib/relaton/plateau/data_fetcher.rb', line 144

def to_xml(bib)
  Item.to_xml(bib, bibdata: true)
end

#to_yaml(bib) ⇒ Object



140
141
142
# File 'lib/relaton/plateau/data_fetcher.rb', line 140

def to_yaml(bib)
  Item.to_yaml(bib)
end