Module: Iev::SubjectAreas
- Defined in:
- lib/iev/subject_areas.rb
Defined Under Namespace
Classes: FetchError
Constant Summary collapse
- DATA_FILE =
File.("../../data/subject_areas.yaml", __dir__)
- AREAS_URL =
"https://electropedia.org/iev/iev.nsf/" \ "6d6bdd8667c378f7c12581fa003d80e7?OpenForm"
- SECTIONS_URL_TEMPLATE =
"https://electropedia.org/iev/iev.nsf/" \ "index?openform&part=%<part>s"
- MIN_PAGE_SIZE =
15_000- FETCH_DELAY =
5- RETRY_DELAY =
30- MAX_RETRIES =
2
Class Method Summary collapse
-
.all ⇒ Array<Hash>
Return all subject areas with their sections.
-
.area_for_section(section_code) ⇒ Hash?
Return the parent area for a given section code.
-
.area_uri(code) ⇒ String
URI for a subject area concept.
-
.fetch ⇒ Object
— Fetching (network, writes to bundled data file) —.
- .fetch_areas ⇒ Object
- .fetch_sections(part) ⇒ Object
-
.find_area(code) ⇒ Hash?
Find a single subject area by its numeric code.
-
.find_section(section_code) ⇒ Hash?
Find a single section by its section code.
-
.section_uri(code) ⇒ String
URI for a section concept.
-
.sections_for(code) ⇒ Array<Hash>
Return all sections for a given area code.
Class Method Details
.all ⇒ Array<Hash>
Return all subject areas with their sections.
46 47 48 |
# File 'lib/iev/subject_areas.rb', line 46 def all data["areas"] end |
.area_for_section(section_code) ⇒ Hash?
Return the parent area for a given section code.
80 81 82 83 84 85 |
# File 'lib/iev/subject_areas.rb', line 80 def area_for_section(section_code) sc = section_code.to_s all.find do |area| area["sections"]&.any? { |s| s["code"] == sc } end end |
.area_uri(code) ⇒ String
URI for a subject area concept.
31 32 33 |
# File 'lib/iev/subject_areas.rb', line 31 def area_uri(code) "area-#{code}" end |
.fetch ⇒ Object
— Fetching (network, writes to bundled data file) —
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/iev/subject_areas.rb', line 89 def fetch cached = read_cache("subject_areas.yaml") return cached if cached && complete?(cached) areas = cached ? cached["areas"] : [] fresh_areas = fetch_areas puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty? # Merge: keep existing sections, add new areas existing = areas.each_with_object({}) { |a, h| h[a["code"]] = a } fresh_areas.each do |fa| existing[fa["code"]] ||= fa end areas = fresh_areas.map { |fa| existing[fa["code"]] || fa } areas.each_with_index do |area, i| next if area["fetched"] begin area["sections"] = fetch_sections(area["code"]) area["fetched"] = true rescue FetchError area["sections"] ||= [] warn "IEV: Skipping area #{area["code"]} due to WAF" end puts "[#{i + 1}/#{areas.length}] #{area["code"]}: #{area["title"]} — #{area["sections"].length} sections" if $stdout.tty? # Save progress every 10 areas so partial results survive WAF failures if (i + 1) % 10 == 0 write_cache("subject_areas.yaml", { "areas" => areas }) end sleep FETCH_DELAY unless i == areas.length - 1 end result = { "areas" => areas } write_cache("subject_areas.yaml", result) result end |
.fetch_areas ⇒ Object
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/iev/subject_areas.rb', line 130 def fetch_areas html = fetch_page_with_retry(AREAS_URL) doc = Nokogiri::HTML(html) areas = [] doc.css("a").each do |link| href = link["href"].to_s next unless href.include?("part=") code = href.match(/part=(\d+)/)&.[](1) next unless code title = link.text.strip next if title.empty? areas << { "code" => code, "title" => title, "sections" => [] } end areas.uniq { |a| a["code"] } end |
.fetch_sections(part) ⇒ Object
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
# File 'lib/iev/subject_areas.rb', line 151 def fetch_sections(part) url = format(SECTIONS_URL_TEMPLATE, part: part) html = fetch_page_with_retry(url) doc = Nokogiri::HTML(html) sections = [] doc.css("td").each do |td| text = td.text.strip if (m = text.match(/\ASection\s+([\d-]+):\s*(.+)\z/)) sections << { "code" => m[1], "title" => m[2].strip } end end sections.uniq { |s| s["code"] } end |
.find_area(code) ⇒ Hash?
Find a single subject area by its numeric code.
53 54 55 |
# File 'lib/iev/subject_areas.rb', line 53 def find_area(code) all.find { |a| a["code"] == code.to_s } end |
.find_section(section_code) ⇒ Hash?
Find a single section by its section code.
68 69 70 71 72 73 74 75 |
# File 'lib/iev/subject_areas.rb', line 68 def find_section(section_code) sc = section_code.to_s all.each do |area| found = area["sections"]&.find { |s| s["code"] == sc } return found if found end nil end |
.section_uri(code) ⇒ String
URI for a section concept.
38 39 40 |
# File 'lib/iev/subject_areas.rb', line 38 def section_uri(code) "section-#{code}" end |
.sections_for(code) ⇒ Array<Hash>
Return all sections for a given area code.
60 61 62 63 |
# File 'lib/iev/subject_areas.rb', line 60 def sections_for(code) area = find_area(code) area ? area["sections"] : [] end |