Module: Iev::SubjectAreas
- Defined in:
- lib/iev/subject_areas.rb
Defined Under Namespace
Classes: FetchError
Constant Summary collapse
- DATA_FILE =
File.("../../data/subject_areas.yaml", __dir__)
- AREAS_URL =
"https://electropedia.org/iev/iev.nsf/" \ "6d6bdd8667c378f7c12581fa003d80e7?OpenForm"
- SECTIONS_URL_TEMPLATE =
"https://electropedia.org/iev/iev.nsf/" \ "index?openform&part=%<part>s"
- MIN_PAGE_SIZE =
15_000- FETCH_DELAY =
5- RETRY_DELAY =
30- MAX_RETRIES =
2
Class Method Summary collapse
-
.all ⇒ Array<SubjectArea>
Return all subject areas with their sections.
-
.area_for(ievref) ⇒ SubjectArea?
Find the subject area for any IEV reference.
-
.area_for_section(section_code) ⇒ SubjectArea?
Return the parent area for a given section code.
-
.area_uri(code) ⇒ String
URI for a subject area concept.
-
.fetch ⇒ Object
— Fetching (network, writes to bundled data file) —.
- .fetch_areas ⇒ Object
- .fetch_sections(part) ⇒ Object
-
.find_area(code) ⇒ SubjectArea?
Find a single subject area by its numeric code.
-
.find_section(section_code) ⇒ Section?
Find a single section by its section code.
-
.reload! ⇒ Object
Clear cached typed objects (useful after fetch updates raw data).
-
.section_for(ievref) ⇒ Section?
Find the section for any IEV reference.
-
.section_uri(code) ⇒ String
URI for a section concept.
-
.sections_for(code) ⇒ Array<Section>
Return all sections for a given area code.
Class Method Details
.all ⇒ Array<SubjectArea>
Return all subject areas with their sections.
46 47 48 |
# File 'lib/iev/subject_areas.rb', line 46 def all @typed_areas ||= raw_data["areas"].map { |h| build_area(h) } end |
.area_for(ievref) ⇒ SubjectArea?
Find the subject area for any IEV reference.
84 85 86 87 |
# File 'lib/iev/subject_areas.rb', line 84 def area_for(ievref) code = IevCode.new(ievref) find_area(code.area_code) end |
.area_for_section(section_code) ⇒ SubjectArea?
Return the parent area for a given section code.
74 75 76 77 |
# File 'lib/iev/subject_areas.rb', line 74 def area_for_section(section_code) sec = find_section(section_code) sec ? find_area(sec.area_code) : nil end |
.area_uri(code) ⇒ String
URI for a subject area concept.
31 32 33 |
# File 'lib/iev/subject_areas.rb', line 31 def area_uri(code) "area-#{code}" end |
.fetch ⇒ Object
— Fetching (network, writes to bundled data file) —
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/iev/subject_areas.rb', line 99 def fetch cached = read_cache("subject_areas.yaml") return cached if cached && complete?(cached) areas = cached ? cached["areas"] : [] fresh_areas = fetch_areas puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty? # Merge: keep existing sections, add new areas existing = areas.each_with_object({}) { |a, h| h[a["code"]] = a } fresh_areas.each do |fa| existing[fa["code"]] ||= fa end areas = fresh_areas.map { |fa| existing[fa["code"]] || fa } areas.each_with_index do |area, i| next if area["fetched"] begin area["sections"] = fetch_sections(area["code"]) area["fetched"] = true rescue FetchError area["sections"] ||= [] warn "IEV: Skipping area #{area["code"]} due to WAF" end puts "[#{i + 1}/#{areas.length}] #{area["code"]}: #{area["title"]} — #{area["sections"].length} sections" if $stdout.tty? # Save progress every 10 areas so partial results survive WAF failures if (i + 1) % 10 == 0 write_cache("subject_areas.yaml", { "areas" => areas }) end sleep FETCH_DELAY unless i == areas.length - 1 end result = { "areas" => areas } write_cache("subject_areas.yaml", result) result end |
.fetch_areas ⇒ Object
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/iev/subject_areas.rb', line 140 def fetch_areas html = fetch_page_with_retry(AREAS_URL) doc = Nokogiri::HTML(html) areas = [] doc.css("a").each do |link| href = link["href"].to_s next unless href.include?("part=") code = href.match(/part=(\d+)/)&.[](1) next unless code title = link.text.strip next if title.empty? areas << { "code" => code, "title" => title, "sections" => [] } end areas.uniq { |a| a["code"] } end |
.fetch_sections(part) ⇒ Object
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# File 'lib/iev/subject_areas.rb', line 161 def fetch_sections(part) url = format(SECTIONS_URL_TEMPLATE, part: part) html = fetch_page_with_retry(url) doc = Nokogiri::HTML(html) sections = [] doc.css("td").each do |td| text = td.text.strip if (m = text.match(/\ASection\s+([\d-]+):\s*(.+)\z/)) sections << { "code" => m[1], "title" => m[2].strip } end end sections.uniq { |s| s["code"] } end |
.find_area(code) ⇒ SubjectArea?
Find a single subject area by its numeric code. O(1) indexed.
53 54 55 |
# File 'lib/iev/subject_areas.rb', line 53 def find_area(code) area_index[code.to_s] end |
.find_section(section_code) ⇒ Section?
Find a single section by its section code. O(1) indexed.
67 68 69 |
# File 'lib/iev/subject_areas.rb', line 67 def find_section(section_code) section_index[section_code.to_s] end |
.reload! ⇒ Object
Clear cached typed objects (useful after fetch updates raw data).
178 179 180 181 182 183 |
# File 'lib/iev/subject_areas.rb', line 178 def reload! @typed_areas = nil @area_index = nil @section_index = nil @raw_data = nil end |
.section_for(ievref) ⇒ Section?
Find the section for any IEV reference.
92 93 94 95 |
# File 'lib/iev/subject_areas.rb', line 92 def section_for(ievref) code = IevCode.new(ievref) code.section_code ? find_section(code.section_code) : nil end |
.section_uri(code) ⇒ String
URI for a section concept.
38 39 40 |
# File 'lib/iev/subject_areas.rb', line 38 def section_uri(code) "section-#{code}" end |
.sections_for(code) ⇒ Array<Section>
Return all sections for a given area code.
60 61 62 |
# File 'lib/iev/subject_areas.rb', line 60 def sections_for(code) find_area(code)&.sections || [] end |