Class: Relaton::W3c::DataFetcher
- Inherits:
-
Core::DataFetcher
- Object
- Core::DataFetcher
- Relaton::W3c::DataFetcher
- Includes:
- SafeRealize
- Defined in:
- lib/relaton/w3c/data_fetcher.rb
Constant Summary collapse
- DEFAULT_CONCURRENCY =
8
Class Method Summary collapse
-
.concurrency ⇒ Object
Number of fetch_spec worker threads.
Instance Method Summary collapse
- #client ⇒ Object
-
#fetch(_source = nil) ⇒ Object
Parse documents in parallel.
- #fetch_spec(unrealized_spec) ⇒ Object
-
#file_name(id) ⇒ String
Generate file name.
- #index ⇒ Object
-
#initialize(*args) ⇒ DataFetcher
constructor
A new instance of DataFetcher.
- #log_error(msg) ⇒ Object
-
#save_doc(bib, warn_duplicate: true) ⇒ Object
Save document to file.
- #to_bibxml(bib) ⇒ Object
- #to_xml(bib) ⇒ Object
- #to_yaml(bib) ⇒ Object
Methods included from SafeRealize
Constructor Details
#initialize(*args) ⇒ DataFetcher
Returns a new instance of DataFetcher.
22 23 24 25 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 22 def initialize(*args) super @mutex = Mutex.new end |
Class Method Details
.concurrency ⇒ Object
Number of fetch_spec worker threads. Tunable via env var so CI or local runs can dial it down (e.g. for debugging or to lighten load on api.w3.org).
18 19 20 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 18 def self.concurrency (ENV["RELATON_W3C_FETCH_CONCURRENCY"] || DEFAULT_CONCURRENCY).to_i end |
Instance Method Details
#client ⇒ Object
35 36 37 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 35 def client @client ||= W3cApi::Client.new end |
#fetch(_source = nil) ⇒ Object
Parse documents in parallel. The crawler is heavily I/O-bound on api.w3.org round-trips (~30-50k requests per run), so a small thread pool gives a near-linear speedup. Pagination still happens serially because each page depends on the previous response’s ‘next` link.
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 45 def fetch(_source = nil) n_workers = self.class.concurrency queue = SizedQueue.new(n_workers * 4) workers = Array.new(n_workers) { spawn_worker(queue) } specs = client.specifications loop do specs.links.specifications.each { |spec| queue << spec } break unless specs.next? # Route pagination through realize so transient 403/5xx on the # next-page link retry with backoff instead of crashing the crawl. next_page = realize(specs.links.next) break unless next_page specs = next_page end n_workers.times { queue << nil } # poison pills workers.each(&:join) index.save report_errors end |
#fetch_spec(unrealized_spec) ⇒ Object
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 70 def fetch_spec(unrealized_spec) spec = realize unrealized_spec return unless spec local_errors = Hash.new(true) save_doc DataParser.parse(spec, local_errors) if spec.links.respond_to?(:version_history) && spec.links.version_history version_history = realize spec.links.version_history version_history&.links&.spec_versions&.each { |version| parse_and_save version } end if spec.links.respond_to?(:predecessor_versions) && spec.links.predecessor_versions predecessor_versions = realize spec.links.predecessor_versions predecessor_versions&.links&.predecessor_versions&.each { |version| parse_and_save version } end if spec.links.respond_to?(:successor_versions) && spec.links.successor_versions successor_versions = realize spec.links.successor_versions successor_versions&.links&.successor_versions&.each { |version| parse_and_save version } end @mutex.synchronize { local_errors.each { |k, v| @errors[k] &&= v } } end |
#file_name(id) ⇒ String
Generate file name
135 136 137 138 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 135 def file_name(id) name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase File.join @output, "#{name}.#{@ext}" end |
#index ⇒ Object
27 28 29 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 27 def index @index ||= Relaton::Index.find_or_create(:W3C, file: "#{INDEXFILE}.yaml") end |
#log_error(msg) ⇒ Object
31 32 33 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 31 def log_error(msg) Util.error msg end |
#save_doc(bib, warn_duplicate: true) ⇒ Object
Save document to file
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 100 def save_doc(bib, warn_duplicate: true) return unless bib file = file_name(bib.docnumber) @mutex.synchronize do if @files.include?(file) Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate else pubid = PubId.parse bib.docnumber index.add_or_update pubid.to_hash, file @files << file end File.write file, serialize(bib), encoding: "UTF-8" end end |
#to_bibxml(bib) ⇒ Object
124 125 126 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 124 def to_bibxml(bib) bib.to_xml end |
#to_xml(bib) ⇒ Object
116 117 118 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 116 def to_xml(bib) bib.to_xml(bibdata: true) end |
#to_yaml(bib) ⇒ Object
120 121 122 |
# File 'lib/relaton/w3c/data_fetcher.rb', line 120 def to_yaml(bib) bib.to_yaml end |