Class: Relaton::W3c::DataFetcher
- Inherits:
-
Core::DataFetcher
- Object
- Core::DataFetcher
- Relaton::W3c::DataFetcher
show all
- Includes:
- RateLimitHandler
- Defined in:
- lib/relaton/w3c/data_fetcher.rb
Constant Summary
collapse
- DEFAULT_CONCURRENCY =
8
RateLimitHandler::MAX_RETRIES, RateLimitHandler::RETRYABLE_ERRORS
Class Method Summary
collapse
Instance Method Summary
collapse
fetched_objects, #realize
Constructor Details
Returns a new instance of DataFetcher.
22
23
24
25
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 22
def initialize(*args)
super
@mutex = Mutex.new
end
|
Class Method Details
.concurrency ⇒ Object
Number of fetch_spec worker threads. Tunable via env var so CI or local runs can dial it down (e.g. for debugging or to lighten load on api.w3.org).
18
19
20
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 18
def self.concurrency
(ENV["RELATON_W3C_FETCH_CONCURRENCY"] || DEFAULT_CONCURRENCY).to_i
end
|
Instance Method Details
#client ⇒ Object
35
36
37
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 35
def client
@client ||= W3cApi::Client.new
end
|
#fetch(_source = nil) ⇒ Object
Parse documents in parallel. The crawler is heavily I/O-bound on api.w3.org round-trips (~30-50k requests per run), so a small thread pool gives a near-linear speedup. Pagination still happens serially because each page depends on the previous response’s ‘next` link.
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 45
def fetch(_source = nil)
n_workers = self.class.concurrency
queue = SizedQueue.new(n_workers * 4)
workers = Array.new(n_workers) { spawn_worker(queue) }
specs = client.specifications
loop do
specs.links.specifications.each { |spec| queue << spec }
break unless specs.next?
specs = specs.next
end
n_workers.times { queue << nil } workers.each(&:join)
index.save
report_errors
end
|
#fetch_spec(unrealized_spec) ⇒ Object
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 65
def fetch_spec(unrealized_spec)
spec = realize unrealized_spec
return unless spec
local_errors = Hash.new(true)
save_doc DataParser.parse(spec, local_errors)
if spec.links.respond_to?(:version_history) && spec.links.version_history
version_history = realize spec.links.version_history
version_history&.links&.spec_versions&.each { |version| parse_and_save version }
end
if spec.links.respond_to?(:predecessor_versions) && spec.links.predecessor_versions
predecessor_versions = realize spec.links.predecessor_versions
predecessor_versions&.links&.predecessor_versions&.each { |version| parse_and_save version }
end
if spec.links.respond_to?(:successor_versions) && spec.links.successor_versions
successor_versions = realize spec.links.successor_versions
successor_versions&.links&.successor_versions&.each { |version| parse_and_save version }
end
@mutex.synchronize { local_errors.each { |k, v| @errors[k] &&= v } }
end
|
#file_name(id) ⇒ String
130
131
132
133
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 130
def file_name(id)
name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase
File.join @output, "#{name}.#{@ext}"
end
|
#index ⇒ Object
27
28
29
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 27
def index
@index ||= Relaton::Index.find_or_create(:W3C, file: "#{INDEXFILE}.yaml")
end
|
#log_error(msg) ⇒ Object
31
32
33
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 31
def log_error(msg)
Util.error msg
end
|
#save_doc(bib, warn_duplicate: true) ⇒ Object
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 95
def save_doc(bib, warn_duplicate: true)
return unless bib
file = file_name(bib.docnumber)
@mutex.synchronize do
if @files.include?(file)
Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
else
pubid = PubId.parse bib.docnumber
index.add_or_update pubid.to_hash, file
@files << file
end
File.write file, serialize(bib), encoding: "UTF-8"
end
end
|
#to_bibxml(bib) ⇒ Object
119
120
121
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 119
def to_bibxml(bib)
bib.to_xml
end
|
#to_xml(bib) ⇒ Object
111
112
113
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 111
def to_xml(bib)
bib.to_xml(bibdata: true)
end
|
#to_yaml(bib) ⇒ Object
115
116
117
|
# File 'lib/relaton/w3c/data_fetcher.rb', line 115
def to_yaml(bib)
bib.to_yaml
end
|