Class: Relaton::W3c::DataFetcher

Inherits:
Core::DataFetcher
  • Object
show all
Includes:
RateLimitHandler
Defined in:
lib/relaton/w3c/data_fetcher.rb

Constant Summary collapse

DEFAULT_CONCURRENCY =
8

Constants included from RateLimitHandler

RateLimitHandler::MAX_RETRIES, RateLimitHandler::RETRYABLE_ERRORS

Class Method Summary collapse

Instance Method Summary collapse

Methods included from RateLimitHandler

fetched_objects, #realize

Constructor Details

#initialize(*args) ⇒ DataFetcher

Returns a new instance of DataFetcher.



22
23
24
25
# File 'lib/relaton/w3c/data_fetcher.rb', line 22

def initialize(*args)
  super
  @mutex = Mutex.new
end

Class Method Details

.concurrencyObject

Number of fetch_spec worker threads. Tunable via env var so CI or local runs can dial it down (e.g. for debugging or to lighten load on api.w3.org).



18
19
20
# File 'lib/relaton/w3c/data_fetcher.rb', line 18

def self.concurrency
  (ENV["RELATON_W3C_FETCH_CONCURRENCY"] || DEFAULT_CONCURRENCY).to_i
end

Instance Method Details

#clientObject



35
36
37
# File 'lib/relaton/w3c/data_fetcher.rb', line 35

def client
  @client ||= W3cApi::Client.new
end

#fetch(_source = nil) ⇒ Object

Parse documents in parallel. The crawler is heavily I/O-bound on api.w3.org round-trips (~30-50k requests per run), so a small thread pool gives a near-linear speedup. Pagination still happens serially because each page depends on the previous response’s ‘next` link.



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/relaton/w3c/data_fetcher.rb', line 45

def fetch(_source = nil)
  n_workers = self.class.concurrency
  queue = SizedQueue.new(n_workers * 4)
  workers = Array.new(n_workers) { spawn_worker(queue) }

  specs = client.specifications
  loop do
    specs.links.specifications.each { |spec| queue << spec }
    break unless specs.next?

    specs = specs.next
  end

  n_workers.times { queue << nil } # poison pills
  workers.each(&:join)

  index.save
  report_errors
end

#fetch_spec(unrealized_spec) ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/relaton/w3c/data_fetcher.rb', line 65

def fetch_spec(unrealized_spec)
  spec = realize unrealized_spec
  return unless spec

  local_errors = Hash.new(true)
  save_doc DataParser.parse(spec, local_errors)

  if spec.links.respond_to?(:version_history) && spec.links.version_history
    version_history = realize spec.links.version_history
    version_history&.links&.spec_versions&.each { |version| parse_and_save version }
  end

  if spec.links.respond_to?(:predecessor_versions) && spec.links.predecessor_versions
    predecessor_versions = realize spec.links.predecessor_versions
    predecessor_versions&.links&.predecessor_versions&.each { |version| parse_and_save version }
  end

  if spec.links.respond_to?(:successor_versions) && spec.links.successor_versions
    successor_versions = realize spec.links.successor_versions
    successor_versions&.links&.successor_versions&.each { |version| parse_and_save version }
  end

  @mutex.synchronize { local_errors.each { |k, v| @errors[k] &&= v } }
end

#file_name(id) ⇒ String

Generate file name

Parameters:

  • id (String)

    document id

Returns:

  • (String)

    file name



130
131
132
133
# File 'lib/relaton/w3c/data_fetcher.rb', line 130

def file_name(id)
  name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase
  File.join @output, "#{name}.#{@ext}"
end

#indexObject



27
28
29
# File 'lib/relaton/w3c/data_fetcher.rb', line 27

def index
  @index ||= Relaton::Index.find_or_create(:W3C, file: "#{INDEXFILE}.yaml")
end

#log_error(msg) ⇒ Object



31
32
33
# File 'lib/relaton/w3c/data_fetcher.rb', line 31

def log_error(msg)
  Util.error msg
end

#save_doc(bib, warn_duplicate: true) ⇒ Object

Save document to file

Parameters:



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/relaton/w3c/data_fetcher.rb', line 95

def save_doc(bib, warn_duplicate: true)
  return unless bib

  file = file_name(bib.docnumber)
  @mutex.synchronize do
    if @files.include?(file)
      Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
    else
      pubid = PubId.parse bib.docnumber
      index.add_or_update pubid.to_hash, file
      @files << file
    end
    File.write file, serialize(bib), encoding: "UTF-8"
  end
end

#to_bibxml(bib) ⇒ Object



119
120
121
# File 'lib/relaton/w3c/data_fetcher.rb', line 119

def to_bibxml(bib)
  bib.to_xml
end

#to_xml(bib) ⇒ Object



111
112
113
# File 'lib/relaton/w3c/data_fetcher.rb', line 111

def to_xml(bib)
  bib.to_xml(bibdata: true)
end

#to_yaml(bib) ⇒ Object



115
116
117
# File 'lib/relaton/w3c/data_fetcher.rb', line 115

def to_yaml(bib)
  bib.to_yaml
end