Class: Relaton::W3c::DataFetcher

Inherits:
Core::DataFetcher
  • Object
show all
Includes:
SafeRealize
Defined in:
lib/relaton/w3c/data_fetcher.rb

Constant Summary collapse

DEFAULT_CONCURRENCY =
8

Class Method Summary collapse

Instance Method Summary collapse

Methods included from SafeRealize

#realize, skipped

Constructor Details

#initialize(*args) ⇒ DataFetcher

Returns a new instance of DataFetcher.



34
35
36
37
38
# File 'lib/relaton/w3c/data_fetcher.rb', line 34

def initialize(*args)
  super
  @mutex = Mutex.new
  @interrupted = false
end

Class Method Details

.concurrencyObject

Number of fetch_spec worker threads. Tunable via env var so CI or local runs can dial it down (e.g. for debugging or to lighten load on api.w3.org).



18
19
20
# File 'lib/relaton/w3c/data_fetcher.rb', line 18

def self.concurrency
  (ENV["RELATON_W3C_FETCH_CONCURRENCY"] || DEFAULT_CONCURRENCY).to_i
end

.fetch_versions?Boolean

Whether to crawl each specification’s version history (version_history, predecessor_versions, successor_versions). Enabled by default for a complete dataset. Set RELATON_W3C_FETCH_VERSIONS=false for a faster, shallower crawl that emits only the top-level specifications and skips the per-spec version fan-out (the bulk of the API requests).

Returns:

  • (Boolean)


27
28
29
30
31
32
# File 'lib/relaton/w3c/data_fetcher.rb', line 27

def self.fetch_versions?
  val = ENV["RELATON_W3C_FETCH_VERSIONS"]
  return true if val.nil? || val.empty?

  !%w[0 false no off].include?(val.strip.downcase)
end

Instance Method Details

#clientObject



48
49
50
# File 'lib/relaton/w3c/data_fetcher.rb', line 48

def client
  @client ||= W3cApi::Client.new
end

#enqueue_specs(queue) ⇒ Object

Page through the specifications index, feeding each spec (paired with its embedded page) to the worker queue. Returns early when interrupted.

embed: true inlines each specification’s full payload into the index page’s ‘_embedded` block, so a spec link realizes from that page in memory instead of making its own HTTP request — one request per page rather than one per specification. The page is queued alongside each link so the worker can hand it back to realize as the parent_resource.



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/relaton/w3c/data_fetcher.rb', line 88

def enqueue_specs(queue)
  specs = client.specifications(embed: true)
  loop do
    page = specs
    page.links.specifications.each do |spec|
      break if @interrupted

      queue << [spec, page]
    end
    break if @interrupted || !page.next?

    # Fetch the next page through the client's fetch path rather than
    # realizing the `next` link: only fetch populates the page's
    # embedded_data, so this keeps embed working past page 1. Realizing
    # the `next` link drops `_embedded` and forces a per-spec HTTP
    # request for every specification on every later page.
    next_page = fetch_specifications_page(page.page + 1)
    break unless next_page

    specs = next_page
  end
end

#fetch(_source = nil) ⇒ Object

Parse documents in parallel. The crawler is heavily I/O-bound on api.w3.org round-trips (~30-50k requests per run), so a small thread pool gives a near-linear speedup. Pagination still happens serially: each page’s ‘next?` flag gates whether the next page is requested.

A SIGINT (Ctrl-C) is handled gracefully: the producer stops queuing and the workers stop processing after their in-flight spec, then the index of everything fetched so far is saved rather than the run being lost.



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/relaton/w3c/data_fetcher.rb', line 62

def fetch(_source = nil)
  n_workers = self.class.concurrency
  queue = SizedQueue.new(n_workers * 4)
  workers = Array.new(n_workers) { spawn_worker(queue) }

  with_interrupt_handler do
    enqueue_specs(queue)
    n_workers.times { queue << nil } # poison pills
    workers.each(&:join)
    Util.warn "Crawl interrupted — saving progress collected so far." if @interrupted
    index.save
  end

  report_errors
end

#fetch_spec(unrealized_spec, page = nil) ⇒ Object



111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/relaton/w3c/data_fetcher.rb', line 111

def fetch_spec(unrealized_spec, page = nil)
  # When `page` came from an embed:true fetch, realizing against it as the
  # parent_resource serves the spec from embedded data (no HTTP request).
  spec = realize(unrealized_spec, parent_resource: page)
  return unless spec

  local_errors = Hash.new(true)
  save_doc DataParser.parse(spec, local_errors)

  fetch_versions(spec) if self.class.fetch_versions?

  @mutex.synchronize { local_errors.each { |k, v| @errors[k] &&= v } }
end

#fetch_versions(spec) ⇒ Object

Crawl a specification’s version history: its dated editions plus the predecessor/successor version chains. Each entry is a separate HTTP request, so this is the bulk of a run and can be skipped via RELATON_W3C_FETCH_VERSIONS=false (see .fetch_versions?).



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# File 'lib/relaton/w3c/data_fetcher.rb', line 131

def fetch_versions(spec)
  if spec.links.respond_to?(:version_history) && spec.links.version_history
    version_history = realize spec.links.version_history
    version_history&.links&.spec_versions&.each { |version| parse_and_save version }
  end

  if spec.links.respond_to?(:predecessor_versions) && spec.links.predecessor_versions
    predecessor_versions = realize spec.links.predecessor_versions
    predecessor_versions&.links&.predecessor_versions&.each { |version| parse_and_save version }
  end

  return unless spec.links.respond_to?(:successor_versions) && spec.links.successor_versions

  successor_versions = realize spec.links.successor_versions
  successor_versions&.links&.successor_versions&.each { |version| parse_and_save version }
end

#file_name(id) ⇒ String

Generate file name

Parameters:

  • id (String)

    document id

Returns:

  • (String)

    file name



188
189
190
191
# File 'lib/relaton/w3c/data_fetcher.rb', line 188

def file_name(id)
  name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase
  File.join @output, "#{name}.#{@ext}"
end

#indexObject



40
41
42
# File 'lib/relaton/w3c/data_fetcher.rb', line 40

def index
  @index ||= Relaton::Index.find_or_create(:W3C, file: "#{INDEXFILE}.yaml")
end

#log_error(msg) ⇒ Object



44
45
46
# File 'lib/relaton/w3c/data_fetcher.rb', line 44

def log_error(msg)
  Util.error msg
end

#save_doc(bib, warn_duplicate: true) ⇒ Object

Save document to file

Parameters:



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/relaton/w3c/data_fetcher.rb', line 153

def save_doc(bib, warn_duplicate: true)
  return unless bib

  file = file_name(bib.docnumber)
  @mutex.synchronize do
    if @files.include?(file)
      Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
    else
      pubid = PubId.parse bib.docnumber
      index.add_or_update pubid.to_hash, file
      @files << file
    end
    File.write file, serialize(bib), encoding: "UTF-8"
  end
end

#to_bibxml(bib) ⇒ Object



177
178
179
# File 'lib/relaton/w3c/data_fetcher.rb', line 177

def to_bibxml(bib)
  bib.to_xml
end

#to_xml(bib) ⇒ Object



169
170
171
# File 'lib/relaton/w3c/data_fetcher.rb', line 169

def to_xml(bib)
  bib.to_xml(bibdata: true)
end

#to_yaml(bib) ⇒ Object



173
174
175
# File 'lib/relaton/w3c/data_fetcher.rb', line 173

def to_yaml(bib)
  bib.to_yaml
end