Module: SubdomainProcessor

Included in:
WaybackMachineDownloader
Defined in:
lib/wayback_machine_downloader/subdom_processor.rb

Instance Method Summary collapse

Instance Method Details

#process_subdomainsObject



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/wayback_machine_downloader/subdom_processor.rb', line 4

def process_subdomains
  return unless @recursive_subdomains
  
  puts "Starting subdomain processing..."
  
  # extract base domain from the URL for comparison
  base_domain = extract_base_domain(@base_url)
  @processed_domains = Set.new([base_domain])
  @subdomain_queue = Queue.new
  
  # scan downloaded files for subdomain links
  initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
  puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
  
  subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
  
  if subdomains_found.empty?
    puts "No subdomains found in downloaded content."
    return
  end
  
  puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
  
  # add found subdomains to the queue
  subdomains_found.each do |subdomain|
    full_domain = "#{subdomain}.#{base_domain}"
    @subdomain_queue << "https://#{full_domain}/"
  end
  
  # process the subdomain queue
  download_subdomains(base_domain)
  
  # after all downloads, rewrite all URLs to make local references
  rewrite_subdomain_links(base_domain) if @rewrite
end