Class: WaybackMachineDownloader
- Inherits:
-
Object
- Object
- WaybackMachineDownloader
- Includes:
- ArchiveAPI, SubdomainProcessor, URLRewrite
- Defined in:
- lib/wayback_machine_downloader.rb
Constant Summary collapse
- VERSION =
"2.4.7"- DEFAULT_TIMEOUT =
30- MAX_RETRIES =
3- RETRY_DELAY =
2- RATE_LIMIT =
Delay between requests in seconds
0.25- CONNECTION_POOL_SIZE =
10- MEMORY_BUFFER_SIZE =
16KB chunks
16384- STATE_CDX_FILENAME =
".cdx.json"- STATE_DB_FILENAME =
".downloaded.txt"
Constants included from URLRewrite
Instance Attribute Summary collapse
-
#all ⇒ Object
Returns the value of attribute all.
-
#all_timestamps ⇒ Object
Returns the value of attribute all_timestamps.
-
#base_url ⇒ Object
Returns the value of attribute base_url.
-
#directory ⇒ Object
Returns the value of attribute directory.
-
#exact_url ⇒ Object
Returns the value of attribute exact_url.
-
#exclude_filter ⇒ Object
Returns the value of attribute exclude_filter.
-
#from_timestamp ⇒ Object
Returns the value of attribute from_timestamp.
-
#keep ⇒ Object
Returns the value of attribute keep.
-
#keep_duplicates ⇒ Object
Returns the value of attribute keep_duplicates.
-
#logger ⇒ Object
Returns the value of attribute logger.
-
#maximum_pages ⇒ Object
Returns the value of attribute maximum_pages.
-
#only_filter ⇒ Object
Returns the value of attribute only_filter.
-
#page_requisites ⇒ Object
Returns the value of attribute page_requisites.
-
#reset ⇒ Object
Returns the value of attribute reset.
-
#rewrite ⇒ Object
Returns the value of attribute rewrite.
-
#snapshot_at ⇒ Object
Returns the value of attribute snapshot_at.
-
#threads_count ⇒ Object
Returns the value of attribute threads_count.
-
#to_timestamp ⇒ Object
Returns the value of attribute to_timestamp.
Instance Method Summary collapse
- #append_to_db(file_id) ⇒ Object
- #backup_name ⇒ Object
- #backup_path ⇒ Object
- #cdx_path ⇒ Object
- #color(text, color_code) ⇒ Object
- #db_path ⇒ Object
- #download_file(file_remote_info, http) ⇒ Object
- #download_files ⇒ Object
- #file_list_by_timestamp ⇒ Object
- #file_queue ⇒ Object
- #get_all_snapshots_to_consider ⇒ Object
-
#get_composite_snapshot_file_list(target_timestamp) ⇒ Object
Get a composite snapshot file list for a specific timestamp.
- #get_file_list_all_timestamps ⇒ Object
- #get_file_list_by_timestamp ⇒ Object
-
#get_file_list_composite_snapshot(target_timestamp) ⇒ Object
Returns a list of files for the composite snapshot.
- #get_file_list_curated ⇒ Object
- #handle_reset ⇒ Object
-
#initialize(params) ⇒ WaybackMachineDownloader
constructor
A new instance of WaybackMachineDownloader.
- #list_files ⇒ Object
- #load_downloaded_ids ⇒ Object
-
#local_path_for_file_id(file_id) ⇒ Object
derive the local filesystem path for a sanitized ‘file_id` stored in the DB.
- #match_exclude_filter(file_url) ⇒ Object
- #match_only_filter(file_url) ⇒ Object
- #process_page_requisites(file_path, parent_remote_info) ⇒ Object
- #process_single_file(file_remote_info) ⇒ Object
- #processing_files(pool, files_to_process) ⇒ Object
- #rewrite_local_files ⇒ Object
- #rewrite_urls_to_relative(file_path) ⇒ Object
- #structure_dir_path(dir_path) ⇒ Object
-
#submit_download_job(file_remote_info) ⇒ Object
helper to submit jobs and increment the counter.
Methods included from URLRewrite
#rewrite_css_urls, #rewrite_html_attr_urls, #rewrite_js_urls
Methods included from SubdomainProcessor
Methods included from ArchiveAPI
#get_raw_list_from_api, #parameters_for_api
Constructor Details
#initialize(params) ⇒ WaybackMachineDownloader
Returns a new instance of WaybackMachineDownloader.
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
# File 'lib/wayback_machine_downloader.rb', line 149 def initialize params validate_params(params) @base_url = params[:base_url]&.tidy_bytes @exact_url = params[:exact_url] if params[:directory] sanitized_dir = params[:directory].tidy_bytes @directory = File.(sanitized_dir) else @directory = nil end @all_timestamps = params[:all_timestamps] @from_timestamp = params[:from_timestamp].to_i @to_timestamp = params[:to_timestamp].to_i @only_filter = params[:only_filter] @exclude_filter = params[:exclude_filter] @all = params[:all] @keep_duplicates = params[:keep_duplicates] || false @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = [params[:threads_count].to_i, 1].max @rewritten = params[:rewritten] @reset = params[:reset] @keep = params[:keep] @timeout = params[:timeout] || DEFAULT_TIMEOUT @logger = setup_logger @failed_downloads = Concurrent::Array.new @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE) @db_mutex = Mutex.new @rewrite = params[:rewrite] || false @recursive_subdomains = params[:recursive_subdomains] || false @subdomain_depth = params[:subdomain_depth] || 1 @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES @page_requisites = params[:page_requisites] || false @pending_jobs = Concurrent::AtomicFixnum.new(0) # URL for rejecting invalid/unencoded wayback urls @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/ handle_reset end |
Instance Attribute Details
#all ⇒ Object
Returns the value of attribute all.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def all @all end |
#all_timestamps ⇒ Object
Returns the value of attribute all_timestamps.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def @all_timestamps end |
#base_url ⇒ Object
Returns the value of attribute base_url.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def base_url @base_url end |
#directory ⇒ Object
Returns the value of attribute directory.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def directory @directory end |
#exact_url ⇒ Object
Returns the value of attribute exact_url.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def exact_url @exact_url end |
#exclude_filter ⇒ Object
Returns the value of attribute exclude_filter.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def exclude_filter @exclude_filter end |
#from_timestamp ⇒ Object
Returns the value of attribute from_timestamp.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def @from_timestamp end |
#keep ⇒ Object
Returns the value of attribute keep.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def keep @keep end |
#keep_duplicates ⇒ Object
Returns the value of attribute keep_duplicates.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def keep_duplicates @keep_duplicates end |
#logger ⇒ Object
Returns the value of attribute logger.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def logger @logger end |
#maximum_pages ⇒ Object
Returns the value of attribute maximum_pages.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def maximum_pages @maximum_pages end |
#only_filter ⇒ Object
Returns the value of attribute only_filter.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def only_filter @only_filter end |
#page_requisites ⇒ Object
Returns the value of attribute page_requisites.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def page_requisites @page_requisites end |
#reset ⇒ Object
Returns the value of attribute reset.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def reset @reset end |
#rewrite ⇒ Object
Returns the value of attribute rewrite.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def rewrite @rewrite end |
#snapshot_at ⇒ Object
Returns the value of attribute snapshot_at.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def snapshot_at @snapshot_at end |
#threads_count ⇒ Object
Returns the value of attribute threads_count.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def threads_count @threads_count end |
#to_timestamp ⇒ Object
Returns the value of attribute to_timestamp.
144 145 146 |
# File 'lib/wayback_machine_downloader.rb', line 144 def @to_timestamp end |
Instance Method Details
#append_to_db(file_id) ⇒ Object
547 548 549 550 551 552 553 554 555 556 |
# File 'lib/wayback_machine_downloader.rb', line 547 def append_to_db(file_id) @db_mutex.synchronize do begin FileUtils.mkdir_p(File.dirname(db_path)) File.open(db_path, 'a') { |f| f.puts(file_id) } rescue => e @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.}") end end end |
#backup_name ⇒ Object
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
# File 'lib/wayback_machine_downloader.rb', line 190 def backup_name url_to_process = @base_url url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*') raw = if url_to_process.include?('//') url_to_process.split('/')[2] else url_to_process end # if it looks like a wildcard pattern, normalize to a safe host-ish name if raw&.start_with?('*.') raw = raw.sub(/\A\*\./, 'all-') end # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port) if Gem.win_platform? raw = raw.gsub(/[:*?"<>|]/, '_') raw = raw.gsub(/[ .]+\z/, '') else # still good practice to strip path separators (and maybe '*' for POSIX too) raw = raw.gsub(/[\/:*?"<>|]/, '_') end raw = 'site' if raw.nil? || raw.empty? raw end |
#backup_path ⇒ Object
218 219 220 221 222 223 224 225 226 227 |
# File 'lib/wayback_machine_downloader.rb', line 218 def backup_path if @directory # because @directory is already an absolute path, we just ensure it exists @directory else # ensure the default path is absolute and normalized cwd = Dir.pwd File.(File.join(cwd, 'websites', backup_name)) end end |
#cdx_path ⇒ Object
229 230 231 |
# File 'lib/wayback_machine_downloader.rb', line 229 def cdx_path File.join(backup_path, STATE_CDX_FILENAME) end |
#color(text, color_code) ⇒ Object
1038 1039 1040 1041 1042 |
# File 'lib/wayback_machine_downloader.rb', line 1038 def color(text, color_code) return text if Gem.win_platform? && !ENV['ENABLE_ANSI'] codes = { red: 31, green: 32, yellow: 33, blue: 34, magenta: 35, cyan: 36, white: 37 } "\e[#{codes[color_code]}m#{text}\e[0m" end |
#db_path ⇒ Object
233 234 235 |
# File 'lib/wayback_machine_downloader.rb', line 233 def db_path File.join(backup_path, STATE_DB_FILENAME) end |
#download_file(file_remote_info, http) ⇒ Object
948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 |
# File 'lib/wayback_machine_downloader.rb', line 948 def download_file (file_remote_info, http) current_encoding = "".encoding file_url = file_remote_info[:file_url].encode(current_encoding) file_id = file_remote_info[:file_id] = file_remote_info[:timestamp] # sanitize file_id to ensure it is a valid path component raw_path_elements = file_id.split('/') sanitized_path_elements = raw_path_elements.map do |element| if Gem.win_platform? # for Windows, we need to sanitize path components to avoid invalid characters # this prevents issues with file names that contain characters not allowed in # Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions element.gsub(/[:\*?"<>\|\&\=\/\\]/) { |match| '%' + match.ord.to_s(16).upcase } else element end end current_backup_path = backup_path if file_id == "" dir_path = current_backup_path file_path = File.join(dir_path, 'index.html') elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.')) # if file_id is a directory, we treat it as such dir_path = File.join(current_backup_path, *sanitized_path_elements) file_path = File.join(dir_path, 'index.html') else # if file_id is a file, we treat it as such filename = sanitized_path_elements.pop dir_path = File.join(current_backup_path, *sanitized_path_elements) file_path = File.join(dir_path, filename) end # check existence *before* download attempt # this handles cases where a file was created manually or by a previous partial run without a .db entry if File.exist? file_path return ["#{color("[EXISTS]", :cyan)} #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", file_path] end begin structure_dir_path dir_path status = download_with_retry(file_path, file_url, , http) case status when :saved if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i rewrite_urls_to_relative(file_path) end return ["#{color("[SAVED]", :green)} #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", file_path] when :skipped_not_found return ["#{color("[NOT FOUND]", :yellow)} #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil] else # ideally, this case should not be reached if download_with_retry behaves as expected. # ideally, this case should not be reached if download_with_retry behaves as expected. return ["#{color("[UNKNOWN]", :magenta)} #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil] end rescue StandardError => e msg = "#{color("[FAILED]", :red)} #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})" if File.exist?(file_path) and File.size(file_path) == 0 File.delete(file_path) msg += "\n#{file_path} was empty and was removed." end return [msg, nil] end end |
#download_files ⇒ Object
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 |
# File 'lib/wayback_machine_downloader.rb', line 591 def download_files start_time = Time.now puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives." FileUtils.mkdir_p(backup_path) # Load the list of files to potentially download files_to_download = if files_to_download.empty? puts "No files found matching criteria." cleanup return end total_files = files_to_download.count puts "#{total_files} files found matching criteria." # Load IDs of already downloaded files downloaded_ids = load_downloaded_ids # We use a thread-safe Set to track what we have queued/downloaded in this session # to avoid infinite loops with page requisites @session_downloaded_ids = Concurrent::Set.new downloaded_ids.each { |id| @session_downloaded_ids.add(id) } files_to_process = files_to_download.reject do |file_info| downloaded_ids.include?(file_info[:file_id]) end remaining_count = files_to_process.count skipped_count = total_files - remaining_count if skipped_count > 0 puts "Found #{skipped_count} previously downloaded files, skipping them." end if remaining_count == 0 && !@page_requisites puts "All matching files have already been downloaded." cleanup return end puts "#{remaining_count} files to download." @processed_file_count = 0 @total_to_download = remaining_count @download_mutex = Mutex.new thread_count = [@threads_count, CONNECTION_POOL_SIZE].min @worker_pool = Concurrent::FixedThreadPool.new(thread_count) # initial batch files_to_process.each do |file_remote_info| @session_downloaded_ids.add(file_remote_info[:file_id]) submit_download_job(file_remote_info) end # print a header for the download phase puts "\n#{color("Processing downloads:", :white)}" $stdout.flush # wait for all jobs to finish loop do sleep 0.5 break if @pending_jobs.value == 0 end @worker_pool.shutdown @worker_pool.wait_for_termination end_time = Time.now puts "\nDownload finished in #{(end_time - start_time).round(2)}s." # process subdomains if enabled if @recursive_subdomains subdomain_start_time = Time.now process_subdomains subdomain_end_time = Time.now subdomain_time = (subdomain_end_time - subdomain_start_time).round(2) puts "Subdomain processing finished in #{subdomain_time}s." end puts "Results saved in #{backup_path}" cleanup end |
#file_list_by_timestamp ⇒ Object
1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 |
# File 'lib/wayback_machine_downloader.rb', line 1048 def if @snapshot_at @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at) elsif @all_timestamps file_list_curated = file_list_curated.map do |file_remote_info| file_remote_info[1][:file_id] = file_remote_info[0] file_remote_info[1] end else file_list_curated = get_file_list_curated file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse file_list_curated.map do |file_remote_info| file_remote_info[1][:file_id] = file_remote_info[0] file_remote_info[1] end end end |
#file_queue ⇒ Object
1044 1045 1046 |
# File 'lib/wayback_machine_downloader.rb', line 1044 def file_queue @file_queue ||= .each_with_object(Queue.new) { |file_info, q| q << file_info } end |
#get_all_snapshots_to_consider ⇒ Object
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 |
# File 'lib/wayback_machine_downloader.rb', line 272 def get_all_snapshots_to_consider if File.exist?(cdx_path) && !@reset puts "Loading snapshot list from #{cdx_path}" begin snapshot_list_to_consider = JSON.parse(File.read(cdx_path)) puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache." puts return Concurrent::Array.new(snapshot_list_to_consider) rescue JSON::ParserError => e puts "Error reading snapshot cache file #{cdx_path}: #{e.}. Refetching..." FileUtils.rm_f(cdx_path) rescue => e puts "Error loading snapshot cache #{cdx_path}: #{e.}. Refetching..." FileUtils.rm_f(cdx_path) end end snapshot_list_to_consider = Concurrent::Array.new mutex = Mutex.new # if snapshot_at is set, limit CDX queries to snapshots at or before that timestamp original_to = @to_timestamp if @snapshot_at @to_timestamp = @snapshot_at end puts "Getting snapshot pages from Wayback Machine API..." # Fetch the initial set of snapshots, sequentially @connection_pool.with_connection do |connection| initial_list = get_raw_list_from_api(@base_url, 0, connection) initial_list ||= [] mutex.synchronize do snapshot_list_to_consider.concat(initial_list) print "." $stdout.flush end end # Fetch additional pages if the exact URL flag is not set and the first page wasn't empty unless @exact_url || snapshot_list_to_consider.empty? page_index = 1 batch_size = [@threads_count, 5].min continue_fetching = true fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max) begin while continue_fetching && page_index < @maximum_pages # Determine the range of pages to fetch in this batch end_index = [page_index + batch_size, @maximum_pages].min current_batch = (page_index...end_index).to_a # Create futures for concurrent API calls futures = current_batch.map do |page| Concurrent::Future.execute(executor: fetch_pool) do result = nil @connection_pool.with_connection do |connection| result = get_raw_list_from_api(@base_url, page, connection) end result ||= [] [page, result] end end results = [] futures.each do |future| begin val = future.value # only append if valid if val && val.is_a?(Array) && val.first.is_a?(Integer) results << val end rescue => e puts "\nError fetching page #{future}: #{e.}" end end # Sort results by page number to maintain order results.sort_by! { |page, _| page } # Process results and check for empty pages results.each do |page, result| if result.nil? || result.empty? continue_fetching = false break else mutex.synchronize do snapshot_list_to_consider.concat(result) print "." $stdout.flush end end end page_index = end_index sleep(RATE_LIMIT) if continue_fetching end ensure fetch_pool.shutdown fetch_pool.wait_for_termination end end puts " found #{snapshot_list_to_consider.length} snapshots." # save the fetched list to the cache file begin FileUtils.mkdir_p(File.dirname(cdx_path)) File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON puts "Saved snapshot list to #{cdx_path}" rescue => e puts "Error saving snapshot cache to #{cdx_path}: #{e.}" ensure # restore any previously set to-timestamp @to_timestamp = original_to end puts snapshot_list_to_consider end |
#get_composite_snapshot_file_list(target_timestamp) ⇒ Object
Get a composite snapshot file list for a specific timestamp
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 |
# File 'lib/wayback_machine_downloader.rb', line 395 def get_composite_snapshot_file_list() file_versions = {} get_all_snapshots_to_consider.each do |, file_url| next unless file_url.include?('/') next if .to_i > # allow empty path by treating missing tail as empty string raw_tail = file_url.split('/')[3..-1]&.join('/') || '' file_id = sanitize_and_prepare_id(raw_tail, file_url) next if file_id.nil? next if match_exclude_filter(file_url) next unless match_only_filter(file_url) if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < .to_i file_versions[file_id] = { file_url: file_url, timestamp: , file_id: file_id } end end file_versions.values end |
#get_file_list_all_timestamps ⇒ Object
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 |
# File 'lib/wayback_machine_downloader.rb', line 453 def file_list_curated = Hash.new get_all_snapshots_to_consider.each do |, file_url| next unless file_url.include?('/') raw_tail = file_url.split('/')[3..-1]&.join('/') || '' file_id = sanitize_and_prepare_id(raw_tail, file_url) if file_id.nil? puts "Malformed file url, ignoring: #{file_url}" next end = [, file_id].join('/') = sanitize_and_prepare_id(, file_url) if .nil? puts "Malformed file id/timestamp combo, ignoring: #{file_url}" next end if .include?('<') || .include?('>') puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}" else if match_exclude_filter(file_url) puts "File url matches exclude filter, ignoring: #{file_url}" elsif !match_only_filter(file_url) puts "File url doesn't match only filter, ignoring: #{file_url}" elsif file_list_curated[] # duplicate combo, ignore silently (verbose flag not shown here) else file_list_curated[] = { file_url: file_url, timestamp: } end end end puts "file_list_curated: " + file_list_curated.count.to_s file_list_curated end |
#get_file_list_by_timestamp ⇒ Object
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 |
# File 'lib/wayback_machine_downloader.rb', line 491 def if @snapshot_at @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at) elsif @all_timestamps file_list_curated = file_list_curated.map do |file_remote_info| file_remote_info[1][:file_id] = file_remote_info[0] file_remote_info[1] end else file_list_curated = get_file_list_curated file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse file_list_curated.map do |file_remote_info| file_remote_info[1][:file_id] = file_remote_info[0] file_remote_info[1] end end end |
#get_file_list_composite_snapshot(target_timestamp) ⇒ Object
Returns a list of files for the composite snapshot
416 417 418 419 420 |
# File 'lib/wayback_machine_downloader.rb', line 416 def get_file_list_composite_snapshot() file_list = get_composite_snapshot_file_list() # return a list sorted newest->oldest by timestamp file_list.sort_by { |v| v[:timestamp].to_s }.reverse end |
#get_file_list_curated ⇒ Object
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 |
# File 'lib/wayback_machine_downloader.rb', line 422 def get_file_list_curated file_list_curated = Hash.new get_all_snapshots_to_consider.each do |, file_url| next unless file_url.include?('/') raw_tail = file_url.split('/')[3..-1]&.join('/') || '' file_id = sanitize_and_prepare_id(raw_tail, file_url) if file_id.nil? puts "Malformed file url, ignoring: #{file_url}" next end if file_id.include?('<') || file_id.include?('>') puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}" else if match_exclude_filter(file_url) puts "File url matches exclude filter, ignoring: #{file_url}" elsif !match_only_filter(file_url) puts "File url doesn't match only filter, ignoring: #{file_url}" elsif file_list_curated[file_id] unless file_list_curated[file_id][:timestamp] > file_list_curated[file_id] = { file_url: file_url, timestamp: } end else file_list_curated[file_id] = { file_url: file_url, timestamp: } end end end file_list_curated end |
#handle_reset ⇒ Object
237 238 239 240 241 242 243 244 |
# File 'lib/wayback_machine_downloader.rb', line 237 def handle_reset if @reset puts "Resetting download state..." FileUtils.rm_f(cdx_path) FileUtils.rm_f(db_path) puts "Removed state files: #{cdx_path}, #{db_path}" end end |
#list_files ⇒ Object
510 511 512 513 514 515 516 517 518 519 520 521 522 |
# File 'lib/wayback_machine_downloader.rb', line 510 def list_files # retrieval produces its own output @orig_stdout = $stdout $stdout = $stderr files = $stdout = @orig_stdout puts "[" files[0...-1].each do |file| puts file.to_json + "," end puts files[-1].to_json puts "]" end |
#load_downloaded_ids ⇒ Object
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 |
# File 'lib/wayback_machine_downloader.rb', line 524 def load_downloaded_ids downloaded_ids = Set.new if File.exist?(db_path) && !@reset puts "Loading list of already downloaded files from #{db_path}" begin File.foreach(db_path) do |line| id = line.strip # only trust DB entries that actually exist on disk; this helps when resuming path = local_path_for_file_id(id) if path && File.exist?(path) downloaded_ids.add(id) else puts "Found DB entry but file missing, will requeue: #{id}" if @logger && @logger.level == Logger::DEBUG end end rescue => e puts "Error reading downloaded files list #{db_path}: #{e.}. Assuming no files downloaded." downloaded_ids.clear end end downloaded_ids end |
#local_path_for_file_id(file_id) ⇒ Object
derive the local filesystem path for a sanitized ‘file_id` stored in the DB
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 |
# File 'lib/wayback_machine_downloader.rb', line 1018 def local_path_for_file_id(file_id) return nil if file_id.nil? current_backup_path = backup_path # file_id coming from DB is expected to already be sanitized raw_path_elements = file_id.split('/') if file_id == "" dir_path = current_backup_path return File.join(dir_path, 'index.html') elsif file_id[-1] == '/' || (raw_path_elements.last && !raw_path_elements.last.include?('.')) dir_path = File.join(current_backup_path, *raw_path_elements) return File.join(dir_path, 'index.html') else filename = raw_path_elements.pop dir_path = File.join(current_backup_path, *raw_path_elements) return File.join(dir_path, filename) end end |
#match_exclude_filter(file_url) ⇒ Object
259 260 261 262 263 264 265 266 267 268 269 270 |
# File 'lib/wayback_machine_downloader.rb', line 259 def match_exclude_filter file_url if @exclude_filter exclude_filter_regex = @exclude_filter.to_regex(detect: true) if exclude_filter_regex exclude_filter_regex =~ file_url else file_url.downcase.include? @exclude_filter.downcase end else false end end |
#match_only_filter(file_url) ⇒ Object
246 247 248 249 250 251 252 253 254 255 256 257 |
# File 'lib/wayback_machine_downloader.rb', line 246 def match_only_filter file_url if @only_filter only_filter_regex = @only_filter.to_regex(detect: true) if only_filter_regex only_filter_regex =~ file_url else file_url.downcase.include? @only_filter.downcase end else true end end |
#process_page_requisites(file_path, parent_remote_info) ⇒ Object
736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 |
# File 'lib/wayback_machine_downloader.rb', line 736 def process_page_requisites(file_path, parent_remote_info) return unless File.exist?(file_path) content = File.read(file_path) content = content.force_encoding('UTF-8').scrub assets = PageRequisites.extract(content) # prepare base URI for resolving relative paths parent_raw = parent_remote_info[:file_url] parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//) begin base_uri = URI(parent_raw) # calculate the "root" host of the site we are downloading to compare later current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host rescue URI::InvalidURIError return end = parent_remote_info[:timestamp] assets.each do |asset_rel_url| begin # resolve full URL (handles relative paths like "../img/logo.png") resolved_uri = base_uri + asset_rel_url # detect if the asset URL is already a Wayback "web/<timestamp>/.../https://..." embed = if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z} = $1 begin orig_uri = URI($2) resolved_uri = orig_uri = .to_i rescue URI::InvalidURIError # fall back to original resolved_uri and parent timestamp end end # filter out navigation links (pages) vs assets # skip if extension is empty or looks like an HTML page path = resolved_uri.path ext = File.extname(path).downcase if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext) next end # construct the original URL to query the Wayback API asset_wbm_url = if resolved_uri.scheme "#{resolved_uri.scheme}://#{resolved_uri.host}#{resolved_uri.path}" else "#{resolved_uri.host}#{resolved_uri.path}" end asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query # attempt to find the best snapshot timestamp for this asset to increase hit rate begin @connection_pool.with_connection do |connection| snapshots = get_raw_list_from_api(asset_wbm_url, 0, connection) if snapshots && snapshots.any? # choose the most recent snapshot at or before the # parent page timestamp, else the latest available chosen = snapshots.select { |ts, _| ts.to_i <= .to_i }.max_by { |s| s[0].to_i } || snapshots.max_by { |s| s[0].to_i } = chosen[0].to_i if chosen end end rescue => e @logger.warn("Failed to query CDX for #{asset_wbm_url}: #{e.}") if @logger end # construct the local file ID # if the asset is on the SAME domain, strip the domain from the folder path # if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder if resolved_uri.host == current_project_host # e.g. /static/css/style.css asset_file_id = resolved_uri.path asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/') else # e.g. cdn.google.com/jquery.js asset_file_id = asset_wbm_url end rescue URI::InvalidURIError, StandardError next end # sanitize and queue asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url) unless @session_downloaded_ids.include?(asset_id) @session_downloaded_ids.add(asset_id) new_file_info = { file_url: asset_wbm_url, timestamp: , file_id: asset_id } @download_mutex.synchronize do @total_to_download += 1 puts "Queued requisite: #{asset_file_id} (#{asset_wbm_url} @ #{})" end submit_download_job(new_file_info) end end end |
#process_single_file(file_remote_info) ⇒ Object
690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 |
# File 'lib/wayback_machine_downloader.rb', line 690 def process_single_file(file_remote_info) download_success = false downloaded_path = nil # fast-path for resumed runs: if file already exists locally, avoid HTTP work entirely existing_path = local_path_for_file_id(file_remote_info[:file_id]) if existing_path && File.exist?(existing_path) = "#{color("[EXISTS]", :cyan)} #{file_remote_info[:file_url]} (#{@processed_file_count + 1}/#{@total_to_download})" @download_mutex.synchronize do @processed_file_count += 1 if @processed_file_count < @total_to_download puts end append_to_db(file_remote_info[:file_id]) if @page_requisites && File.extname(existing_path) =~ /\.(html?|php|asp|aspx|jsp)$/i process_page_requisites(existing_path, file_remote_info) end return end @connection_pool.with_connection do |connection| , downloaded_path = download_file(file_remote_info, connection) if downloaded_path && File.exist?(downloaded_path) download_success = true end @download_mutex.synchronize do @processed_file_count += 1 if @processed_file_count < @total_to_download # only print if it's a "User" file or a requisite we found puts if end end if download_success append_to_db(file_remote_info[:file_id]) if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i process_page_requisites(downloaded_path, file_remote_info) end end rescue => e @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.}") end |
#processing_files(pool, files_to_process) ⇒ Object
558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 |
# File 'lib/wayback_machine_downloader.rb', line 558 def processing_files(pool, files_to_process) files_to_process.each do |file_remote_info| pool.post do download_success = false begin @connection_pool.with_connection do |connection| , downloaded_path = download_file(file_remote_info, connection) # consider the download successful if we have a downloaded path present if downloaded_path && File.exist?(downloaded_path) download_success = true end @download_mutex.synchronize do @processed_file_count += 1 # adjust progress message to reflect remaining files = .sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if puts if end end # sppend to DB only after successful download outside the connection block if download_success append_to_db(file_remote_info[:file_id]) end rescue => e @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.}") @download_mutex.synchronize do @processed_file_count += 1 end end sleep(RATE_LIMIT) end end end |
#rewrite_local_files ⇒ Object
881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 |
# File 'lib/wayback_machine_downloader.rb', line 881 def rewrite_local_files puts "Scanning #{backup_path} for files to rewrite..." files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js,php,asp,aspx,jsp}")) puts "Found #{files.size} files. Rewriting links for local browsing..." pool = Concurrent::FixedThreadPool.new(@threads_count) progress = Concurrent::AtomicFixnum.new(0) files.each do |file_path| pool.post do rewrite_urls_to_relative(file_path) current = progress.increment print "\rProgress: #{current}/#{files.size}" if current % 100 == 0 end end pool.shutdown pool.wait_for_termination puts "\nFinished rewriting all files." end |
#rewrite_urls_to_relative(file_path) ⇒ Object
903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 |
# File 'lib/wayback_machine_downloader.rb', line 903 def rewrite_urls_to_relative(file_path) return unless File.exist?(file_path) file_ext = File.extname(file_path).downcase begin content = File.binread(file_path) # detect encoding for HTML files if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp' encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8' content.force_encoding(encoding) rescue content.force_encoding('UTF-8') else content.force_encoding('UTF-8') end # URLs in HTML attributes content = rewrite_html_attr_urls(content) # URLs in CSS content = rewrite_css_urls(content) # URLs in JavaScript content = rewrite_js_urls(content) # for URLs that start with a single slash, make them relative content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do prefix, path, suffix = $1, $2, $3 "#{prefix}./#{path}#{suffix}" end # for URLs in CSS that start with a single slash, make them relative content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do path = $1 "url(\"./#{path}\")" end # save the modified content back to the file File.binwrite(file_path, content) puts "Rewrote URLs in #{file_path} to be relative." rescue Errno::ENOENT => e @logger.warn("Error reading file #{file_path}: #{e.}") end end |
#structure_dir_path(dir_path) ⇒ Object
845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 |
# File 'lib/wayback_machine_downloader.rb', line 845 def structure_dir_path dir_path begin # check if it's already a directory; if not, try to create it FileUtils::mkdir_p dir_path unless File.directory? dir_path rescue Errno::EEXIST, Errno::ENOTDIR => e file_already_existing = nil check_path = dir_path # walk up the path to find the specific file that is blocking directory creation while check_path != "." && check_path != "/" if File.exist?(check_path) && !File.directory?(check_path) file_already_existing = check_path break end parent = File.dirname(check_path) break if parent == check_path check_path = parent end if file_already_existing file_already_existing_temporary = file_already_existing + '.temp' file_already_existing_permanent = file_already_existing + '/index.html' FileUtils::mv file_already_existing, file_already_existing_temporary FileUtils::mkdir_p file_already_existing FileUtils::mv file_already_existing_temporary, file_already_existing_permanent puts "#{file_already_existing} -> #{file_already_existing_permanent}" # retry the directory creation now that the path is clear structure_dir_path dir_path else raise "Unhandled directory restructure error: #{e.}" end end end |
#submit_download_job(file_remote_info) ⇒ Object
helper to submit jobs and increment the counter
679 680 681 682 683 684 685 686 687 688 |
# File 'lib/wayback_machine_downloader.rb', line 679 def submit_download_job(file_remote_info) @pending_jobs.increment @worker_pool.post do begin process_single_file(file_remote_info) ensure @pending_jobs.decrement end end end |