Class: Archaeo::SubdomainDiscovery

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/subdomain_discovery.rb

Overview

Discovers subdomains from downloaded content.

Scans HTML, CSS, and JavaScript files for links to subdomains of a base domain, enabling recursive archival.

Constant Summary collapse

MULTI_PART_TLDS =
%w[
  co.uk com.au co.jp co.nz co.za com.br com.mx
  com.sg co.in co.kr com.tw com.hk org.uk ac.uk
  co.il com.ar co.id co.th com.my com.tr co.ke
].freeze
HTML_URL_ATTRS =
%w[href src action].freeze
HTML_URL_RE =
/https?:\/\/([a-z0-9][-a-z0-9.]*[a-z0-9])/i
CSS_URL_RE =
/url\(\s*['"]?(https?:\/\/[^'")\s]+)['"]?\s*\)/i
JS_STRING_RE =
/['"](https?:\/\/[a-z0-9][-a-z0-9.]*[a-z0-9][^\s'"]*)['"]/i

Instance Method Summary collapse

Constructor Details

#initialize(base_domain, max_depth: 1) ⇒ SubdomainDiscovery

Returns a new instance of SubdomainDiscovery.



22
23
24
25
26
# File 'lib/archaeo/subdomain_discovery.rb', line 22

def initialize(base_domain, max_depth: 1)
  @base_domain = base_domain.to_s
  @max_depth = max_depth
  @visited = Set.new
end

Instance Method Details

#base_domain(host) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/archaeo/subdomain_discovery.rb', line 58

def base_domain(host)
  parts = host.to_s.downcase.split(".")
  return host.to_s if parts.length <= 2

  MULTI_PART_TLDS.each do |tld|
    tld_parts = tld.split(".")
    if parts.last(tld_parts.length) == tld_parts
      return parts.last(tld_parts.length + 1).join(".")
    end
  end

  parts.last(2).join(".")
end

#discover_recursive(directory, depth: 0) ⇒ Object



49
50
51
52
53
54
55
56
# File 'lib/archaeo/subdomain_discovery.rb', line 49

def discover_recursive(directory, depth: 0)
  return [] if depth >= @max_depth

  subdomains = scan_files(directory)
  new_subdomains = subdomains.reject { |s| @visited.include?(s) }
  @visited.merge(new_subdomains)
  new_subdomains
end

#scan_content(content, content_type:) ⇒ Object



28
29
30
31
# File 'lib/archaeo/subdomain_discovery.rb', line 28

def scan_content(content, content_type:)
  urls = extract_urls(content, content_type)
  filter_subdomains(urls)
end

#scan_files(directory) ⇒ Object



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/archaeo/subdomain_discovery.rb', line 33

def scan_files(directory)
  found = Set.new
  Dir.glob(File.join(directory, "**", "*")).each do |path|
    next unless File.file?(path)

    content = File.read(path, encoding: "UTF-8",
                              invalid: :replace, undef: :replace)
    ext = File.extname(path).downcase
    content_type = content_type_for_ext(ext)
    next unless content_type

    found.merge(scan_content(content, content_type: content_type))
  end
  found.to_a
end