Class: Crawlscope::Configuration

Inherits:
Object
  • Object
show all
Defined in:
lib/crawlscope/configuration.rb

Constant Summary collapse

DEFAULT_ALLOWED_STATUSES =
[200, 301, 302].freeze
DEFAULT_BROWSER_CONCURRENCY =
4
DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS =
5
DEFAULT_BROWSER_SCROLL_PAGE =
true
DEFAULT_CONCURRENCY =
10
DEFAULT_FETCH_EXECUTOR =
:async
RENDERERS =
%i[http browser].freeze
DEFAULT_TIMEOUT_SECONDS =
20

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#allowed_statusesObject



16
17
18
19
# File 'lib/crawlscope/configuration.rb', line 16

def allowed_statuses
  value = resolve(@allowed_statuses)
  Array(value.nil? ? DEFAULT_ALLOWED_STATUSES : value).map(&:to_i)
end

#base_urlObject



21
22
23
# File 'lib/crawlscope/configuration.rb', line 21

def base_url
  resolve(@base_url)
end

#browser_factoryObject



25
26
27
# File 'lib/crawlscope/configuration.rb', line 25

def browser_factory
  resolve(@browser_factory)
end

#concurrencyObject



29
30
31
32
# File 'lib/crawlscope/configuration.rb', line 29

def concurrency
  value = resolve(@concurrency)
  positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency")
end

#fetch_executorObject



34
35
36
37
38
39
# File 'lib/crawlscope/configuration.rb', line 34

def fetch_executor
  value = resolve(@fetch_executor)
  default = (renderer == :browser) ? :threaded : DEFAULT_FETCH_EXECUTOR

  FetchExecutor.normalize(value.nil? ? default : value)
end

#network_idle_timeout_secondsObject



52
53
54
55
# File 'lib/crawlscope/configuration.rb', line 52

def network_idle_timeout_seconds
  value = resolve(@network_idle_timeout_seconds)
  positive_integer(value, default: DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, name: "network_idle_timeout_seconds")
end

#outputObject



57
58
59
60
# File 'lib/crawlscope/configuration.rb', line 57

def output
  value = resolve(@output)
  value.nil? ? $stdout : value
end

#rendererObject

Raises:



62
63
64
65
66
67
68
69
70
71
# File 'lib/crawlscope/configuration.rb', line 62

def renderer
  value = resolve(@renderer)
  normalized_value = value.to_s.strip
  normalized_value = "http" if normalized_value.empty?

  renderer = normalized_value.to_sym
  return renderer if RENDERERS.include?(renderer)

  raise ConfigurationError, "Crawlscope renderer must be http or browser"
end

#rule_registryObject



73
74
75
76
77
78
# File 'lib/crawlscope/configuration.rb', line 73

def rule_registry
  value = resolve(@rule_registry)
  return value unless value.nil?

  RuleRegistry.default(site_name: site_name)
end

#schema_registryObject



105
106
107
108
109
110
# File 'lib/crawlscope/configuration.rb', line 105

def schema_registry
  value = resolve(@schema_registry)
  return value unless value.nil?

  SchemaRegistry.default
end

#scroll_page=(value) ⇒ Object (writeonly)

Sets the attribute scroll_page

Parameters:

  • value

    the value to set the attribute scroll_page to.



14
15
16
# File 'lib/crawlscope/configuration.rb', line 14

def scroll_page=(value)
  @scroll_page = value
end

#site_nameObject



112
113
114
# File 'lib/crawlscope/configuration.rb', line 112

def site_name
  resolve(@site_name)
end

#sitemap_pathObject



121
122
123
# File 'lib/crawlscope/configuration.rb', line 121

def sitemap_path
  resolve(@sitemap_path)
end

#timeout_secondsObject



125
126
127
128
# File 'lib/crawlscope/configuration.rb', line 125

def timeout_seconds
  value = resolve(@timeout_seconds)
  positive_integer(value, default: DEFAULT_TIMEOUT_SECONDS, name: "timeout_seconds")
end

Instance Method Details

#audit(base_url: self.base_url, sitemap_path: self.sitemap_path, rule_names: nil) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/crawlscope/configuration.rb', line 80

def audit(base_url: self.base_url, sitemap_path: self.sitemap_path, rule_names: nil)
  if base_url.to_s.strip.empty?
    raise ConfigurationError, "Crawlscope base_url is not configured"
  end

  if sitemap_path.to_s.strip.empty?
    raise ConfigurationError, "Crawlscope sitemap_path is not configured"
  end

  Crawl.new(
    base_url: base_url,
    sitemap_path: sitemap_path,
    browser_factory: browser_factory,
    concurrency: concurrency,
    fetch_executor: fetch_executor,
    network_idle_timeout_seconds: network_idle_timeout_seconds,
    renderer: renderer,
    timeout_seconds: timeout_seconds,
    allowed_statuses: allowed_statuses,
    rules: rule_registry.rules_for(rule_names),
    schema_registry: schema_registry,
    scroll_page: scroll_page?
  )
end

#browser_concurrencyObject



41
42
43
44
45
46
47
48
49
50
# File 'lib/crawlscope/configuration.rb', line 41

def browser_concurrency
  value = concurrency
  default_value = DEFAULT_BROWSER_CONCURRENCY

  if value > default_value
    default_value
  else
    value
  end
end

#scroll_page?Boolean

Returns:

  • (Boolean)


116
117
118
119
# File 'lib/crawlscope/configuration.rb', line 116

def scroll_page?
  value = resolve(@scroll_page)
  value.nil? ? DEFAULT_BROWSER_SCROLL_PAGE : value
end