Class: ScraperCentral
- Inherits:
-
Object
- Object
- ScraperCentral
- Defined in:
- lib/scraper_central.rb,
lib/scraper_central/version.rb
Constant Summary collapse
- VERSION =
'2.2.0'
Instance Attribute Summary collapse
-
#auth_config ⇒ Object
Returns the value of attribute auth_config.
-
#cache_duration ⇒ Object
Returns the value of attribute cache_duration.
-
#cookies ⇒ Object
Returns the value of attribute cookies.
-
#enable_get_dom ⇒ Object
Returns the value of attribute enable_get_dom.
-
#enable_image_cache ⇒ Object
Returns the value of attribute enable_image_cache.
-
#enable_js ⇒ Object
Returns the value of attribute enable_js.
-
#headers ⇒ Object
Returns the value of attribute headers.
-
#proxy_name ⇒ Object
Returns the value of attribute proxy_name.
-
#query_params ⇒ Object
Returns the value of attribute query_params.
-
#retry_attr ⇒ Object
Returns the value of attribute retry_attr.
-
#s3_key ⇒ Object
Returns the value of attribute s3_key.
-
#timeout ⇒ Object
Returns the value of attribute timeout.
-
#tls_verify ⇒ Object
Returns the value of attribute tls_verify.
Instance Method Summary collapse
- #cache_server ⇒ Object
- #fetch(url) ⇒ Object
-
#initialize ⇒ ScraperCentral
constructor
A new instance of ScraperCentral.
- #print_proxy_values ⇒ Object
Constructor Details
#initialize ⇒ ScraperCentral
Returns a new instance of ScraperCentral.
14 15 16 17 |
# File 'lib/scraper_central.rb', line 14 def initialize @lock = Mutex.new @logger = Logger.new($stdout) end |
Instance Attribute Details
#auth_config ⇒ Object
Returns the value of attribute auth_config.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def auth_config @auth_config end |
#cache_duration ⇒ Object
Returns the value of attribute cache_duration.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def cache_duration @cache_duration end |
#cookies ⇒ Object
Returns the value of attribute cookies.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def @cookies end |
#enable_get_dom ⇒ Object
Returns the value of attribute enable_get_dom.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def enable_get_dom @enable_get_dom end |
#enable_image_cache ⇒ Object
Returns the value of attribute enable_image_cache.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def enable_image_cache @enable_image_cache end |
#enable_js ⇒ Object
Returns the value of attribute enable_js.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def enable_js @enable_js end |
#headers ⇒ Object
Returns the value of attribute headers.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def headers @headers end |
#proxy_name ⇒ Object
Returns the value of attribute proxy_name.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def proxy_name @proxy_name end |
#query_params ⇒ Object
Returns the value of attribute query_params.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def query_params @query_params end |
#retry_attr ⇒ Object
Returns the value of attribute retry_attr.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def retry_attr @retry_attr end |
#s3_key ⇒ Object
Returns the value of attribute s3_key.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def s3_key @s3_key end |
#timeout ⇒ Object
Returns the value of attribute timeout.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def timeout @timeout end |
#tls_verify ⇒ Object
Returns the value of attribute tls_verify.
11 12 13 |
# File 'lib/scraper_central.rb', line 11 def tls_verify @tls_verify end |
Instance Method Details
#cache_server ⇒ Object
97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/scraper_central.rb', line 97 def cache_server args = { proxy_name: proxy_name, enable_js: enable_js, cache_duration: cache_duration, s3_key: s3_key, enable_image_cache: enable_image_cache, auth_config: auth_config, use_get_dom: enable_get_dom } CacheServer.new(args) end |
#fetch(url) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/scraper_central.rb', line 19 def fetch(url) @lock.synchronize do @url = url page_from_server, headers_from_server, proxy_from_server, status_code = cache_server.get_cache(@url) if proxy_from_server.nil? print_proxy_values response_code = enable_get_dom ? status_code : 200 return Response.new(code: response_code, body: page_from_server, headers: headers_from_server) else proxy_response = nil params = { country: s3_key[:country], headers: headers, query_params: query_params, cookies: , timeout: timeout, tls_verify: tls_verify, retry_attr: retry_attr, enable_js: enable_js, enable_image_cache: enable_image_cache } case proxy_from_server['proxyName'] when 'BrightData' proxy_response = Proxy::BrightData.new(params).fetch(@url, proxy_from_server) when 'CrawlBase' proxy_response = Proxy::CrawlBase.new(params).fetch(@url, proxy_from_server) when 'ScraperApi' proxy_response = Proxy::ScraperApi.new(params).fetch(@url, proxy_from_server) end if proxy_response.nil? || proxy_response&.code != 200 status_code = proxy_response&.code || 500 @logger.error("Error fetching content from proxy: #{proxy_from_server['proxyName']}, error code: #{status_code}, params: #{s3_key}") return Response.new(code: status_code) end unless enable_get_dom Thread.new do cache_server.put_cache(proxy_from_server['cacheKey'], proxy_response.body, proxy_response.headers, proxy_response., enable_image_cache) @logger.info("Cache successfully sent to server key: #{proxy_from_server['cacheKey']}") rescue StandardError => e @logger.error("Error uploading cache to server key: #{proxy_from_server['cacheKey']}, error: #{e.}") end end print_proxy_values proxy_response end end end |
#print_proxy_values ⇒ Object
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/scraper_central.rb', line 74 def print_proxy_values @logger.info("url: #{@url}") unless s3_key.empty? @logger.info("marketplace: #{s3_key[:marketplace]}") @logger.info("country: #{s3_key[:country]}") @logger.info("identifier: #{s3_key[:identifier]}") @logger.info("page_type: #{s3_key[:page_type]}") @logger.info("page_number: #{s3_key[:page_number]}") end @logger.info("cache_duration: #{cache_duration}") @logger.info("proxy_name: #{proxy_name}") @logger.info("enable_js: #{enable_js}") @logger.info("tls_verify: #{tls_verify}") if tls_verify @logger.info("headers: #{headers}") if headers @logger.info("query_params: #{query_params}") if query_params @logger.info("cookies: #{}") if @logger.info("timeout: #{timeout}") if timeout @logger.info("retry_attr: #{retry_attr}") if retry_attr @logger.info("enable_image_cache: #{enable_image_cache}") if enable_image_cache end |