Module: Coelacanth

Defined in:
lib/coelacanth.rb,
lib/coelacanth/dom.rb,
lib/coelacanth/http.rb,
lib/coelacanth/robots.rb,
lib/coelacanth/version.rb,
lib/coelacanth/redirect.rb,
lib/coelacanth/configure.rb,
lib/coelacanth/extractor.rb,
lib/coelacanth/validator.rb,
lib/coelacanth/extractor/utilities.rb,
lib/coelacanth/extractor/normalizer.rb,
lib/coelacanth/extractor/preprocessor.rb,
lib/coelacanth/extractor/weak_ml_probe.rb,
lib/coelacanth/extractor/fallback_probe.rb,
lib/coelacanth/extractor/metadata_probe.rb,
lib/coelacanth/extractor/heuristic_probe.rb,
lib/coelacanth/extractor/image_collector.rb,
lib/coelacanth/extractor/markdown_renderer.rb,
lib/coelacanth/extractor/morphological_analyzer.rb,
lib/coelacanth/extractor/eyecatch_image_extractor.rb,
lib/coelacanth/extractor/markdown_listing_collector.rb

Overview

Coelacanth

Defined Under Namespace

Modules: Client, HTTP, Robots Classes: Configure, DeepRedirectError, Dom, Error, Extractor, Redirect, RedirectError, RobotsDisallowedError, TimeoutError, Validator

Constant Summary collapse

VERSION =
"0.6.1"

Class Method Summary collapse

Class Method Details

.analyze(url) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/coelacanth.rb', line 24

def self.analyze(url)
  client_class = client_class_for(config.read("client"))
  @client = client_class.new(url)
  regular_url = Redirect.new.resolve_redirect(url)
  response = begin
    Coelacanth::HTTP.get_response(URI.parse(regular_url))
  rescue Coelacanth::TimeoutError
    nil
  end
   = {
    status_code: response&.status_code,
    headers: response&.headers || {},
    final_url: response&.final_url || regular_url
  }
  html = response&.body.to_s
  html = html.dup
  html = html.force_encoding(Encoding::UTF_8)
  html = html.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
  extractor_result = Extractor.new.call(html: html, url: regular_url, response_metadata: )
  {
    dom: Dom.new.oga(regular_url, html: html),
    screenshot: @client.get_screenshot,
    extraction: extractor_result,
    response: 
  }
end

.client_class_for(client_name) ⇒ Object



51
52
53
54
55
56
57
58
59
60
# File 'lib/coelacanth.rb', line 51

def self.client_class_for(client_name)
  case client_name
  when "screenshot_one"
    Client::ScreenshotOne
  when "gotenberg"
    Client::Gotenberg
  else
    Client::Ferrum
  end
end

.configObject



62
63
64
# File 'lib/coelacanth.rb', line 62

def self.config
  @config ||= Configure.new
end

.morphological_analysis(text, title: nil) ⇒ Object



66
67
68
69
70
# File 'lib/coelacanth.rb', line 66

def self.morphological_analysis(text, title: nil)
  Extractor::MorphologicalAnalyzer
    .new(config: config)
    .call_text(text, title: title)
end