Module: Coelacanth
- Defined in:
- lib/coelacanth.rb,
lib/coelacanth/dom.rb,
lib/coelacanth/http.rb,
lib/coelacanth/robots.rb,
lib/coelacanth/version.rb,
lib/coelacanth/redirect.rb,
lib/coelacanth/configure.rb,
lib/coelacanth/extractor.rb,
lib/coelacanth/validator.rb,
lib/coelacanth/extractor/utilities.rb,
lib/coelacanth/extractor/normalizer.rb,
lib/coelacanth/extractor/preprocessor.rb,
lib/coelacanth/extractor/weak_ml_probe.rb,
lib/coelacanth/extractor/fallback_probe.rb,
lib/coelacanth/extractor/metadata_probe.rb,
lib/coelacanth/extractor/heuristic_probe.rb,
lib/coelacanth/extractor/image_collector.rb,
lib/coelacanth/extractor/markdown_renderer.rb,
lib/coelacanth/extractor/morphological_analyzer.rb,
lib/coelacanth/extractor/eyecatch_image_extractor.rb,
lib/coelacanth/extractor/markdown_listing_collector.rb
Overview
Defined Under Namespace
Modules: Client, HTTP, Robots
Classes: Configure, DeepRedirectError, Dom, Error, Extractor, Redirect, RedirectError, RobotsDisallowedError, TimeoutError, Validator
Constant Summary
collapse
- VERSION =
"0.6.1"
Class Method Summary
collapse
Class Method Details
.analyze(url) ⇒ Object
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
# File 'lib/coelacanth.rb', line 24
def self.analyze(url)
client_class = client_class_for(config.read("client"))
@client = client_class.new(url)
regular_url = Redirect.new.resolve_redirect(url)
response = begin
Coelacanth::HTTP.get_response(URI.parse(regular_url))
rescue Coelacanth::TimeoutError
nil
end
response_metadata = {
status_code: response&.status_code,
headers: response&. || {},
final_url: response&.final_url || regular_url
}
html = response&.body.to_s
html = html.dup
html = html.force_encoding(Encoding::UTF_8)
html = html.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
= Extractor.new.call(html: html, url: regular_url, response_metadata: response_metadata)
{
dom: Dom.new.oga(regular_url, html: html),
screenshot: @client.get_screenshot,
extraction: ,
response: response_metadata
}
end
|
.client_class_for(client_name) ⇒ Object
.config ⇒ Object
62
63
64
|
# File 'lib/coelacanth.rb', line 62
def self.config
@config ||= Configure.new
end
|
.morphological_analysis(text, title: nil) ⇒ Object
66
67
68
69
70
|
# File 'lib/coelacanth.rb', line 66
def self.morphological_analysis(text, title: nil)
Extractor::MorphologicalAnalyzer
.new(config: config)
.call_text(text, title: title)
end
|