Class: Readability::Document

Inherits:

Object

Object
Readability::Document

show all

Includes:: Cleaner, Metadata, Scoring, Utils

Defined in:: lib/readability/document.rb

Instance Method Summary collapse

#initialize(doc, url: nil, **options) ⇒ Document constructor

A new instance of Document.
#parse ⇒ Object

Port of parse() — JS lines 2747-2805.

Constructor Details

#initialize(doc, url: nil, **options) ⇒ `Document`

Returns a new instance of Document.

# File 'lib/readability/document.rb', line 12

def initialize(doc, url: nil, **options)
  @doc = doc.dup  # Deep clone
  @url = url
  @article_title = nil
  @article_byline = nil
  @article_dir = nil
  @article_site_name = nil
  @article_lang = nil
  @attempts = []
  @metadata = {}
  @candidates = {}
  @data_tables = Set.new

  # Options — from JS constructor lines 49-66
  @debug = !!options[:debug]
  @max_elems_to_parse = options[:max_elems_to_parse] || DEFAULT_MAX_ELEMS_TO_PARSE
  @nb_top_candidates = options[:nb_top_candidates] || DEFAULT_N_TOP_CANDIDATES
  @char_threshold = options[:char_threshold] || DEFAULT_CHAR_THRESHOLD
  @classes_to_preserve = CLASSES_TO_PRESERVE + (options[:classes_to_preserve] || [])
  @keep_classes = !!options[:keep_classes]
  @serializer = options[:serializer] || ->(el) { el.inner_html }
  @disable_json_ld = !!options[:disable_json_ld]
  @allowed_video_regex = options[:allowed_video_regex] || VIDEOS
  @link_density_modifier = options[:link_density_modifier] || 0

  # Flags — all active initially
  @flags = FLAG_STRIP_UNLIKELYS | FLAG_WEIGHT_CLASSES | FLAG_CLEAN_CONDITIONALLY
end

Instance Method Details

#parse ⇒ `Object`

Port of parse() — JS lines 2747-2805

# File 'lib/readability/document.rb', line 42

def parse
  # Avoid parsing too large documents
  if @max_elems_to_parse > 0
    count = 0
    @doc.traverse do |n|
      if n.element?
        count += 1
        if count > @max_elems_to_parse
          raise "Aborting parsing document; #{count} elements found"
        end
      end
    end
  end

  # Unwrap image from noscript
  unwrap_noscript_images(@doc)

  # Extract JSON-LD metadata before removing scripts
  json_ld = @disable_json_ld ? {} : get_json_ld(@doc)

  # Remove script tags from the document
  remove_scripts(@doc)

  prep_document

  # Cache the prepped body HTML for retry re-parsing (avoids innerHTML= cost)
  @prepped_body_html = @doc.at_css("body")&.inner_html

  metadata = get_article_metadata(json_ld)
  @metadata = metadata
  @article_title = metadata["title"]

  article_content = grab_article
  return nil unless article_content

  log("Grabbed: #{article_content.inner_html}")

  post_process_content(article_content)

  # If we haven't found an excerpt in the article's metadata, use the article's
  # first paragraph as the excerpt.
  if !metadata["excerpt"] || metadata["excerpt"].empty?
    paragraphs = article_content.css("p")
    if paragraphs.length > 0
      metadata["excerpt"] = paragraphs[0].text.strip
    end
  end

  text_content = article_content.text

  Result.new(
    title: @article_title,
    byline: (metadata["byline"] && !metadata["byline"].empty? ? metadata["byline"] : nil) || @article_byline,
    dir: @article_dir,
    lang: @article_lang,
    content: @serializer.call(article_content),
    text_content: text_content,
    length: text_content.length,
    excerpt: metadata["excerpt"],
    site_name: metadata["siteName"] || @article_site_name,
    published_time: metadata["publishedTime"]
  )
end

Class: Readability::Document

Instance Method Summary collapse

Constructor Details

#initialize(doc, url: nil, **options) ⇒ Document

Instance Method Details

#parse ⇒ Object

#initialize(doc, url: nil, **options) ⇒ `Document`

#parse ⇒ `Object`