Class: Itonoko::Parser::HtmlParser

Inherits:
Object
  • Object
show all
Defined in:
lib/itonoko/parser/html_parser.rb

Defined Under Namespace

Classes: Token

Constant Summary collapse

VOID_ELEMENTS =
%w[
  area base br col embed hr img input link meta param source track wbr
].to_set.freeze
RAW_TEXT_ELEMENTS =
%w[script style].to_set.freeze
AUTO_CLOSE =

Elements that auto-close a previous open same/related element

{
  "p"  => %w[p],
  "li" => %w[li],
  "dt" => %w[dt dd],
  "dd" => %w[dd dt],
  "td" => %w[td th],
  "th" => %w[td th],
  "tr" => %w[tr],
  "colgroup" => %w[colgroup],
  "caption"  => %w[caption],
  "option"   => %w[option],
  "optgroup" => %w[optgroup option],
  "rb"  => %w[rb rt rtc rp],
  "rt"  => %w[rb rt rp],
  "rp"  => %w[rb rt rtc rp],
  "rtc" => %w[rb rtc rp],
}.freeze
IMPLIED_END_CLOSED_BY =

Implied end tags for implicit closing

{
  "li" => %w[ul ol],
  "dt" => %w[dl],
  "dd" => %w[dl],
  "tr" => %w[table tbody thead tfoot],
  "td" => %w[tr table tbody thead tfoot],
  "th" => %w[tr table tbody thead tfoot],
  "colgroup" => %w[table],
  "caption"  => %w[table],
}.freeze

Instance Method Summary collapse

Instance Method Details

#parse(html) ⇒ Object



53
54
55
56
57
58
59
# File 'lib/itonoko/parser/html_parser.rb', line 53

def parse(html)
  @doc = HTML::Document.new
  @doc.errors = []
  tokens = tokenize(html)
  build_tree(tokens)
  @doc
end