Class: Makiri::Document
- Defined in:
- lib/makiri/document.rb,
ext/makiri/makiri.c
Overview
Root container for a parsed HTML document.
Class Method Summary collapse
-
._parse(source) ⇒ Document
Native entry point.
-
.parse(source) ⇒ Makiri::Document
Parse
sourceas HTML5 and return a Document.
Instance Method Summary collapse
-
#body ⇒ Makiri::Element?
The document’s <body> element, or nil.
- #create_comment(rb_text) ⇒ Object
-
#create_element(rb_name) ⇒ Object
——————————————————————.
- #create_text_node(rb_text) ⇒ Object
-
#encoding ⇒ String
Makiri parses and stores everything as UTF-8 (callers decode bytes before parsing), so the in-memory encoding is always UTF-8.
-
#errors ⇒ Object
Parse warnings.
-
#fragment(*args) ⇒ Object
document.fragment(html, context: …) -> DocumentFragment bound to this document.
-
#head ⇒ Makiri::Element?
The document’s <head> element, or nil.
-
#internal_subset ⇒ Object
The document’s DocumentType node (‘<!DOCTYPE …>`), or nil if absent.
-
#meta_encoding ⇒ String?
The charset declared in the document’s markup, or nil.
-
#meta_encoding=(value) ⇒ String
Set (or insert) a <meta charset> declaration.
-
#quirks_mode ⇒ Object
The document’s quirks mode as an Integer matching Lexbor’s lxb_dom_document_cmode_t (and Gumbo/Nokogiri): 0 = no-quirks, 1 = quirks, 2 = limited-quirks.
-
#root ⇒ Object
Get the root element (<html>) of the document, or nil.
-
#title ⇒ Object
Get the document <title>, or “” if absent.
-
#title=(text) ⇒ String
Set the document title, creating <title> (in <head>) if absent.
Methods inherited from Node
#<<, #==, #[], #[]=, #add_child, #add_class, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #append_class, #at, #at_css, #at_xpath, #attribute, #attribute?, #attribute_nodes, #attributes, #before, #blank?, #child, #children, #classes, #comment?, #content, #content=, #css, #delete, #document, #document?, #document_fragment?, #element?, #element_children, #elements, #eql?, #first_element_child, #hash, #inner_html, #inner_html=, #inner_text, #inspect, #key?, #keys, #last_element_child, #line, #matches?, #name, #name=, #next, #next_element, #next_sibling, #node_type, #outer_html, #outer_html=, #parent, #parse, #path, #previous, #previous_element, #previous_sibling, #processing_instruction?, #remove, #remove_class, #replace, #search, #set_attribute, #text, #text?, #to_h, #to_html, #to_s, #traverse, #unlink, #value, #values, #xpath
Class Method Details
._parse(source) ⇒ Document
Native entry point. Ruby-level Document.parse coerces source to a String (and reads IO) before calling this. Source locations for Node#line are always tracked.
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 |
# File 'ext/makiri/glue/ruby_doc.c', line 384
static VALUE
mkr_doc_s_parse(VALUE klass, VALUE rb_source)
{
StringValue(rb_source);
/* Allocate the wrapper first (with parsed == NULL) so that if parsing
* fails the GC-managed object frees cleanly. */
mkr_doc_data_t *d;
VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_doc_type, d);
d->parsed = NULL;
d->errors = rb_ary_new();
/* Copy the source into a C buffer so the parse can run with the GVL
* released without racing GC/compaction on the Ruby String's backing
* store. The source is not retained past the parse (Lexbor copies what it
* needs into the arena and the line table is built up front), so the
* buffer is freed immediately after. */
mkr_owned_bytes_t source = {0};
if (mkr_ruby_copy_bytes(rb_source, &source) != 0) {
rb_raise(mkr_eError, "out of memory copying source");
}
RB_GC_GUARD(rb_source);
mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len, NULL };
rb_thread_call_without_gvl(mkr_parse_nogvl, &args, NULL, NULL);
mkr_owned_bytes_clear(&source);
d->parsed = args.result;
if (d->parsed == NULL) {
rb_raise(mkr_eError, "failed to parse HTML document");
}
return obj;
}
|
.parse(source) ⇒ Makiri::Document
Parse source as HTML5 and return a Document.
source may be a String or any object responding to #read (e.g. an IO). The native parser (#_parse) expects UTF-8 bytes. Source locations for Node#line are always tracked (the cost is negligible).
14 15 16 17 |
# File 'lib/makiri/document.rb', line 14 def self.parse(source) source = source.read if source.respond_to?(:read) _parse(String(source)) end |
Instance Method Details
#body ⇒ Makiri::Element?
The document’s <body> element, or nil.
21 22 23 |
# File 'lib/makiri/document.rb', line 21 def body at_css("body") end |
#create_comment(rb_text) ⇒ Object
397 398 399 400 401 402 403 404 405 406 407 408 409 |
# File 'ext/makiri/glue/ruby_mutate.c', line 397
static VALUE
mkr_doc_create_comment(VALUE self, VALUE rb_text)
{
lxb_dom_document_t *doc = mkr_doc_unwrap(self);
mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_text, "comment content");
lxb_dom_comment_t *c = lxb_dom_document_create_comment(
doc, (const lxb_char_t *)tv.ptr, tv.len);
RB_GC_GUARD(tv.value);
if (c == NULL) {
rb_raise(mkr_eError, "failed to create comment");
}
return mkr_wrap_node(lxb_dom_interface_node(c), self);
}
|
#create_element(rb_name) ⇒ Object
369 370 371 372 373 374 375 376 377 378 379 380 381 |
# File 'ext/makiri/glue/ruby_mutate.c', line 369
static VALUE
mkr_doc_create_element(VALUE self, VALUE rb_name)
{
lxb_dom_document_t *doc = mkr_doc_unwrap(self);
mkr_ruby_borrowed_text_t nv = mkr_ruby_verified_text(rb_name, "element name");
lxb_dom_element_t *el = lxb_dom_document_create_element(
doc, (const lxb_char_t *)nv.ptr, nv.len, NULL);
RB_GC_GUARD(nv.value);
if (el == NULL) {
rb_raise(mkr_eError, "failed to create element");
}
return mkr_wrap_node(lxb_dom_interface_node(el), self);
}
|
#create_text_node(rb_text) ⇒ Object
383 384 385 386 387 388 389 390 391 392 393 394 395 |
# File 'ext/makiri/glue/ruby_mutate.c', line 383
static VALUE
mkr_doc_create_text_node(VALUE self, VALUE rb_text)
{
lxb_dom_document_t *doc = mkr_doc_unwrap(self);
mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_text, "text content");
lxb_dom_text_t *t = lxb_dom_document_create_text_node(
doc, (const lxb_char_t *)tv.ptr, tv.len);
RB_GC_GUARD(tv.value);
if (t == NULL) {
rb_raise(mkr_eError, "failed to create text node");
}
return mkr_wrap_node(lxb_dom_interface_node(t), self);
}
|
#encoding ⇒ String
Makiri parses and stores everything as UTF-8 (callers decode bytes before parsing), so the in-memory encoding is always UTF-8.
47 48 49 |
# File 'lib/makiri/document.rb', line 47 def encoding "UTF-8" end |
#errors ⇒ Object
Parse warnings. Reserved; currently always empty.
467 468 469 470 471 472 473 |
# File 'ext/makiri/glue/ruby_doc.c', line 467
static VALUE
mkr_doc_errors(VALUE self)
{
mkr_doc_data_t *d;
TypedData_Get_Struct(self, mkr_doc_data_t, &mkr_doc_type, d);
return d->errors;
}
|
#fragment(*args) ⇒ Object
document.fragment(html, context: …) -> DocumentFragment bound to this document. context defaults to <body>; see mkr_resolve_fragment_context.
302 303 304 305 306 307 308 309 310 311 312 313 |
# File 'ext/makiri/glue/ruby_doc.c', line 302
static VALUE
mkr_doc_fragment(int argc, VALUE *argv, VALUE self)
{
VALUE html, opts;
rb_scan_args(argc, argv, "1:", &html, &opts);
VALUE context = NIL_P(opts) ? Qnil
: rb_hash_aref(opts, ID2SYM(rb_intern("context")));
lxb_tag_id_t tag;
lxb_ns_id_t ns;
mkr_resolve_fragment_context(mkr_doc_unwrap(self), context, &tag, &ns);
return mkr_build_fragment_ctx(self, html, tag, ns);
}
|
#head ⇒ Makiri::Element?
The document’s <head> element, or nil.
27 28 29 |
# File 'lib/makiri/document.rb', line 27 def head at_css("head") end |
#internal_subset ⇒ Object
The document’s DocumentType node (‘<!DOCTYPE …>`), or nil if absent. Mirrors Nokogiri’s Document#internal_subset. The doctype is a child of the document node (typically first), so a short scan of the children finds it.
445 446 447 448 449 450 451 452 453 454 455 |
# File 'ext/makiri/glue/ruby_doc.c', line 445
static VALUE
mkr_doc_internal_subset(VALUE self)
{
lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_doc_unwrap(self);
for (lxb_dom_node_t *c = doc->first_child; c != NULL; c = c->next) {
if (c->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
return mkr_wrap_node(c, self);
}
}
return Qnil;
}
|
#meta_encoding ⇒ String?
The charset declared in the document’s markup, or nil. Reads <meta charset> first, then <meta http-equiv=“Content-Type”>.
54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/makiri/document.rb', line 54 def if (m = at_css("meta[charset]")) return m["charset"] end css("meta").each do || http_equiv = ["http-equiv"] next unless http_equiv&.downcase == "content-type" content = ["content"].to_s return Regexp.last_match(1) if content =~ /charset\s*=\s*"?([^\s;"]+)/i end nil end |
#meta_encoding=(value) ⇒ String
Set (or insert) a <meta charset> declaration.
72 73 74 75 76 77 78 79 80 |
# File 'lib/makiri/document.rb', line 72 def (value) = at_css("meta[charset]") unless = Element.new("meta", self) (head || root).add_child() end ["charset"] = value value end |
#quirks_mode ⇒ Object
The document’s quirks mode as an Integer matching Lexbor’s lxb_dom_document_cmode_t (and Gumbo/Nokogiri): 0 = no-quirks, 1 = quirks, 2 = limited-quirks. Set by the parser from the doctype.
460 461 462 463 464 |
# File 'ext/makiri/glue/ruby_doc.c', line 460
static VALUE
mkr_doc_quirks_mode(VALUE self)
{
return INT2NUM((int)mkr_doc_unwrap(self)->compat_mode);
}
|
#root ⇒ Object
Get the root element (<html>) of the document, or nil.
424 425 426 427 428 429 |
# File 'ext/makiri/glue/ruby_doc.c', line 424
static VALUE
mkr_doc_root(VALUE self)
{
lxb_dom_document_t *doc = mkr_doc_unwrap(self);
return mkr_wrap_node(lxb_dom_document_root(doc), self);
}
|
#title ⇒ Object
Get the document <title>, or “” if absent.
432 433 434 435 436 437 438 439 440 |
# File 'ext/makiri/glue/ruby_doc.c', line 432
static VALUE
mkr_doc_title(VALUE self)
{
size_t len = 0;
const lxb_char_t *str =
lxb_html_document_title((lxb_html_document_t *)mkr_doc_unwrap(self), &len);
return (str == NULL) ? rb_utf8_str_new("", 0)
: rb_utf8_str_new((const char *)str, len);
}
|