Class: Makiri::HTML::Document
- Defined in:
- lib/makiri/html/document.rb,
ext/makiri/makiri.c
Overview
Root container for a parsed HTML document. Construction, serialization and the HTML-only conveniences (body/head/title/encoding) live here, not on the abstract Makiri::Document.
Class Method Summary collapse
-
._parse(source) ⇒ Document
Native entry point.
-
.parse(source) ⇒ Makiri::HTML::Document
Parse
sourceas HTML5 and return a Makiri::HTML::Document.
Instance Method Summary collapse
-
#body ⇒ Makiri::Element?
The document’s <body> element, or nil.
-
#clone(freeze: nil) ⇒ Object
Like #dup: an independent copy of the document, honouring Ruby’s
freeze:keyword (a frozen document’s nodes raiseFrozenErroron mutation). - #create_comment(rb_text) ⇒ Object
-
#create_document_fragment ⇒ Object
Document#create_document_fragment - DOM createDocumentFragment: an empty DocumentFragment owned by this document (unlike #fragment / DocumentFragment.parse, which parse HTML; this makes an empty one to build up programmatically).
-
#create_element(rb_name) ⇒ Object
——————————————————————.
-
#create_processing_instruction(rb_target, rb_data) ⇒ Object
Document#create_processing_instruction(target, data) - DOM createProcessingInstruction: a detached ProcessingInstruction owned by this document.
- #create_text_node(rb_text) ⇒ Object
-
#dup ⇒ Object
An independent copy of the whole document (like Nokogiri’s Document#dup).
-
#encoding ⇒ String
Makiri parses and stores everything as UTF-8 (callers decode bytes before parsing), so the in-memory encoding is always UTF-8.
-
#errors ⇒ Object
Parse warnings.
-
#fragment(*args) ⇒ Object
document.fragment(html, context: …) -> DocumentFragment bound to this document.
-
#head ⇒ Makiri::Element?
The document’s <head> element, or nil.
-
#import_node(*args) ⇒ Object
Document#import_node(node, deep = false): a shallow (or deep, with deep truthy) copy of
nodeowned by THIS document - the DOM importNode, whose ‘deep` defaults to false (a missing/nil/false argument => shallow). -
#internal_subset ⇒ Object
The document’s DocumentType node (‘<!DOCTYPE …>`), or nil if absent.
-
#meta_encoding ⇒ String?
The charset declared in the document’s markup, or nil.
-
#meta_encoding=(value) ⇒ String
Set (or insert) a <meta charset> declaration.
-
#quirks_mode ⇒ Object
The document’s quirks mode as an Integer matching Lexbor’s lxb_dom_document_cmode_t (and Gumbo/Nokogiri): 0 = no-quirks, 1 = quirks, 2 = limited-quirks.
-
#root ⇒ Object
Get the root element (<html>) of the document, or nil.
-
#title ⇒ Object
Get the document <title>, or “” if absent.
-
#title=(text) ⇒ String
Set the document title, creating <title> (in <head>) if absent.
Methods inherited from Document
Methods inherited from Node
#add_class, #append_class, #at, #attribute, #attribute?, #attributes, #blank?, #cdata?, #classes, #comment?, #document?, #document_fragment?, #each, #element?, #inspect, #path, #processing_instruction?, #remove_class, #search, #set_attribute, #text?, #to_h, #traverse
Class Method Details
._parse(source) ⇒ Document
Native entry point. Ruby-level Document.parse coerces source to a String (and reads IO) before calling this. Source locations for Node#line are always tracked.
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 |
# File 'ext/makiri/glue/ruby_doc.c', line 507
static VALUE
mkr_doc_s_parse(VALUE klass, VALUE rb_source)
{
StringValue(rb_source);
/* Honour the input's encoding: UTF-8/US-ASCII/binary pass through (no
* degradation), anything else is transcoded to UTF-8 so its content is
* preserved rather than read as raw UTF-8 bytes. */
rb_source = mkr_ruby_to_utf8(rb_source);
/* Copy the source into a C buffer up front - BEFORE allocating the wrapper
* (a Ruby allocation, and thus a GC point) - so no GC can run between
* obtaining rb_source (possibly a fresh transcoded String) and copying its
* bytes, and the parse can then run with the GVL released without racing
* GC/compaction on the Ruby String's backing store. The source is not
* retained past the parse (Lexbor copies what it needs into the arena and
* the line table is built up front), so the buffer is freed immediately
* after. The coderange is read first (no scan): a source Ruby already knows
* is valid UTF-8 lets the parse skip its sanitisation scan. */
bool assume_valid = mkr_ruby_str_known_valid_utf8(rb_source);
mkr_owned_bytes_t source = {0};
if (mkr_ruby_copy_bytes(rb_source, &source) != 0) {
rb_raise(mkr_eError, "out of memory copying source");
}
/* Allocate the wrapper (with parsed == NULL) so that if parsing fails the
* GC-managed object frees cleanly. This is the HTML parse entry (defined on
* Makiri::HTML::Document), so the result is always HTML. */
mkr_doc_data_t *d;
VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_html_doc_type, d);
d->parsed = NULL;
d->errors = rb_ary_new();
mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len,
assume_valid, NULL };
rb_thread_call_without_gvl(mkr_parse_nogvl, &args, NULL, NULL);
mkr_owned_bytes_clear(&source);
d->parsed = args.result;
if (d->parsed == NULL) {
rb_raise(mkr_eError, "failed to parse HTML document");
}
return obj;
}
|
.parse(source) ⇒ Makiri::HTML::Document
Parse source as HTML5 and return a Makiri::HTML::Document.
source may be a String or any object responding to #read (e.g. an IO). The native parser (#_parse) expects UTF-8 bytes. Source locations for Node#line are always tracked (the cost is negligible).
17 18 19 20 |
# File 'lib/makiri/html/document.rb', line 17 def self.parse(source) source = source.read if source.respond_to?(:read) _parse(String(source)) end |
Instance Method Details
#body ⇒ Makiri::Element?
The document’s <body> element, or nil.
44 45 46 |
# File 'lib/makiri/html/document.rb', line 44 def body at_css("body") end |
#clone(freeze: nil) ⇒ Object
Like #dup: an independent copy of the document, honouring Ruby’s freeze: keyword (a frozen document’s nodes raise FrozenError on mutation).
36 37 38 39 40 |
# File 'lib/makiri/html/document.rb', line 36 def clone(freeze: nil) copy = Makiri.parse(to_html) copy.freeze if freeze || (freeze.nil? && frozen?) copy end |
#create_comment(rb_text) ⇒ Object
597 598 599 600 601 602 603 604 605 606 607 608 609 |
# File 'ext/makiri/glue/ruby_html_mutate.c', line 597
static VALUE
mkr_doc_create_comment(VALUE self, VALUE rb_text)
{
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_text, "comment content");
lxb_dom_comment_t *c = lxb_dom_document_create_comment(
doc, (const lxb_char_t *)tv.ptr, tv.len);
RB_GC_GUARD(tv.value);
if (c == NULL) {
rb_raise(mkr_eError, "failed to create comment");
}
return mkr_wrap_html_node(lxb_dom_interface_node(c), self);
}
|
#create_document_fragment ⇒ Object
Document#create_document_fragment - DOM createDocumentFragment: an empty DocumentFragment owned by this document (unlike #fragment / DocumentFragment.parse, which parse HTML; this makes an empty one to build up programmatically).
633 634 635 636 637 638 639 640 641 642 |
# File 'ext/makiri/glue/ruby_html_mutate.c', line 633
static VALUE
mkr_doc_create_document_fragment(VALUE self)
{
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
lxb_dom_document_fragment_t *f = lxb_dom_document_create_document_fragment(doc);
if (f == NULL) {
rb_raise(mkr_eError, "failed to create document fragment");
}
return mkr_wrap_html_node(lxb_dom_interface_node(f), self);
}
|
#create_element(rb_name) ⇒ Object
569 570 571 572 573 574 575 576 577 578 579 580 581 |
# File 'ext/makiri/glue/ruby_html_mutate.c', line 569
static VALUE
mkr_doc_create_element(VALUE self, VALUE rb_name)
{
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
mkr_ruby_borrowed_text_t nv = mkr_ruby_verified_text(rb_name, "element name");
lxb_dom_element_t *el = lxb_dom_document_create_element(
doc, (const lxb_char_t *)nv.ptr, nv.len, NULL);
RB_GC_GUARD(nv.value);
if (el == NULL) {
rb_raise(mkr_eError, "failed to create element");
}
return mkr_wrap_html_node(lxb_dom_interface_node(el), self);
}
|
#create_processing_instruction(rb_target, rb_data) ⇒ Object
Document#create_processing_instruction(target, data) - DOM createProcessingInstruction: a detached ProcessingInstruction owned by this document. Lexbor validates the target, so an invalid one fails closed.
614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 |
# File 'ext/makiri/glue/ruby_html_mutate.c', line 614
static VALUE
mkr_doc_create_processing_instruction(VALUE self, VALUE rb_target, VALUE rb_data)
{
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_target, "processing instruction target");
mkr_ruby_borrowed_text_t dv = mkr_ruby_verified_text(rb_data, "processing instruction data");
lxb_dom_processing_instruction_t *pi = lxb_dom_document_create_processing_instruction(
doc, (const lxb_char_t *)tv.ptr, tv.len, (const lxb_char_t *)dv.ptr, dv.len);
RB_GC_GUARD(tv.value);
RB_GC_GUARD(dv.value);
if (pi == NULL) {
rb_raise(mkr_eError, "failed to create processing instruction");
}
return mkr_wrap_html_node(lxb_dom_interface_node(pi), self);
}
|
#create_text_node(rb_text) ⇒ Object
583 584 585 586 587 588 589 590 591 592 593 594 595 |
# File 'ext/makiri/glue/ruby_html_mutate.c', line 583
static VALUE
mkr_doc_create_text_node(VALUE self, VALUE rb_text)
{
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_text, "text content");
lxb_dom_text_t *t = lxb_dom_document_create_text_node(
doc, (const lxb_char_t *)tv.ptr, tv.len);
RB_GC_GUARD(tv.value);
if (t == NULL) {
rb_raise(mkr_eError, "failed to create text node");
}
return mkr_wrap_html_node(lxb_dom_interface_node(t), self);
}
|
#dup ⇒ Object
An independent copy of the whole document (like Nokogiri’s Document#dup). Built by serialising and re-parsing, so the copy shares no nodes with the original - Node#dup’s clone_node delegation is wrong for a document node, hence this override. (A DOM mutated into a shape the HTML parser would not itself produce, e.g. a foster-parented table cell, may be re-normalised on re-parse; a freshly parsed document round-trips unchanged.) Any level / freeze argument is ignored.
29 30 31 |
# File 'lib/makiri/html/document.rb', line 29 def dup(*) Makiri.parse(to_html) end |
#encoding ⇒ String
Makiri parses and stores everything as UTF-8 (callers decode bytes before parsing), so the in-memory encoding is always UTF-8.
65 66 67 |
# File 'lib/makiri/html/document.rb', line 65 def encoding "UTF-8" end |
#errors ⇒ Object
Parse warnings. Reserved; currently always empty.
600 601 602 603 604 605 606 |
# File 'ext/makiri/glue/ruby_doc.c', line 600
static VALUE
mkr_doc_errors(VALUE self)
{
mkr_doc_data_t *d;
TypedData_Get_Struct(self, mkr_doc_data_t, &mkr_doc_type, d);
return d->errors;
}
|
#fragment(*args) ⇒ Object
document.fragment(html, context: …) -> DocumentFragment bound to this document. context defaults to <body>; see mkr_resolve_fragment_context.
424 425 426 427 428 429 430 431 432 433 434 435 |
# File 'ext/makiri/glue/ruby_doc.c', line 424
static VALUE
mkr_doc_fragment(int argc, VALUE *argv, VALUE self)
{
VALUE html, opts;
rb_scan_args(argc, argv, "1:", &html, &opts);
VALUE context = NIL_P(opts) ? Qnil
: rb_hash_aref(opts, ID2SYM(rb_intern("context")));
lxb_tag_id_t tag;
lxb_ns_id_t ns;
mkr_resolve_fragment_context(mkr_html_doc_unwrap(self), context, &tag, &ns);
return mkr_build_fragment_ctx(self, html, tag, ns);
}
|
#head ⇒ Makiri::Element?
The document’s <head> element, or nil.
50 51 52 |
# File 'lib/makiri/html/document.rb', line 50 def head at_css("head") end |
#import_node(*args) ⇒ Object
Document#import_node(node, deep = false): a shallow (or deep, with deep truthy) copy of node owned by THIS document - the DOM importNode, whose ‘deep` defaults to false (a missing/nil/false argument => shallow). Unlike Node#clone_node, the copy is owned by the receiver rather than the node’s own document, so it is the way to bring a node across documents (Makiri never moves a node between arenas). The source is left untouched; the copy is detached. Same import + <template>-content fixup as clone_node; fails closed on a NULL import.
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 |
# File 'ext/makiri/glue/ruby_doc.c', line 346
static VALUE
mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
{
VALUE node_v, deep_v;
rb_scan_args(argc, argv, "11", &node_v, &deep_v);
bool deep = RTEST(deep_v);
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
/* An XML node is TRANSLATED across representations (mkr -> lxb) by
* ruby_cross_import.c into a detached lxb subtree owned by this document. */
if (mkr_node_kind(node_v) == MKR_NODE_KIND_XML) {
lxb_dom_node_t *imp = NULL;
mkr_xml_mut_check(mkr_cross_xml_to_html(doc, mkr_xml_node_unwrap(node_v), deep, &imp));
return mkr_wrap_html_node(imp, self);
}
lxb_dom_node_t *src = mkr_html_node_unwrap(node_v); /* HTML node (raises on a non-node) */
lxb_dom_node_t *imp = lxb_dom_document_import_node(doc, src, deep);
if (imp == NULL) {
rb_raise(mkr_eError, "failed to import node");
}
if (deep) {
mkr_fixup_template_content(doc, src, imp);
}
return mkr_wrap_html_node(imp, self);
}
|
#internal_subset ⇒ Object
The document’s DocumentType node (‘<!DOCTYPE …>`), or nil if absent. Mirrors Nokogiri’s Document#internal_subset. The doctype is a child of the document node (typically first), so a short scan of the children finds it.
578 579 580 581 582 583 584 585 586 587 588 |
# File 'ext/makiri/glue/ruby_doc.c', line 578
static VALUE
mkr_doc_internal_subset(VALUE self)
{
lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_html_doc_unwrap(self);
for (lxb_dom_node_t *c = doc->first_child; c != NULL; c = c->next) {
if (c->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
return mkr_wrap_html_node(c, self);
}
}
return Qnil;
}
|
#meta_encoding ⇒ String?
The charset declared in the document’s markup, or nil. Reads <meta charset> first, then <meta http-equiv=“Content-Type”>.
72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/makiri/html/document.rb', line 72 def if (m = at_css("meta[charset]")) return m["charset"] end css("meta").each do || http_equiv = ["http-equiv"] next unless http_equiv&.downcase == "content-type" content = ["content"].to_s return Regexp.last_match(1) if content =~ /charset\s*=\s*"?([^\s;"]+)/i end nil end |
#meta_encoding=(value) ⇒ String
Set (or insert) a <meta charset> declaration.
90 91 92 93 |
# File 'lib/makiri/html/document.rb', line 90 def (value) ensure_in_head("meta[charset]", "meta")["charset"] = value value end |
#quirks_mode ⇒ Object
The document’s quirks mode as an Integer matching Lexbor’s lxb_dom_document_cmode_t (and Gumbo/Nokogiri): 0 = no-quirks, 1 = quirks, 2 = limited-quirks. Set by the parser from the doctype.
593 594 595 596 597 |
# File 'ext/makiri/glue/ruby_doc.c', line 593
static VALUE
mkr_doc_quirks_mode(VALUE self)
{
return INT2NUM((int)mkr_html_doc_unwrap(self)->compat_mode);
}
|
#root ⇒ Object
Get the root element (<html>) of the document, or nil.
557 558 559 560 561 562 |
# File 'ext/makiri/glue/ruby_doc.c', line 557
static VALUE
mkr_doc_root(VALUE self)
{
lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
return mkr_wrap_html_node(lxb_dom_document_root(doc), self);
}
|
#title ⇒ Object
Get the document <title>, or “” if absent.
565 566 567 568 569 570 571 572 573 |
# File 'ext/makiri/glue/ruby_doc.c', line 565
static VALUE
mkr_doc_title(VALUE self)
{
size_t len = 0;
const lxb_char_t *str =
lxb_html_document_title((lxb_html_document_t *)mkr_html_doc_unwrap(self), &len);
return (str == NULL) ? rb_utf8_str_new("", 0)
: rb_utf8_str_new((const char *)str, len);
}
|
#title=(text) ⇒ String
Set the document title, creating <title> (in <head>) if absent.
57 58 59 60 |
# File 'lib/makiri/html/document.rb', line 57 def title=(text) ensure_in_head("title", "title").content = text text end |