Class: Makiri::HTML::Document

Inherits:
Document show all
Defined in:
lib/makiri/html/document.rb,
ext/makiri/makiri.c

Overview

Root container for a parsed HTML document. Construction, serialization and the HTML-only conveniences (body/head/title/encoding) live here, not on the abstract Makiri::Document.

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Document

coerce!

Methods inherited from Node

#add_class, #append_class, #at, #attribute, #attribute?, #attributes, #blank?, #cdata?, #classes, #comment?, #document?, #document_fragment?, #each, #element?, #inspect, #path, #processing_instruction?, #remove_class, #search, #set_attribute, #text?, #to_h, #traverse

Class Method Details

._parse(source) ⇒ Document

Native entry point. Ruby-level Document.parse coerces source to a String (and reads IO) before calling this. Source locations for Node#line are always tracked.

Returns:



507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
# File 'ext/makiri/glue/ruby_doc.c', line 507

static VALUE
mkr_doc_s_parse(VALUE klass, VALUE rb_source)
{
    StringValue(rb_source);
    /* Honour the input's encoding: UTF-8/US-ASCII/binary pass through (no
     * degradation), anything else is transcoded to UTF-8 so its content is
     * preserved rather than read as raw UTF-8 bytes. */
    rb_source = mkr_ruby_to_utf8(rb_source);

    /* Copy the source into a C buffer up front - BEFORE allocating the wrapper
     * (a Ruby allocation, and thus a GC point) - so no GC can run between
     * obtaining rb_source (possibly a fresh transcoded String) and copying its
     * bytes, and the parse can then run with the GVL released without racing
     * GC/compaction on the Ruby String's backing store. The source is not
     * retained past the parse (Lexbor copies what it needs into the arena and
     * the line table is built up front), so the buffer is freed immediately
     * after. The coderange is read first (no scan): a source Ruby already knows
     * is valid UTF-8 lets the parse skip its sanitisation scan. */
    bool assume_valid = mkr_ruby_str_known_valid_utf8(rb_source);
    mkr_owned_bytes_t source = {0};
    if (mkr_ruby_copy_bytes(rb_source, &source) != 0) {
        rb_raise(mkr_eError, "out of memory copying source");
    }

    /* Allocate the wrapper (with parsed == NULL) so that if parsing fails the
     * GC-managed object frees cleanly. This is the HTML parse entry (defined on
     * Makiri::HTML::Document), so the result is always HTML. */
    mkr_doc_data_t *d;
    VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_html_doc_type, d);
    d->parsed = NULL;
    d->errors = rb_ary_new();

    mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len,
                               assume_valid, NULL };
    rb_thread_call_without_gvl(mkr_parse_nogvl, &args, NULL, NULL);
    mkr_owned_bytes_clear(&source);

    d->parsed = args.result;
    if (d->parsed == NULL) {
        rb_raise(mkr_eError, "failed to parse HTML document");
    }

    return obj;
}

.parse(source) ⇒ Makiri::HTML::Document

Parse source as HTML5 and return a Makiri::HTML::Document.

source may be a String or any object responding to #read (e.g. an IO). The native parser (#_parse) expects UTF-8 bytes. Source locations for Node#line are always tracked (the cost is negligible).

Parameters:

  • source (String, #read)

Returns:



17
18
19
20
# File 'lib/makiri/html/document.rb', line 17

def self.parse(source)
  source = source.read if source.respond_to?(:read)
  _parse(String(source))
end

Instance Method Details

#bodyMakiri::Element?

The document’s <body> element, or nil.

Returns:



44
45
46
# File 'lib/makiri/html/document.rb', line 44

def body
  at_css("body")
end

#clone(freeze: nil) ⇒ Object

Like #dup: an independent copy of the document, honouring Ruby’s freeze: keyword (a frozen document’s nodes raise FrozenError on mutation).



36
37
38
39
40
# File 'lib/makiri/html/document.rb', line 36

def clone(freeze: nil)
  copy = Makiri.parse(to_html)
  copy.freeze if freeze || (freeze.nil? && frozen?)
  copy
end

#create_comment(rb_text) ⇒ Object



597
598
599
600
601
602
603
604
605
606
607
608
609
# File 'ext/makiri/glue/ruby_html_mutate.c', line 597

static VALUE
mkr_doc_create_comment(VALUE self, VALUE rb_text)
{
    lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
    mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_text, "comment content");
    lxb_dom_comment_t *c = lxb_dom_document_create_comment(
        doc, (const lxb_char_t *)tv.ptr, tv.len);
    RB_GC_GUARD(tv.value);
    if (c == NULL) {
        rb_raise(mkr_eError, "failed to create comment");
    }
    return mkr_wrap_html_node(lxb_dom_interface_node(c), self);
}

#create_document_fragmentObject

Document#create_document_fragment - DOM createDocumentFragment: an empty DocumentFragment owned by this document (unlike #fragment / DocumentFragment.parse, which parse HTML; this makes an empty one to build up programmatically).



633
634
635
636
637
638
639
640
641
642
# File 'ext/makiri/glue/ruby_html_mutate.c', line 633

static VALUE
mkr_doc_create_document_fragment(VALUE self)
{
    lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
    lxb_dom_document_fragment_t *f = lxb_dom_document_create_document_fragment(doc);
    if (f == NULL) {
        rb_raise(mkr_eError, "failed to create document fragment");
    }
    return mkr_wrap_html_node(lxb_dom_interface_node(f), self);
}

#create_element(rb_name) ⇒ Object




569
570
571
572
573
574
575
576
577
578
579
580
581
# File 'ext/makiri/glue/ruby_html_mutate.c', line 569

static VALUE
mkr_doc_create_element(VALUE self, VALUE rb_name)
{
    lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
    mkr_ruby_borrowed_text_t nv = mkr_ruby_verified_text(rb_name, "element name");
    lxb_dom_element_t *el = lxb_dom_document_create_element(
        doc, (const lxb_char_t *)nv.ptr, nv.len, NULL);
    RB_GC_GUARD(nv.value);
    if (el == NULL) {
        rb_raise(mkr_eError, "failed to create element");
    }
    return mkr_wrap_html_node(lxb_dom_interface_node(el), self);
}

#create_processing_instruction(rb_target, rb_data) ⇒ Object

Document#create_processing_instruction(target, data) - DOM createProcessingInstruction: a detached ProcessingInstruction owned by this document. Lexbor validates the target, so an invalid one fails closed.



614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
# File 'ext/makiri/glue/ruby_html_mutate.c', line 614

static VALUE
mkr_doc_create_processing_instruction(VALUE self, VALUE rb_target, VALUE rb_data)
{
    lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
    mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_target, "processing instruction target");
    mkr_ruby_borrowed_text_t dv = mkr_ruby_verified_text(rb_data, "processing instruction data");
    lxb_dom_processing_instruction_t *pi = lxb_dom_document_create_processing_instruction(
        doc, (const lxb_char_t *)tv.ptr, tv.len, (const lxb_char_t *)dv.ptr, dv.len);
    RB_GC_GUARD(tv.value);
    RB_GC_GUARD(dv.value);
    if (pi == NULL) {
        rb_raise(mkr_eError, "failed to create processing instruction");
    }
    return mkr_wrap_html_node(lxb_dom_interface_node(pi), self);
}

#create_text_node(rb_text) ⇒ Object



583
584
585
586
587
588
589
590
591
592
593
594
595
# File 'ext/makiri/glue/ruby_html_mutate.c', line 583

static VALUE
mkr_doc_create_text_node(VALUE self, VALUE rb_text)
{
    lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
    mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_text, "text content");
    lxb_dom_text_t *t = lxb_dom_document_create_text_node(
        doc, (const lxb_char_t *)tv.ptr, tv.len);
    RB_GC_GUARD(tv.value);
    if (t == NULL) {
        rb_raise(mkr_eError, "failed to create text node");
    }
    return mkr_wrap_html_node(lxb_dom_interface_node(t), self);
}

#dupObject

An independent copy of the whole document (like Nokogiri’s Document#dup). Built by serialising and re-parsing, so the copy shares no nodes with the original - Node#dup’s clone_node delegation is wrong for a document node, hence this override. (A DOM mutated into a shape the HTML parser would not itself produce, e.g. a foster-parented table cell, may be re-normalised on re-parse; a freshly parsed document round-trips unchanged.) Any level / freeze argument is ignored.



29
30
31
# File 'lib/makiri/html/document.rb', line 29

def dup(*)
  Makiri.parse(to_html)
end

#encodingString

Makiri parses and stores everything as UTF-8 (callers decode bytes before parsing), so the in-memory encoding is always UTF-8.

Returns:

  • (String)


65
66
67
# File 'lib/makiri/html/document.rb', line 65

def encoding
  "UTF-8"
end

#errorsObject

Parse warnings. Reserved; currently always empty.



600
601
602
603
604
605
606
# File 'ext/makiri/glue/ruby_doc.c', line 600

static VALUE
mkr_doc_errors(VALUE self)
{
    mkr_doc_data_t *d;
    TypedData_Get_Struct(self, mkr_doc_data_t, &mkr_doc_type, d);
    return d->errors;
}

#fragment(*args) ⇒ Object

document.fragment(html, context: …) -> DocumentFragment bound to this document. context defaults to <body>; see mkr_resolve_fragment_context.



424
425
426
427
428
429
430
431
432
433
434
435
# File 'ext/makiri/glue/ruby_doc.c', line 424

static VALUE
mkr_doc_fragment(int argc, VALUE *argv, VALUE self)
{
    VALUE html, opts;
    rb_scan_args(argc, argv, "1:", &html, &opts);
    VALUE context = NIL_P(opts) ? Qnil
                                : rb_hash_aref(opts, ID2SYM(rb_intern("context")));
    lxb_tag_id_t tag;
    lxb_ns_id_t  ns;
    mkr_resolve_fragment_context(mkr_html_doc_unwrap(self), context, &tag, &ns);
    return mkr_build_fragment_ctx(self, html, tag, ns);
}

#headMakiri::Element?

The document’s <head> element, or nil.

Returns:



50
51
52
# File 'lib/makiri/html/document.rb', line 50

def head
  at_css("head")
end

#import_node(*args) ⇒ Object

Document#import_node(node, deep = false): a shallow (or deep, with deep truthy) copy of node owned by THIS document - the DOM importNode, whose ‘deep` defaults to false (a missing/nil/false argument => shallow). Unlike Node#clone_node, the copy is owned by the receiver rather than the node’s own document, so it is the way to bring a node across documents (Makiri never moves a node between arenas). The source is left untouched; the copy is detached. Same import + <template>-content fixup as clone_node; fails closed on a NULL import.



346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
# File 'ext/makiri/glue/ruby_doc.c', line 346

static VALUE
mkr_doc_import_node(int argc, VALUE *argv, VALUE self)
{
    VALUE node_v, deep_v;
    rb_scan_args(argc, argv, "11", &node_v, &deep_v);
    bool deep = RTEST(deep_v);
    lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);

    /* An XML node is TRANSLATED across representations (mkr -> lxb) by
     * ruby_cross_import.c into a detached lxb subtree owned by this document. */
    if (mkr_node_kind(node_v) == MKR_NODE_KIND_XML) {
        lxb_dom_node_t *imp = NULL;
        mkr_xml_mut_check(mkr_cross_xml_to_html(doc, mkr_xml_node_unwrap(node_v), deep, &imp));
        return mkr_wrap_html_node(imp, self);
    }

    lxb_dom_node_t *src = mkr_html_node_unwrap(node_v);   /* HTML node (raises on a non-node) */
    lxb_dom_node_t *imp = lxb_dom_document_import_node(doc, src, deep);
    if (imp == NULL) {
        rb_raise(mkr_eError, "failed to import node");
    }
    if (deep) {
        mkr_fixup_template_content(doc, src, imp);
    }
    return mkr_wrap_html_node(imp, self);
}

#internal_subsetObject

The document’s DocumentType node (‘<!DOCTYPE …>`), or nil if absent. Mirrors Nokogiri’s Document#internal_subset. The doctype is a child of the document node (typically first), so a short scan of the children finds it.



578
579
580
581
582
583
584
585
586
587
588
# File 'ext/makiri/glue/ruby_doc.c', line 578

static VALUE
mkr_doc_internal_subset(VALUE self)
{
    lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_html_doc_unwrap(self);
    for (lxb_dom_node_t *c = doc->first_child; c != NULL; c = c->next) {
        if (c->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
            return mkr_wrap_html_node(c, self);
        }
    }
    return Qnil;
}

#meta_encodingString?

The charset declared in the document’s markup, or nil. Reads <meta charset> first, then <meta http-equiv=“Content-Type”>.

Returns:

  • (String, nil)


72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/makiri/html/document.rb', line 72

def meta_encoding
  if (m = at_css("meta[charset]"))
    return m["charset"]
  end

  css("meta").each do |meta|
    http_equiv = meta["http-equiv"]
    next unless http_equiv&.downcase == "content-type"

    content = meta["content"].to_s
    return Regexp.last_match(1) if content =~ /charset\s*=\s*"?([^\s;"]+)/i
  end
  nil
end

#meta_encoding=(value) ⇒ String

Set (or insert) a <meta charset> declaration.

Parameters:

  • value (String)

Returns:

  • (String)


90
91
92
93
# File 'lib/makiri/html/document.rb', line 90

def meta_encoding=(value)
  ensure_in_head("meta[charset]", "meta")["charset"] = value
  value
end

#quirks_modeObject

The document’s quirks mode as an Integer matching Lexbor’s lxb_dom_document_cmode_t (and Gumbo/Nokogiri): 0 = no-quirks, 1 = quirks, 2 = limited-quirks. Set by the parser from the doctype.



593
594
595
596
597
# File 'ext/makiri/glue/ruby_doc.c', line 593

static VALUE
mkr_doc_quirks_mode(VALUE self)
{
    return INT2NUM((int)mkr_html_doc_unwrap(self)->compat_mode);
}

#rootObject

Get the root element (<html>) of the document, or nil.



557
558
559
560
561
562
# File 'ext/makiri/glue/ruby_doc.c', line 557

static VALUE
mkr_doc_root(VALUE self)
{
    lxb_dom_document_t *doc = mkr_html_doc_unwrap(self);
    return mkr_wrap_html_node(lxb_dom_document_root(doc), self);
}

#titleObject

Get the document <title>, or “” if absent.



565
566
567
568
569
570
571
572
573
# File 'ext/makiri/glue/ruby_doc.c', line 565

static VALUE
mkr_doc_title(VALUE self)
{
    size_t len = 0;
    const lxb_char_t *str =
        lxb_html_document_title((lxb_html_document_t *)mkr_html_doc_unwrap(self), &len);
    return (str == NULL) ? rb_utf8_str_new("", 0)
                         : rb_utf8_str_new((const char *)str, len);
}

#title=(text) ⇒ String

Set the document title, creating <title> (in <head>) if absent.

Parameters:

  • text (String)

Returns:

  • (String)


57
58
59
60
# File 'lib/makiri/html/document.rb', line 57

def title=(text)
  ensure_in_head("title", "title").content = text
  text
end