Class: Makiri::Document

Inherits:
Node
  • Object
show all
Defined in:
lib/makiri/document.rb,
ext/makiri/makiri.c

Overview

Root container for a parsed HTML document.

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Node

#<<, #==, #[], #[]=, #add_child, #add_class, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #append_class, #at, #at_css, #at_xpath, #attribute, #attribute?, #attribute_nodes, #attributes, #before, #blank?, #child, #children, #classes, #comment?, #content, #content=, #css, #delete, #document, #document?, #document_fragment?, #element?, #element_children, #elements, #eql?, #first_element_child, #hash, #inner_html, #inner_html=, #inner_text, #inspect, #key?, #keys, #last_element_child, #line, #matches?, #name, #name=, #next, #next_element, #next_sibling, #node_type, #outer_html, #outer_html=, #parent, #parse, #path, #previous, #previous_element, #previous_sibling, #processing_instruction?, #remove, #remove_class, #replace, #search, #set_attribute, #text, #text?, #to_h, #to_html, #to_s, #traverse, #unlink, #value, #values, #xpath

Class Method Details

._parse(source) ⇒ Document

Native entry point. Ruby-level Document.parse coerces source to a String (and reads IO) before calling this. Source locations for Node#line are always tracked.

Returns:



384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
# File 'ext/makiri/glue/ruby_doc.c', line 384

static VALUE
mkr_doc_s_parse(VALUE klass, VALUE rb_source)
{
    StringValue(rb_source);

    /* Allocate the wrapper first (with parsed == NULL) so that if parsing
     * fails the GC-managed object frees cleanly. */
    mkr_doc_data_t *d;
    VALUE obj = TypedData_Make_Struct(klass, mkr_doc_data_t, &mkr_doc_type, d);
    d->parsed = NULL;
    d->errors = rb_ary_new();

    /* Copy the source into a C buffer so the parse can run with the GVL
     * released without racing GC/compaction on the Ruby String's backing
     * store. The source is not retained past the parse (Lexbor copies what it
     * needs into the arena and the line table is built up front), so the
     * buffer is freed immediately after. */
    mkr_owned_bytes_t source = {0};
    if (mkr_ruby_copy_bytes(rb_source, &source) != 0) {
        rb_raise(mkr_eError, "out of memory copying source");
    }
    RB_GC_GUARD(rb_source);

    mkr_parse_nogvl_t args = { (const lxb_char_t *)source.ptr, source.len, NULL };
    rb_thread_call_without_gvl(mkr_parse_nogvl, &args, NULL, NULL);
    mkr_owned_bytes_clear(&source);

    d->parsed = args.result;
    if (d->parsed == NULL) {
        rb_raise(mkr_eError, "failed to parse HTML document");
    }

    return obj;
}

.parse(source) ⇒ Makiri::Document

Parse source as HTML5 and return a Document.

source may be a String or any object responding to #read (e.g. an IO). The native parser (#_parse) expects UTF-8 bytes. Source locations for Node#line are always tracked (the cost is negligible).

Parameters:

  • source (String, #read)

Returns:



14
15
16
17
# File 'lib/makiri/document.rb', line 14

def self.parse(source)
  source = source.read if source.respond_to?(:read)
  _parse(String(source))
end

Instance Method Details

#bodyMakiri::Element?

The document’s <body> element, or nil.

Returns:



21
22
23
# File 'lib/makiri/document.rb', line 21

def body
  at_css("body")
end

#create_comment(rb_text) ⇒ Object



397
398
399
400
401
402
403
404
405
406
407
408
409
# File 'ext/makiri/glue/ruby_mutate.c', line 397

static VALUE
mkr_doc_create_comment(VALUE self, VALUE rb_text)
{
    lxb_dom_document_t *doc = mkr_doc_unwrap(self);
    mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_text, "comment content");
    lxb_dom_comment_t *c = lxb_dom_document_create_comment(
        doc, (const lxb_char_t *)tv.ptr, tv.len);
    RB_GC_GUARD(tv.value);
    if (c == NULL) {
        rb_raise(mkr_eError, "failed to create comment");
    }
    return mkr_wrap_node(lxb_dom_interface_node(c), self);
}

#create_element(rb_name) ⇒ Object




369
370
371
372
373
374
375
376
377
378
379
380
381
# File 'ext/makiri/glue/ruby_mutate.c', line 369

static VALUE
mkr_doc_create_element(VALUE self, VALUE rb_name)
{
    lxb_dom_document_t *doc = mkr_doc_unwrap(self);
    mkr_ruby_borrowed_text_t nv = mkr_ruby_verified_text(rb_name, "element name");
    lxb_dom_element_t *el = lxb_dom_document_create_element(
        doc, (const lxb_char_t *)nv.ptr, nv.len, NULL);
    RB_GC_GUARD(nv.value);
    if (el == NULL) {
        rb_raise(mkr_eError, "failed to create element");
    }
    return mkr_wrap_node(lxb_dom_interface_node(el), self);
}

#create_text_node(rb_text) ⇒ Object



383
384
385
386
387
388
389
390
391
392
393
394
395
# File 'ext/makiri/glue/ruby_mutate.c', line 383

static VALUE
mkr_doc_create_text_node(VALUE self, VALUE rb_text)
{
    lxb_dom_document_t *doc = mkr_doc_unwrap(self);
    mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_text, "text content");
    lxb_dom_text_t *t = lxb_dom_document_create_text_node(
        doc, (const lxb_char_t *)tv.ptr, tv.len);
    RB_GC_GUARD(tv.value);
    if (t == NULL) {
        rb_raise(mkr_eError, "failed to create text node");
    }
    return mkr_wrap_node(lxb_dom_interface_node(t), self);
}

#encodingString

Makiri parses and stores everything as UTF-8 (callers decode bytes before parsing), so the in-memory encoding is always UTF-8.

Returns:

  • (String)


47
48
49
# File 'lib/makiri/document.rb', line 47

def encoding
  "UTF-8"
end

#errorsObject

Parse warnings. Reserved; currently always empty.



467
468
469
470
471
472
473
# File 'ext/makiri/glue/ruby_doc.c', line 467

static VALUE
mkr_doc_errors(VALUE self)
{
    mkr_doc_data_t *d;
    TypedData_Get_Struct(self, mkr_doc_data_t, &mkr_doc_type, d);
    return d->errors;
}

#fragment(*args) ⇒ Object

document.fragment(html, context: …) -> DocumentFragment bound to this document. context defaults to <body>; see mkr_resolve_fragment_context.



302
303
304
305
306
307
308
309
310
311
312
313
# File 'ext/makiri/glue/ruby_doc.c', line 302

static VALUE
mkr_doc_fragment(int argc, VALUE *argv, VALUE self)
{
    VALUE html, opts;
    rb_scan_args(argc, argv, "1:", &html, &opts);
    VALUE context = NIL_P(opts) ? Qnil
                                : rb_hash_aref(opts, ID2SYM(rb_intern("context")));
    lxb_tag_id_t tag;
    lxb_ns_id_t  ns;
    mkr_resolve_fragment_context(mkr_doc_unwrap(self), context, &tag, &ns);
    return mkr_build_fragment_ctx(self, html, tag, ns);
}

#headMakiri::Element?

The document’s <head> element, or nil.

Returns:



27
28
29
# File 'lib/makiri/document.rb', line 27

def head
  at_css("head")
end

#internal_subsetObject

The document’s DocumentType node (‘<!DOCTYPE …>`), or nil if absent. Mirrors Nokogiri’s Document#internal_subset. The doctype is a child of the document node (typically first), so a short scan of the children finds it.



445
446
447
448
449
450
451
452
453
454
455
# File 'ext/makiri/glue/ruby_doc.c', line 445

static VALUE
mkr_doc_internal_subset(VALUE self)
{
    lxb_dom_node_t *doc = (lxb_dom_node_t *)mkr_doc_unwrap(self);
    for (lxb_dom_node_t *c = doc->first_child; c != NULL; c = c->next) {
        if (c->type == LXB_DOM_NODE_TYPE_DOCUMENT_TYPE) {
            return mkr_wrap_node(c, self);
        }
    }
    return Qnil;
}

#meta_encodingString?

The charset declared in the document’s markup, or nil. Reads <meta charset> first, then <meta http-equiv=“Content-Type”>.

Returns:

  • (String, nil)


54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/makiri/document.rb', line 54

def meta_encoding
  if (m = at_css("meta[charset]"))
    return m["charset"]
  end

  css("meta").each do |meta|
    http_equiv = meta["http-equiv"]
    next unless http_equiv&.downcase == "content-type"

    content = meta["content"].to_s
    return Regexp.last_match(1) if content =~ /charset\s*=\s*"?([^\s;"]+)/i
  end
  nil
end

#meta_encoding=(value) ⇒ String

Set (or insert) a <meta charset> declaration.

Parameters:

  • value (String)

Returns:

  • (String)


72
73
74
75
76
77
78
79
80
# File 'lib/makiri/document.rb', line 72

def meta_encoding=(value)
  meta = at_css("meta[charset]")
  unless meta
    meta = Element.new("meta", self)
    (head || root).add_child(meta)
  end
  meta["charset"] = value
  value
end

#quirks_modeObject

The document’s quirks mode as an Integer matching Lexbor’s lxb_dom_document_cmode_t (and Gumbo/Nokogiri): 0 = no-quirks, 1 = quirks, 2 = limited-quirks. Set by the parser from the doctype.



460
461
462
463
464
# File 'ext/makiri/glue/ruby_doc.c', line 460

static VALUE
mkr_doc_quirks_mode(VALUE self)
{
    return INT2NUM((int)mkr_doc_unwrap(self)->compat_mode);
}

#rootObject

Get the root element (<html>) of the document, or nil.



424
425
426
427
428
429
# File 'ext/makiri/glue/ruby_doc.c', line 424

static VALUE
mkr_doc_root(VALUE self)
{
    lxb_dom_document_t *doc = mkr_doc_unwrap(self);
    return mkr_wrap_node(lxb_dom_document_root(doc), self);
}

#titleObject

Get the document <title>, or “” if absent.



432
433
434
435
436
437
438
439
440
# File 'ext/makiri/glue/ruby_doc.c', line 432

static VALUE
mkr_doc_title(VALUE self)
{
    size_t len = 0;
    const lxb_char_t *str =
        lxb_html_document_title((lxb_html_document_t *)mkr_doc_unwrap(self), &len);
    return (str == NULL) ? rb_utf8_str_new("", 0)
                         : rb_utf8_str_new((const char *)str, len);
}

#title=(text) ⇒ String

Set the document title, creating <title> (in <head>) if absent.

Parameters:

  • text (String)

Returns:

  • (String)


34
35
36
37
38
39
40
41
42
# File 'lib/makiri/document.rb', line 34

def title=(text)
  t = at_css("title")
  unless t
    t = Element.new("title", self)
    (head || root).add_child(t)
  end
  t.content = text
  text
end