Class: Makiri::XML::Document

Inherits:
Document show all
Includes:
NodeMethods
Defined in:
lib/makiri/xml/document.rb,
ext/makiri/glue/ruby_xml.c

Overview

XML-specific document conveniences. The XML node leaves and the document itself are defined in C (ext/makiri/glue/ruby_xml*.c); construction sugar that is pure composition over the public surface lives here, not on the abstract Makiri::Document (which carries no construction).

Class Method Summary collapse

Instance Method Summary collapse

Methods included from NodeMethods

#<<, #==, #[], #[]=, #add_child, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #at_css, #attribute_nodes, #before, #canonicalize, #child, #children, #clone_node, #collect_namespaces, #content, #content=, #css, #delete, #document, #element_children, #eql?, #hash, #inner_html, #inner_text, #key?, #last_element_child, #local_name, #matches?, #name, #name=, #namespace, #namespace_definitions, #namespace_uri, #namespaces, #next, #next_sibling, #node_type, #outer_html, #parent, #pointer_id, #prefix, #previous, #previous_sibling, #remove, #remove_attribute, #remove_attribute_ns, #replace, #set_attribute_ns, #text, #to_html, #to_s, #to_xml, #unlink, #value

Methods inherited from Document

coerce!

Methods inherited from Node

#add_class, #append_class, #at, #attribute, #attribute?, #attributes, #blank?, #cdata?, #classes, #clone, #comment?, #document?, #document_fragment?, #dup, #each, #element?, #inspect, #path, #processing_instruction?, #remove_class, #search, #set_attribute, #text?, #to_h, #traverse

Class Method Details

.Makiri::XML::Document.newDocument

A new, empty XML document (no root element) to build up programmatically with #create_element etc. and #add_child / #root=, like Nokogiri. Any arguments (Nokogiri accepts a version / encoding) are accepted and ignored.

Returns:



558
559
560
561
562
563
# File 'ext/makiri/glue/ruby_xml.c', line 558

static VALUE
mkr_xml_document_s_new(int argc, VALUE *argv, VALUE klass)
{
    (void)argc; (void)argv; (void)klass;
    return mkr_xml_new_empty_document();
}

.Makiri::XML::Document.parse(source, max_bytes: nil) ⇒ Makiri::XML::Document .Makiri::XML(source, max_bytes: nil) ⇒ Makiri::XML::Document

source is a String or any object responding to #read (an IO / File / StringIO); max_bytes overrides the default arena memory ceiling for this parse. Read a non-UTF-8 file in binary mode (File.binread / “rb”) so the encoding is autodetected from its BOM / declaration.

Overloads:



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'ext/makiri/glue/ruby_xml.c', line 103

static VALUE
mkr_xml_s_parse(int argc, VALUE *argv, VALUE self)
{
    (void)self;
    VALUE rb_source, rb_opts;
    rb_scan_args(argc, argv, "1:", &rb_source, &rb_opts);
    mkr_xml_limits_t limits = mkr_xml_parse_limits(rb_opts);  /* validates; may raise */
    size_t budget = limits.max_bytes ? limits.max_bytes : (size_t)MKR_XML_MAX_BYTES;

    /* Read an IO/File-like source (an object responding to #read), like the HTML
     * entry; a String passes straight through. */
    if (rb_respond_to(rb_source, rb_intern("read"))) {
        rb_source = rb_funcall(rb_source, rb_intern("read"), 0);
    }

    /* Strict decode under the GVL: invalid UTF-8 / undecodable byte / NUL all
     * raise Makiri::XML::SyntaxError here (no U+FFFD repair). Passing the budget
     * lets decode reject an over-budget input (LimitExceeded) before its
     * validation copy and the GVL-release copy below - so a hostile oversized
     * document is not materialised twice for a doomed parse. */
    VALUE decoded = mkr_xml_decode_input(rb_String(rb_source), budget);

    /* Copy the decoded bytes into a private C buffer up front - BEFORE allocating
     * any Ruby object (the wrap below) - so there is NO GC point between obtaining
     * +decoded+ and copying it, and the parse can then run with the GVL released
     * without racing GC/compaction on the String's backing store. */
    mkr_owned_bytes_t source = {0};
    if (mkr_ruby_copy_bytes(decoded, &source) != 0) {
        rb_raise(mkr_eError, "out of memory copying XML source");
    }

    /* Build an empty XML handle and wrap it (doc == NULL) so a failure mid-parse
     * frees cleanly via GC (mkr_parsed_destroy -> the XML branch ->
     * mkr_xml_doc_destroy(NULL), a no-op). The source is already copied, so this
     * Ruby allocation cannot disturb it. */
    mkr_parsed_t *parsed = mkr_parsed_new_xml(NULL);
    if (parsed == NULL) {
        mkr_owned_bytes_clear(&source);
        rb_raise(mkr_eError, "out of memory allocating XML document");
    }
    VALUE obj = mkr_wrap_document(parsed); /* GC owns +parsed+ from here */

    mkr_xml_parse_nogvl_t args = { source.ptr, source.len, limits, NULL, MKR_XML_OK };
    rb_thread_call_without_gvl(mkr_xml_parse_nogvl, &args, NULL, NULL);
    mkr_owned_bytes_clear(&source);

    if (args.result == NULL) {
        switch (args.status) {
        case MKR_XML_ERR_SYNTAX:  rb_raise(mkr_eXmlSyntaxError,   "malformed XML"); break;
        case MKR_XML_ERR_LIMIT:   rb_raise(mkr_eXmlLimitExceeded, "XML document budget exceeded"); break;
        case MKR_XML_ERR_VERSION: rb_raise(mkr_eXmlSyntaxError,
                                           "unsupported XML version (only XML 1.0 is supported)"); break;
        default:                  rb_raise(mkr_eError,            "failed to parse XML document"); break;
        }
    }
    mkr_parsed_set_xml_doc(parsed, args.result);
    return obj;
}

Instance Method Details

#at_xpath(*args) ⇒ Object



298
299
300
301
302
303
304
# File 'ext/makiri/glue/ruby_xml.c', line 298

static VALUE
mkr_xml_doc_at_xpath(int argc, VALUE *argv, VALUE self)
{
    VALUE expr, ns;
    rb_scan_args(argc, argv, "11", &expr, &ns);
    return mkr_xml_doc_xpath_run(self, expr, ns, 1);
}

#create_cdata(t) ⇒ Object



1309
# File 'ext/makiri/glue/ruby_xml_node.c', line 1309

static VALUE mkr_xml_doc_create_cdata(VALUE self, VALUE t)     { return mkr_xml_doc_create_chardata(self, t, MKR_XML_NODE_TYPE_CDATA_SECTION, "CDATA content"); }

#create_cdata_node(t) ⇒ Object



1309
# File 'ext/makiri/glue/ruby_xml_node.c', line 1309

static VALUE mkr_xml_doc_create_cdata(VALUE self, VALUE t)     { return mkr_xml_doc_create_chardata(self, t, MKR_XML_NODE_TYPE_CDATA_SECTION, "CDATA content"); }

#create_comment(t) ⇒ Object



1308
# File 'ext/makiri/glue/ruby_xml_node.c', line 1308

static VALUE mkr_xml_doc_create_comment(VALUE self, VALUE t)   { return mkr_xml_doc_create_chardata(self, t, MKR_XML_NODE_TYPE_COMMENT, "comment content"); }

#create_element(*args) ⇒ Object

create_element(name, content = nil, attributes = {}) -> Element. Nokogiri-style trailing arguments: a Hash sets attributes, any other (non-nil) argument is the element’s text content.



1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
# File 'ext/makiri/glue/ruby_xml_node.c', line 1260

static VALUE
mkr_xml_doc_create_element(int argc, VALUE *argv, VALUE self)
{
    VALUE rb_name, rb_rest;
    rb_scan_args(argc, argv, "1*", &rb_name, &rb_rest);
    VALUE rb_content = Qnil, rb_attrs = Qnil;
    for (long i = 0; i < RARRAY_LEN(rb_rest); i++) {
        VALUE a = RARRAY_AREF(rb_rest, i);
        if (RB_TYPE_P(a, T_HASH)) {
            rb_attrs = a;
        } else if (!NIL_P(a)) {
            rb_content = a;
        }
    }

    mkr_xml_doc_t *xdoc = mkr_xml_node_xdoc(self);
    mkr_ruby_borrowed_text_t nv = mkr_ruby_verified_text(rb_name, "element name");
    mkr_xml_node_t *el = NULL;
    mkr_xml_mut_status_t st = mkr_xml_new_element(xdoc, nv.ptr, mkr_xml_u32_len(nv.len), &el);
    RB_GC_GUARD(nv.value);
    mkr_xml_mut_check(st);
    if (!NIL_P(rb_content)) {
        mkr_ruby_borrowed_text_t tv = mkr_ruby_verified_text(rb_content, "element content");
        st = mkr_xml_set_content(xdoc, el, tv.ptr, mkr_xml_u32_len(tv.len));
        RB_GC_GUARD(tv.value);
        mkr_xml_mut_check(st);
    }
    VALUE rb_el = mkr_wrap_xml_node(el, self);
    if (!NIL_P(rb_attrs)) {
        rb_hash_foreach(rb_attrs, mkr_xml_create_attr_i, rb_el);
    }
    return rb_el;
}

#create_processing_instruction(rb_target, rb_data) ⇒ Object



1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
# File 'ext/makiri/glue/ruby_xml_node.c', line 1311

static VALUE
mkr_xml_doc_create_pi(VALUE self, VALUE rb_target, VALUE rb_data)
{
    mkr_xml_doc_t *xdoc = mkr_xml_node_xdoc(self);
    mkr_ruby_borrowed_text_t tg = mkr_ruby_verified_text(rb_target, "PI target");
    mkr_ruby_borrowed_text_t dt = mkr_ruby_verified_text(rb_data, "PI data");
    mkr_xml_node_t *pi = NULL;
    mkr_xml_mut_status_t st = mkr_xml_new_pi(
        xdoc, tg.ptr, mkr_xml_u32_len(tg.len), dt.ptr, mkr_xml_u32_len(dt.len), &pi);
    RB_GC_GUARD(tg.value);
    RB_GC_GUARD(dt.value);
    mkr_xml_mut_check(st);
    return mkr_wrap_xml_node(pi, self);
}

#create_text_node(t) ⇒ Object



1307
# File 'ext/makiri/glue/ruby_xml_node.c', line 1307

static VALUE mkr_xml_doc_create_text_node(VALUE self, VALUE t) { return mkr_xml_doc_create_chardata(self, t, MKR_XML_NODE_TYPE_TEXT, "text content"); }

#fragment(source) ⇒ DocumentFragment

Parse source into a fragment bound to this document, resolving names against the document’s in-scope (root) namespaces, so the fragment’s nodes can be spliced in with Node#add_child and friends.

Returns:



585
586
587
588
589
590
591
592
593
594
595
# File 'ext/makiri/glue/ruby_xml.c', line 585

static VALUE
mkr_xml_doc_fragment(VALUE self, VALUE rb_source)
{
    mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
    if (xdoc == NULL) {
        rb_raise(mkr_eError, "the document has no arena");
    }
    mkr_xml_node_t *frag = mkr_xml_fragment_into(xdoc, rb_source, 1);
    VALUE result = mkr_wrap_xml_node(frag, self);
    return result;
}

#import_node(*args) ⇒ Object

Makiri::XML::Document#import_node(node, deep = false) - the DOM importNode for an XML document. A same-representation (XML) node is deep/shallow-copied into this document’s arena (namespaces re-resolved when it is later linked); an HTML node is TRANSLATED across representations (lxb -> mkr) by ruby_cross_import.c. The result is detached and owned by this document; the source is untouched. Fails closed (no partial node returned).



1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
# File 'ext/makiri/glue/ruby_xml_node.c', line 1332

static VALUE
mkr_xml_doc_import_node(int argc, VALUE *argv, VALUE self)
{
    VALUE node_v, deep_v;
    rb_scan_args(argc, argv, "11", &node_v, &deep_v);
    int deep = RTEST(deep_v);

    mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
    mkr_xml_node_t *copy = NULL;

    switch (mkr_node_kind(node_v)) {
    case MKR_NODE_KIND_XML:
        mkr_xml_mut_check(mkr_xml_copy_node(xdoc, mkr_xml_node_unwrap(node_v), deep, &copy));
        break;
    case MKR_NODE_KIND_HTML:
        mkr_xml_mut_check(mkr_cross_html_to_xml(xdoc, mkr_html_node_unwrap(node_v), deep, &copy));
        break;
    default:
        rb_raise(rb_eTypeError, "import_node expects a Makiri node");
    }
    return mkr_wrap_xml_node(copy, self);
}

#internal_subsetObject

The document’s DOCTYPE as a Makiri::XML::DocumentType (aliased Makiri::XML::DTD), or nil if the document had no ‘<!DOCTYPE …>`. Mirrors Nokogiri’s Document#internal_subset. The DTD’s name and external/system identifiers are read; the DTD body is NOT parsed (no entity/element declarations are loaded - &name; stays an undefined-entity error and no external subset is fetched). The doctype node is kept off the tree, so XPath never sees it (XPath 1.0 has no doctype node type).



485
486
487
488
489
490
491
492
# File 'ext/makiri/glue/ruby_xml.c', line 485

static VALUE
mkr_xml_doc_internal_subset(VALUE self)
{
    mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
    return (xdoc == NULL || xdoc->doctype == NULL)
               ? Qnil
               : mkr_wrap_xml_node(xdoc->doctype, self);
}

#rootObject

The document’s root element.



471
472
473
474
475
476
# File 'ext/makiri/glue/ruby_xml.c', line 471

static VALUE
mkr_xml_doc_root(VALUE self)
{
    mkr_xml_doc_t *xdoc = mkr_parsed_xml_doc(mkr_doc_parsed(self));
    return (xdoc == NULL) ? Qnil : mkr_wrap_xml_node(xdoc->root, self);
}

#root=(node) ⇒ Makiri::XML::Element

Set (or replace) the document’s root element: with an existing root it replaces that root, otherwise it appends one (subject to the single-root rule). Pure composition over Node#replace / Node#add_child; Nokogiri-compatible. XML only - an HTML5 document has a fixed html/head/body structure, so a free-form root is not meaningful there.

Parameters:

Returns:



18
19
20
21
# File 'lib/makiri/xml/document.rb', line 18

def root=(node)
  r = root
  r ? r.replace(node) : add_child(node)
end

#xpath(*args) ⇒ Object

xpath / at_xpath work on the document and on any XML node (rooted at that node), so they live on the shared XML node behavior module + the document.



290
291
292
293
294
295
296
# File 'ext/makiri/glue/ruby_xml.c', line 290

static VALUE
mkr_xml_doc_xpath(int argc, VALUE *argv, VALUE self)
{
    VALUE expr, ns;
    rb_scan_args(argc, argv, "11", &expr, &ns);
    return mkr_xml_doc_xpath_run(self, expr, ns, 0);
}