8.8.5. PUGIXML-05 — LINQ over XML

This tutorial covers from_xml_node — a typed, lazy iterator that walks the child elements of an XML node and materializes each into a struct, so ordinary comprehensions and daslib/linq_boost queries run straight over an XML document. It lives in pugixml/PUGIXML_boost.

8.8.5.1. Typed rows from attributes

from_xml_node(root, type<Row>) walks every child element of root and fills a Row by reading same-named attributes. Each yielded value is fully materialized — no XML node escapes into the loop body. A field whose attribute is absent keeps the struct’s declared default:

require pugixml/PUGIXML_boost

struct Car {
    id : int
    make : string
    price : float
    year : int = 2000        // used when the attribute is absent
    in_stock : bool
}

let CATALOG = ("<cars>" +
    "<car id=\"1\" make=\"Audi\"   price=\"45000.0\" year=\"2021\" in_stock=\"true\"/>" +
    "<car id=\"2\" make=\"Toyota\" price=\"28000.0\" year=\"2020\" in_stock=\"true\"/>" +
    "<car id=\"3\" make=\"Ford\"   price=\"31000.0\" in_stock=\"false\"/>" +
    "<car id=\"4\" make=\"Kia\"    price=\"22000.0\" year=\"2022\" in_stock=\"true\"/>" +
    "</cars>")

parse_xml(CATALOG) $(doc, ok) {
    let root = doc.document_element
    for (car in from_xml_node(root, type<Car>)) {
        print("#{car.id} {car.make} ({car.year})\n")
    }
}
// #1 Audi (2021)
// #2 Toyota (2020)
// #3 Ford (2000)   <- year attribute absent, default kept
// #4 Kia (2022)

Supported field types are the XML scalar attribute types: int, uint, float, double, bool, string. Fields of other types keep their default (child-element / text mapping is a planned growth path).

8.8.5.2. LINQ comprehensions

The iterator is just an iterable, so comprehensions filter and project directly:

parse_xml(CATALOG) $(doc, ok) {
    let root = doc.document_element
    let affordable <- [for (car in from_xml_node(root, type<Car>));
        car.make;
        where car.in_stock && car.price < 40000.0]
    print("{affordable}\n")
    // [ Toyota, Kia]
}

8.8.5.3. Reverse iteration

each_child_reverse walks children in reverse document order via last_child / previous_sibling (both O(1) in pugixml). The fused _fold lane uses the same backward walk for reverse |> take(N) and a no-predicate last(): it visits only the kept tail — the last N elements — instead of scanning the whole child list forward:

parse_xml(CATALOG) $(doc, ok) {
    let root = doc.document_element
    // raw reverse walk — last car first
    for (ch in each_child_reverse(root, "car")) {
        print(" {ch["id"] as int}")
    }
    print("\n")
    // 4 3 2 1
    unsafe {
        // the last two cars, reversed, as typed rows — the macro walks
        // backward and stops after 2, never touching cars #1 / #2
        let last_two <- _fold(from_xml_node(doc.document_element, type<Car>).reverse().take(2).to_array())
        print("{[for (c in last_two); c.make]}\n")
        // [ Kia, Ford]
    }
}

A predicated reverse |> where |> last deliberately stays on the forward walk: reverse DOM traversal is ~2× cache-hostile per node, so a match far from the end would cost more than a forward scan.

8.8.5.4. Tag-filtered walk

from_xml_node(root, "tag", type<Row>) walks only children with that tag — useful when a parent interleaves several element kinds:

let mixed = ("<fleet>" +
    "<car id=\"10\" make=\"Mazda\" price=\"26000.0\"/>" +
    "<note>service due</note>" +
    "<car id=\"11\" make=\"Honda\" price=\"24000.0\"/>" +
    "</fleet>")
parse_xml(mixed) $(doc, ok) {
    let root = doc.document_element
    let ids <- [for (car in from_xml_node(root, "car", type<Car>)); car.id]
    print("{ids}\n")
    // [ 10, 11]   <- <note> skipped
}

8.8.5.5. Results outlive the document

Rows are owned values — string fields are cloned out of the document — so collecting them with to_array and using them after the RAII block closes is safe. Because from_xml_node is an [unsafe_outside_of_for] iterator, consuming it outside a for loop needs an unsafe block:

var inventory : array<Car>
parse_xml(CATALOG) $(doc, ok) {
    let root = doc.document_element
    unsafe {
        inventory <- from_xml_node(root, type<Car>) |> to_array()
    }
}
// doc is freed here — inventory and its strings remain valid
print("{length(inventory)} cars; first = {inventory[0].make}\n")