The Ultimate html5ever Cheat Sheet for Rust

Oct 31, 2023 ยท 3 min read

html5ever is an HTML5 parser built for speed and correctness in Rust. This cheat sheet aims to cover its features exhaustively.

Installation

Add to Cargo.toml:

[dependencies]
html5ever = "0.25"

Parsing

From string:

let html = r#"<html>...</html>"#;
let doc = parse_document(html.as_bytes());

From bytes:

let bytes: Vec<u8> = fetch_bytes();
let doc = parse_document(bytes);

From reader:

let mut reader = File::open("doc.html")?;
let doc = parse_document(&mut reader);

From file:

let bytes = fs::read("doc.html")?;
let doc = parse_document(bytes);

Custom options:

let opts = ParseOpts::default().scripting_enabled(true);
let doc = parse_document_with_opts(html.as_bytes(), opts);

Serialization

To string:

let html = serialize(&doc, Default::default());

To writer:

let mut buffer = Vec::new();
serialize(&doc, &mut buffer);

To file:

serialize(&doc, File::create("out.html")?);

Custom options:

let opts = SerializeOpts::default()
  .minify(true)
  .format(SerializeFormat::HTML);

serialize(&doc, opts);

Traversal

Child elements:

for child in root.children() {
  // ...
}

Descendants:

fn traverse(node: &Node) {
  for child in node.children() {
    traverse(child);
  }
}

Parent element:

let parent = node.parent_element();

Previous sibling:

let prev = node.prev_sibling();

Next sibling:

let next = node.next_sibling();

Modification

Append child:

parent.append_child(&new_node);

Insert before:

parent.insert_before(&new_node, &ref_node);

Remove child:

parent.remove_child(&child);

Replace child:

parent.replace_child(&new, &old);

Set attribute:

el.set_attribute("class", "blue");

Set id:

el.set_id("main");

Set text content:

el.set_text_content(Some("Hello!"));

Creation

New element:

let el = Element::new(local_name!("div"));

New text node:

let text = TextNode::new("Hi there!");

New comment:

let comment = Comment::new("A comment");

Document fragment:

let frag = DocumentFragment::new();

Namespaces

Register namespace:

let ns = Namespace::new(None, local_name!("svg"));
doc.namespace_bindings_mut().push(ns);

Namespaced element:

let circle = Element::new(local_name!("circle"), &["svg"]);

Attributes

Boolean attribute:

el.set_bool_attribute(local_name!("hidden"), true);

Custom attribute:

el.set_custom_attribute(local_name!("data-id"), Atom::from("123"));

Encoding

From encoded bytes:

let bytes = include_bytes!("doc.html");
let encoding = EncodingRef::Utf8;
let doc = parse_document_from_utf8_expecting(bytes, encoding);

Handle invalid sequences:

let opts = ParseOpts::default().replace_invalid_codepoints(true);

Validation

DTD validate:

let dtd = include_bytes!("doctype.dtd");
parse_document(html).validate_dtd(dtd);

Schema validate:

let schema = include_bytes!("schema.xsd");
parse_document(html).validate_schema(schema);

Performance

Parallel parsing:

let html = include_str!("doc.html");
let doc = parse_html_parallel(html, &Default::default());

Real World Uses

  • HTML parsers, validators, converters
  • Web scraping and automation
  • Archiving sites
  • Sanitizing/filtering HTML
  • Migrating between systems
  • Building HTML editors and CMSes
  • Data extraction
  • PDF generation
  • Static site generator
  • HTML testing suites
  • Browser engine integration
  • Browse by language:

    The easiest way to do Web Scraping

    Get HTML from any page with a simple API call. We handle proxy rotation, browser identities, automatic retries, CAPTCHAs, JavaScript rendering, etc automatically for you


    Try ProxiesAPI for free

    curl "http://api.proxiesapi.com/?key=API_KEY&url=https://example.com"

    <!doctype html>
    <html>
    <head>
        <title>Example Domain</title>
        <meta charset="utf-8" />
        <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1" />
    ...

    X

    Don't leave just yet!

    Enter your email below to claim your free API key: