Building a Simple XML Parser in C: A Step-by-Step Guide

Introduction to XML Parsing

XML (Extensible Markup Language) is a widely used format for structured data representation. Unlike JSON, XML requires explicit opening/closing tags and supports attributes, namespaces, and complex hierarchies. Building an XML parser in C provides deep insights into text processing, memory management, and tree data structures.

This guide will create a non-validating parser that:

Reads XML text
Builds a Document Object Model (DOM) tree
Handles basic syntax errors
Supports elements, attributes, and text content

Key Components of an XML Parser

1. Lexical Analysis (Tokenizer)

Identifies XML tokens:

< (tag start)
> (tag end)
</ (closing tag)
= (attribute assignment)
" (attribute values)

2. DOM Structure

typedef struct XMLAttribute {
    char* name;
    char* value;
    struct XMLAttribute* next;
} XMLAttribute;
 
typedef struct XMLNode {
    char* name;
    char* content;
    XMLAttribute* attributes;
    struct XMLNode* parent;
    struct XMLNode* children;
    struct XMLNode* next;
} XMLNode;

3. Parsing Logic

Tag nesting validation
Attribute parsing
Text content extraction
Error recovery

Implementation Details

Step 1: Basic File Reading

char* read_xml_file(const char* filename) {
    FILE* file = fopen(filename, "rb");
    if (!file) return NULL;
 
    fseek(file, 0, SEEK_END);
    long length = ftell(file);
    fseek(file, 0, SEEK_SET);
 
    char* buffer = malloc(length + 1);
    fread(buffer, 1, length, file);
    buffer[length] = '\0';
    fclose(file);
    return buffer;
}

Step 2: Tokenization State Machine

typedef enum {
    STATE_START,
    STATE_IN_TAG,
    STATE_IN_ATTRIBUTE,
    STATE_IN_CONTENT,
    STATE_IN_COMMENT,
    STATE_ERROR
} ParserState;

Step 3: DOM Tree Construction

XMLNode* create_node(const char* name) {
    XMLNode* node = malloc(sizeof(XMLNode));
    node->name = strdup(name);
    node->content = NULL;
    node->attributes = NULL;
    node->parent = node->children = node->next = NULL;
    return node;
}
 
void add_attribute(XMLNode* node, const char* name, const char* value) {
    XMLAttribute* attr = malloc(sizeof(XMLAttribute));
    attr->name = strdup(name);
    attr->value = strdup(value);
    attr->next = node->attributes;
    node->attributes = attr;
}

Step 4: Core Parsing Algorithm

XMLNode* parse_xml(const char* xml) {
    XMLNode* root = NULL;
    XMLNode* current = NULL;
    ParserState state = STATE_START;
    char* cursor = (char*)xml;
 
    while (*cursor) {
        switch (state) {
            case STATE_START:
                if (*cursor == '<') {
                    if (*(cursor+1) == '?') {
                        // Skip XML declaration
                        cursor = strchr(cursor, '>') + 1;
                    } else {
                        state = STATE_IN_TAG;
                        current = create_node("");
                    }
                }
                cursor++;
                break;
 
            case STATE_IN_TAG: {
                char* tag_end = strpbrk(cursor, " >");
                if (!tag_end) { /* Error handling */ }
 
                *tag_end = '\0';
                current->name = strdup(cursor);
                cursor = tag_end + 1;
 
                // Attribute parsing
                while (*cursor != '>') {
                    char* eq = strchr(cursor, '=');
                    char* val_start = strchr(cursor, '"') + 1;
                    char* val_end = strchr(val_start, '"');
 
                    *eq = *val_start = *val_end = '\0';
                    add_attribute(current, cursor, val_start+1);
                    cursor = val_end + 1;
                }
 
                state = STATE_IN_CONTENT;
                break;
            }
 
            // Additional states handled similarly
        }
    }
    return root;
}

Error Handling Strategies

Mismatched Tags

void validate_tag_hierarchy(XMLNode* node) {
    if (!node) return;
    XMLNode* child = node->children;
    while (child) {
        if (strcmp(child->name, node->name) == 0) {
            fprintf(stderr, "Error: Nested <%s> tags\n", node->name);
        }
        validate_tag_hierarchy(child);
        child = child->next;
    }
}

Unclosed Tags Track open tags using a stack:

typedef struct TagStack {
    char* name;
    struct TagStack* next;
} TagStack;
 
void push_tag(TagStack** stack, const char* name) {
    TagStack* new = malloc(sizeof(TagStack));
    new->name = strdup(name);
    new->next = *stack;
    *stack = new;
}
 
void pop_tag(TagStack** stack) {
    TagStack* top = *stack;
    *stack = top->next;
    free(top->name);
    free(top);
}

Memory Management

void free_xml(XMLNode* node) {
    if (!node) return;
    XMLAttribute* attr = node->attributes;
    while (attr) {
        XMLAttribute* next = attr->next;
        free(attr->name);
        free(attr->value);
        free(attr);
        attr = next;
    }
 
    XMLNode* child = node->children;
    while (child) {
        XMLNode* next = child->next;
        free_xml(child);
        child = next;
    }
 
    free(node->name);
    free(node->content);
    free(node);
}

Example Usage

int main() {
    const char* xml =
        "<bookstore>"
        "  <book category=\"fiction\">"
        "    <title>The Hobbit</title>"
        "    <author>J.R.R. Tolkien</author>"
        "  </book>"
        "</bookstore>";
 
    XMLNode* root = parse_xml(xml);
    print_xml(root, 0); // Implement pretty-printing
    free_xml(root);
    return 0;
}

Optimization Techniques

Buffer Pooling Reuse memory blocks for frequently allocated structures
String Interning Store common strings (like "id", "class") once
Predictive Parsing Precompute tag probabilities for faster lookups
SIMD Processing Use AVX instructions for fast tag detection:

__m256i tag_start = _mm256_set1_epi8('<');
while (remaining >= 32) {
    __m256i chunk = _mm256_loadu_si256((__m256i*)cursor);
    __m256i cmp = _mm256_cmpeq_epi8(chunk, tag_start);
    unsigned mask = _mm256_movemask_epi8(cmp);
    // Process matches
}

Security Considerations

Entity Expansion Limits Prevent billion laughs attacks:

#define MAX_ENTITY_EXPANSION 64
size_t expansion_depth = 0;
void parse_entity(...) {
    if (++expansion_depth > MAX_ENTITY_EXPANSION) {
        fatal_error("Entity expansion limit exceeded");
    }
}

Buffer Overflow Protection Always use bounded string operations:

strncpy(dest, src, dest_size - 1);
dest[dest_size - 1] = '\0';

Malformed UTF-8 Handling Validate byte sequences:

int is_valid_utf8(const char* s) {
    while (*s) {
        if ((*s & 0x80) == 0) s++;
        else if ((*s & 0xE0) == 0xC0) { /* 2-byte */ }
        // Additional checks
    }
    return 1;
}

Testing Methodology

Unit Tests

void test_self_closing_tag() {
    XMLNode* node = parse_xml("<br>");
    assert(node != NULL);
    assert(node->children == NULL);
    free_xml(node);
}

Fuzz Testing Use American Fuzzy Lop (AFL) with corpus:

afl-gcc -o xml_parser parser.c
afl-fuzz -i testcases -o findings ./xml_parser @@

Conformance Testing Validate against W3C XML Test Suite

Advanced Features (Optional)

Namespace Support

typedef struct XMLNamespace {
    char* prefix;
    char* uri;
    struct XMLNamespace* next;
} XMLNamespace;

XPath Query Support Implement subset of XPath 1.0:

XMLNode* xpath_query(XMLNode* root, const char* query) {
    if (strncmp(query, "//", 2) == 0) {
        // Depth-first search
    }
}

Schema Validation Check against DTD or XSD

Conclusion

Building an XML parser in C teaches fundamental CS concepts while creating a practical tool. Key takeaways:

State machines are essential for text parsing
Careful memory management prevents leaks
Defense against malformed input is critical
Modular design enables future extensions

For production use, consider established libraries like libxml2, but understanding the underlying mechanics remains invaluable.

XML Parsing in C