04-05-2025

XML Parsing in C

Building a Simple XML Parser in C: A Step-by-Step Guide

Introduction to XML Parsing

XML (Extensible Markup Language) is a widely used format for structured data representation. Unlike JSON, XML requires explicit opening/closing tags and supports attributes, namespaces, and complex hierarchies. Building an XML parser in C provides deep insights into text processing, memory management, and tree data structures.

This guide will create a non-validating parser that:

  1. Reads XML text
  2. Builds a Document Object Model (DOM) tree
  3. Handles basic syntax errors
  4. Supports elements, attributes, and text content

Key Components of an XML Parser

1. Lexical Analysis (Tokenizer)

Identifies XML tokens:

  • < (tag start)
  • > (tag end)
  • </ (closing tag)
  • = (attribute assignment)
  • " (attribute values)

2. DOM Structure

typedef struct XMLAttribute {
    char* name;
    char* value;
    struct XMLAttribute* next;
} XMLAttribute;
 
typedef struct XMLNode {
    char* name;
    char* content;
    XMLAttribute* attributes;
    struct XMLNode* parent;
    struct XMLNode* children;
    struct XMLNode* next;
} XMLNode;

3. Parsing Logic

  • Tag nesting validation
  • Attribute parsing
  • Text content extraction
  • Error recovery

Implementation Details

Step 1: Basic File Reading

char* read_xml_file(const char* filename) {
    FILE* file = fopen(filename, "rb");
    if (!file) return NULL;
 
    fseek(file, 0, SEEK_END);
    long length = ftell(file);
    fseek(file, 0, SEEK_SET);
 
    char* buffer = malloc(length + 1);
    fread(buffer, 1, length, file);
    buffer[length] = '\0';
    fclose(file);
    return buffer;
}

Step 2: Tokenization State Machine

typedef enum {
    STATE_START,
    STATE_IN_TAG,
    STATE_IN_ATTRIBUTE,
    STATE_IN_CONTENT,
    STATE_IN_COMMENT,
    STATE_ERROR
} ParserState;

Step 3: DOM Tree Construction

XMLNode* create_node(const char* name) {
    XMLNode* node = malloc(sizeof(XMLNode));
    node->name = strdup(name);
    node->content = NULL;
    node->attributes = NULL;
    node->parent = node->children = node->next = NULL;
    return node;
}
 
void add_attribute(XMLNode* node, const char* name, const char* value) {
    XMLAttribute* attr = malloc(sizeof(XMLAttribute));
    attr->name = strdup(name);
    attr->value = strdup(value);
    attr->next = node->attributes;
    node->attributes = attr;
}

Step 4: Core Parsing Algorithm

XMLNode* parse_xml(const char* xml) {
    XMLNode* root = NULL;
    XMLNode* current = NULL;
    ParserState state = STATE_START;
    char* cursor = (char*)xml;
 
    while (*cursor) {
        switch (state) {
            case STATE_START:
                if (*cursor == '<') {
                    if (*(cursor+1) == '?') {
                        // Skip XML declaration
                        cursor = strchr(cursor, '>') + 1;
                    } else {
                        state = STATE_IN_TAG;
                        current = create_node("");
                    }
                }
                cursor++;
                break;
 
            case STATE_IN_TAG: {
                char* tag_end = strpbrk(cursor, " >");
                if (!tag_end) { /* Error handling */ }
 
                *tag_end = '\0';
                current->name = strdup(cursor);
                cursor = tag_end + 1;
 
                // Attribute parsing
                while (*cursor != '>') {
                    char* eq = strchr(cursor, '=');
                    char* val_start = strchr(cursor, '"') + 1;
                    char* val_end = strchr(val_start, '"');
 
                    *eq = *val_start = *val_end = '\0';
                    add_attribute(current, cursor, val_start+1);
                    cursor = val_end + 1;
                }
 
                state = STATE_IN_CONTENT;
                break;
            }
 
            // Additional states handled similarly
        }
    }
    return root;
}

Error Handling Strategies

  1. Mismatched Tags
void validate_tag_hierarchy(XMLNode* node) {
    if (!node) return;
    XMLNode* child = node->children;
    while (child) {
        if (strcmp(child->name, node->name) == 0) {
            fprintf(stderr, "Error: Nested <%s> tags\n", node->name);
        }
        validate_tag_hierarchy(child);
        child = child->next;
    }
}
  1. Unclosed Tags Track open tags using a stack:
typedef struct TagStack {
    char* name;
    struct TagStack* next;
} TagStack;
 
void push_tag(TagStack** stack, const char* name) {
    TagStack* new = malloc(sizeof(TagStack));
    new->name = strdup(name);
    new->next = *stack;
    *stack = new;
}
 
void pop_tag(TagStack** stack) {
    TagStack* top = *stack;
    *stack = top->next;
    free(top->name);
    free(top);
}

Memory Management

void free_xml(XMLNode* node) {
    if (!node) return;
    XMLAttribute* attr = node->attributes;
    while (attr) {
        XMLAttribute* next = attr->next;
        free(attr->name);
        free(attr->value);
        free(attr);
        attr = next;
    }
 
    XMLNode* child = node->children;
    while (child) {
        XMLNode* next = child->next;
        free_xml(child);
        child = next;
    }
 
    free(node->name);
    free(node->content);
    free(node);
}

Example Usage

int main() {
    const char* xml =
        "<bookstore>"
        "  <book category=\"fiction\">"
        "    <title>The Hobbit</title>"
        "    <author>J.R.R. Tolkien</author>"
        "  </book>"
        "</bookstore>";
 
    XMLNode* root = parse_xml(xml);
    print_xml(root, 0); // Implement pretty-printing
    free_xml(root);
    return 0;
}

Optimization Techniques

  1. Buffer Pooling Reuse memory blocks for frequently allocated structures
  2. String Interning Store common strings (like "id", "class") once
  3. Predictive Parsing Precompute tag probabilities for faster lookups
  4. SIMD Processing Use AVX instructions for fast tag detection:
__m256i tag_start = _mm256_set1_epi8('<');
while (remaining >= 32) {
    __m256i chunk = _mm256_loadu_si256((__m256i*)cursor);
    __m256i cmp = _mm256_cmpeq_epi8(chunk, tag_start);
    unsigned mask = _mm256_movemask_epi8(cmp);
    // Process matches
}

Security Considerations

  1. Entity Expansion Limits Prevent billion laughs attacks:
#define MAX_ENTITY_EXPANSION 64
size_t expansion_depth = 0;
void parse_entity(...) {
    if (++expansion_depth > MAX_ENTITY_EXPANSION) {
        fatal_error("Entity expansion limit exceeded");
    }
}
  1. Buffer Overflow Protection Always use bounded string operations:
strncpy(dest, src, dest_size - 1);
dest[dest_size - 1] = '\0';
  1. Malformed UTF-8 Handling Validate byte sequences:
int is_valid_utf8(const char* s) {
    while (*s) {
        if ((*s & 0x80) == 0) s++;
        else if ((*s & 0xE0) == 0xC0) { /* 2-byte */ }
        // Additional checks
    }
    return 1;
}

Testing Methodology

  1. Unit Tests
void test_self_closing_tag() {
    XMLNode* node = parse_xml("<br>");
    assert(node != NULL);
    assert(node->children == NULL);
    free_xml(node);
}
  1. Fuzz Testing Use American Fuzzy Lop (AFL) with corpus:
afl-gcc -o xml_parser parser.c
afl-fuzz -i testcases -o findings ./xml_parser @@
  1. Conformance Testing Validate against W3C XML Test Suite

Advanced Features (Optional)

  1. Namespace Support
typedef struct XMLNamespace {
    char* prefix;
    char* uri;
    struct XMLNamespace* next;
} XMLNamespace;
  1. XPath Query Support Implement subset of XPath 1.0:
XMLNode* xpath_query(XMLNode* root, const char* query) {
    if (strncmp(query, "//", 2) == 0) {
        // Depth-first search
    }
}
  1. Schema Validation Check against DTD or XSD

Conclusion

Building an XML parser in C teaches fundamental CS concepts while creating a practical tool. Key takeaways:

  1. State machines are essential for text parsing
  2. Careful memory management prevents leaks
  3. Defense against malformed input is critical
  4. Modular design enables future extensions

For production use, consider established libraries like libxml2, but understanding the underlying mechanics remains invaluable.