Building a Simple XML Parser in C: A Step-by-Step Guide
Introduction to XML Parsing
XML (Extensible Markup Language) is a widely used format for structured data representation. Unlike JSON, XML requires explicit opening/closing tags and supports attributes, namespaces, and complex hierarchies. Building an XML parser in C provides deep insights into text processing, memory management, and tree data structures.
This guide will create a non-validating parser that:
- Reads XML text
- Builds a Document Object Model (DOM) tree
- Handles basic syntax errors
- Supports elements, attributes, and text content
Key Components of an XML Parser
1. Lexical Analysis (Tokenizer)
Identifies XML tokens:
<(tag start)>(tag end)</(closing tag)=(attribute assignment)"(attribute values)
2. DOM Structure
typedef struct XMLAttribute {
char* name;
char* value;
struct XMLAttribute* next;
} XMLAttribute;
typedef struct XMLNode {
char* name;
char* content;
XMLAttribute* attributes;
struct XMLNode* parent;
struct XMLNode* children;
struct XMLNode* next;
} XMLNode;3. Parsing Logic
- Tag nesting validation
- Attribute parsing
- Text content extraction
- Error recovery
Implementation Details
Step 1: Basic File Reading
char* read_xml_file(const char* filename) {
FILE* file = fopen(filename, "rb");
if (!file) return NULL;
fseek(file, 0, SEEK_END);
long length = ftell(file);
fseek(file, 0, SEEK_SET);
char* buffer = malloc(length + 1);
fread(buffer, 1, length, file);
buffer[length] = '\0';
fclose(file);
return buffer;
}Step 2: Tokenization State Machine
typedef enum {
STATE_START,
STATE_IN_TAG,
STATE_IN_ATTRIBUTE,
STATE_IN_CONTENT,
STATE_IN_COMMENT,
STATE_ERROR
} ParserState;Step 3: DOM Tree Construction
XMLNode* create_node(const char* name) {
XMLNode* node = malloc(sizeof(XMLNode));
node->name = strdup(name);
node->content = NULL;
node->attributes = NULL;
node->parent = node->children = node->next = NULL;
return node;
}
void add_attribute(XMLNode* node, const char* name, const char* value) {
XMLAttribute* attr = malloc(sizeof(XMLAttribute));
attr->name = strdup(name);
attr->value = strdup(value);
attr->next = node->attributes;
node->attributes = attr;
}Step 4: Core Parsing Algorithm
XMLNode* parse_xml(const char* xml) {
XMLNode* root = NULL;
XMLNode* current = NULL;
ParserState state = STATE_START;
char* cursor = (char*)xml;
while (*cursor) {
switch (state) {
case STATE_START:
if (*cursor == '<') {
if (*(cursor+1) == '?') {
// Skip XML declaration
cursor = strchr(cursor, '>') + 1;
} else {
state = STATE_IN_TAG;
current = create_node("");
}
}
cursor++;
break;
case STATE_IN_TAG: {
char* tag_end = strpbrk(cursor, " >");
if (!tag_end) { /* Error handling */ }
*tag_end = '\0';
current->name = strdup(cursor);
cursor = tag_end + 1;
// Attribute parsing
while (*cursor != '>') {
char* eq = strchr(cursor, '=');
char* val_start = strchr(cursor, '"') + 1;
char* val_end = strchr(val_start, '"');
*eq = *val_start = *val_end = '\0';
add_attribute(current, cursor, val_start+1);
cursor = val_end + 1;
}
state = STATE_IN_CONTENT;
break;
}
// Additional states handled similarly
}
}
return root;
}Error Handling Strategies
- Mismatched Tags
void validate_tag_hierarchy(XMLNode* node) {
if (!node) return;
XMLNode* child = node->children;
while (child) {
if (strcmp(child->name, node->name) == 0) {
fprintf(stderr, "Error: Nested <%s> tags\n", node->name);
}
validate_tag_hierarchy(child);
child = child->next;
}
}- Unclosed Tags Track open tags using a stack:
typedef struct TagStack {
char* name;
struct TagStack* next;
} TagStack;
void push_tag(TagStack** stack, const char* name) {
TagStack* new = malloc(sizeof(TagStack));
new->name = strdup(name);
new->next = *stack;
*stack = new;
}
void pop_tag(TagStack** stack) {
TagStack* top = *stack;
*stack = top->next;
free(top->name);
free(top);
}Memory Management
void free_xml(XMLNode* node) {
if (!node) return;
XMLAttribute* attr = node->attributes;
while (attr) {
XMLAttribute* next = attr->next;
free(attr->name);
free(attr->value);
free(attr);
attr = next;
}
XMLNode* child = node->children;
while (child) {
XMLNode* next = child->next;
free_xml(child);
child = next;
}
free(node->name);
free(node->content);
free(node);
}Example Usage
int main() {
const char* xml =
"<bookstore>"
" <book category=\"fiction\">"
" <title>The Hobbit</title>"
" <author>J.R.R. Tolkien</author>"
" </book>"
"</bookstore>";
XMLNode* root = parse_xml(xml);
print_xml(root, 0); // Implement pretty-printing
free_xml(root);
return 0;
}Optimization Techniques
- Buffer Pooling Reuse memory blocks for frequently allocated structures
- String Interning Store common strings (like "id", "class") once
- Predictive Parsing Precompute tag probabilities for faster lookups
- SIMD Processing Use AVX instructions for fast tag detection:
__m256i tag_start = _mm256_set1_epi8('<');
while (remaining >= 32) {
__m256i chunk = _mm256_loadu_si256((__m256i*)cursor);
__m256i cmp = _mm256_cmpeq_epi8(chunk, tag_start);
unsigned mask = _mm256_movemask_epi8(cmp);
// Process matches
}Security Considerations
- Entity Expansion Limits Prevent billion laughs attacks:
#define MAX_ENTITY_EXPANSION 64
size_t expansion_depth = 0;
void parse_entity(...) {
if (++expansion_depth > MAX_ENTITY_EXPANSION) {
fatal_error("Entity expansion limit exceeded");
}
}- Buffer Overflow Protection Always use bounded string operations:
strncpy(dest, src, dest_size - 1);
dest[dest_size - 1] = '\0';- Malformed UTF-8 Handling Validate byte sequences:
int is_valid_utf8(const char* s) {
while (*s) {
if ((*s & 0x80) == 0) s++;
else if ((*s & 0xE0) == 0xC0) { /* 2-byte */ }
// Additional checks
}
return 1;
}Testing Methodology
- Unit Tests
void test_self_closing_tag() {
XMLNode* node = parse_xml("<br>");
assert(node != NULL);
assert(node->children == NULL);
free_xml(node);
}- Fuzz Testing Use American Fuzzy Lop (AFL) with corpus:
afl-gcc -o xml_parser parser.c
afl-fuzz -i testcases -o findings ./xml_parser @@- Conformance Testing Validate against W3C XML Test Suite
Advanced Features (Optional)
- Namespace Support
typedef struct XMLNamespace {
char* prefix;
char* uri;
struct XMLNamespace* next;
} XMLNamespace;- XPath Query Support Implement subset of XPath 1.0:
XMLNode* xpath_query(XMLNode* root, const char* query) {
if (strncmp(query, "//", 2) == 0) {
// Depth-first search
}
}- Schema Validation Check against DTD or XSD
Conclusion
Building an XML parser in C teaches fundamental CS concepts while creating a practical tool. Key takeaways:
- State machines are essential for text parsing
- Careful memory management prevents leaks
- Defense against malformed input is critical
- Modular design enables future extensions
For production use, consider established libraries like libxml2, but understanding the underlying mechanics remains invaluable.