Download full source

This is a recursive descent with backtracking C parser that I wrote for a project I never finished. It supports a good subset of the C language and produces a concrete syntax tree. This means if you walk the output of the parser in depth first order and output the leaves you will get back the original input (including whitespace).

For example:

int main(int argc, char *argv[])
{
   int a = 6;
   int b = a + 5;
   printf("hello world\n");
   return 0;
} 

Becomes:

program
 external_declaration
   function_definition
     declaration_specifiers
       type_specifier
         "int"
     whitespace
       " "
     declarator
       direct_declarator
         identifier
           "main"
         "("
         parameter_type_list
           parameter_list
             parameter_declaration
               declaration_specifiers
                 type_specifier
                   "int"
               whitespace
                 " "
               declarator
                 direct_declarator
                   identifier
                     "argc"
             ","
             whitespace
               " "
             parameter_declaration
               declaration_specifiers
                 type_specifier
                   "char"
               whitespace
                 " "
               declarator
                 pointer
                   "*"
                 direct_declarator
                   identifier
                     "argv"
                   "["
                   "]"
         ")"
     whitespace
       "
"
     compound_statement
       "{"
       whitespace
         "
       "
       declaration
         declaration_specifiers
           type_specifier
             "int"
         whitespace
           " "
         init_declarator_list
           declarator
             direct_declarator
               identifier
                 "a"
               whitespace
                 " "
             "="
             whitespace
               " "
             initializer
               constant
                 "6"
         ";"
       whitespace
         "
       "
       declaration
         declaration_specifiers
           type_specifier
             "int"
         whitespace
           " "
         init_declarator_list
           declarator
             direct_declarator
               identifier
                 "b"
               whitespace
                 " "
             "="
             whitespace
               " "
             initializer
               additive_expression
                 identifier
                   "a"
                 whitespace
                   " "
                 "+"
                 whitespace
                   " "
                 constant
                   "5"
         ";"
       whitespace
         "
       "
       statement
         expression_statement
           expression
             postfix_expression
               identifier
                 "printf"
               "("
               expression
                 string_literal
                   ""hello world\n""
               ")"
           ";"
       whitespace
         "
       "
       statement
         jump_statement
           "return"
           whitespace
             " "
           expression
             constant
               "0"
           ";"
       whitespace
         "
"
       "}"
 whitespace
   "
" 

The parser is written in C++ and is extremely simple to integrate. The tree is represented by a minimalist structure:

class ST {
public:
	std::string s;
	std::vector ch;
	ST *parent;
};

I hope you find it useful.


QuantumG
<< back to my home page