/* Parse ABNF to locate dangling references and undefined symbols
 *  does limited syntax checking
 *
 *  This is a quick&dirty hack donated to public domain by Chris Newman
 */

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#define ERROR       0
#define COMMENT     1
#define RULEDEF     2
#define AMENDRULE   3
#define HEXMODE     4
#define BINMODE     5
#define DECMODE     6
#define ALTERNATE   7
#define REPEAT      8
#define BEGINGROUP  9
#define ENDGROUP    10
#define BEGINOPTION 11
#define ENDOPTION   12
#define RANGE       13
#define LITLIST     14
#define NUMBER      15
#define HEXNUMBER   16
#define QUOTED      17
#define PROSE       18
#define RULENAME    19
#define NEWLINE     20

static const char *token_name[] = {
    "error", "comment", "ruledef", "amendrule", "hexmode", "binmode",
    "decmode", "alternate", "repeat", "begingroup", "endgroup",
    "beginoption", "endoption", "range", "litlist", "number",
    "hexnumber", "quoted", "prose", "rulename", "newline"
};

/* simple rulename datastructure
 */
struct rulename {
    struct rulename *next;
    int defined, referenced;
    int len;
    char data[1];
};

#define HTSIZE 1023
static struct rulename *htable[HTSIZE];
static unsigned char hex_table[256];
const static unsigned char hexchars[] = "0123456789ABCDEF";

static const char *baserules[] = {
    "ALPHA", "BIT", "CHAR", "CR", "CRLF", "CTL", "DIGIT",
    "DQUOTE", "HEXDIG", "HTAB", "LF", "LWSP", "OCTET", "SP",
    "VCHAR", "WSP", NULL
};

const unsigned char cvt_to_lowercase[256] = {
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
    0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
    0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
    0x40,  'a',  'b',  'c',  'd',  'e',  'f',  'g',
     'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',
     'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
     'x',  'y',  'z', 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
    0x60,  'a',  'b',  'c',  'd',  'e',  'f',  'g',
     'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',
     'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
     'x',  'y',  'z', 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
    0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
    0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
    0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
    0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
    0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
    0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
};
#define TOLOWER(c) (cvt_to_lowercase[(unsigned char)(c)])

/* returns pointer to end of token, NULL if no tokens left in line
 */
char *gettoken(char *ptr, int *tokentype, int hexmode)
{
    *tokentype = ERROR;

    switch (*ptr++) {
      case '\0':
	return (NULL);

      case '\r':
	if (*ptr == '\n') ++ptr;
      case '\n':
	*tokentype = NEWLINE;
	break;
	
      case ';':
	*tokentype = COMMENT;
	while (*ptr != '\0' && *ptr != '\n' && *ptr != '\r')
	    ++ptr;
	break;

      case '/':
	*tokentype = ALTERNATE;
	break;

      case '*':
	*tokentype = REPEAT;
	break;

      case '(':
	*tokentype = BEGINGROUP;
	break;

      case ')':
	*tokentype = ENDGROUP;
	break;

      case '[':
	*tokentype = BEGINOPTION;
	break;

      case ']':
	*tokentype = ENDOPTION;
	break;

      case '-':
	*tokentype = RANGE;
	break;

      case '.':
	*tokentype = LITLIST;
	break;
	
      case '=':
	*tokentype = RULEDEF;
	if (*ptr == '/') {
	    *tokentype = AMENDRULE;
	    ++ptr;
	}
	break;

      case '%':
	switch (*ptr++) {
	  case 'x':
	  case 'X':
	    *tokentype = HEXMODE;
	    break;
	  case 'b':
	  case 'B':
	    *tokentype = BINMODE;
	    break;
	  case 'd':
	  case 'D':
	    *tokentype = DECMODE;
	    break;
	  default:
	    return ("Invalid literal");
	}
	break;

      case '"':
	while (*ptr && *ptr != '"') ++ptr;
	if (*ptr != '"') {
	    return ("Unbalanced double quotes");
	}
	++ptr;
	*tokentype = QUOTED;
	break;

      case '<':
	while (*ptr && *ptr != '>') ++ptr;
	if (*ptr != '>') {
	    return ("Unbalanced angle brakets");
	}
	++ptr;
	*tokentype = PROSE;
	break;

      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9':
	if (hexmode) goto GETHEX;
	while (isdigit(*ptr)) ++ptr;
	*tokentype = NUMBER;
	break;

      default:
	/* deal with hex digits */
	if (hexmode && isxdigit(ptr[-1])) {
	  GETHEX:
	    while (isxdigit(*ptr)) ++ptr;
	    *tokentype = HEXNUMBER;
	    break;
	}
	if (!isalpha(ptr[-1])) {
	    return ("Invalid character");
	}

	/* deal with rule names */
	while (isalnum(*ptr) || *ptr == '-') ++ptr;
	*tokentype = RULENAME;
	break;
    }

    return (ptr);
}

/* find a rule in a hash table
 */
struct rulename *hashfind(const char *ptr, int len)
{
    int i, hash;
    struct rulename **pptr;

    /* lookup in hash table */
    hash = len;
    for (i = 0; i < len; ++i) {
	hash = (hash * 13 + TOLOWER(ptr[i])) % HTSIZE;
    }
    pptr = &htable[hash];
    while (*pptr && ((*pptr)->len != len
		     || strncasecmp(ptr, (*pptr)->data, len) != 0)) {
	pptr = &(*pptr)->next;
    }

    /* create new entry, if needed */
    if (!*pptr) {
	*pptr = calloc(sizeof (struct rulename) + len, 1);
	if (!*pptr) {
	    fprintf(stderr, "Out of memory\n");
	    exit(1);
	}
	(*pptr)->len = len;
	memcpy((*pptr)->data, ptr, len);
    }

    return (*pptr);
}

main(int argc, char **argv)
{
    int web_cgi = 0, verbose = 0;
    const unsigned char *uscan;
    char *buf, *name, *src, *dst, *linestart;
    char *ptr, *nptr;
    const char **ruleptr;
    int hexmode, tokentype, line;
    int plevel = 0, blevel = 0;
    int bused, bsize, amount;
    struct rulename *lastrule;

    /* initialize hex table */
    memset(hex_table, 16, sizeof (hex_table));
    for (uscan = hexchars; *uscan; ++uscan) {
	hex_table[(unsigned) *uscan] = uscan - hexchars;
    }

    /* initialize default rules */
    for (ruleptr = baserules; *ruleptr != NULL; ++ruleptr) {
	lastrule = hashfind(*ruleptr, strlen(*ruleptr));
	lastrule->defined = lastrule->referenced = 1;
    }
    lastrule = NULL;

    /* get program name */
    name = strrchr(*argv, '/');
    name = name == NULL ? *argv : name + 1;
    if (strncmp("web", name, 3) == 0) web_cgi = 1;

    /* parse args */
    while (*++argv != NULL && **argv == '-') switch ((*argv)[1]) {
      case 'w':
	web_cgi = 1;
	break;

      case 'v':
	verbose = 1;
	break;

      case 'h':
	printf("%s [-h] [-w]\n", name);
	printf("  -h  display this help\n");
	printf("  -w  input is web CGI encoded\n");
	exit(0);

      default:
	fprintf(stderr, "Invalid arguments, type `%s -h' for usage\n", name);
	exit(1);
    }

    if (web_cgi) printf("Content-Type: text/plain\n\n");
    
    /* read the data into memory */
    bsize = 4096;
    buf = malloc(bsize);
    if (buf != NULL) {
	for (bused = 0;
	     (amount = fread(buf + bused, 1, bsize - bused, stdin)) != 0;
	     bused += amount) {
	    if (bused + amount == bsize) {
		if (bsize > 10*1024*1024) {
		    fprintf(stderr, "Size limit exceeded\n");
		    exit(1);
		}
		buf = realloc(buf, bsize *= 2);
		if (buf == NULL) break;
	    }
	}
    }
    if (buf == NULL) {
	fprintf(stderr, "Out of memory\n");
	exit(1);
    }
    if (!feof(stdin)) {
	perror("input");
	exit(1);
    }
    buf[bused] = '\0';
	
    /* remove web CGI grunge */
    if (web_cgi) {
	for (src = buf; *src != '\0' && !isspace(*src) &&
	     *src != ':' && *src != '='; ++src)
	    ;
	if (*src == '=') {
	    ++src;
	    for (dst = buf; *src != '&' && *src != '\0'; ++src, ++dst) {
		if (*src == '%') {
		    *dst = (hex_table[(unsigned char)src[1]] << 4) +
			hex_table[(unsigned char)src[2]];
		    src += 2;
		} else if (*src == '+') {
		    *dst = ' ';
		} else {
		    *dst = *src;
		}
	    }
	    *dst = '\0';
	    bused = dst - buf;
	}
    }

    /* remove xml RFC grunge */
    for (src = buf; *src == ' ' || *src == '\t'; ++src)
	;
    if (*src == '<' && src[1] == '?') {
	for (dst = buf; *src != '\0'; ++src) {
	    if (*src == '\r' || *src == '\n') {
		*dst++ = *src;
	    } else if (src[0] == '<' && src[1] == 'a' &&
		       (strncmp(src, "<artwork type=\"abnf\">", 21) == 0
			|| strncmp(src, "<artwork type='abnf'>", 21) == 0)) {
		src += 21;
		while (*src != '\0') {
		    if (src[0] == '<' && src[1] == '!'
			&& strncmp(src, "<![CDATA[", 9) == 0) {
			src += 9;
			while (*src != '\0'
			       && (src[0] != ']' || src[1] != ']' ||
				   src[2] != '>')) {
			    *dst++ = *src++;
			}
			src += 3;
			continue;
		    }
		    if (src[0] == '<' && src[1] == '/'
			&& strncmp(src, "</artwork>", 10) == 0) {
			src += 10;
			break;
		    }
		    if (src[0] == '&') {
			if (strncmp(src, "&gt;", 4) == 0) {
			    *dst++ = '>';
			    src += 4;
			    continue;
			}
			if (strncmp(src, "&lt;", 4) == 0) {
			    *dst++ = '<';
			    src += 4;
			    continue;
			}
			if (strncmp(src, "&amp;", 5) == 0) {
			    *dst++ = '&';
			    src += 5;
			    continue;
			}
			if (strncmp(src, "&apos;", 6) == 0) {
			    *dst++ = '\'';
			    src += 6;
			    continue;
			}
			if (strncmp(src, "&quot;", 6) == 0) {
			    *dst++ = '"';
			    src += 6;
			    continue;
			}
		    }
		    *dst++ = *src++;
		}
		continue;
	    }
	}
	*dst = '\0';
    }

    /* process lines */
    ptr = linestart = buf;
    hexmode = 0;
    line = 1;

    /* get tokens */
    for (;;) {
	/* skip whitespace */
	while (*ptr == ' ' || *ptr == '\t') {
	    ++ptr;
	    hexmode = 0;
	}

	/* get token, done if at end of line */
	if ((nptr = gettoken(ptr, &tokentype, hexmode)) == NULL)
	    break;

	/* handle errors */
	if (!tokentype) {
	    printf("Error (line %d, col %d): %s\n",
		    line, ptr - linestart, nptr);
	    exit(1);
	}

	/* reset hexmode */
	if (hexmode && tokentype != HEXNUMBER && tokentype != LITLIST
	    && tokentype != RANGE) {
	    hexmode = 0;
	}

	/* handle last rulename */
	if (lastrule != NULL) {
	    if (tokentype == RULEDEF) {
		lastrule->defined = 1;
	    } else if (tokentype == AMENDRULE) {
		lastrule->defined = lastrule->referenced = 1;
	    } else {
		lastrule->referenced = 1;
	    }
	    lastrule = NULL;
	}

	if (verbose && tokentype != NEWLINE) {
	    printf("line %d, col %d, %s",
		   line, ptr - linestart,
		   token_name[tokentype],
		   nptr - ptr, ptr);
	    if (nptr != NULL) {
		printf(": %.*s", nptr - ptr, ptr);
	    }
	    putchar('\n');
	}

	switch (tokentype) {
	  case NEWLINE:
	    hexmode = 0;
	    ++line;
	    ptr = linestart = nptr;
	    continue;
	    
	  case RULENAME:
	    lastrule = hashfind(ptr, nptr - ptr);
	    break;

	  case HEXMODE:
	    hexmode = 1;
	    break;

	  case RULEDEF:
	  case AMENDRULE:
	    if (plevel || blevel) {
	      UNBALANCED:
		printf("Error (line %d): Unbalanced %s\n",
		       line, plevel ? "parenthesis" : "square brackets");
		exit(1);
	    }
	    break;

	  case BEGINGROUP:
	    ++plevel;
	    break;

	  case ENDGROUP:
	    if (--plevel < 0) goto UNBALANCED;
	    break;
	    
	  case BEGINOPTION:
	    ++blevel;
	    break;

	  case ENDOPTION:
	    if (--blevel < 0) goto UNBALANCED;
	    break;
	}

	/* advance */
	ptr = nptr;
    }

    /* final warnings */
    if (plevel) {
	printf("Error at end of text: Unbalanced parenthesis\n");
    }
    if (blevel) {
	printf("Error at end of text: Unbalanced square brackets\n");
    }

    /* deal with last rule on last line */
    if (lastrule) {
	lastrule->referenced = 1;
    }

    /* list undefined and unreferenced rules */
    for (line = 0; line < HTSIZE; ++line) {
	lastrule = htable[line];
	while (lastrule) {
	    if (!lastrule->defined) {
		printf("undefined rule: %s\n", lastrule->data);
	    }
	    if (!lastrule->referenced) {
		printf("unreferenced rule: %s\n", lastrule->data);
	    }

	    lastrule = lastrule->next;
	}
    }

    printf("ABNF validation (version 1.0) completed\n");

    exit(0);
}
