#include <stdio.h>
#include <string.h>
#include <tcl.h>

#define ETAG -1
#define STAG 1
#define DOCTYPE 2
#define BRACK 3

#define START 0
#define NAME 1
#define EQUALS 2
#define QUOTE 3
#define VALUE 4

static ntags = 0;

#define MAXLINECHAR 5000
int main (int argc, char* argv[]) {

    FILE *fp;
    Tcl_Interp *interp;

    if (argc != 2) {
        printf("Usage:\n");
        printf("    dtdless <filename>\n");
        return;
    }
    if ((fp = fopen(argv[1],"r")) == (FILE*) NULL) {
        fprintf(stderr,"Cannot open file: %s\n",argv[1]);
        return;
    }
    fclose(fp);
    
    interp = Tcl_CreateInterp();

    parse(interp, 1, argv[1]);
    analyseCounts(interp);
    parse(interp, 2, argv[1]);
    return 0;
}

int parse(Tcl_Interp *interp, int pass, char *filename) {

    char line[MAXLINECHAR];
    char attname[10000];
    char attval[10000];
    char content[1000000];
    char tag[100];
    char *pl, *ptag, *pd, *ptem;
    int intag, inquote, tagtype, pn, pv, attcount, att, esistag, pc, firstnl, temp;
    char c, delim;
    FILE *fp;

    fp = fopen(filename, "r");
    intag = 0;
    inquote = 0;
    pc = 0;
    content[0] = 0;
    firstnl = 1;
    tagtype = NULL;
    temp = NULL;
/* skip DOCTYPE and whitespace */
    while (fgets(line,MAXLINECHAR,fp) != 0) {
        if ((pd = strstr(line, "<!")) != (char*) NULL) {
            if (strncasecmp(pd, "<!DOCTYPE", 9) == 0) {
                temp = DOCTYPE;
            }
        }
/* are there 'included' ENTITIES, etc.?  IGNORE!!! */
        if (temp == DOCTYPE && strstr(line,"[") != (char*) NULL) {
            temp = BRACK;
        }
        if (temp == DOCTYPE && strstr(line,  ">") != (char*) NULL) break;
        if (temp == BRACK && strstr(line,  "]>") != (char*) NULL) break;
        if (temp == NULL) {
            pl = line;
/* if there is no DOCTYPE, rewind the file and break to parser */
            if (!isspace(*pl++)) {
                fclose(fp);
                fp = fopen(filename, "r");
                break;
            }
        }
    }

    while (fgets(line,MAXLINECHAR,fp) != 0) {
/* Add newlines (as "\n"), except that SGML ignores first newline of content 
 * and also last newline, if followed by '<' at start of line
 */
        if (!intag && !firstnl && line[0] != '<') {
            content[pc++] = '\\'; 
            content[pc++] = 'n';
        }
        pl = line;
        while (c = *pl++) {
/* ignore embedded CR and NL - menaces! */
            if (c == '\r' || c == '\n') { continue; }
            if (!intag) {
                if (c == '<') {
/* output any content */
                    if (!firstnl && pass == 2) {
                        content[pc++] = 0; printf("%s\n",content);
                    }
                    firstnl = 1;
                    intag = 1;
                    ptag = tag;
                    if (*pl == '/') {
                        tagtype = ETAG;
                        esistag = ')';
                        pl++;
                    } else {
                        tagtype = STAG;
                        esistag = '(';
                        att = START;
                    }
                    attcount = 0;
                } else {
                    content[pc++] = c;
                    firstnl = 0;
                }
            } else {
                if (tagtype == ETAG || tagtype == STAG) {
                    if (c == '>' || isspace(c)) {
                        *ptag = 0;
                        tagtype = NULL;
                        if (c == '>') {
                            pl--;
                        }
/* accumulate GI */
                    } else if (c != '<') { *ptag++ = toupper(c); }
                } else {
                    if (c == '>') {
                        if (pn && pv) {
                            attval[pv] = 0;
                            if (pass == 2) {
                                printf("A%s CDATA %s\n",attname,attval);
                            }
                            pv = 0;
                        }
/* first pass we count the tags, second we insert any closing EMPTY tags */
                        if (pass == 1) {
                            countTags(interp,esistag,tag);
                        } else if (pass == 2) {
                            printf("%c%s\n",esistag,tag);
                            if (esistag == '(') {
                                ptem = Tcl_GetVar2(interp, "EMPTY", tag, 0);
                                if (ptem != (char*) NULL) {
                                    printf("%c%s\n", ')', tag);
                                }
                            }
                        }
                        intag = 0;
                        content[0] = '-';
                        pc = 1;
                    } else if (att == START) {
                        if (!isspace(c)) {
                            att = NAME;
                            pn = 0;
                            attname[pn++] = toupper(c);
                            delim = ' ';
                        }
                    } else if (att == NAME) {
                        if (isspace(c)) {
                            attname[pn] = 0;
                            att = EQUALS;
                        } else if (c == '=') {
                            attname[pn] = 0;
                            att = QUOTE;
                        } else {
                            attname[pn++] = toupper(c);
                            attcount++;
                        }
                    } else if (att == EQUALS) {
                        if (isspace(c)) {
                        } else if (c == '=') {
                            att = QUOTE;
                        } else {
                            fprintf(stderr,"Bad equals: %s",line);
                        }
                    } else if (att == QUOTE) {
                        if (isspace(c)) {
                        } else if (c == '"' || c == '\'') {
                            delim = c;
                            att = VALUE;
                            pv = 0;
                        } else {
                            attval[0] = c;
                            pv = 1;
                            att = VALUE;
                        }
                    } else if (att == VALUE) {
                        if ((delim == ' ' && (c == '>' || isspace(c))) ||
                            c == delim) {
                            att = START;
                            pn = 0;
                            attval[pv] = 0;
                            if (pass == 2) {
                                printf("A%s CDATA %s\n", attname, attval);
                            }
                            pv = 0;
                            if (c == '>') {
                                pl--;
                            }
                        } else {
                            attval[pv++] = c;
                        }
                    }
                }
            }
        }
    }
    if (pass == 2) printf("C\n");
    fclose(fp);
}


int countTags(Tcl_Interp* interp, char esistag, char* tag) {
    
    char result[100];
    char *temp, *array;
    int num;

    if (esistag == '(') {
        array = "START";
    } else if (esistag == ')') {
        array = "END";
    }
    temp = Tcl_GetVar2(interp, array, tag, 0);
    if (temp == (char*) NULL) {
        Tcl_SetVar2(interp, array, tag, "1", 0);
    } else {
        Tcl_GetInt(interp, temp, &num);
        sprintf(temp, "%d", ++num);
        Tcl_SetVar2(interp, array, tag, temp, 0);
    }
/* give this tag (in any form) a serial number */
    temp = Tcl_GetVar2(interp, "TAG", tag, 0);
    if (temp == (char*) NULL) {
        Tcl_SetVar2(interp, "TAG", tag, "1", 0);
        sprintf(result, "%d", ++ntags);
        Tcl_SetVar2(interp, "TAGS", result, tag, 0);
    }
}

int analyseCounts(Tcl_Interp* interp) {
    char chari[100];
    char *tag, *temp;
    int i, nstart, nend, fail;

    fail = 0;
    for (i = 1; i <= ntags; i++) {
        sprintf(chari, "%d", i);
        tag = Tcl_GetVar2(interp, "TAGS", chari, 0);
        temp = Tcl_GetVar2(interp, "START", tag, 0);
        if (temp == (char*) NULL) {
            nstart = 0;
        } else {
            Tcl_GetInt(interp, temp, &nstart);
        }
        temp = Tcl_GetVar2(interp, "END", tag, 0);
        if (temp == (char*) NULL) {
            nend = 0;
        } else {
            Tcl_GetInt(interp, temp, &nend);
        }
        if (nend == 0) {
            Tcl_SetVar2(interp, "EMPTY", tag, "1", 0);
        } else if (nend != nstart) {
            fprintf(stderr, "UNBALANCED: %s   %d  %d\n", tag, nstart, nend);
            fail = 1;
        }
    }
}
