/* extracts tables from HTML pages to tab separated ascii files */

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "entities.c"

#define MAXOUT 8

char outname[256], *fnum;


void gettables (FILE *infile)
 {
  int c, depth = -1, exc = 0, index = 0, copy = 0, i, n, act, wasspace, within, dashcnt;
  char kept[8];
  struct
   {
    FILE *filep;
    int   index;
    short firstrow;
    short firstcol;
   }
  outfile[MAXOUT];

  c = fgetc (infile);
  while (c != EOF)
   {
    i = 0;                      /* also flag to say that char is already fetched */
    if (c == '<')               /* tag starting? */
     {
      c = fgetc (infile);
      if (tolower (c) == 't')   /* tag starting with 't' */
       {
        c = fgetc (infile);
        switch (tolower (c))
         {
         case 'a':              /* table starts a new file */
          if ((c = fgetc (infile), tolower (c) == 'b') &&
              (c = fgetc (infile), tolower (c) == 'l') &&
              (c = fgetc (infile), tolower (c) == 'e'))
           {
            copy = 0;
            if (exc == 0)
             {
              if (++depth < MAXOUT)
               {
                sprintf (fnum, "%02d", index);
                if ((outfile[depth].filep = fopen (outname, "w")) == NULL)
                 {
                  fprintf (stderr, "error: cannot open %s - aborting input file.\n", outname);
                  depth--;
                  goto quitfile;
                 }
                else
                  printf ("writing HTML table to file: %s\n", outname);
                outfile [depth].index = index++;
                outfile [depth].firstrow = 1;
               }
              else
               {
                fprintf (stderr, "error: nesting depth exceeds %d - ignored.\n", MAXOUT);
                exc = 1;
                depth = -1;
               }
             }
            else
              exc++;
           }
          break;

         case 'r':              /* tr starts a new record */
          if (depth >= 0)
           {
            if (outfile [depth].firstrow)
              outfile [depth].firstrow = 0;
            else
              fputc ('\n', outfile[depth].filep);
            outfile [depth].firstcol = 1;
            copy = 0;
           }
          break;

         case 'h':              /* th, td start a new field */
         case 'd':
          if (depth >= 0)
           {
            if (outfile [depth].firstcol)
              outfile [depth].firstcol = 0;
            else
              fputc ('\t', outfile[depth].filep);
            copy = 1;
            wasspace = within = 0;
           }
          break;
         }
       }
      else if (c == '/')        /* end tag */
       {
        c = fgetc (infile);
        if (tolower (c) == 't')
         {
          c = fgetc (infile);
          switch (tolower (c))
           {
           case 'a':
            /* /table ends current file and switches to previous */
            if ((c = fgetc (infile), tolower (c) == 'b') &&
                (c = fgetc (infile), tolower (c) == 'l') &&
                (c = fgetc (infile), tolower (c) == 'e'))
             {
              copy = 0;
              if (depth >= 0)
               {
                fputc ('\n', outfile[depth].filep);
                fclose (outfile[depth].filep);
               }
              else if (exc > 0)
               {
                if (--exc == 0)
                 {
                  depth = MAXOUT;
                  sprintf (fnum, "--");
                 }
               }
              else
                fprintf (stderr, "error: unmatched table end tag (previous: %s)\n", outname);
              if (depth >= 0 && --depth >= 0)
               {
                fprintf (outfile[depth].filep, "<%s>", outname);
                sprintf (fnum, "%02d", outfile[depth].index);
               }
             }
            break;

           case 'r':            /* /tr, /th, /td all end the current field */
           case 'h':
           case 'd':
            copy = 0;
            break;
           }
         }
       }
      else if (c == '!' &&      /* html comment */
              ((c = fgetc (infile)) == '-') &&
              ((c = fgetc (infile)) == '-'))
       {
        dashcnt = 0;            /* skip to comment end */
        while ((c = fgetc (infile)) != EOF)
         {
          if (c == '-')
            dashcnt++;
          else if (c == '>' && dashcnt >= 2)
            break;
          else
            dashcnt = 0;
         }
       }
      while (c != '>')          /* skip to end of tag */
       {
        if (c == EOF)
         {
          i = 1;
          break;
         }
        c = fgetc (infile);
       }
     }
    else                        /* text outside tags (c != '<') */
     {
      if (copy)
       {
        if (isspace (c))
          wasspace = within;
        else
         {
          if (wasspace)
           {
            fputc (' ', outfile[depth].filep);
            wasspace = 0;
           }
          if (c == '&')         /* html entity starting? */
           {
            n = act = 0;
            for (;;)            /* entity parser */
             {
              kept[n++] = c;
              c = fgetc (infile);
              while (entity[act][0] != c && entity[act][0] >= 0)
                act++;
              if (entity[act][0] < 0)
               {
                while (i < n)
                  fputc (kept[i++], outfile[depth].filep);
                break;
               }
              else if (c == ';')
               {
                fputc (entity[act][1], outfile[depth].filep);
                break;
               }
              else
                act = entity[act][1];
             }
           }
          else                  /* simple char */
           {
            fputc (c, outfile[depth].filep);
           }
					within = 1;
         }
       }
     }
    if (!i)
      c = fgetc (infile);
   }

quitfile:

  while (depth >= 0)
   {
    sprintf (fnum, "%02d", outfile[depth].index);
    fprintf (stderr, "error: table end tag missing in %s\n", outname);
    fputc ('\n', outfile[depth].filep);
    fclose (outfile[depth--].filep);
   }
 }


int main (int argc, char *argv[])
 {
  int curr;
  FILE *infile;

  printf ("TableExtract v1.0 (c) 2000 Heinz Repp\n");

  if (argc < 2)
   {
    fprintf (stderr, "usage: %s <HTML-file> [...]\n", argv[0]);
    exit (-1);
   }

  for (curr = 1; curr < argc; curr++)
   {
    strcpy (outname, argv[curr]);
    if ((fnum = strrchr (outname, '\\')) == NULL) fnum = outname; else fnum++;
    fnum = strrchr (fnum, '.');
    if (fnum)
      fnum++;
    else
     {
      fnum = outname;
      while (*++fnum);
      *fnum++ = '.';
     }
    *fnum++ = 't';
    *fnum = '\0';

    if (infile = fopen (argv[curr], "r"))
     {
      gettables (infile);
      fclose (infile);
     }
    else
      fprintf (stderr, "error: cannot open %s\n", argv[curr]);
   }

  return 0;
 }
