/*
 * biblewords.c 
 * by John Holder
 *
 * This code is free under the terms of the BSD-Unix license.
 *
 * Note: this code computes word frequencies from data on stdin
 * A binary tree is used to store the data.  
 * The input data must be in the ISO-Latin-1 (ISO8859-1) encoding.
 * The code tries to filter out invalid data.
 * Results, in no particular order, are dumped to stdout.  
 * Use the UNIX tools grep and sort to query the results.
 */
#include <stdio.h>
#include <string.h>

typedef struct tag_wordnode    WORDNODE, *PWORDNODE;

struct tag_wordnode 
{
   char *word;
   long n;
   PWORDNODE l;
   PWORDNODE r;
};


int myisalpha( unsigned char c )
{
  if( 
      ( c >= 0x41 && c <= 0x5A ) ||
      ( c >= 0x61 && c <= 0x7A ) ||
      ( c >= 0xC0 && c <= 0xD6 ) ||
      ( c >= 0xD8 && c <= 0xF6 ) ||
      ( c >= 0xF8 && c <= 0xFF ) 
    )
  {
    return 1;
  }
  return 0;
}

void lower_case( unsigned char *c )
{
   int i;
   for(i=0; i<strlen(c); i++)
   {
     if( c[i] >= 'A'  && c[i] <='Z'   ) c[i]+= 'a'-'A';
     if( c[i] >= 0xc0 && c[i] <= 0xde ) c[i]+= 0x20;
   }
}

int good_word( char *word )
{
  int n, len;

  if( !myisalpha( word[0] )  )
    return 0;

  if( strstr( word, "ftp" ) != NULL )
    return 0;

  if( strstr( word, "http" ) != NULL )
    return 0;

  len = strlen( word );
  for( n=1; n<len; n++)
  {
    if( !myisalpha( word[n] ) )
    {
      return 0;
    }
  }
  return 1;
}

void insert_word( char *word, PWORDNODE *wordtree )
{
   if( *wordtree == NULL ) 
   {
      *wordtree = (PWORDNODE) malloc( sizeof(WORDNODE) );
      memset( *wordtree, 0, sizeof( WORDNODE ) );
      (*wordtree)->word = strdup(word);
      return;
   }
   else if( strcmp( (*wordtree)->word, word ) > 0 )
   {
      if( (*wordtree)->l == NULL ) 
      {
         (*wordtree)->l = (PWORDNODE) malloc( sizeof(WORDNODE) );
         memset( (*wordtree)->l, 0, sizeof( WORDNODE ) );
         (*wordtree)->l->word = strdup(word);
      }
      else
      {
         insert_word( word, &((*wordtree)->l) );
      }
   }
   else if( strcmp( (*wordtree)->word, word ) < 0 )
   {
      if( (*wordtree)->r == NULL ) 
      {
         (*wordtree)->r = (PWORDNODE) malloc( sizeof(WORDNODE) );
         memset( (*wordtree)->r, 0, sizeof( WORDNODE ) );
         (*wordtree)->r->word = strdup(word);
      }
      else
      {
         insert_word( word, &((*wordtree)->r) );
      }
   }
   else
   {
      (*wordtree)->n++;
   }
}

void add_word( char *word, PWORDNODE *wordtree )
{
   if( good_word( word ) )
   {
      lower_case( word );
      insert_word( word, wordtree );
   }
}

void get_words( PWORDNODE *wordtree )
{
   char buf[512];
   char *p;

   while( fgets(buf, 512, stdin) != NULL )
   {
      if(buf[0]!='B' && buf[0]!='T' && buf[0]!='E') {
         for( p = strtok( buf , " \t\n\r\f\v\"'`-;:.,!?()[]*{}" );
              p;
              p = strtok( NULL, " \t\n\r\f\v\"'`-;:.,!?()[]*{}") )
         {
            add_word( p, wordtree );
         }
      }
             
   }
}

void write_dict( PWORDNODE wordtree )
{
   if( wordtree == NULL ) return;
   if( wordtree->l ) write_dict( wordtree->l );
   fprintf( stdout, "%08ld  %s\n", wordtree->n+1, wordtree->word );
   if( wordtree->r ) write_dict( wordtree->r );
}

int main(void)
{
  PWORDNODE wordtree = NULL;

  get_words( &wordtree );

  write_dict( wordtree );
  
}
