#include "cdict5.conf"
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>
#include <assert.h>

#define DICT_PATH       "/home/r59182/Install/Chinese/dict/etdictfromcdict"

#define KEYWORD_COLOR		""
#define WORD_CLASS_COLOR	""
#define IDIOM_COLOR		    ""
#define EXAMPLE_COLOR		""
#define INFLEXION_COLOR		""
#define OTHER_INFO_COLOR	""
#define MARKED_COLOR		""
#define ANSI_RESET		    ""
//#define COLUMN		75 
#define COLUMN		16000 

#define isBig5CodeHi(c)	((c) >= 0x81 && (c) <=0xFE)

typedef unsigned char BYTE;
typedef unsigned short WORD;

FILE      *FD;
const char sTargetFile[] = "cdict-gb.dic";

/*char *word_class[]={"", "", "", "ݴ", "", 
"", "", "ﶯ", "ﶯ", "", "Ӵ", 
"̾", "", "", "β", ""};

char *freq[]={"", "[γ]", "[]", "[]", ""};
*/

char *word_class[]={"Dλy", "W", "NW", "ήe", "Ƶ", 
"Uʵ", "ʵ", "Ϊʵ", "Ϊʵ", "t", "s", 
"Pĵ", "W", "W", "r", "r"};

char *freq[]={"", "[`Φr]", "[`Φr]", "[򥻦rJ]", ""};

        
char *indent[]={"", "  ", "   ", "     ", "  ", "   "};
int  indentnum[]={0, 2, 3, 5, 2, 3};

struct WordItem {
    char English[100];
    char Chinese[16000];
    char Mark[200];
} CurrentWordItem, LastWordItem;
    


long convertEndian(long i) {
#ifndef LITTLE_ENDIAN    
  union {
    int i;    
    char c[4];
  } u, v;
    
  u.i = i;
  v.c[0] = u.c[3];
  v.c[1] = u.c[2];
  v.c[2] = u.c[1];
  v.c[3] = u.c[0];
  
  return v.i;
#else
  return i;
#endif  
}

class MapTable
{
private:
        FILE * file;
public:
        MapTable();
        ~MapTable();
        bool Init(const char * TabFileName);
        bool vMap(BYTE ch1,BYTE ch2,WORD &output);
};

MapTable::MapTable()
{
    file = NULL;
}

bool MapTable::Init(const char * TabFileName)
{
    if (TabFileName==NULL)
        return false;
//    char sFullTableFileName[PATH_MAX+1];
//    strcpy(sFullTableFileName,sHomeDir);
//    strncat( sFullTableFileName,TabFileName,PATH_MAX-strlen(sFullTableFileName) );
//    FILE * file;
    file=fopen(TabFileName,"rb");
    if (file==NULL)
    {
        //sprintf(sErrorMessage,"Can't open convert tab file %s\n",sTabFileName);
	printf("open table file error!\n");
        return false;
    }
    return true;
}

MapTable::~MapTable()
{
    if (file)
        fclose(file);
    file=NULL;
}

bool MapTable::vMap(BYTE ch1,BYTE ch2,WORD &output)
{
    assert(file);
    output = ch1*256 + ch2;
    long offset;
	if((ch2>=0xa1) && (ch2<=0xfe))
		offset =  ((ch1-161)*157 + (ch2-161) + 63)*sizeof(WORD); 
	else 
		if((ch2>=0x40) && (ch2<=0x7e)) 
			offset = ((ch1-161)*157 + (ch2-64))* sizeof(WORD);
		else
			return false; 
	// Read the corresponding code from the mapping table.
	if ( fseek(file,offset,SEEK_SET) )
        return false;
    if ( fread(&output,sizeof(WORD),1,file) == 0 )
        return false;
    return true;
}

class MapTable Big2GBTable;

void Big2GB(unsigned char * source)
{
    unsigned char * s = source;
    unsigned char * end = source+strlen((char*)source);
    while ( s < end )
    {
        if ( *s > 160 )
        {
            assert( s<=end-1 );
            //search table
            WORD gb;
            Big2GBTable.vMap(*s,*(s+1),gb);
            *s = (unsigned char)(gb/256);
            *(s+1) = (unsigned char)gb;
            s++;s++;
        }
        else
            s++; 
    }
    return;
}


void deleteprechar(char * source)
{
    if ( isalpha(*source) )
        return;
    char temp[100];
    char * s = source;
    while( *s && !isalpha(*s) )
        s++;
    strcpy(temp,s);
    strcpy(source,temp);
}
void convertmark(char * source)
{
    char temp[100];
    if ( strlen(source)>=100 )
    {
        printf("%s\n",source);
    }
    assert(strlen(source)<100);
    char * s = source;
    char * t = temp;
    while ( *s )
    {
        assert( *s!=0x07 && *s!=0x08 && *s!=0x09 && *s!=0x0a && *s!=0x0c );
        assert( *s!=0x0d && *s!=0x12 && *s!=0x14 && *s!=0x1a && *s!=0x1b );
/*        if ( strstr(s,"(Ū)") == s )
        {
            s += strlen("(Ū)");
            *t++ = '(';
            *t++ = '-';
            *t++ = ')';
            continue;
        }
        if ( strstr(s,"(Ū)") == s )
        {
            s += strlen("(Ū)");
            *t++ = '(';
            *t++ = '+';
            *t++ = ')';
            continue;
        }
*/        
        if ( *s == 0x01 )
            *t = 'I';
        else if ( *s == 0x02 )
            *t = 'W';
        else if ( *s == 0x03 )
            *t = 'K';
        else if ( *s == 0x04 )
            *t = 'A';
        else if ( *s == 0x05 )
            *t = ':';
        else if ( *s == 0x06 )
            *t = 'C';
        else if ( *s == 0x0b )
            *t = 'Q';
        else if ( *s == 0x0e )
            *t = 'U';
        else if ( *s == 0x0f )
            *t = 'V';
        else if ( *s == 0x10 )
            *t = 'E';
        else if ( *s == 0x11 )
            *t = 'F';
        else if ( *s == 0x13 )
            *t = 'Z';
        else if ( *s == 0x15 )
            *t = 'm';
        else if ( *s == 0x16 )
            *t = 'N';
        else if ( *s == 0x17 )
            *t = 'L';
        else if ( *s == 0x18 )
            *t = '9';
        else if ( *s == 0x19 )
            *t = 'G';
        else if ( *s == 0x1c )
            *t = 'J';
        else if ( *s == 0x1d )
            *t = 'S';
        else if ( *s == 0x1e )
            *t = '3';
        else if ( *s == 0x1f )
            *t = '!';
        else
            *t = *s;
        s++;t++;
    }
    *t = '\0';
    strcpy(source,temp);
}
unsigned char *justify(unsigned char *str, int indent, int stringlen) {
  unsigned char *ptr, *begin, *bp;   /* bp == possible breakpoint */
  int len;

  ptr = str;
  len = COLUMN-indent; 
  begin = str;
  
  while (1) {
    if (stringlen <= len) return str;

    while (*ptr) {
      if (ptr-begin > len) break;
      if (*ptr == ' ') 
        bp = ptr;
      else if (isBig5CodeHi(*ptr)) {
        bp = ptr;
        ptr++;
      } 
      ptr++;
    }

    begin = bp+1+indent;  
    if (*bp==' ') bcopy(bp+1, begin, strlen((char*)bp)+1);
    else bcopy(bp, begin, strlen((char *)bp)+1);

    memset(bp+1, ' ', indent);
    *bp = '\n';
    stringlen = strlen((char *)begin);
    ptr = begin;
  } 
}

unsigned int printLine(unsigned char *begin) {
  unsigned char buf[1024], rawchar;
  unsigned int  i, len, offst=2, type;

  len = begin[1];

  if (len==255) { 
    len+=begin[2];
    offst=3;
  }

  type = *begin/0x10;
  if (type==2)
  {
//    strcat(CurrentWordItem.Mark, "[" );
    for (i=offst; i<len+offst; i++) {
      rawchar = begin[i]^(unsigned char)0xa5;
      if ( strlen(CurrentWordItem.Mark)>=sizeof(CurrentWordItem.Mark) )
      {
          printf("%s:%s\n",CurrentWordItem.English,CurrentWordItem.Mark);
      }
      assert( strlen(CurrentWordItem.Mark)<sizeof(CurrentWordItem.Mark) );
      CurrentWordItem.Mark[strlen(CurrentWordItem.Mark)] = rawchar;
    }
//    strcat(CurrentWordItem.Mark, "]\n");
    return len+offst;
  }

  if (type==1) {
//    strcat(CurrentWordItem.English, KEYWORD_COLOR);
    strcat( CurrentWordItem.Chinese, "< " ); 
    for (i=offst; i<len+offst; i++) {
      rawchar = begin[i]^(unsigned char)0xa5;
      
      if (rawchar >= 225)
      {
          CurrentWordItem.English[strlen(CurrentWordItem.English)] = rawchar-225+'a';
          CurrentWordItem.Chinese[strlen(CurrentWordItem.Chinese)] = '.';
          CurrentWordItem.Chinese[strlen(CurrentWordItem.Chinese)] = rawchar-225+'a';
      }
      else if (rawchar >= 193)
      {
          CurrentWordItem.English[strlen(CurrentWordItem.English)] = rawchar-193+'A';
          CurrentWordItem.Chinese[strlen(CurrentWordItem.Chinese)] = '.';
          CurrentWordItem.Chinese[strlen(CurrentWordItem.Chinese)] = rawchar-193+'A';
      }
      else if (rawchar==172)
      {
          strcat(CurrentWordItem.English,  "., "); 
          strcat(CurrentWordItem.Chinese,  "., "); 
      }
      else if (rawchar==160)
      {
          strcat(CurrentWordItem.English,  ". "); 
          strcat(CurrentWordItem.Chinese,  ". "); 
      }
      else if (rawchar==168)
      {
          strcat(CurrentWordItem.English,  "("); 
          strcat(CurrentWordItem.Chinese,  "("); 
      }
      else if (rawchar >=129)
      {
          CurrentWordItem.English[strlen(CurrentWordItem.English)] = rawchar-129+'a';
          CurrentWordItem.Chinese[strlen(CurrentWordItem.Chinese)] = '.';
          CurrentWordItem.Chinese[strlen(CurrentWordItem.Chinese)] = rawchar-129+'a';
      }
      else
      {
          CurrentWordItem.English[strlen(CurrentWordItem.English)] = rawchar;
          CurrentWordItem.Chinese[strlen(CurrentWordItem.Chinese)] = rawchar;
      }
    } 
//    strcat(CurrentWordItem.English, ANSI_RESET);
    strcat(CurrentWordItem.Chinese," >\n");
    strcat(CurrentWordItem.Chinese,freq[*begin-0x10]);
    if ( strlen(freq[*begin-0x10]) )
        strcat(CurrentWordItem.Chinese,"\n");
  }
  else {
    for (i=0; i<len; i++) {
      buf[i] = begin[i+offst]^(char)0xa5;
      if (buf[i]=='#') {
        buf[i]=0;
        break;
      }
    }        
    buf[len]=0;

    if (*begin == 0xC4 || *begin == 0xD8) 
    {
      strcat(CurrentWordItem.Chinese, INFLEXION_COLOR);
      strcat(CurrentWordItem.Chinese,(char *)justify(buf, 0, len));
      strcat(CurrentWordItem.Chinese,ANSI_RESET "\n");
    }
    else if (*begin == 0x84 || type==0x5) 
    {
      strcat(CurrentWordItem.Chinese, IDIOM_COLOR);
      strcat(CurrentWordItem.Chinese,(char *)justify(buf, 0, len));
      strcat(CurrentWordItem.Chinese,ANSI_RESET "\n");
    }  
    else if (type == 0xD || type == 0xA || type == 0xE || type == 0x9) {
      int i=0, j;
      switch (type) {
        case 0xD: i = *begin-0xDC; break;
        case 0xA: i = *begin-0xA5; break;
        case 0x9: i = *begin-0x9D; break;
	    case 0xE: i = *begin-0xE4; break;  
      }

      if (i<0 || i>=sizeof(indentnum)/sizeof(indentnum[0]))
          i=0;
      strcat(CurrentWordItem.Chinese, indent[i]);
      strcat(CurrentWordItem.Chinese, EXAMPLE_COLOR);
      strcat(CurrentWordItem.Chinese, (char *)justify(buf, indentnum[i], len));
      strcat(CurrentWordItem.Chinese,ANSI_RESET "\n");
    }

    else if (type == 0xC) {
      int j;

      if (*begin>=0xC8) {
        j = *begin - 0xC8;
        strcat(CurrentWordItem.Chinese, indent[j]);
        strcat(CurrentWordItem.Chinese, OTHER_INFO_COLOR);
        strcat(CurrentWordItem.Chinese, (char *)justify(buf, indentnum[j], len));
        strcat(CurrentWordItem.Chinese,ANSI_RESET "\n");
      }
      else {
        j = *begin - 0xC0;
        strcat(CurrentWordItem.Chinese, indent[j]);
        strcat(CurrentWordItem.Chinese, (char *)justify(buf, indentnum[j], len) );
        strcat(CurrentWordItem.Chinese,"\n");
      }
    }   
    else if (type == 0x8){
        int j;
      
        j = *begin - 0x81;
        if (j<0 || j >= sizeof(indentnum)/sizeof(indentnum[0]))
            j=0;

        strcat(CurrentWordItem.Chinese, indent[j]);
        strcat(CurrentWordItem.Chinese, (char *)justify(buf, indentnum[j], len));
        strcat(CurrentWordItem.Chinese, "\n");
    }
    else
    {
      strcat(CurrentWordItem.Chinese, (char *)justify(buf, 0, len));
      strcat(CurrentWordItem.Chinese, "\n");
    }
    
  }
  
  return len+offst;
}

void savetoitem(unsigned char * buf,int buflen)
{
  int index = 0;
  memset(&CurrentWordItem,0,sizeof(CurrentWordItem));
//  strcat(CurrentWordItem.English, "NEW WORD\n");
  do {
    if (buf[index]/0x10 == 6) {   
      strcat(CurrentWordItem.Chinese, WORD_CLASS_COLOR "<<");
	  strcat(CurrentWordItem.Chinese, word_class[buf[index]-0x60]);
      strcat(CurrentWordItem.Chinese, ">>" ANSI_RESET "\n" );
      index++;
    }
    else if (buf[index] == 0x50) {
          //strcat(CurrentWordItem.Chinese, WORD_CLASS_COLOR "<<ͬ>>" ANSI_RESET "\n");
	  strcat(CurrentWordItem.Chinese, WORD_CLASS_COLOR "<<PN>>" ANSI_RESET "\n");
      index++;
    }
    else index += (printLine(buf+index));
  } while (buf[index]/0x10 != 1 && index <= buflen-1); 
  
  if ( strlen(CurrentWordItem.Chinese)
       && CurrentWordItem.Chinese[strlen(CurrentWordItem.Chinese)-1]=='\n' )
      CurrentWordItem.Chinese[strlen(CurrentWordItem.Chinese)-1]='\0';
  
  Big2GB((BYTE*)CurrentWordItem.Chinese);
  Big2GB((BYTE*)CurrentWordItem.Mark);
  deleteprechar(CurrentWordItem.English);
  convertmark(CurrentWordItem.Mark);
}

inline unsigned char mytoupper(unsigned char c)
{
    if ( c>='a' && c<='z' )
        return c-'a'+'A';
    else
        return c;
}

int mystrcmp(unsigned char * s1, unsigned char * s2)
{
    int i;
    for (i=0;s1[i] && s2[i];i++)
    {
        if ( s1[i] > s2[i] )
            return 1;
        else if ( s2[i] > s1[i] )
            return -1;
    }
    if (s1[i])
        return 1;
    else if (s2[i])
        return -1;
    return 0;
}

int mycompare(const void * s1, const void * s2)
{
    assert(s1!=NULL && s2!=NULL);
    unsigned char * c1 = (unsigned char *)((*(WordItem*)s1).English);
    unsigned char * c2 = (unsigned char *)((*(WordItem*)s2).English);
    int i;
	for(i=0;c1[i] && c2[i];i++)
	{
    	if ( mytoupper(c1[i]) > mytoupper(c2[i]) ) return (1);
    	else if ( mytoupper(c1[i]) < mytoupper(c2[i]) ) return (-1);
	}

	if(c1[i]) return(1); 	// c2[i] must be '\0' so c1>c2
	else if(c2[i]) return(-1);// c1[i] must be '\0' so c1<c2
	else 
        return(mystrcmp(c1,c2));	// c1[i]=='\0' and c2[i]=='\0',so c1==c2
}

int main() {
  int i, index_len, data_len, word_count,wordlen,total_word=0;
  int indexFD, dictFD;
  unsigned char o;
  unsigned char buf[16384], index_path[128], dict_path[128];
  long *indice;
  memset(&LastWordItem,0,sizeof(LastWordItem));
  
  FD = fopen(sTargetFile, "w+");  
  if (FD == NULL) {
      fprintf(stderr, "cannot open file %s for writing\n", sTargetFile);
      exit(-1);
  }
  Big2GBTable.Init("../table/big2gb.table");
  for (o='a'; o<='z'; o++)
  {
    word_count = 0;
    sprintf((char *)index_path, DICT_PATH "/%c.i50", o);
    sprintf((char*)dict_path, DICT_PATH "/%c.d50", o);

    indexFD = open((char *)index_path, O_RDONLY);
    dictFD  = open((char *)dict_path, O_RDONLY);
  
    if (dictFD < 0 || indexFD < 0) {
      fprintf(stderr, "file %s or %s not open\n", index_path, dict_path);
      exit(-1);
    }
    else 
      fprintf(stderr, "Processing %c\n", o);

    index_len = lseek(indexFD, 0, SEEK_END);
    lseek(indexFD, 0, 0);
    data_len = lseek(dictFD, 0, SEEK_END);
    lseek(dictFD, 0, 0);
    
    indice = (long*)malloc(index_len);
    index_len = read(indexFD, (unsigned char*)indice, index_len);
    if (index_len <= 0) {
      fprintf(stderr, "%s corrupted\n", index_path);
      exit(-1);
    }
    close(indexFD);

    struct WordItem * WordItemLib = new WordItem[index_len/4];
    assert ( WordItemLib );

    for (i = 0; i < index_len/4; i++) {
      lseek(dictFD, convertEndian(indice[i]) & 0x1FFFFFL, 0);
      if ( i<index_len/4 - 1 )
          wordlen = (convertEndian(indice[i+1])&0x1FFFFFL) - (convertEndian(indice[i])&0x1FFFFFL);
      else
          wordlen = data_len - (convertEndian(indice[i])&0x1FFFFFL);
      
      if ( wordlen >= sizeof(buf) )
      {
          fprintf(stderr,"wordlen is: %d \n",wordlen);
          wordlen = sizeof(buf);
          exit(-1);
      }

      if ( (wordlen=read(dictFD, buf, wordlen)) <= 0 ) {
        fprintf(stderr, "%s corrupted\n", dict_path);
        exit(-1);
      }
      if (*buf/0x10 != 1) {
        fprintf(stderr, "error: %dth word\n", i);
        exit(-1);
      }
   
      savetoitem(buf,wordlen);
      
      if ( !strcasecmp(CurrentWordItem.English,LastWordItem.English) )  //equal
      {
          strcat(LastWordItem.Chinese,"\n\n");
          assert( strlen(LastWordItem.Chinese)+strlen(CurrentWordItem.Chinese) <= sizeof(CurrentWordItem.Chinese) );
          strcat(LastWordItem.Chinese,CurrentWordItem.Chinese);
          strcat(LastWordItem.Mark," | ");
          assert( strlen(LastWordItem.Mark)+strlen(CurrentWordItem.Mark) <= sizeof(CurrentWordItem.Mark) );
          strcat(LastWordItem.Mark,CurrentWordItem.Mark);
      }
      else
      { 
          if ( strlen(LastWordItem.English) )
          {
              WordItemLib[word_count++] = LastWordItem;
          }
          LastWordItem = CurrentWordItem;
          //memcpy(&LastWordItem,&CurrentWordItem,sizeof(LastWordItem));
      }
      if (word_count%0xFF == 0)
          fprintf(stderr, "%d%%\r", word_count*400/index_len);    //progress
    }   // each word
    WordItemLib[word_count++] = LastWordItem;
    memset(&LastWordItem,0,sizeof(LastWordItem));
    
    printf("sort...\n");
    qsort(WordItemLib,word_count,sizeof(WordItem),mycompare);
    
    printf("saving...\n");
    for(int currentitem=0;currentitem<word_count;currentitem++)
    {
        fwrite(WordItemLib[currentitem].English,sizeof(char),strlen(WordItemLib[currentitem].English)+1,FD);
        fwrite(WordItemLib[currentitem].Chinese,sizeof(char),strlen(WordItemLib[currentitem].Chinese)+1,FD);
        fwrite(WordItemLib[currentitem].Mark,sizeof(char),strlen(WordItemLib[currentitem].Mark)+1,FD);
    }

    close(dictFD);
    free(indice);
    fprintf(stderr, "%c part completed.\n", o);
    total_word += word_count;
  } // end of for a-z

  fprintf(stderr,"Total Words:%d\n",total_word);
  fwrite(&total_word,sizeof(total_word),1,FD);
  int style = (0<<24) + (0<<16) + (1<<8) + 4 ;
  fwrite(&style,sizeof(int),1,FD);
  fclose(FD);
  exit(0);
  return 0;
}
