#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#include <unistd.h>

typedef unsigned char BYTE;
typedef unsigned short int WORD;

#define MAXWORDS1 300000

typedef struct {
    char chinese[11];
    char * english;
} WORDSLIST;

FILE *output;
char buf[100];
char pre[16384];
int wordscount;
WORDSLIST * wordslist;

int mystrcmp(unsigned char * s1, unsigned char * s2)
{
    int i;
    for (i=0;s1[i] && s2[i];i++)
    {
        if ( s1[i] > s2[i] )
            return 1;
        else if ( s2[i] > s1[i] )
            return -1;
    }
    if (s1[i])
        return 1;
    else if (s2[i])
        return -1;
    return 0;
}

int mycompare(const void * s1, const void * s2)
{
    if (s1==NULL || s2==NULL)
    {
        fprintf(stderr,"call err\n");
        return 0;
    }
    unsigned char * c1 = (unsigned char *)((*(WORDSLIST*)s1).chinese);
    unsigned char * c2 = (unsigned char *)((*(WORDSLIST*)s2).chinese);
    unsigned char * e1 = (unsigned char *)((*(WORDSLIST*)s1).english);
    unsigned char * e2 = (unsigned char *)((*(WORDSLIST*)s2).english);
    
    int value = mystrcmp(c1,c2);
    if ( value == 0 )
        value = strcasecmp((char*)e1,(char*)e2);
    if ( value == 0 )
        value = mystrcmp(e1,e2);
    return value;
}


bool predelete(char * buffer,char * english)
{
    if (strlen(buffer)>16000)
        printf("%s too long\n",buffer);
    int i,j;
    char *h,*t;
    bool find = true;
    while ( find )
    {
        i = 0;
        if ( (h=strchr(buffer,'(')) )
            i++;
        if ( (t=strchr(buffer,')')) )
            i++;
        if ( i == 1 )
        {
//            printf("%s:() not euqal:%s,skiped\n",english,buffer);
            return false;
        }
        if ( i == 2 )
        {
            *h=*t='\0';
            strcpy(pre,buffer);
            strcat(pre,t+1);
            strcpy(buffer,pre);
        }
        if ( i == 0 )
            find = false;
    }
    
    find = true;
    while (find)
    {
        j = 0;
        if ( (h=strchr(buffer,'[')) )
            j++;
        if ( (t=strchr(buffer,']')) )
            j++;
        if ( j == 1 )
        {
//            printf("%s:[] not euqal:%s,skiped\n",english,buffer);
            return false;
        }
        if ( j == 2 )
        {
            *h=*t='\0';
            strcpy(pre,buffer);
            strcat(pre,t+1);
            strcpy(buffer,pre);
        }
        if ( j == 0 )
            find = false;
    }
    return true;
}

bool chomp(char * chinese)
{
//    printf("Processing %s\n",chinese);
    if ( strlen(chinese)>=100 )
    {
        printf("%s too long,skiped\n",chinese);
        return false;
    }
    
    // chomp
    char * tt;
    /*tt = chinese+strlen(chinese)-1;
    while( *tt==' ' && tt>=chinese )
        tt--;
    if ( *(tt+1) == ' ' )
        *(tt+1) = '\0';
    
    tt = chinese;
    while ( *tt == ' ' )
        tt++;
    if ( tt>chinese )
        strcpy(chinese,tt);*/
    strcpy(buf,chinese);
    tt = buf;
    char * o = chinese;
    while (*tt)
    {
        if ( *tt!=' ' )
            *o++ = *tt;
        tt++;
    }
    *o = '\0';

    // . at end
    char * endpoint;
    if ( (endpoint=strrchr(chinese,'.')) )
    {
        if ( endpoint == chinese+strlen(chinese)-1 )
            *endpoint = '\0';
    }
    
    char * point;
    if ( (point=strchr(chinese,'.')) )
    {
        if ( point == chinese )
        {
//            printf(". at head:%s,skiped\n",chinese);
            return false;
        }
        BYTE * t = (BYTE *)chinese;
        bool preok = true;
        while ( (char*)t<point )
        {
            if ( *t > 160 )
                preok = false;
            t++;
        }
        if (preok)
            strcpy(chinese,point+1);
    }   //point
    
    if ( (point=strchr(chinese,'.')) )
    {
        if (point==chinese)
        {
//            fprintf(stderr,"%s still have \".\" at the begining,skiped\n",chinese);
            return false;
        }
    }
    
    // ...
    if ( (tt=strstr(chinese,"...")) )
    {
        if (tt==chinese)
        {
//            printf("\"%s\"... at head,skiped\n",chinese);
            return false;
        }
        if (tt==chinese+strlen(chinese)-3)
        {
//            printf("\"%s\"... at end,deleted \"...\"\n",chinese);
            *tt = '\0';
        }
    }
    
    // chinese ...
    if ( (tt=strstr(chinese,"")) )
    {
        if (tt==chinese)
        {
//            printf("\"%s\"[chinese] at head,skiped\n",chinese);
            return false;
        }
        if (tt==chinese+strlen(chinese)-2)
        {
//            printf("\"%s\"[chinese] at end,deleted \"\"\n",chinese);
            *tt = '\0';
        }
    }
    
    // U & C
    if ( chinese[0]=='U' || chinese[0]=='C' )
        strcpy(chinese,chinese+1);
    
    // !
    if ( (tt=strchr(chinese,'!')) )
    {
//        printf("%s has !\n",chinese);
        *tt = '\0';
    }
    
    BYTE * asc = (BYTE*)chinese;
    while (*asc)
    {
        if ( (*asc)<=160 && *asc!='.' )
        {
//            printf("!!!!!!\"%s\" has ascii code, skiped!!!!!!!!!!\n",chinese);
            return false;
        }
        asc++;
    }
    if ( strlen(chinese)<=1 )
    {
//        printf("%s is too short,skiped\n",chinese);
        return false;
    }
    if ( strlen(chinese)>10 )
    {
//        printf("%s is longer than 5,skiped\n",chinese);
        return false;
    }

    return true;
}

void savetolist(char * chinese, char * english)
{
    if ( chomp(chinese) )
    {
        strcpy(wordslist[wordscount].chinese,chinese);
        wordslist[wordscount].english=english;
//        fprintf(output,"%s--->%s\n",chinese,english);
        wordscount++;
    }
}

int main(int argc,char **argv)
{
    wordslist = (WORDSLIST *)new WORDSLIST[MAXWORDS1];
    if (wordslist==NULL)
    {
        fprintf(stderr,"Not enough memory\n");
        return -2;
    }
    int fd=open("../dic/xdict.dic",O_RDONLY);
    if(fd==-1)
    {
        printf("can't open source file!\n");
        return(-1);
    }
//    if (!(output = fopen("../dic/xdict_ce.dic", "wb")))
    if (!(output = fopen("../dic/xdict_ce.dic", "wb")))
    {
        fprintf(stderr,"Unable to open outfile\r\n");
        close(fd);
        return -2;
    }
    
    // get length of dicfile.
    struct stat stStat;
    if(fstat(fd,&stStat)!=0)
    {
        printf("can't get dictionary stat !\n");
        close(fd);
        fclose(output);
        return(-1);
    }
    long iFileSize=stStat.st_size;
    unsigned int iLength = 0;
    
    // get item count
    lseek(fd,0-sizeof(int)*2,SEEK_END);
    unsigned int iCapacity,iStyle;
    read(fd,&iCapacity,sizeof(int));
    read(fd,&iStyle,sizeof(int));
//    BYTE cIndex=(unsigned char)(iStyle>>24);
//    BYTE cWord=(unsigned char)(iStyle>>16);
//    BYTE cMeaning=(unsigned char)(iStyle>>8);
    BYTE cMark=(unsigned char)iStyle;

    // mmap the file to memory
    caddr_t pFileMem=(caddr_t)mmap( (caddr_t)0,iFileSize-sizeof(int)*2,
                            PROT_READ,MAP_SHARED|MAP_NORESERVE,fd,0 );
    if(pFileMem==MAP_FAILED)
    {
        printf("mmap error !\n");
        close(fd);
        fclose(output);
        return(-1);
    }

    // begin to read items.
    caddr_t p=pFileMem;
    caddr_t pMeaning, pMark;
    char * buffer = new char[64000];
    if (!buffer) return -1;
    while(p<pFileMem+iFileSize-sizeof(int)*2 && iLength<iCapacity)
    {
        pMeaning=p+strlen(p)+1;
        if ( !cMark )   // no Mark field, eg py2gb
            pMark = NULL;
        else
            pMark = pMeaning+strlen(pMeaning)+1;
        
        char *pSplit,*pHeader;
        strcpy(buffer,pMeaning);
//        fprintf(output,"ORIGIN:%s---->%s\n",buffer,p);
        if ( predelete(buffer,p) )
        {
            pSplit=pHeader=buffer;
            while ( *pSplit )
            {
                if ( *pSplit==';' || *pSplit==',' || *pSplit=='\n' || *pSplit=='\r')
                {
                    *pSplit = '\0';
                    savetolist(pHeader,p);
                    pHeader = pSplit+1;
                }
                pSplit++;
            }
//            if ( pHeader == buffer )
                savetolist(pHeader,p);
        }
        
        iLength++;
        if ( !cMark )
            p = pMeaning+strlen(pMeaning)+1;
        else
            p = pMark+strlen(pMark)+1;
    }
    
    fprintf(stderr,"%d words\n",wordscount);    
    fprintf(stderr,"sorting...\n");
    qsort(wordslist,wordscount,sizeof(WORDSLIST),mycompare);

    fprintf(stderr,"writing...\n");
    int newwordscount = 0;
    int meaningcount = 0;
    int maxmc = 0;int maxwords;
    char fakenull,fakeline;
    fakenull = '\0';
    fakeline = '\n';
    for (int i=0;i<wordscount;i++)
    {
        if ( i==0 || mystrcmp((BYTE*)(wordslist[i].chinese),(BYTE*)(wordslist[i-1].chinese)) )
        {
            if (i!=0)
                fwrite(&fakenull,sizeof(char),1,output);
            fwrite(wordslist[i].chinese,sizeof(char),strlen(wordslist[i].chinese)+1,output);
            fwrite(wordslist[i].english,sizeof(char),strlen(wordslist[i].english),output);
            newwordscount++;
            if ( meaningcount > maxmc )
            {
                maxmc = meaningcount;
                maxwords = i-1;
            }
            meaningcount=1;
        }
        else if ( strcasecmp(wordslist[i].english,wordslist[i-1].english) )
        {
            fwrite(&fakeline,sizeof(char),1,output);
            fwrite(wordslist[i].english,sizeof(char),strlen(wordslist[i].english),output);
            meaningcount++;
        }
    }
    fwrite(&fakenull,sizeof(char),1,output);
    
    fwrite(&newwordscount,sizeof(int),1,output);
    //          INDEX      GB      ENGLISH    MARK
    int style= (0<<24) + (1<<16) + (0<<8) +    0;
    fwrite(&style,sizeof(int),1,output);
    
    fprintf(stderr,"%d merged words\n",newwordscount);
    fprintf(stderr,"MAXMEANING:%s has %d meaning\n",wordslist[maxwords].chinese,maxmc);

    close(fd);
    fclose(output);
    delete [] buffer;
    delete [] wordslist;
    

    return(0);
}
