diff options
Diffstat (limited to 'kiten/xjdxgen.c')
-rw-r--r-- | kiten/xjdxgen.c | 425 |
1 files changed, 425 insertions, 0 deletions
diff --git a/kiten/xjdxgen.c b/kiten/xjdxgen.c new file mode 100644 index 00000000..8dcfdd64 --- /dev/null +++ b/kiten/xjdxgen.c @@ -0,0 +1,425 @@ +/************************************************************************** +* X J D X G E N +* Author: Jim Breen +* Index (.xjdx) generator program fron XJDIC +* +* V2.3 - indexes JIS X 0212 (3-byte EUC) kanji +***************************************************************************/ +/* This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 1, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* Changed: ignore all rc stuff. use args 1 and 2 for input/output file. + -- jason */ + +#include <config.h> +#include <inttypes.h> +#include <sys/stat.h> +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <string.h> + +#ifdef HAVE_STDINT_H +#include <stdint.h> +#endif + +#include "xjdic.h" + +#define TRUE 1 +#define FALSE 0 +#define SPTAG '@' +#define EXLIM 100 +#define TOKENLIM 40 + +unsigned char *db; +unsigned char ENVname[50]; +unsigned char *dicenv; +struct stat buf; +uint32_t dbyte; +uint32_t *jindex; +uint32_t indptr,llone; +const char *ctl_file = ".xjdicrc"; +const char *Dname; +const char *JDXname; +unsigned char exlist[EXLIM][11]; /* list of words to be excluded */ +int excount,exlens[EXLIM]; +int jiver = 14; /*The last time the index structure changed was Version1.4*/ + +/*====== prototypes=================================================*/ +int stringcomp(unsigned char *s1, unsigned char *s2); +void jqsort(int32_t i, int32_t j); +int Kstrcmp(uint32_t lhs, uint32_t rhs); +void xjdicrc(); +int alphaoreuc(unsigned char x); + +int stringcomp(unsigned char *s1, unsigned char *s2) +{ + uint i; + unsigned char c1,c2; + + for(i = 0; i < strlen(s1);i++) + { + c1 = s1[i]; + if (c1 < 0x60) c1 = (c1|0x20); + c2 = s2[i]; + if (c2 < 0x60) c2 = (c2|0x20); + if (c1 != c2) return(1); + } + return (0); +} + +extern char *getenv(const char *name); + + +/*====function to Load Dictionary and load/create index table=======*/ +int main(argc,argv) +int argc; +unsigned char **argv; +{ + FILE *fp,*fopen(); + uint32_t possav,schi,diclen,indlen; + int i,inwd,cstrp,saving,isc,nodread; + unsigned char c; + unsigned char currstr[TOKENLIM]; + + printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n"); + printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n Copyright J.W. Breen, 1998\n"); + + if (argc < 3) + { + printf("\nUSAGE: kitengen input output.xjdx\n"); + exit(2); + } + + Dname = argv[1]; + JDXname = argv[2]; + printf("Commandline request to use files %s and %s \n", Dname, JDXname); + + inwd = FALSE; + indptr = 1; + llone = 1; + if(stat(Dname, &buf) != 0) + { + perror(NULL); + printf("Cannot stat: %s \n",Dname); + exit(1); + } + diclen = buf.st_size; + printf("\nWARNING!! This program may take a long time to run .....\n"); + + puts ("\nLoading Dictionary file. Please wait.....\n"); + fp=fopen(Dname,"rb"); + if (fp==NULL ) + { + printf("\nCannot open dictionary file\n"); + exit(1); + } + db = (unsigned char *)malloc((diclen+100) * sizeof(unsigned char)); + if(db == NULL) + { + fprintf(stderr,"malloc() for dictionary failed.\n"); + fclose(fp); + exit(1); + } + nodread = diclen/1024; + dbyte = fread((unsigned char *)db+1, 1024, nodread, fp); + nodread = diclen % 1024; + dbyte = fread((unsigned char *)(db+(diclen/1024)*1024)+1, nodread,1, fp); + fclose(fp); + diclen++; + dbyte = diclen; + db[diclen] = 10; + db[0] = 10; + printf("Dictionary size: %d bytes.\n",dbyte); + indlen = (diclen * 3)/4; + jindex = (uint32_t *)malloc(indlen); + if(jindex == NULL) + { + fprintf(stderr,"malloc() for index table failed.\n"); + exit(1); + } + printf("Parsing.... \n"); + /*this is the dictionary parser. It places an entry in jindex for every + kana/kanji string and every alphabetic string it finds which is >=3 + characters and is not on the "exclude" list */ + indptr = 1; + saving = FALSE; + cstrp = 0; + for (schi =0; schi < dbyte; schi++) /* scan whole dictionary */ + { + c = db[schi]; + if (inwd) + { + if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c <= '9'))) + { + currstr[cstrp] = c; + if(cstrp < TOKENLIM-1) cstrp++; + } + else + { + currstr[cstrp] = '\0'; + inwd = FALSE; + if ((strlen(currstr) <= 2) && (currstr[0] < 127))saving = FALSE; + if ((strlen(currstr) == 2) && (currstr[1] <= '9'))saving = TRUE; + if (saving && (currstr[0] > 127)) + { + possav = jindex[indptr]; + indptr++; + if (indptr > indlen/sizeof(int32_t)) + { + printf("Index table overflow. Dictionary too etarge?\n"); + exit(1); + } +/* generate index for *every* kanji in key */ + i = 2; + if (currstr[0] == 0x8f) i++; + for ( ; i < strlen(currstr); i+=2) + { + if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f)) + { + jindex[indptr] = possav+i; + indptr++; + if (indptr > indlen/sizeof(int32_t)) + { + printf("Index table overflow. Dictionary too large?\n"); + exit(1); + } + } + if (currstr[i] == 0x8f) i++; + } + } + if (saving && (currstr[0] < 127)) + { + indptr++; + if (indptr > indlen/sizeof(int32_t)) + { + printf("Index table overflow. Dictionary too large?\n"); + exit(1); + } +/* If this is non-Japanese, and has a 'SPTAGn' tag, generate two indices */ + if ( currstr[0] == SPTAG) + { + jindex[indptr] = jindex[indptr-1]+1; + strcpy(currstr,currstr+1); + indptr++; + if (indptr > indlen/sizeof(int32_t)) + { + printf("Index table overflow. Dictionary too large?\n"); + exit(1); + } + } + if (currstr[0] < 128) + { + for (isc = 0; isc <= excount; isc++) + { + if (( (uint) exlens[isc] == strlen(currstr)) && + (stringcomp(currstr,exlist[isc]) == 0) ) + { + indptr--; + break; + } + } + } + } + } + } + else + { + if (alphaoreuc(c) || c == SPTAG) + { + inwd = TRUE; + jindex[indptr] = schi; + cstrp = 1; + currstr[0] = c; + currstr[1] = '\0'; + saving = TRUE; + } + } + } + indptr--; + printf("Index entries: %d \nSorting (this is slow)......\n",indptr); + jqsort(llone,indptr); + printf("Sorted\nWriting index file ....\n"); + fp = fopen(JDXname,"wb"); + if (fp==NULL ) + { + printf("\nCannot open %s output file\n",JDXname); + exit(1); + } + jindex[0] = diclen+jiver; + fwrite(jindex,sizeof(int32_t),indptr+1,fp); + fclose(fp); + return 0; +} +/*======function to sort jindex table====================*/ +void jqsort(int32_t lhs, int32_t rhs) +{ + int32_t i,last,midp; + uint32_t temp; + if (lhs >= rhs) return; + /* Swap ( lhs , (lhs+rhs)/2);*/ + midp = (lhs+rhs)/2; + temp = jindex[lhs]; + jindex[lhs] = jindex[midp]; + jindex[midp] = temp; + last = lhs; + for (i = lhs+1;i <= rhs; i++) + { + if (Kstrcmp(jindex[i],jindex[lhs]) < 0) + { + /* Swap(++last,i);*/ + last++; + temp = jindex[i]; + jindex[i] = jindex[last]; + jindex[last] = temp; + } + } +/* Swap (lhs,last);*/ + temp = jindex[lhs]; + jindex[lhs] = jindex[last]; + jindex[last] = temp; + jqsort(lhs,last-1); + jqsort(last+1,rhs); +} +/*=====string comparison used by jqsort==========================*/ +int Kstrcmp(uint32_t lhs, uint32_t rhs) +{ + int i,c1 = 0, c2 = 0; +/* effectively does a strnicmp on two "strings" within the dictionary, + except it will make katakana and hirgana match (EUC A4 & A5) */ + + for (i = 0; i<20 ; i++) + { + c1 = db[lhs+i]; + c2 = db[rhs+i]; + if ((i % 2) == 0) + { + if (c1 == 0xA5) + { + c1 = 0xA4; + } + if (c2 == 0xA5) + { + c2 = 0xA4; + } + } + if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20; + if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20; + if (c1 != c2 ) break; + } + return(c1-c2); +} + +/*=====xjdicrc - access and analyze "xjdicrc" file (if any)==============*/ +/* +void xjdicrc() +{ + unsigned char xjdicdir[PATH_MAX],rcstr[80],*rcwd; + int iex; + FILE *fm,*fopen(); + + iex = 0; + xjdicdir[0] = '\0'; + dicenv = (unsigned char *)getenv("XJDIC"); + if (!dicenv) dicenv = (unsigned char *)DEFAULT_DICDIR; + if (strlen(dicenv) <= 2) + { + dicenv = (unsigned char *)getcwd(ENVname,sizeof(ENVname)); + if (dicenv == NULL) + { + printf("Cannot extract working directory!\n"); + exit(1); + } + } + else + { + strncpy(ENVname,dicenv,sizeof(ENVname)); + } + ENVname[sizeof(ENVname)-1] = '\0'; + + xjdicdir[sizeof(xjdicdir)-1] = '\0'; + if (strlen(ENVname) > 2) + { + strncpy(xjdicdir,ENVname, sizeof(xjdicdir)-1); + strncat(xjdicdir,"/", sizeof(xjdicdir)-1-strlen(xjdicdir)); + } + else + { + strncpy(xjdicdir,(unsigned char *)getenv("HOME"), sizeof(xjdicdir)-1); + strncat(xjdicdir,"/", sizeof(xjdicdir)-1-strlen(xjdicdir)); + } + + strncat(xjdicdir, ctl_file, sizeof(xjdicdir)-1-strlen(xjdicdir)); + fm = fopen(xjdicdir,"r"); + if (fm == NULL) + { + // Weird code --waba + strncat(xjdicdir, ctl_file, sizeof(xjdicdir)-1-strlen(xjdicdir)); + fm = fopen(xjdicdir,"r"); + } + if (fm != NULL) + { + while(fgets(rcstr,79,fm) != NULL) + { + rcwd = (unsigned char *)strtok(rcstr," \t"); + if( stringcomp((unsigned char *)"exlist",rcwd) == 0) + { + while (TRUE) + { + rcwd = (unsigned char *)strtok(NULL," \t\f\r\n"); + if (rcwd == NULL) break; + strncpy(exlist[iex],rcwd, sizeof(exlist[iex])); + exlist[iex][sizeof(exlist[iex])-1] = '\0'; + exlens[iex] = strlen(rcwd); + if (iex < EXLIM) iex++; + } + excount = iex-1; + continue; + } + } + } + if (fm == NULL) + { + printf("No control file detected!\n"); + return; + } + else + { + fclose(fm); + return; + } +} +*/ +/*=======function to test a character for alpha or kana/kanji====*/ +int alphaoreuc(unsigned char x) +{ + int c; + + c = x & 0xff; + if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122))) + { + return (TRUE); + } + if ((c >= '0') && (c <= '9')) + { + return(TRUE); + } + if ((c & 0x80) > 0) + { + return(TRUE); + } + return (FALSE); +} + |