dictionary-builder.c

Go to the documentation of this file.
00001 /*
00002      This file is part of libextractor.
00003      (C) 2002, 2003, 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff
00004 
00005      libextractor is free software; you can redistribute it and/or modify
00006      it under the terms of the GNU General Public License as published
00007      by the Free Software Foundation; either version 2, or (at your
00008      option) any later version.
00009 
00010      libextractor is distributed in the hope that it will be useful, but
00011      WITHOUT ANY WARRANTY; without even the implied warranty of
00012      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013      General Public License for more details.
00014 
00015      You should have received a copy of the GNU General Public License
00016      along with libextractor; see the file COPYING.  If not, write to the
00017      Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00018      Boston, MA 02111-1307, USA.
00019  */
00020 /**
00021  * Tool to build a bloomfilter from a dictionary.
00022  */
00023 
00024 #include "platform.h"
00025 #include <string.h>
00026 #include "bloomfilter.h"
00027 
00028 /**
00029  * Sets a bit active in the bitArray. Increment bit-specific
00030  * usage counter on disk only if below 4bit max (==15).
00031  *
00032  * @param bitArray memory area to set the bit in
00033  * @param bitIdx which bit to set
00034  */
00035 static void
00036 setBit (unsigned char *bitArray, unsigned int bitIdx)
00037 {
00038   unsigned int arraySlot;
00039   unsigned int targetBit;
00040 
00041   arraySlot = bitIdx / 8;
00042   targetBit = (1L << (bitIdx % 8));
00043   bitArray[arraySlot] |= targetBit;
00044 }
00045 
00046 /**
00047  * Callback: increment bit
00048  *
00049  * @param bf the filter to manipulate
00050  * @param bit the bit to increment
00051  * @param arg not used
00052  */
00053 static void
00054 setBitCallback (Bloomfilter * bf, unsigned int bit, void *arg)
00055 {
00056   setBit (bf->bitArray, bit);
00057 }
00058 
00059 /**
00060  * Add an element to the filter
00061  *
00062  * @param bf the filter
00063  * @param e the element
00064  */
00065 static void
00066 addToBloomfilter (Bloomfilter * bf, const HashCode160 * e)
00067 {
00068 
00069   if (NULL == bf)
00070     return;
00071   iterateBits (bf, &setBitCallback, NULL, e);
00072 }
00073 
00074 
00075 #define ADDR_PER_ELEMENT 46
00076 
00077 
00078 int
00079 main (int argc, char **argv)
00080 {
00081   Bloomfilter bf;
00082   HashCode160 hc;
00083   int i;
00084   int j;
00085   int cnt;
00086   char *fn;
00087   char **words;
00088   char line[2048];              /* buffer overflow, here we go */
00089   FILE *dictin;
00090   char *bn;
00091   char *charset = NULL;
00092 #define ALLOCSIZE 1024*1024
00093 
00094   if (argc < 3)
00095     {
00096       fprintf (stderr,
00097                _("Please provide the name of the language you are building\n"
00098                  "a dictionary for.  For example:\n"));
00099       fprintf (stderr, "$ ./dictionary-builder ./en en > en.c\n");
00100       exit (-1);
00101     }
00102 
00103   fn = malloc (strlen (argv[1]) + 6);
00104   strcpy (fn, argv[1]);
00105   strcat (fn, ".txt");
00106   dictin = fopen (fn, "r");
00107   free (fn);
00108   if (dictin == NULL)
00109     {
00110       fprintf (stderr,
00111                _("Error opening file `%s': %s\n"), argv[1], strerror (errno));
00112       exit (-1);
00113     }
00114 
00115   words = malloc (sizeof (char *) * ALLOCSIZE); /* don't we LOVE constant size buffers? */
00116   if (words == NULL)
00117     {
00118       fprintf (stderr, _("Error allocating: %s\n."), strerror (errno));
00119       exit (-1);
00120     }
00121   cnt = 0;
00122   memset (&line[0], 0, 2048);
00123   fscanf (dictin, "%s", (char *) &line);
00124   charset = strdup (line);      /* not used (yet) */
00125   while (1 == fscanf (dictin, "%s", (char *) &line))
00126     {
00127       words[cnt] = strdup (line);
00128       cnt++;
00129       memset (&line[0], 0, 2048);
00130       if (cnt >= ALLOCSIZE)
00131         {
00132           fprintf (stderr, _("Increase ALLOCSIZE (in %s).\n"), __FILE__);
00133           exit (-1);
00134         }
00135     }
00136 
00137   bf.addressesPerElement = ADDR_PER_ELEMENT;
00138   bf.bitArraySize = (1 + (cnt / SUBTABLES)) * sizeof (int) * SUBTABLES;
00139   bf.bitArray = malloc (bf.bitArraySize);
00140   memset (bf.bitArray, 0, bf.bitArraySize);
00141 
00142   for (i = 0; i < cnt; i++)
00143     {
00144       hash (words[i], strlen (words[i]), &hc);
00145       addToBloomfilter (&bf, &hc);
00146     }
00147 
00148   fprintf (stdout, "#include \"bloomfilter-def.h\"\n");
00149 
00150   /* use int[] instead of char[] since it cuts the memory use of
00151      gcc down to a quarter; don't use long long since various
00152      gcc versions then output tons of warnings about "decimal constant
00153      is so large that it is unsigned" (even for unsigned long long[]
00154      that warning is generated and dramatically increases compile times). */
00155   for (j = 0; j < SUBTABLES; j++)
00156     {
00157       char fn[64];
00158       FILE *btfile;
00159 
00160       snprintf (fn, 64, "%s_%d.c", argv[1], j);
00161       btfile = fopen (fn, "w+");
00162       if (btfile == NULL)
00163         {
00164           fprintf (stderr,
00165                    _("Error opening file `%s': %s\n"), fn, strerror (errno));
00166           exit (-1);
00167         }
00168       fprintf (btfile, "int %s_bits_%d[] = { ", argv[2], j);
00169       for (i = j * (bf.bitArraySize / sizeof (int) / SUBTABLES);
00170            i < (j + 1) * (bf.bitArraySize / sizeof (int) / SUBTABLES); i++)
00171         fprintf (btfile, "%dL,", (((int *) bf.bitArray)[i]));
00172       fprintf (btfile, "};\n");
00173       fclose (btfile);
00174       fprintf (stdout, "extern int %s_bits_%d[];\n", argv[2], j);
00175     }
00176 
00177   fprintf (stdout, "static int * bits[] = { ");
00178   for (i = 0; i < SUBTABLES; i++)
00179     fprintf (stdout, "%s_bits_%d,", argv[2], i);
00180   fprintf (stdout, "};\n");
00181   bn = &argv[1][strlen (argv[1])];
00182   while ((bn != argv[1]) && (bn[0] != '/'))
00183     bn--;
00184   if (bn[0] == '/')
00185     bn++;
00186   fprintf (stdout, "Bloomfilter libextractor_printable_%s_filter = {\n" "  %u,\n" "  NULL,\n"   /* bitarray */
00187            "  (unsigned char **)bits,\n"        /* sbitArray */
00188            "  %u };\n", bn, ADDR_PER_ELEMENT, bf.bitArraySize);
00189   free (charset);
00190   return 0;
00191 }

Generated on Fri Jan 9 12:44:26 2009 for libextractor by  doxygen 1.5.1