00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "platform.h"
00025 #include <string.h>
00026 #include "bloomfilter.h"
00027
00028
00029
00030
00031
00032
00033
00034
00035 static void
00036 setBit (unsigned char *bitArray, unsigned int bitIdx)
00037 {
00038 unsigned int arraySlot;
00039 unsigned int targetBit;
00040
00041 arraySlot = bitIdx / 8;
00042 targetBit = (1L << (bitIdx % 8));
00043 bitArray[arraySlot] |= targetBit;
00044 }
00045
00046
00047
00048
00049
00050
00051
00052
00053 static void
00054 setBitCallback (Bloomfilter * bf, unsigned int bit, void *arg)
00055 {
00056 setBit (bf->bitArray, bit);
00057 }
00058
00059
00060
00061
00062
00063
00064
00065 static void
00066 addToBloomfilter (Bloomfilter * bf, const HashCode160 * e)
00067 {
00068
00069 if (NULL == bf)
00070 return;
00071 iterateBits (bf, &setBitCallback, NULL, e);
00072 }
00073
00074
00075 #define ADDR_PER_ELEMENT 46
00076
00077
00078 int
00079 main (int argc, char **argv)
00080 {
00081 Bloomfilter bf;
00082 HashCode160 hc;
00083 int i;
00084 int j;
00085 int cnt;
00086 char *fn;
00087 char **words;
00088 char line[2048];
00089 FILE *dictin;
00090 char *bn;
00091 char *charset = NULL;
00092 #define ALLOCSIZE 1024*1024
00093
00094 if (argc < 3)
00095 {
00096 fprintf (stderr,
00097 _("Please provide the name of the language you are building\n"
00098 "a dictionary for. For example:\n"));
00099 fprintf (stderr, "$ ./dictionary-builder ./en en > en.c\n");
00100 exit (-1);
00101 }
00102
00103 fn = malloc (strlen (argv[1]) + 6);
00104 strcpy (fn, argv[1]);
00105 strcat (fn, ".txt");
00106 dictin = fopen (fn, "r");
00107 free (fn);
00108 if (dictin == NULL)
00109 {
00110 fprintf (stderr,
00111 _("Error opening file `%s': %s\n"), argv[1], strerror (errno));
00112 exit (-1);
00113 }
00114
00115 words = malloc (sizeof (char *) * ALLOCSIZE);
00116 if (words == NULL)
00117 {
00118 fprintf (stderr, _("Error allocating: %s\n."), strerror (errno));
00119 exit (-1);
00120 }
00121 cnt = 0;
00122 memset (&line[0], 0, 2048);
00123 fscanf (dictin, "%s", (char *) &line);
00124 charset = strdup (line);
00125 while (1 == fscanf (dictin, "%s", (char *) &line))
00126 {
00127 words[cnt] = strdup (line);
00128 cnt++;
00129 memset (&line[0], 0, 2048);
00130 if (cnt >= ALLOCSIZE)
00131 {
00132 fprintf (stderr, _("Increase ALLOCSIZE (in %s).\n"), __FILE__);
00133 exit (-1);
00134 }
00135 }
00136
00137 bf.addressesPerElement = ADDR_PER_ELEMENT;
00138 bf.bitArraySize = (1 + (cnt / SUBTABLES)) * sizeof (int) * SUBTABLES;
00139 bf.bitArray = malloc (bf.bitArraySize);
00140 memset (bf.bitArray, 0, bf.bitArraySize);
00141
00142 for (i = 0; i < cnt; i++)
00143 {
00144 hash (words[i], strlen (words[i]), &hc);
00145 addToBloomfilter (&bf, &hc);
00146 }
00147
00148 fprintf (stdout, "#include \"bloomfilter-def.h\"\n");
00149
00150
00151
00152
00153
00154
00155 for (j = 0; j < SUBTABLES; j++)
00156 {
00157 char fn[64];
00158 FILE *btfile;
00159
00160 snprintf (fn, 64, "%s_%d.c", argv[1], j);
00161 btfile = fopen (fn, "w+");
00162 if (btfile == NULL)
00163 {
00164 fprintf (stderr,
00165 _("Error opening file `%s': %s\n"), fn, strerror (errno));
00166 exit (-1);
00167 }
00168 fprintf (btfile, "int %s_bits_%d[] = { ", argv[2], j);
00169 for (i = j * (bf.bitArraySize / sizeof (int) / SUBTABLES);
00170 i < (j + 1) * (bf.bitArraySize / sizeof (int) / SUBTABLES); i++)
00171 fprintf (btfile, "%dL,", (((int *) bf.bitArray)[i]));
00172 fprintf (btfile, "};\n");
00173 fclose (btfile);
00174 fprintf (stdout, "extern int %s_bits_%d[];\n", argv[2], j);
00175 }
00176
00177 fprintf (stdout, "static int * bits[] = { ");
00178 for (i = 0; i < SUBTABLES; i++)
00179 fprintf (stdout, "%s_bits_%d,", argv[2], i);
00180 fprintf (stdout, "};\n");
00181 bn = &argv[1][strlen (argv[1])];
00182 while ((bn != argv[1]) && (bn[0] != '/'))
00183 bn--;
00184 if (bn[0] == '/')
00185 bn++;
00186 fprintf (stdout, "Bloomfilter libextractor_printable_%s_filter = {\n" " %u,\n" " NULL,\n"
00187 " (unsigned char **)bits,\n"
00188 " %u };\n", bn, ADDR_PER_ELEMENT, bf.bitArraySize);
00189 free (charset);
00190 return 0;
00191 }