extractor.c

Go to the documentation of this file.
00001 /*
00002      This file is part of libextractor.
00003      (C) 2002, 2003, 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff
00004 
00005      libextractor is free software; you can redistribute it and/or modify
00006      it under the terms of the GNU General Public License as published
00007      by the Free Software Foundation; either version 2, or (at your
00008      option) any later version.
00009 
00010      libextractor is distributed in the hope that it will be useful, but
00011      WITHOUT ANY WARRANTY; without even the implied warranty of
00012      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013      General Public License for more details.
00014 
00015      You should have received a copy of the GNU General Public License
00016      along with libextractor; see the file COPYING.  If not, write to the
00017      Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00018      Boston, MA 02111-1307, USA.
00019  */
00020 
00021 #include "platform.h"
00022 #include "extractor.h"
00023 #include <pthread.h>
00024 
00025 #if HAVE_LTDL_H
00026 #include <ltdl.h>
00027 #else
00028 #include <../../libltdl/ltdl.h>
00029 #endif
00030 
00031 #if HAVE_LIBBZ2
00032 #include <bzlib.h>
00033 #endif
00034 
00035 #if HAVE_ZLIB
00036 #include <zlib.h>
00037 #endif
00038 
00039 #define DEBUG 0
00040 
00041 /**
00042  * The sources of keywords as strings.
00043  */
00044 static const char *keywordTypes[] = {
00045   gettext_noop("unknown"), /* 0 */
00046   gettext_noop("filename"),
00047   gettext_noop("mimetype"),
00048   gettext_noop("title"),
00049   gettext_noop("author"),
00050   gettext_noop("artist"), /* 5 */
00051   gettext_noop("description"),
00052   gettext_noop("comment"),
00053   gettext_noop("date"),
00054   gettext_noop("publisher"),
00055   gettext_noop("language"), /* 10 */
00056   gettext_noop("album"),
00057   gettext_noop("genre"),
00058   gettext_noop("location"),
00059   gettext_noop("version"),
00060   gettext_noop("organization"), /* 15 */
00061   gettext_noop("copyright"),
00062   gettext_noop("subject"),
00063   gettext_noop("keywords"),
00064   gettext_noop("contributor"),
00065   gettext_noop("resource-type"), /* 20 */
00066   gettext_noop("format"),
00067   gettext_noop("resource-identifier"),
00068   gettext_noop("source"),
00069   gettext_noop("relation"),
00070   gettext_noop("coverage"), /* 25 */
00071   gettext_noop("software"),
00072   gettext_noop("disclaimer"),
00073   gettext_noop("warning"),
00074   gettext_noop("translated"),
00075   gettext_noop("creation date"), /* 30 */
00076   gettext_noop("modification date"),
00077   gettext_noop("creator"),
00078   gettext_noop("producer"),
00079   gettext_noop("page count"),
00080   gettext_noop("page orientation"), /* 35 */
00081   gettext_noop("paper size"),
00082   gettext_noop("used fonts"),
00083   gettext_noop("page order"),
00084   gettext_noop("created for"),
00085   gettext_noop("magnification"), /* 40 */
00086   gettext_noop("release"),
00087   gettext_noop("group"),
00088   gettext_noop("size"),
00089   gettext_noop("summary"),
00090   gettext_noop("packager"), /* 45 */
00091   gettext_noop("vendor"),
00092   gettext_noop("license"),
00093   gettext_noop("distribution"),
00094   gettext_noop("build-host"),
00095   gettext_noop("operating system"), /* 50 */
00096   gettext_noop("dependency"),
00097   gettext_noop("MD4"),
00098   gettext_noop("MD5"),
00099   gettext_noop("SHA-0"),
00100   gettext_noop("SHA-1"), /* 55 */
00101   gettext_noop("RipeMD160"),
00102   gettext_noop("resolution"),
00103   gettext_noop("category"),
00104   gettext_noop("book title"),
00105   gettext_noop("priority"), /* 60 */
00106   gettext_noop("conflicts"),
00107   gettext_noop("replaces"),
00108   gettext_noop("provides"),
00109   gettext_noop("conductor"),
00110   gettext_noop("interpreter"), /* 65 */
00111   gettext_noop("owner"),
00112   gettext_noop("lyrics"),
00113   gettext_noop("media type"),
00114   gettext_noop("contact"),
00115   gettext_noop("binary thumbnail data"), /* 70 */
00116   gettext_noop("publication date"),
00117   gettext_noop("camera make"),
00118   gettext_noop("camera model"),
00119   gettext_noop("exposure"),
00120   gettext_noop("aperture"), /* 75 */
00121   gettext_noop("exposure bias"),
00122   gettext_noop("flash"),
00123   gettext_noop("flash bias"),
00124   gettext_noop("focal length"),
00125   gettext_noop("focal length (35mm equivalent)"), /* 80 */
00126   gettext_noop("iso speed"),
00127   gettext_noop("exposure mode"),
00128   gettext_noop("metering mode"),
00129   gettext_noop("macro mode"),
00130   gettext_noop("image quality"), /* 85 */
00131   gettext_noop("white balance"),
00132   gettext_noop("orientation"),
00133   gettext_noop("template"),
00134   gettext_noop("split"),
00135   gettext_noop("product version"), /* 90 */
00136   gettext_noop("last saved by"),
00137   gettext_noop("last printed"),
00138   gettext_noop("word count"),
00139   gettext_noop("character count"),
00140   gettext_noop("total editing time"), /* 95 */
00141   gettext_noop("thumbnails"),
00142   gettext_noop("security"),
00143   gettext_noop("created by software"),
00144   gettext_noop("modified by software"),
00145   gettext_noop("revision history"), /* 100 */
00146   gettext_noop("lower case conversion"),
00147   gettext_noop("company"),
00148   gettext_noop("generator"),
00149   gettext_noop("character set"),
00150   gettext_noop("line count"), /* 105 */
00151   gettext_noop("paragraph count"),
00152   gettext_noop("editing cycles"),
00153   gettext_noop("scale"),
00154   gettext_noop("manager"),
00155   gettext_noop(/* movie director */"director"), /* 110 */
00156   gettext_noop("duration"),
00157   gettext_noop("information"),
00158   gettext_noop("full name"),
00159   gettext_noop("chapter"),
00160   gettext_noop("year"), /* 115 */
00161   gettext_noop("link"),
00162   gettext_noop("music CD identifier"),
00163   gettext_noop("play counter"),
00164   gettext_noop("popularity meter"),
00165   gettext_noop("content type"), /* 120 */
00166   gettext_noop("encoded by"),
00167   gettext_noop("time"),
00168   gettext_noop("musician credits list"),
00169   gettext_noop("mood"),
00170   gettext_noop("format version"), /* 125 */
00171   gettext_noop("television system"),
00172   gettext_noop("song count"),
00173   gettext_noop("starting song"),
00174   gettext_noop("hardware dependency"),
00175   gettext_noop("ripper"), /* 130 */
00176   gettext_noop("filesize"),
00177   gettext_noop("track number"),
00178   gettext_noop("international standard recording code"),
00179   gettext_noop("disc number"), /* 134 */
00180   gettext_noop("preferred display style (GNUnet)"),
00181   gettext_noop("GNUnet URI of ECBC data"),
00182   NULL,
00183 };
00184 
00185 /* the number of keyword types (for bounds-checking) */
00186 #define HIGHEST_TYPE_NUMBER 137
00187 
00188 #ifdef HAVE_LIBOGG
00189 #if HAVE_VORBIS
00190 #define WITH_OGG 1
00191 #endif
00192 #endif
00193 
00194 #if HAVE_VORBISFILE
00195 #define WITH_OGG 1
00196 #endif
00197 
00198 #if HAVE_EXIV2
00199 #define EXSO "libextractor_exiv2:"
00200 #else
00201 #define EXSO ""
00202 #endif
00203 
00204 #if WITH_OGG
00205 #define OGGSO "libextractor_ogg:"
00206 #else
00207 #define OGGSO ""
00208 #endif
00209 
00210 #if HAVE_FLAC
00211 #define FLACSO "libextractor_flac:"
00212 #else
00213 #define FLACSO ""
00214 #endif
00215 
00216 #if HAVE_ZLIB
00217 #define QTSO "libextractor_qt:"
00218 #else
00219 #define QTSO ""
00220 #endif
00221 
00222 #if HAVE_GSF
00223 #define OLESO "libextractor_ole2:"
00224 #else
00225 #define OLESO ""
00226 #endif
00227 
00228 #if HAVE_MPEG2
00229 #define MPEGSO "libextractor_mpeg:"
00230 #else
00231 #define MPEGSO ""
00232 #endif 
00233 
00234 /* ATTN: order matters (for performance!) since
00235    mime-types can be used to avoid parsing once
00236    the type has been established! */
00237 #define DEFSO \
00238 "libextractor_html:\
00239 libextractor_man:\
00240 libextractor_ps:\
00241 libextractor_pdf:\
00242 libextractor_mp3:\
00243 libextractor_id3v2:\
00244 libextractor_id3v23:\
00245 libextractor_id3v24:\
00246 libextractor_mime:\
00247 libextractor_tar:\
00248 libextractor_dvi:\
00249 libextractor_deb:\
00250 libextractor_png:\
00251 libextractor_gif:\
00252 libextractor_wav:\
00253 libextractor_flv:\
00254 libextractor_real:\
00255 libextractor_jpeg:\
00256 libextractor_tiff:\
00257 libextractor_zip:\
00258 libextractor_rpm:\
00259 libextractor_riff:\
00260 libextractor_applefile:\
00261 libextractor_elf:\
00262 libextractor_oo:\
00263 libextractor_asf:\
00264 libextractor_sid:\
00265 libextractor_nsfe:\
00266 libextractor_nsf:\
00267 libextractor_it:\
00268 libextractor_xm:\
00269 libextractor_s3m"
00270 
00271 #define DEFAULT_LIBRARIES MPEGSO EXSO OLESO OGGSO FLACSO QTSO DEFSO
00272 
00273 const char * EXTRACTOR_getDefaultLibraries() {
00274   return DEFAULT_LIBRARIES;
00275 }
00276 
00277 /* determine installation path */
00278 
00279 static char * cut_bin(char * in) {
00280   size_t p;
00281 
00282   if (in == NULL)
00283     return NULL;
00284   p = strlen(in);
00285   if (p > 4) {
00286     if ( (in[p-1] == '/') ||
00287          (in[p-1] == '\\') )
00288       in[--p] = '\0';
00289     if (0 == strcmp(&in[p-3],
00290                     "bin")) {
00291       in[p-3] = '\0';
00292       p -= 3;
00293     }
00294   }
00295   return in;
00296 }
00297 
00298 static char * cut_lib(char * in) {
00299   size_t p;
00300 
00301   if (in == NULL)
00302     return NULL;
00303   p = strlen(in);
00304   if (p > 4) {
00305     if ( (in[p-1] == '/') ||
00306          (in[p-1] == '\\') )
00307       in[--p] = '\0';
00308     if (0 == strcmp(&in[p-3],
00309                     "lib")) {
00310       in[p-3] = '\0';
00311       p -= 3;
00312     }
00313   }
00314   return in;
00315 }
00316 
00317 
00318 #if LINUX
00319 /**
00320  * Try to determine path by reading /proc/PID/exe or
00321  * /proc/PID/maps.
00322  *
00323  * Note that this may fail if LE is installed in one directory
00324  * and the binary linking against it sits elsewhere.
00325  */
00326 static char *
00327 get_path_from_proc_exe() {
00328   char fn[64];
00329   char line[1024];
00330   char dir[1024];
00331   char * lnk;
00332   size_t size;
00333   FILE * f;
00334 
00335   snprintf(fn,
00336            64,
00337            "/proc/%u/maps",
00338            getpid());
00339   f = fopen(fn, "r");
00340   if (f != NULL) {
00341     while (NULL != fgets(line, 1024, f)) {
00342       if ( (1 == sscanf(line,
00343                         "%*x-%*x %*c%*c%*c%*c %*x %*2u:%*2u %*u%*[ ]%s",
00344                         dir)) &&
00345            (NULL != strstr(dir,
00346                            "libextractor")) ) {
00347         strstr(dir, "libextractor")[0] = '\0';
00348         fclose(f);
00349         return strdup(dir);
00350       }
00351     }
00352     fclose(f);
00353   }
00354   snprintf(fn,
00355            64,
00356            "/proc/%u/exe",
00357            getpid());
00358   lnk = malloc(1029); /* 1024 + 5 for "lib/" catenation */
00359   size = readlink(fn, lnk, 1023);
00360   if ( (size == 0) || (size >= 1024) ) {
00361     free(lnk);
00362     return NULL;
00363   }
00364   lnk[size] = '\0';
00365   while ( (lnk[size] != '/') &&
00366           (size > 0) )
00367     size--;
00368   if ( (size < 4) ||
00369        (lnk[size-4] != '/') ) {
00370     /* not installed in "/bin/" -- binary path probably useless */
00371     free(lnk);
00372     return NULL;
00373   }
00374   lnk[size] = '\0';
00375   lnk = cut_bin(lnk);
00376   lnk = realloc(lnk, strlen(lnk) + 5);
00377   strcat(lnk, "lib/"); /* guess "lib/" as the library dir */
00378   return lnk;
00379 }
00380 #endif
00381 
00382 #if WINDOWS
00383 /**
00384  * Try to determine path with win32-specific function
00385  */
00386 static char * get_path_from_module_filename() {
00387   char * path;
00388   char * idx;
00389 
00390   path = malloc(4103); /* 4096+nil+6 for "/lib/" catenation */
00391   GetModuleFileName(NULL, path, 4096);
00392   idx = path + strlen(path);
00393   while ( (idx > path) &&
00394           (*idx != '\\') &&
00395           (*idx != '/') )
00396     idx--;
00397   *idx = '\0';
00398   path = cut_bin(path);
00399   path = realloc(path, strlen(path) + 6);
00400   strcat(path, "/lib/"); /* guess "lib/" as the library dir */
00401   return path;
00402 }
00403 #endif
00404 
00405 #if DARWIN
00406 static char * get_path_from_dyld_image() {
00407   const char * path;
00408   char * p, * s;
00409   int i;
00410   int c;
00411 
00412   p = NULL;
00413   c = _dyld_image_count();
00414   for (i = 0; i < c; i++) {
00415     if (_dyld_get_image_header(i) == &_mh_dylib_header) {
00416       path = _dyld_get_image_name(i);
00417       if (path != NULL && strlen(path) > 0) {
00418         p = strdup(path);
00419         s = p + strlen(p);
00420         while ( (s > p) && (*s != '/') )
00421           s--;
00422         s++;
00423         *s = '\0';
00424       }
00425       break;
00426     }
00427   }
00428   return p;
00429 }
00430 #endif
00431 
00432 /**
00433  * This may also fail -- for example, if extract
00434  * is not also installed.
00435  */
00436 static char *
00437 get_path_from_PATH() {
00438   struct stat sbuf;
00439   char * path;
00440   char * pos;
00441   char * end;
00442   char * buf;
00443   const char * p;
00444   size_t size;
00445 
00446   p = getenv("PATH");
00447   if (p == NULL)
00448     return NULL;
00449   path = strdup(p); /* because we write on it */
00450   buf = malloc(strlen(path) + 20);
00451   size = strlen(path);
00452   pos = path;
00453 
00454   while (NULL != (end = strchr(pos, ':'))) {
00455     *end = '\0';
00456     sprintf(buf, "%s/%s", pos, "extract");
00457     if (0 == stat(buf, &sbuf)) {
00458       pos = strdup(pos);
00459       free(buf);
00460       free(path);
00461       pos = cut_bin(pos);
00462       pos = realloc(pos, strlen(pos) + 5);
00463       strcat(pos, "lib/");
00464       return pos;
00465     }
00466     pos = end + 1;
00467   }
00468   sprintf(buf, "%s/%s", pos, "extract");
00469   if (0 == stat(buf, &sbuf)) {
00470     pos = strdup(pos);
00471     free(buf);
00472     free(path);
00473     pos = cut_bin(pos);
00474     pos = realloc(pos, strlen(pos) + 5);
00475     strcat(pos, "lib/");
00476     return pos;
00477   }
00478   free(buf);
00479   free(path);
00480   return NULL;
00481 }
00482 
00483 static char *
00484 get_path_from_ENV_PREFIX() {
00485   const char * p;
00486 
00487   p = getenv("LIBEXTRACTOR_PREFIX");
00488   if (p != NULL) {
00489     char * s = malloc(strlen(p) + 6);
00490     if (s != NULL) {
00491       int len;
00492       strcpy(s, p);
00493       s = cut_bin(cut_lib(s));
00494       len = strlen(s);
00495       s = realloc(s, len + 6);
00496       if (len > 0 && s[len-1] != '/')
00497         strcat(s, "/lib/");
00498       else
00499         strcat(s, "lib/");
00500       return s;
00501     }
00502   }
00503   return NULL;
00504 }
00505 
00506 /*
00507  * @brief get the path to the plugin directory
00508  * @return a pointer to the dir path (to be freed by the caller)
00509  */
00510 static char * os_get_installation_path() {
00511   size_t n;
00512   char * tmp;
00513   char * lpref;
00514   char * pexe;
00515   char * modu;
00516   char * dima;
00517   char * path;
00518 
00519   lpref = get_path_from_ENV_PREFIX();
00520 #if LINUX
00521   pexe = get_path_from_proc_exe();
00522 #else
00523   pexe = NULL;
00524 #endif
00525 #if WINDOWS
00526   modu = get_path_from_module_filename();
00527 #else
00528   modu = NULL;
00529 #endif
00530 #if DARWIN
00531   dima = get_path_from_dyld_image();
00532   path = NULL;
00533 #else
00534   dima = NULL;
00535   path = get_path_from_PATH();
00536 #endif
00537   n = 1;
00538   if (lpref != NULL)
00539     n += strlen(lpref) + strlen(PLUGINDIR "/:");
00540   if (pexe != NULL)
00541     n += strlen(pexe) + strlen(PLUGINDIR "/:");
00542   if (modu != NULL)
00543     n += strlen(modu) + strlen(PLUGINDIR "/:");
00544   if (dima != NULL)
00545     n += strlen(dima) + strlen(PLUGINDIR "/:");
00546   if (path != NULL)
00547     n += strlen(path) + strlen(PLUGINDIR "/:");
00548   tmp = malloc(n);
00549   tmp[0] = '\0';
00550   if (lpref != NULL) {
00551     strcat(tmp, lpref);
00552     strcat(tmp, PLUGINDIR "/:");
00553     free(lpref);
00554   }
00555   if (pexe != NULL) {
00556     strcat(tmp, pexe);
00557     strcat(tmp, PLUGINDIR "/:");
00558     free(pexe);
00559   }
00560   if (modu != NULL) {
00561     strcat(tmp, modu);
00562     strcat(tmp, PLUGINDIR "/:");
00563     free(modu);
00564   }
00565   if (dima != NULL) {
00566     strcat(tmp, dima);
00567     strcat(tmp, PLUGINDIR "/:");
00568     free(dima);
00569   }
00570   if (path != NULL) {
00571     strcat(tmp, path);
00572     strcat(tmp, PLUGINDIR "/:");
00573     free(path);
00574   }
00575   if (strlen(tmp) > 0)
00576     tmp[strlen(tmp)-1] = '\0';
00577   if (strlen(tmp) == 0) {
00578     free(tmp);
00579     return NULL;
00580   }
00581   return tmp;
00582 }
00583 
00584 
00585 /* ************library initialization ***************** */
00586 
00587 static char * old_dlsearchpath = NULL;
00588 
00589 /* using libtool, needs init! */
00590 void __attribute__ ((constructor)) le_ltdl_init() {
00591   int err;
00592   const char * opath;
00593   char * path;
00594   char * cpath;
00595 
00596 #if ENABLE_NLS
00597   BINDTEXTDOMAIN(PACKAGE, LOCALEDIR);
00598   BINDTEXTDOMAIN("iso-639", ISOLOCALEDIR); /* used by wordextractor */
00599 #endif
00600   err = lt_dlinit ();
00601   if (err > 0) {
00602 #if DEBUG
00603     fprintf(stderr,
00604             _("Initialization of plugin mechanism failed: %s!\n"),
00605             lt_dlerror());
00606 #endif
00607     return;
00608   }
00609   opath = lt_dlgetsearchpath();
00610   if (opath != NULL)
00611     old_dlsearchpath = strdup(opath);
00612   path = os_get_installation_path();
00613   if (path != NULL) {
00614     if (opath != NULL) {
00615       cpath = malloc(strlen(path) + strlen(opath) + 4);
00616       strcpy(cpath, opath);
00617       strcat(cpath, ":");
00618       strcat(cpath, path);
00619       lt_dlsetsearchpath(cpath);
00620       free(path);
00621       free(cpath);
00622     } else {
00623       lt_dlsetsearchpath(path);
00624       free(path);
00625     }
00626   }
00627 #ifdef MINGW
00628   InitWinEnv();
00629 #endif
00630 }
00631 
00632 void __attribute__ ((destructor)) le_ltdl_fini() {
00633   lt_dlsetsearchpath(old_dlsearchpath);
00634   if (old_dlsearchpath != NULL) {
00635     free(old_dlsearchpath);
00636     old_dlsearchpath = NULL;
00637   }
00638 #ifdef MINGW
00639   ShutdownWinEnv();
00640 #endif
00641   lt_dlexit ();
00642 }
00643 
00644 /**
00645  * Open a file
00646  */
00647 static int fileopen(const char *filename, int oflag, ...)
00648 {
00649   int mode;
00650   char *fn;
00651 
00652 #ifdef MINGW
00653   char szFile[_MAX_PATH + 1];
00654   long lRet;
00655 
00656   if ((lRet = plibc_conv_to_win_path(filename, szFile)) != ERROR_SUCCESS)
00657   {
00658     errno = ENOENT;
00659     SetLastError(lRet);
00660 
00661     return -1;
00662   }
00663   fn = szFile;
00664 #else
00665   fn = (char *) filename;
00666 #endif
00667 
00668   if (oflag & O_CREAT)
00669   {
00670     va_list arg;
00671     va_start(arg, oflag);
00672     mode = va_arg(arg, int);
00673     va_end(arg);
00674   }
00675   else
00676   {
00677     mode = 0;
00678   }
00679 
00680 #ifdef MINGW
00681   /* Set binary mode */
00682   mode |= O_BINARY;
00683 #endif
00684 
00685   return open(fn, oflag, mode);
00686 }
00687 
00688 
00689 
00690 /**
00691  * Load the default set of libraries. The default set of
00692  * libraries consists of the libraries that are part of
00693  * the libextractor distribution (except split and filename
00694  * extractor) plus the extractors that are specified
00695  * in the environment variable "LIBEXTRACTOR_LIBRARIES".
00696  *
00697  * @return the default set of libraries.
00698  */
00699 EXTRACTOR_ExtractorList *
00700 EXTRACTOR_loadDefaultLibraries ()
00701 {
00702   char *env;
00703   char *tmp;
00704   EXTRACTOR_ExtractorList *res;
00705 
00706 
00707   env = getenv ("LIBEXTRACTOR_LIBRARIES");
00708   if (env == NULL)
00709     {
00710       return EXTRACTOR_loadConfigLibraries (NULL, DEFAULT_LIBRARIES);
00711     }
00712   tmp = malloc (strlen (env) + strlen (DEFAULT_LIBRARIES) + 2);
00713   strcpy (tmp, env);
00714   strcat (tmp, ":");
00715   strcat (tmp, DEFAULT_LIBRARIES);
00716   res = EXTRACTOR_loadConfigLibraries (NULL, tmp);
00717   free (tmp);
00718   return res;
00719 }
00720 
00721 /**
00722  * Get the textual name of the keyword.
00723  * @return NULL if the type is not known
00724  */
00725 const char *
00726 EXTRACTOR_getKeywordTypeAsString(const EXTRACTOR_KeywordType type)
00727 {
00728   if ((type >= 0) && (type < HIGHEST_TYPE_NUMBER))
00729     return keywordTypes[type];
00730   else
00731     return NULL;
00732 }
00733 
00734 static pthread_mutex_t ltdl_lock = PTHREAD_MUTEX_INITIALIZER;
00735 
00736 #define LTDL_MUTEX_LOCK                     \
00737   if (pthread_mutex_lock (&ltdl_lock) != 0) \
00738     abort();
00739 #define LTDL_MUTEX_UNLOCK                     \
00740   if (pthread_mutex_unlock (&ltdl_lock) != 0) \
00741     abort();
00742 
00743 static void *getSymbolWithPrefix(void *lib_handle,
00744                                  const char *lib_name,
00745                                  const char *sym_name)
00746 {
00747   size_t name_size
00748     = strlen(lib_name)
00749     + strlen(sym_name)
00750     + 1 /* for the zero delim. */
00751     + 1 /* for the optional '_' prefix */;
00752   char *name=malloc(name_size),*first_error;
00753   void *symbol=NULL;
00754 
00755   snprintf(name,
00756            name_size,
00757            "_%s%s",
00758            lib_name,
00759            sym_name);
00760 
00761   LTDL_MUTEX_LOCK
00762   symbol=lt_dlsym(lib_handle,name+1 /* skip the '_' */);
00763   if (symbol==NULL) {
00764     first_error=strdup(lt_dlerror());
00765     symbol=lt_dlsym(lib_handle,name /* now try with the '_' */);
00766 #if DEBUG
00767     fprintf(stderr,
00768             _("Resolving symbol `%s' in library `%s' failed, "
00769               "so I tried `%s', but that failed also.  Errors are: "
00770               "`%s' and `%s'.\n"),
00771              name+1,
00772              lib_name,
00773              name,
00774              first_error,
00775              lt_dlerror());
00776 #endif
00777     free(first_error);
00778   }
00779   LTDL_MUTEX_UNLOCK
00780   free(name);
00781   return symbol;
00782 }
00783 
00784 /**
00785  * Load a dynamic library.
00786  * @return 1 on success, -1 on error
00787  */
00788 static int
00789 loadLibrary (const char *name,
00790              void **libHandle,
00791              ExtractMethod * method)
00792 {
00793 #if 0
00794   lt_dladvise advise;
00795 #endif
00796 
00797   LTDL_MUTEX_LOCK
00798 #if 0
00799   lt_dladvise_init(&advise);
00800   lt_dladvise_ext(&advise);
00801   lt_dladvise_local(&advise);
00802   *libHandle = lt_dlopenadvise (name, &advise);
00803   lt_dladvise_destroy(&advise);
00804 #else
00805   *libHandle = lt_dlopenext (name);
00806 #endif
00807   if (*libHandle == NULL)
00808     {
00809 #if DEBUG
00810       fprintf (stderr,
00811                _("Loading `%s' plugin failed: %s\n"),
00812                name,
00813                lt_dlerror ());
00814 #endif
00815       LTDL_MUTEX_UNLOCK
00816       return -1;
00817     }
00818   LTDL_MUTEX_UNLOCK
00819 
00820   *method = (ExtractMethod) getSymbolWithPrefix (*libHandle, name, "_extract");
00821   if (*method == NULL) {
00822     LTDL_MUTEX_LOCK
00823     lt_dlclose (*libHandle);
00824     LTDL_MUTEX_UNLOCK
00825     return -1;
00826   }
00827   return 1;
00828 }
00829 
00830 /* Internal function that accepts options. */
00831 static EXTRACTOR_ExtractorList *
00832 EXTRACTOR_addLibrary2 (EXTRACTOR_ExtractorList * prev,
00833                        const char *library, const char *options)
00834 {
00835   EXTRACTOR_ExtractorList *result;
00836   void *handle;
00837   ExtractMethod method;
00838 
00839   if (-1 == loadLibrary (library, &handle, &method))
00840     return prev;
00841   result = malloc (sizeof (EXTRACTOR_ExtractorList));
00842   result->next = prev;
00843   result->libraryHandle = handle;
00844   result->extractMethod = method;
00845   result->libname = strdup (library);
00846   if( options )
00847     result->options = strdup (options);
00848   else
00849     result->options = NULL;
00850   return result;
00851 }
00852 
00853 /**
00854  * Add a library for keyword extraction.
00855  * @param prev the previous list of libraries, may be NULL
00856  * @param library the name of the library
00857  * @return the new list of libraries, equal to prev iff an error occured
00858  */
00859 EXTRACTOR_ExtractorList *
00860 EXTRACTOR_addLibrary (EXTRACTOR_ExtractorList * prev,
00861                       const char *library)
00862 {
00863   return EXTRACTOR_addLibrary2(prev, library, NULL);
00864 }
00865 
00866 /* Internal function which takes options. */
00867 static EXTRACTOR_ExtractorList *
00868 EXTRACTOR_addLibraryLast2 (EXTRACTOR_ExtractorList * prev,
00869                            const char *library, const char *options)
00870 {
00871   EXTRACTOR_ExtractorList *result;
00872   EXTRACTOR_ExtractorList *pos;
00873   void *handle;
00874   ExtractMethod method;
00875 
00876   if (-1 == loadLibrary (library, &handle, &method))
00877     return prev;
00878   result = malloc (sizeof (EXTRACTOR_ExtractorList));
00879   result->next = NULL;
00880   result->libraryHandle = handle;
00881   result->extractMethod = method;
00882   result->libname = strdup (library);
00883   if( options )
00884     result->options = strdup (options);
00885   else
00886     result->options = NULL;
00887   if (prev == NULL)
00888     return result;
00889   pos = prev;
00890   while (pos->next != NULL)
00891     pos = pos->next;
00892   pos->next = result;
00893   return prev;
00894 }
00895 
00896 /**
00897  * Add a library for keyword extraction at the END of the list.
00898  * @param prev the previous list of libraries, may be NULL
00899  * @param library the name of the library
00900  * @return the new list of libraries, always equal to prev
00901  *         except if prev was NULL and no error occurs
00902  */
00903 EXTRACTOR_ExtractorList *
00904 EXTRACTOR_addLibraryLast (EXTRACTOR_ExtractorList * prev,
00905                           const char *library)
00906 {
00907   return EXTRACTOR_addLibraryLast2(prev, library, NULL);
00908 }
00909 
00910 /**
00911  * Load multiple libraries as specified by the user.
00912  * @param config a string given by the user that defines which
00913  *        libraries should be loaded. Has the format
00914  *        "[[-]LIBRARYNAME[:[-]LIBRARYNAME]*]". For example,
00915  *        libextractor_mp3.so:libextractor_ogg.so loads the
00916  *        mp3 and the ogg library. The '-' before the LIBRARYNAME
00917  *        indicates that the library should be added to the end
00918  *        of the library list (addLibraryLast).
00919  * @param prev the  previous list of libraries, may be NULL
00920  * @return the new list of libraries, equal to prev iff an error occured
00921  *         or if config was empty (or NULL).
00922  */
00923 EXTRACTOR_ExtractorList *
00924 EXTRACTOR_loadConfigLibraries (EXTRACTOR_ExtractorList * prev,
00925                                const char *config)
00926 {
00927   char *cpy;
00928   int pos;
00929   int last;
00930   int lastconf;
00931   int len;
00932 
00933   if (config == NULL)
00934     return prev;
00935   len = strlen(config);
00936   cpy = strdup(config);
00937   pos = 0;
00938   last = 0;
00939   lastconf = 0;
00940   while (pos < len)
00941     {
00942       while ((cpy[pos] != ':') && (cpy[pos] != '\0') &&
00943              (cpy[pos] != '('))
00944         pos++;
00945       if( cpy[pos] == '(' ) {
00946         cpy[pos++] = '\0';      /* replace '(' by termination */
00947         lastconf = pos;         /* start config from here, after (. */
00948         while ((cpy[pos] != '\0') && (cpy[pos] != ')'))
00949           pos++; /* config until ) or EOS. */
00950         if( cpy[pos] == ')' ) {
00951           cpy[pos++] = '\0'; /* write end of config here. */
00952           while ((cpy[pos] != ':') && (cpy[pos] != '\0'))
00953             pos++; /* forward until real end of string found. */
00954           cpy[pos++] = '\0';
00955         } else {
00956           cpy[pos++] = '\0'; /* end of string. */
00957         }
00958       } else {
00959         lastconf = -1;         /* NULL config when no (). */
00960         cpy[pos++] = '\0';      /* replace ':' by termination */
00961       }
00962       if (cpy[last] == '-')
00963         {
00964           last++;
00965           if( lastconf != -1 )
00966             prev = EXTRACTOR_addLibraryLast2 (prev, &cpy[last],
00967                                               &cpy[lastconf]);
00968           else
00969             prev = EXTRACTOR_addLibraryLast2 (prev, &cpy[last], NULL);
00970         }
00971       else
00972         if( lastconf != -1 )
00973           prev = EXTRACTOR_addLibrary2 (prev, &cpy[last], &cpy[lastconf]);
00974         else
00975           prev = EXTRACTOR_addLibrary2 (prev, &cpy[last], NULL);
00976 
00977       last = pos;
00978     }
00979   free (cpy);
00980   return prev;
00981 }
00982 
00983 /**
00984  * Remove a library for keyword extraction.
00985  * @param prev the current list of libraries
00986  * @param library the name of the library to remove
00987  * @return the reduced list, unchanged if the library was not loaded
00988  */
00989 EXTRACTOR_ExtractorList *
00990 EXTRACTOR_removeLibrary(EXTRACTOR_ExtractorList * prev,
00991                         const char *library)
00992 {
00993   EXTRACTOR_ExtractorList *pos;
00994   EXTRACTOR_ExtractorList *first;
00995   pos = prev;
00996   first = prev;
00997   while ((pos != NULL) && (0 != strcmp (pos->libname, library)))
00998     {
00999       prev = pos;
01000       pos = pos->next;
01001     }
01002   if (pos != NULL)
01003     {
01004       /* found, close library */
01005       if (first == pos)
01006         first = pos->next;
01007       else
01008         prev->next = pos->next;
01009       /* found */
01010       free (pos->libname);
01011       if( pos->options )
01012         free (pos->options);
01013       if( pos->libraryHandle ) {
01014         LTDL_MUTEX_LOCK
01015         lt_dlclose (pos->libraryHandle);
01016         LTDL_MUTEX_UNLOCK
01017       }
01018       free (pos);
01019     }
01020 #if DEBUG
01021   else
01022     fprintf(stderr,
01023             _("Unloading plugin `%s' failed!\n"),
01024             library);
01025 #endif
01026   return first;
01027 }
01028 
01029 /**
01030  * Remove all extractors.
01031  * @param libraries the list of extractors
01032  */
01033 void
01034 EXTRACTOR_removeAll (EXTRACTOR_ExtractorList * libraries)
01035 {
01036   while (libraries != NULL)
01037     libraries = EXTRACTOR_removeLibrary (libraries, libraries->libname);
01038 }
01039 
01040 
01041 
01042 /**
01043  * How many bytes do we actually try to scan? (from the beginning
01044  * of the file).  Limit to 1 GB.
01045  */
01046 #define MAX_READ 1024 * 1024 * 1024
01047 
01048 /**
01049  * How many bytes do we actually try to decompress? (from the beginning
01050  * of the file).  Limit to 16 MB.
01051  */
01052 #define MAX_DECOMPRESS 16 * 1024 * 1024
01053 
01054 
01055 static EXTRACTOR_KeywordList *
01056 getKeywords (EXTRACTOR_ExtractorList * extractor,
01057              const char * filename,
01058              const unsigned char * data,
01059              size_t size) {
01060   EXTRACTOR_KeywordList *result;
01061   unsigned char * buf;
01062   size_t dsize;
01063 #if HAVE_ZLIB
01064   z_stream strm;
01065   int ret;
01066   size_t pos;
01067 #endif
01068 #if HAVE_LIBBZ2
01069   bz_stream bstrm;
01070   int bret;
01071   size_t bpos;
01072 #endif
01073 
01074   result = NULL;
01075   buf = NULL;
01076   dsize = 0;
01077 #if HAVE_ZLIB
01078   /* try gzip decompression first */
01079   if ( (size >= 12) &&
01080        (data[0] == 0x1f) &&
01081        (data[1] == 0x8b) &&
01082        (data[2] == 0x08) ) {
01083 
01084     /*
01085      * Skip gzip header - we might want to retrieve parts of it as keywords
01086      */
01087     unsigned gzip_header_length = 10;
01088 
01089     if (data[3] & 0x4) /* FEXTRA  set */
01090       gzip_header_length += 2 + (unsigned) (data[10] & 0xff)
01091                               + (((unsigned) (data[11] & 0xff)) * 256);
01092 
01093     if(data[3] & 0x8) /* FNAME set */
01094     {
01095       const unsigned char * cptr = data + gzip_header_length;
01096 
01097       /*
01098        * stored file name is here
01099        * extremely long file names might break the following code.
01100        */
01101 
01102       while(cptr < data + size)
01103       {
01104         if('\0' == *cptr)
01105           break;
01106 
01107         cptr++;
01108       }
01109       gzip_header_length = (cptr - data) + 1;
01110     }
01111 
01112     if(data[3] & 0x16) /* FCOMMENT set */
01113     {
01114       const unsigned char * cptr = data + gzip_header_length;
01115 
01116       /*
01117        * stored comment is here
01118        */
01119 
01120       while(cptr < data + size)
01121       {
01122         if('\0' == *cptr)
01123           break;
01124 
01125         cptr ++;
01126       }
01127 
01128       gzip_header_length = (cptr - data) + 1;
01129     }
01130 
01131     if(data[3] & 0x2) /* FCHRC set */
01132       gzip_header_length += 2;
01133 
01134     memset(&strm,
01135            0,
01136            sizeof(z_stream));
01137 #ifdef ZLIB_VERNUM
01138     gzip_header_length = 0;
01139 #endif
01140     if (size > gzip_header_length) {
01141       strm.next_in = (Bytef*) data + gzip_header_length;
01142       strm.avail_in = size - gzip_header_length;
01143     } else {
01144       strm.next_in = (Bytef*) data;
01145       strm.avail_in = 0;
01146     }
01147     strm.total_in = 0;
01148     strm.zalloc = NULL;
01149     strm.zfree = NULL;
01150     strm.opaque = NULL;
01151 
01152     /*
01153      * note: maybe plain inflateInit(&strm) is adequate,
01154      * it looks more backward-compatible also ;
01155      *
01156      * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ;
01157      * there might be a better check.
01158      */
01159 #ifdef ZLIB_VERNUM
01160     if (Z_OK == inflateInit2(&strm,
01161                              15 + 32)) {
01162 #else
01163     if (Z_OK == inflateInit2(&strm,
01164                              -MAX_WBITS)) {
01165 #endif
01166       dsize = 2 * size;
01167       if (dsize > MAX_DECOMPRESS)
01168         dsize = MAX_DECOMPRESS;
01169       buf = malloc(dsize);
01170       pos = 0;
01171       if (buf == NULL) {
01172         inflateEnd(&strm);
01173       } else {
01174         strm.next_out = (Bytef*) buf;
01175         strm.avail_out = dsize;
01176         do {
01177           ret = inflate(&strm,
01178                         Z_SYNC_FLUSH);
01179           if (ret == Z_OK) {
01180             if (dsize == MAX_DECOMPRESS)
01181               break;
01182             pos += strm.total_out;
01183             strm.total_out = 0;
01184             dsize *= 2;
01185             if (dsize > MAX_DECOMPRESS)
01186               dsize = MAX_DECOMPRESS;
01187             buf = realloc(buf, dsize);
01188             strm.next_out = (Bytef*) &buf[pos];
01189             strm.avail_out = dsize - pos;
01190           } else if (ret != Z_STREAM_END) {
01191             /* error */
01192             free(buf);
01193             buf = NULL;
01194           }
01195         } while ( (buf != NULL) &&              
01196                   (ret != Z_STREAM_END) );
01197         dsize = pos + strm.total_out;
01198         inflateEnd(&strm);
01199         if (dsize == 0) {
01200           free(buf);
01201           buf = NULL;
01202         }
01203       }
01204     }
01205   }
01206 #endif
01207 
01208 #if HAVE_LIBBZ2
01209   if ( (size >= 4) &&
01210        (data[0] == 'B') &&
01211        (data[1] == 'Z') &&
01212        (data[2] == 'h') ) {
01213     /* now try bz2 decompression */
01214     memset(&bstrm,
01215            0,
01216            sizeof(bz_stream));
01217     bstrm.next_in = (char*) data;
01218     bstrm.avail_in = size;
01219     bstrm.total_in_lo32 = 0;
01220     bstrm.total_in_hi32 = 0;
01221     bstrm.bzalloc = NULL;
01222     bstrm.bzfree =