extractor.h

Go to the documentation of this file.
00001 /*
00002      This file is part of libextractor.
00003      (C) 2002, 2003, 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff
00004 
00005      libextractor is free software; you can redistribute it and/or modify
00006      it under the terms of the GNU General Public License as published
00007      by the Free Software Foundation; either version 2, or (at your
00008      option) any later version.
00009 
00010      libextractor is distributed in the hope that it will be useful, but
00011      WITHOUT ANY WARRANTY; without even the implied warranty of
00012      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013      General Public License for more details.
00014 
00015      You should have received a copy of the GNU General Public License
00016      along with libextractor; see the file COPYING.  If not, write to the
00017      Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00018      Boston, MA 02111-1307, USA.
00019  */
00020 
00021 #ifndef EXTRACTOR_H
00022 #define EXTRACTOR_H
00023 
00024 #ifdef __cplusplus
00025 extern "C" {
00026 #if 0 /* keep Emacsens' auto-indent happy */
00027 }
00028 #endif
00029 #endif
00030 
00031 /**
00032  * 0.2.6-1 => 0x00020601
00033  * 4.5.2-0 => 0x04050200
00034  */
00035 #define EXTRACTOR_VERSION 0x00052003
00036 
00037 #include <stdio.h>
00038 
00039 /* ignore the 'type' of the keyword when eliminating duplicates */
00040 #define EXTRACTOR_DUPLICATES_TYPELESS 1
00041 /* remove type 'UNKNOWN' if there is a duplicate keyword of
00042    known type, even if usually different types should be
00043    preserved */
00044 #define EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN 2
00045 
00046 #define EXTRACTOR_DEFAULT_LIBRARIES EXTRACTOR_getDefaultLibraries()
00047 
00048 const char * EXTRACTOR_getDefaultLibraries(void);
00049 
00050 /**
00051  * Enumeration defining various sources of keywords.
00052  * See also
00053  * http://dublincore.org/documents/1998/09/dces/
00054  */
00055 typedef enum {
00056   EXTRACTOR_UNKNOWN = 0,
00057   EXTRACTOR_FILENAME = 1,
00058   EXTRACTOR_MIMETYPE = 2,
00059   EXTRACTOR_TITLE = 3,
00060   EXTRACTOR_AUTHOR = 4,
00061   EXTRACTOR_ARTIST = 5,
00062   EXTRACTOR_DESCRIPTION = 6,
00063   EXTRACTOR_COMMENT = 7,
00064   EXTRACTOR_DATE = 8,
00065   EXTRACTOR_PUBLISHER = 9,
00066   EXTRACTOR_LANGUAGE = 10,
00067   EXTRACTOR_ALBUM = 11,
00068   EXTRACTOR_GENRE = 12,
00069   EXTRACTOR_LOCATION = 13,
00070   EXTRACTOR_VERSIONNUMBER = 14,
00071   EXTRACTOR_ORGANIZATION = 15,
00072   EXTRACTOR_COPYRIGHT = 16,
00073   EXTRACTOR_SUBJECT = 17,
00074   EXTRACTOR_KEYWORDS = 18,
00075   EXTRACTOR_CONTRIBUTOR = 19,
00076   EXTRACTOR_RESOURCE_TYPE = 20,
00077   EXTRACTOR_FORMAT = 21,
00078   EXTRACTOR_RESOURCE_IDENTIFIER = 22,
00079   EXTRACTOR_SOURCE = 23,
00080   EXTRACTOR_RELATION = 24,
00081   EXTRACTOR_COVERAGE = 25,
00082   EXTRACTOR_SOFTWARE = 26,
00083   EXTRACTOR_DISCLAIMER = 27,
00084   EXTRACTOR_WARNING = 28,
00085   EXTRACTOR_TRANSLATED = 29,
00086   EXTRACTOR_CREATION_DATE = 30,
00087   EXTRACTOR_MODIFICATION_DATE = 31,
00088   EXTRACTOR_CREATOR = 32,
00089   EXTRACTOR_PRODUCER = 33,
00090   EXTRACTOR_PAGE_COUNT = 34,
00091   EXTRACTOR_PAGE_ORIENTATION = 35,
00092   EXTRACTOR_PAPER_SIZE = 36,
00093   EXTRACTOR_USED_FONTS = 37,
00094   EXTRACTOR_PAGE_ORDER = 38,
00095   EXTRACTOR_CREATED_FOR = 39,
00096   EXTRACTOR_MAGNIFICATION = 40,
00097   EXTRACTOR_RELEASE = 41,
00098   EXTRACTOR_GROUP = 42,
00099   EXTRACTOR_SIZE = 43,
00100   EXTRACTOR_SUMMARY = 44,
00101   EXTRACTOR_PACKAGER = 45,
00102   EXTRACTOR_VENDOR = 46,
00103   EXTRACTOR_LICENSE = 47,
00104   EXTRACTOR_DISTRIBUTION = 48,
00105   EXTRACTOR_BUILDHOST = 49,
00106   EXTRACTOR_OS = 50,
00107   EXTRACTOR_DEPENDENCY = 51,
00108   EXTRACTOR_HASH_MD4 = 52,
00109   EXTRACTOR_HASH_MD5 = 53,
00110   EXTRACTOR_HASH_SHA0 = 54,
00111   EXTRACTOR_HASH_SHA1 = 55,
00112   EXTRACTOR_HASH_RMD160 = 56,
00113   EXTRACTOR_RESOLUTION = 57,
00114   EXTRACTOR_CATEGORY = 58,
00115   EXTRACTOR_BOOKTITLE = 59,
00116   EXTRACTOR_PRIORITY = 60,
00117   EXTRACTOR_CONFLICTS = 61,
00118   EXTRACTOR_REPLACES = 62,
00119   EXTRACTOR_PROVIDES = 63,
00120   EXTRACTOR_CONDUCTOR = 64,
00121   EXTRACTOR_INTERPRET = 65,
00122   EXTRACTOR_OWNER = 66,
00123   EXTRACTOR_LYRICS = 67,
00124   EXTRACTOR_MEDIA_TYPE = 68,
00125   EXTRACTOR_CONTACT = 69,
00126   EXTRACTOR_THUMBNAIL_DATA = 70,
00127   EXTRACTOR_PUBLICATION_DATE = 71,
00128   EXTRACTOR_CAMERA_MAKE = 72,
00129   EXTRACTOR_CAMERA_MODEL = 73,
00130   EXTRACTOR_EXPOSURE = 74,
00131   EXTRACTOR_APERTURE = 75,
00132   EXTRACTOR_EXPOSURE_BIAS = 76,
00133   EXTRACTOR_FLASH = 77,
00134   EXTRACTOR_FLASH_BIAS = 78,
00135   EXTRACTOR_FOCAL_LENGTH = 79,
00136   EXTRACTOR_FOCAL_LENGTH_35MM = 80,
00137   EXTRACTOR_ISO_SPEED = 81,
00138   EXTRACTOR_EXPOSURE_MODE = 82,
00139   EXTRACTOR_METERING_MODE = 83,
00140   EXTRACTOR_MACRO_MODE = 84,
00141   EXTRACTOR_IMAGE_QUALITY = 85,
00142   EXTRACTOR_WHITE_BALANCE = 86,
00143   EXTRACTOR_ORIENTATION = 87,
00144   EXTRACTOR_TEMPLATE = 88,
00145   EXTRACTOR_SPLIT = 89,
00146   EXTRACTOR_PRODUCTVERSION = 90,
00147   EXTRACTOR_LAST_SAVED_BY = 91,
00148   EXTRACTOR_LAST_PRINTED = 92,
00149   EXTRACTOR_WORD_COUNT = 93,
00150   EXTRACTOR_CHARACTER_COUNT = 94,
00151   EXTRACTOR_TOTAL_EDITING_TIME = 95,
00152   EXTRACTOR_THUMBNAILS = 96,
00153   EXTRACTOR_SECURITY = 97,
00154   EXTRACTOR_CREATED_BY_SOFTWARE = 98,
00155   EXTRACTOR_MODIFIED_BY_SOFTWARE = 99,
00156   EXTRACTOR_REVISION_HISTORY = 100,
00157   EXTRACTOR_LOWERCASE = 101,
00158   EXTRACTOR_COMPANY = 102,
00159   EXTRACTOR_GENERATOR = 103,
00160   EXTRACTOR_CHARACTER_SET = 104,
00161   EXTRACTOR_LINE_COUNT = 105,
00162   EXTRACTOR_PARAGRAPH_COUNT = 106,
00163   EXTRACTOR_EDITING_CYCLES = 107,
00164   EXTRACTOR_SCALE = 108,
00165   EXTRACTOR_MANAGER = 109,
00166   EXTRACTOR_MOVIE_DIRECTOR = 110,
00167   EXTRACTOR_DURATION = 111,
00168   EXTRACTOR_INFORMATION = 112,
00169   EXTRACTOR_FULL_NAME = 113,
00170   EXTRACTOR_CHAPTER = 114,
00171   EXTRACTOR_YEAR = 115,
00172   EXTRACTOR_LINK = 116,
00173   EXTRACTOR_MUSIC_CD_IDENTIFIER = 117,
00174   EXTRACTOR_PLAY_COUNTER = 118,
00175   EXTRACTOR_POPULARITY_METER = 119,
00176   EXTRACTOR_CONTENT_TYPE = 120,
00177   EXTRACTOR_ENCODED_BY = 121,
00178   EXTRACTOR_TIME = 122,
00179   EXTRACTOR_MUSICIAN_CREDITS_LIST = 123,
00180   EXTRACTOR_MOOD = 124, 
00181   EXTRACTOR_FORMAT_VERSION = 125,
00182   EXTRACTOR_TELEVISION_SYSTEM = 126,
00183   EXTRACTOR_SONG_COUNT = 127,
00184   EXTRACTOR_STARTING_SONG = 128,
00185   EXTRACTOR_HARDWARE_DEPENDENCY = 129,
00186   EXTRACTOR_RIPPER = 130,
00187   EXTRACTOR_FILE_SIZE = 131,
00188   EXTRACTOR_TRACK_NUMBER = 132,
00189   EXTRACTOR_ISRC = 133,
00190   EXTRACTOR_DISC_NUMBER = 134,
00191   EXTRACTOR_GNUNET_DISPLAY_TYPE = 135,
00192   EXTRACTOR_GNUNET_ECBC_URI = 136,
00193 } EXTRACTOR_KeywordType;
00194 
00195 /**
00196  * Test if a given LE type contains binary data.
00197  */
00198 #define EXTRACTOR_isBinaryType(type) (type == EXTRACTOR_THUMBNAIL_DATA) 
00199 
00200 /**
00201  * A linked list of keywords. This structure is passed around
00202  * in libExtractor and is typically the result of any keyword
00203  * extraction operation.
00204  * <p>
00205  * Each entry in the keyword list consists of a string (the
00206  * keyword) and the keyword type (of type KeywordType)
00207  * describing how/from where the keyword was obtained.
00208  */
00209 typedef struct EXTRACTOR_Keywords {
00210   /* the keyword that was found */
00211   char * keyword;
00212   /* the type of the keyword (classification) */
00213   EXTRACTOR_KeywordType keywordType;
00214   /* the next entry in the list */
00215   struct EXTRACTOR_Keywords * next;
00216 } EXTRACTOR_KeywordList;
00217 
00218 /**
00219  * Signature of the extract method that each plugin
00220  * must provide.
00221  *
00222  * @param filename MAYBE NULL (!)
00223  * @param data must not be modified (!)
00224  */
00225 typedef EXTRACTOR_KeywordList *
00226 (*ExtractMethod)(const char * filename,
00227                  char * data,
00228                  size_t filesize,
00229                  EXTRACTOR_KeywordList * next,
00230                  const char * options);
00231 
00232 /**
00233  * Linked list of extractor helper-libraries. An application
00234  * builds this list by telling libextractor to load various
00235  * keyword-extraction libraries. Libraries can also be unloaded
00236  * (removed from this list, see removeLibrary).
00237  * <p>
00238  * Client code should never be concerned with the internals of
00239  * this struct.
00240  */
00241 typedef struct EXTRACTOR_Extractor {
00242   void * libraryHandle;
00243   char * libname;
00244   ExtractMethod extractMethod;
00245   struct EXTRACTOR_Extractor * next;
00246   char * options;
00247 } EXTRACTOR_ExtractorList;
00248 
00249 /**
00250  * Load the default set of libraries.
00251  * @return the default set of libraries.
00252  */
00253 EXTRACTOR_ExtractorList * EXTRACTOR_loadDefaultLibraries(void);
00254 
00255 /**
00256  * Get the textual name of the keyword.
00257  * @return NULL if the type is not known
00258  */
00259 const char *
00260 EXTRACTOR_getKeywordTypeAsString(EXTRACTOR_KeywordType type);
00261 
00262 /**
00263  * Return the highest type number, exclusive as in [0,highest).
00264  */
00265 EXTRACTOR_KeywordType
00266 EXTRACTOR_getHighestKeywordTypeNumber(void);
00267 
00268 /**
00269  * Load multiple libraries as specified by the user.
00270  * @param config a string given by the user that defines which
00271  *        libraries should be loaded. Has the format
00272  *        "[[-]LIBRARYNAME[(options)][:[-]LIBRARYNAME[(options)]]]*".
00273  *        For example,
00274  *        libextractor_mp3.so:libextractor_ogg.so loads the
00275  *        mp3 and the ogg library. The '-' before the LIBRARYNAME
00276  *        indicates that the library should be added to the end
00277  *        of the library list (addLibraryLast).
00278  * @param prev the  previous list of libraries, may be NULL
00279  * @return the new list of libraries, equal to prev iff an error occured
00280  *         or if config was empty (or NULL).
00281  */
00282 EXTRACTOR_ExtractorList *
00283 EXTRACTOR_loadConfigLibraries(EXTRACTOR_ExtractorList * prev,
00284                               const char * config);
00285 
00286 /**
00287  * Add a library for keyword extraction.
00288  * @param prev the previous list of libraries, may be NULL
00289  * @param library the name of the library
00290  * @return the new list of libraries, equal to prev iff an error occured
00291  */
00292 EXTRACTOR_ExtractorList *
00293 EXTRACTOR_addLibrary(EXTRACTOR_ExtractorList * prev,
00294                      const char * library);
00295 
00296 /**
00297  * Add a library for keyword extraction at the END of the list.
00298  * @param prev the previous list of libraries, may be NULL
00299  * @param library the name of the library
00300  * @return the new list of libraries, always equal to prev
00301  *         except if prev was NULL and no error occurs
00302  */
00303 EXTRACTOR_ExtractorList *
00304 EXTRACTOR_addLibraryLast(EXTRACTOR_ExtractorList * prev,
00305                          const char * library);
00306                 
00307 /**
00308  * Remove a library for keyword extraction.
00309  * @param prev the current list of libraries
00310  * @param library the name of the library to remove
00311  * @return the reduced list, unchanged if the library was not loaded
00312  */
00313 EXTRACTOR_ExtractorList *
00314 EXTRACTOR_removeLibrary(EXTRACTOR_ExtractorList * prev,
00315                         const char * library);
00316 
00317 /**
00318  * Remove all extractors.
00319  * @param libraries the list of extractors
00320  */
00321 void EXTRACTOR_removeAll(EXTRACTOR_ExtractorList * libraries);
00322 
00323 /**
00324  * Extract keywords from a file using the available extractors.
00325  * @param extractor the list of extractor libraries
00326  * @param filename the name of the file
00327  * @return the list of keywords found in the file, NULL if none
00328  *         were found (or other errors)
00329  */
00330 EXTRACTOR_KeywordList *
00331 EXTRACTOR_getKeywords(EXTRACTOR_ExtractorList * extractor,
00332                       const char * filename);
00333 
00334 
00335 /**
00336  * Extract keywords from a buffer in memory
00337  * using the available extractors.
00338  *
00339  * @param extractor the list of extractor libraries
00340  * @param data the data of the file
00341  * @param size the number of bytes in data
00342  * @return the list of keywords found in the file, NULL if none
00343  *         were found (or other errors)
00344  */
00345 EXTRACTOR_KeywordList *
00346 EXTRACTOR_getKeywords2(EXTRACTOR_ExtractorList * extractor,
00347                        const void * data,
00348                        size_t size);
00349 
00350 
00351 /**
00352  * Remove duplicate keywords from the list.
00353  * @param list the original keyword list (destroyed in the process!)
00354  * @param options a set of options (DUPLICATES_XXXX)
00355  * @return a list of keywords without duplicates
00356  */
00357 EXTRACTOR_KeywordList *
00358 EXTRACTOR_removeDuplicateKeywords(EXTRACTOR_KeywordList * list,
00359                                   unsigned int options);
00360 
00361 
00362 /**
00363  * Remove empty (all-whitespace) keywords from the list.
00364  * @param list the original keyword list (destroyed in the process!)
00365  * @return a list of keywords without duplicates
00366  */
00367 EXTRACTOR_KeywordList *
00368 EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * list);
00369 
00370 /**
00371  * Remove keywords of a particular type from the list.
00372  * @param list the original keyword list (altered in the process!)
00373  * @param type the type to remove
00374  * @return a list of keywords without entries of given type
00375  */
00376 EXTRACTOR_KeywordList *
00377 EXTRACTOR_removeKeywordsOfType(EXTRACTOR_KeywordList * list,
00378                                EXTRACTOR_KeywordType type);
00379 
00380 /**
00381  * Print a keyword list to a file.
00382  * For debugging.
00383  * @param handle the file to write to (stdout, stderr), must NOT be NULL
00384  * @param keywords the list of keywords to print, may be NULL
00385  */
00386 void EXTRACTOR_printKeywords(FILE * handle,
00387                              EXTRACTOR_KeywordList * keywords);
00388 
00389 /**
00390  * Free the memory occupied by the keyword list (and the
00391  * keyword strings in it!)
00392  * @param keywords the list to free
00393  */
00394 void EXTRACTOR_freeKeywords(EXTRACTOR_KeywordList * keywords);
00395 
00396 /**
00397  * Extract the last keyword that of the given type from the keyword list.
00398  * @param type the type of the keyword
00399  * @param keywords the keyword list
00400  * @return the last matching keyword, or NULL if none matches;
00401  *  the string returned is aliased in the keywords list and must
00402  *  not be freed or manipulated by the client.  It will become
00403  *  invalid once the keyword list is freed.
00404  */
00405 const char * EXTRACTOR_extractLast(EXTRACTOR_KeywordType type,
00406                                    EXTRACTOR_KeywordList * keywords);
00407 
00408 /**
00409  * Extract the last keyword of the given string from the keyword list.
00410  * @param type the string describing the type of the keyword
00411  * @param keywords the keyword list
00412  * @return the last matching keyword, or NULL if none matches;
00413  *  the string returned is aliased in the keywords list and must
00414  *  not be freed or manipulated by the client.  It will become
00415  *  invalid once the keyword list is freed.
00416  */
00417 const char * EXTRACTOR_extractLastByString(const char * type,
00418                                            EXTRACTOR_KeywordList * keywords);
00419 
00420 /**
00421  * Count the number of keywords in the keyword list.
00422  * @param keywords the keyword list
00423  * @return the number of keywords in the list
00424  */
00425 unsigned int EXTRACTOR_countKeywords(EXTRACTOR_KeywordList * keywords);
00426 
00427 
00428 /**
00429  * This function can be used to decode the binary data
00430  * encoded in the libextractor metadata (i.e. for
00431  * the  thumbnails).
00432  *
00433  * @param in 0-terminated string from the meta-data
00434  * @return 1 on error, 0 on success
00435  */
00436 int EXTRACTOR_binaryDecode(const char * in,
00437                            unsigned char ** out,
00438                            size_t * outSize);
00439 
00440 
00441 /**
00442  * Encode the given binary data object
00443  * as a 0-terminated C-string according
00444  * to the LE binary data encoding standard.
00445  *
00446  * @return NULL on error, the 0-terminated
00447  *  encoding otherwise
00448  */
00449 char * EXTRACTOR_binaryEncode(const unsigned char * data,
00450                               size_t size);
00451 
00452 
00453 #if 0 /* keep Emacsens' auto-indent happy */
00454 {
00455 #endif
00456 #ifdef __cplusplus
00457 }
00458 #endif
00459 
00460 #endif

Generated on Wed Jul 23 17:44:18 2008 for libextractor by  doxygen 1.5.1