extract.c

Go to the documentation of this file.
00001 /*
00002      This file is part of libextractor.
00003      (C) 2002, 2003, 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff
00004 
00005      libextractor is free software; you can redistribute it and/or modify
00006      it under the terms of the GNU General Public License as published
00007      by the Free Software Foundation; either version 2, or (at your
00008      option) any later version.
00009 
00010      libextractor is distributed in the hope that it will be useful, but
00011      WITHOUT ANY WARRANTY; without even the implied warranty of
00012      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013      General Public License for more details.
00014 
00015      You should have received a copy of the GNU General Public License
00016      along with libextractor; see the file COPYING.  If not, write to the
00017      Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00018      Boston, MA 02111-1307, USA.
00019 */
00020 
00021 #include "platform.h"
00022 #include "extractor.h"
00023 #include "getopt.h"
00024 
00025 #define YES 1
00026 #define NO 0
00027 
00028 
00029 typedef struct {
00030   char shortArg;
00031   char * longArg;
00032   char * mandatoryArg;
00033   char * description;
00034 } Help;
00035 
00036 #define BORDER 29
00037 
00038 static void formatHelp(const char * general,
00039                        const char * description,
00040                        const Help * opt) {
00041   int slen;
00042   int i;
00043   int j;
00044   int ml;
00045   int p;
00046   char * scp;
00047   const char * trans;
00048         
00049   printf(_("Usage: %s\n%s\n\n"),
00050          gettext(general),
00051          gettext(description));
00052   printf(_("Arguments mandatory for long options are also mandatory for short options.\n"));
00053   slen = 0;
00054   i = 0;
00055   while (opt[i].description != NULL) {
00056     if (opt[i].shortArg == 0)
00057       printf("      ");
00058     else
00059       printf("  -%c, ",
00060              opt[i].shortArg);
00061     printf("--%s",
00062            opt[i].longArg);
00063     slen = 8 + strlen(opt[i].longArg);
00064     if (opt[i].mandatoryArg != NULL) {
00065       printf("=%s",
00066              opt[i].mandatoryArg);
00067       slen += 1+strlen(opt[i].mandatoryArg);
00068     }
00069     if (slen > BORDER) {
00070       printf("\n%*s", BORDER, "");
00071       slen = BORDER;
00072     }
00073     if (slen < BORDER) {
00074       printf("%*s", BORDER-slen, "");
00075       slen = BORDER;
00076     }
00077     trans = gettext(opt[i].description);
00078     ml = strlen(trans);
00079     p = 0;
00080   OUTER:
00081     while (ml - p > 78 - slen) {
00082       for (j=p+78-slen;j>p;j--) {
00083         if (isspace(trans[j])) {
00084           scp = malloc(j-p+1);
00085           memcpy(scp,
00086                  &trans[p],
00087                  j-p);
00088           scp[j-p] = '\0';
00089           printf("%s\n%*s",
00090                  scp,
00091                  BORDER+2,
00092                  "");
00093           free(scp);
00094           p = j+1;
00095           slen = BORDER+2;
00096           goto OUTER;
00097         }
00098       }
00099       /* could not find space to break line */
00100       scp = malloc(78 - slen + 1);
00101       memcpy(scp,
00102              &trans[p],
00103              78 - slen);
00104       scp[78 - slen] = '\0';
00105       printf("%s\n%*s",
00106              scp,
00107              BORDER+2,
00108              "");       
00109       free(scp);
00110       slen = BORDER+2;
00111       p = p + 78 - slen;
00112     }
00113     /* print rest */
00114     if (p < ml)
00115       printf("%s\n",
00116              &trans[p]);
00117     i++;
00118   }
00119 }
00120 
00121 static void
00122 printHelp ()
00123 {
00124   static Help help[] = {
00125     { 'a', "all", NULL,
00126       gettext_noop("do not remove any duplicates") },
00127     { 'b', "bibtex", NULL,
00128       gettext_noop("print output in bibtex format") },
00129     { 'B', "binary", "LANG",
00130       gettext_noop("use the generic plaintext extractor for the language with the 2-letter language code LANG") },
00131     { 'd', "duplicates", NULL,
00132       gettext_noop("remove duplicates only if types match") },
00133     { 'f', "filename", NULL,
00134       gettext_noop("use the filename as a keyword (loads filename-extractor plugin)") },
00135     { 'g', "grep-friendly", NULL,
00136       gettext_noop("produce grep-friendly output (all results on one line per file)") },
00137     { 'h', "help", NULL,
00138       gettext_noop("print this help") },
00139     { 'H', "hash", "ALGORITHM",
00140       gettext_noop("compute hash using the given ALGORITHM (currently sha1 or md5)") },
00141     { 'l', "library", "LIBRARY",
00142       gettext_noop("load an extractor plugin named LIBRARY") },
00143     { 'L', "list", NULL,
00144       gettext_noop("list all keyword types") },
00145     { 'n', "nodefault", NULL,
00146       gettext_noop("do not use the default set of extractor plugins") },
00147     { 'p', "print", "TYPE",
00148       gettext_noop("print only keywords of the given TYPE (use -L to get a list)") },
00149     { 'r', "remove-duplicates", NULL,
00150       gettext_noop("remove duplicates even if keyword types do not match") },
00151     { 's', "split", NULL,
00152       gettext_noop("use keyword splitting (loads split-extractor plugin)") },
00153     { 'v', "version", NULL,
00154       gettext_noop("print the version number") },
00155     { 'V', "verbose", NULL,
00156       gettext_noop("be verbose") },
00157     { 'x', "exclude", "TYPE",
00158       gettext_noop("do not print keywords of the given TYPE") },
00159     { 0, NULL, NULL, NULL },
00160   };
00161   formatHelp(_("extract [OPTIONS] [FILENAME]*"),
00162              _("Extract metadata from files."),
00163              help);
00164 
00165 }
00166 
00167 #include "iconv.c"
00168 
00169 
00170 /**
00171  * Print a keyword list to a file.
00172  *
00173  * @param handle the file to write to (stdout, stderr), may NOT be NULL
00174  * @param keywords the list of keywords to print, may be NULL
00175  * @param print array indicating which types to print
00176  */
00177 static void
00178 printSelectedKeywords(FILE * handle,
00179                       EXTRACTOR_KeywordList * keywords,
00180                       const int * print,
00181                       const int verbose)
00182 {
00183   char * keyword;
00184   iconv_t cd;
00185 
00186   cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
00187   while (keywords != NULL) {
00188     if (EXTRACTOR_isBinaryType(keywords->keywordType)) {
00189       fprintf (handle,
00190                _("%s - (binary)\n"),
00191                _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)));
00192     } else {
00193       if (cd != (iconv_t) -1)
00194         keyword = iconvHelper(cd,
00195                               keywords->keyword);
00196       else
00197         keyword = strdup(keywords->keyword);
00198       if (NULL == EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)) {
00199         if (verbose == YES) {
00200           fprintf(handle,
00201                   _("INVALID TYPE - %s\n"),
00202                   keyword);
00203         }
00204       } else if (print[keywords->keywordType] == YES)
00205         fprintf (handle,
00206                  "%s - %s\n",
00207                  _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)),
00208                  keyword);
00209       free(keyword);
00210     }
00211     keywords = keywords->next;
00212   }
00213   if (cd != (iconv_t) -1)
00214     iconv_close(cd);
00215 }
00216 
00217 /**
00218  * Print a keyword list to a file in a grep-friendly manner.
00219  *
00220  * @param handle the file to write to (stdout, stderr), may NOT be NULL
00221  * @param keywords the list of keywords to print, may be NULL
00222  * @param print array indicating which types to print
00223  */
00224 static void
00225 printSelectedKeywordsGrepFriendly(FILE * handle,
00226                                   EXTRACTOR_KeywordList * keywords,
00227                                   const int * print,
00228                                   const int verbose)
00229 {
00230   char * keyword;
00231   iconv_t cd;
00232   size_t pos;
00233 
00234   cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
00235   while (keywords != NULL) {
00236     if ( (EXTRACTOR_isBinaryType(EXTRACTOR_THUMBNAIL_DATA)) &&
00237          (print[keywords->keywordType] == YES) ) {
00238       if (verbose > 1) 
00239         fprintf(handle,
00240                 "%s: ",
00241                 _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)));
00242       if (cd != (iconv_t) -1)
00243         keyword = iconvHelper(cd,
00244                               keywords->keyword);
00245       else
00246         keyword = strdup(keywords->keyword);
00247       pos = 0;
00248       while (keyword[pos] != '\0') {
00249         if (iscntrl(keyword[pos]))      
00250           keyword[pos] = ' ';
00251         pos++;
00252       }
00253       fprintf (handle,
00254                (keywords->next == NULL) ? "%s" : "%s ",
00255                keyword);
00256       free(keyword);
00257     }
00258     keywords = keywords->next;
00259   }
00260   fprintf(handle, "\n");
00261   if (cd != (iconv_t) -1)
00262     iconv_close(cd);
00263 }
00264 
00265 /**
00266  * Take title, auth, year and return a string
00267  */
00268 static char *
00269 str_splice(const char * title,
00270            const char * auth,
00271            const char * year) {
00272   char * temp = malloc(16);
00273   int i = 0;
00274 
00275   snprintf(temp, 
00276            15,
00277            "%.5s%.5s%.5s", 
00278            auth, 
00279            year, 
00280            title);
00281   for (i=0;i<strlen(temp);i++ ) {
00282     if (! isalnum(temp[i]) )
00283       temp[i] = '_';
00284     else
00285       temp[i] = tolower(temp[i]);
00286   }
00287   return temp;
00288 }
00289 
00290 /**
00291  * Print a keyword list in bibtex format to a file.
00292  * FIXME: We should generate the three letter abbrev of the month
00293  * @param handle the file to write to (stdout, stderr), may NOT be NULL
00294  * @param keywords the list of keywords to print, may be NULL
00295  * @param print array indicating which types to print
00296  */
00297 static void
00298 printSelectedKeywordsBibtex (FILE * handle,
00299                              EXTRACTOR_KeywordList * keywords,
00300                              const int * print,
00301                              const char * filename)
00302 {
00303   const char * last = NULL;
00304   if (keywords == NULL)
00305     return;
00306   if (print[keywords->keywordType] == YES)
00307     {
00308       const char * title = NULL;
00309       const char * author = NULL;
00310       const char * note = NULL;
00311       const char * date = NULL;
00312       const char * publisher = NULL;
00313       const char * organization = NULL;
00314       const char * key = NULL;
00315       const char * pages = NULL;
00316       char * year = NULL;
00317       char * month = NULL;
00318       char * tmp;
00319 
00320       title = EXTRACTOR_extractLastByString(_("title"), keywords);
00321       if ( !title )
00322         title = EXTRACTOR_extractLastByString(_("filename"), keywords);
00323       if ( !title )
00324         title = (char*)filename;
00325       last = title;
00326 
00327       author = EXTRACTOR_extractLastByString(_("author"), keywords);
00328       if ( author )
00329         last = author;
00330 
00331       note = EXTRACTOR_extractLastByString(_("description"), keywords);
00332       if ( !note )
00333         note = EXTRACTOR_extractLastByString(_("keywords"), keywords);
00334       if ( !note )
00335         note = EXTRACTOR_extractLastByString(_("comment"), keywords);
00336       if ( note )
00337         last = note;
00338 
00339       date = EXTRACTOR_extractLastByString(_("date"), keywords);
00340       if ( !date )
00341         date = EXTRACTOR_extractLastByString(_("creation date"), keywords);
00342       if ( date ) {
00343         if ( strlen(keywords->keyword) >= 7 ) {
00344           year = (char*)malloc(sizeof(char)*5);
00345           memset(year, 0, sizeof(char)*5);
00346           month = (char*)malloc(sizeof(char)*3);
00347           memset(month, 0, sizeof(char)*3);
00348           year[0] = keywords->keyword[0];
00349           year[1] = keywords->keyword[1];
00350           year[2] = keywords->keyword[2];
00351           year[3] = keywords->keyword[3];
00352           month[0] = keywords->keyword[4];
00353           month[1] = keywords->keyword[5];
00354         } else if ( strlen(keywords->keyword) >= 4 ) {
00355           year = (char*)malloc(sizeof(char)*5);
00356           memset(year, 0, sizeof(char)*5);
00357           year[0] = keywords->keyword[0];
00358           year[1] = keywords->keyword[1];
00359           year[2] = keywords->keyword[2];
00360           year[3] = keywords->keyword[3];
00361         }
00362       }
00363       if ( year )
00364         last = year;
00365 
00366       if ( month )
00367         last = month;
00368 
00369       publisher = EXTRACTOR_extractLastByString(_("publisher"), keywords);
00370       if ( publisher )
00371         last = publisher;
00372 
00373       organization = EXTRACTOR_extractLastByString(_("organization"), keywords);
00374       if ( organization )
00375         last = organization;
00376 
00377       key = EXTRACTOR_extractLastByString(_("subject"), keywords);
00378       if ( key )
00379         last = key;
00380 
00381       pages = EXTRACTOR_extractLastByString(_("page count"), keywords);
00382       if ( pages )
00383         last = pages;
00384 
00385       tmp = str_splice(title, author, year);
00386       fprintf(handle, 
00387               "@misc{ %s,\n", 
00388               tmp);
00389       free(tmp);      
00390       if ( title )
00391         fprintf(handle, "    title = \"%s\"%s\n", title,
00392             (last == title)?"":",");
00393       if ( author )
00394         fprintf(handle, "    author = \"%s\"%s\n", author,
00395             (last == author)?"":",");
00396       if ( note )
00397         fprintf(handle, "    note = \"%s\"%s\n", note,
00398             (last == note)?"":",");
00399       if ( year )
00400         fprintf(handle, "    year = \"%s\"%s\n", year,
00401             (last == year)?"":",");
00402       if ( month )
00403         fprintf(handle, "    month = \"%s\"%s\n", month,
00404             (last == month)?"":",");
00405       if ( publisher )
00406         fprintf(handle, "    publisher = \"%s\"%s\n", publisher,
00407             (last == publisher)?"":",");
00408       if ( organization )
00409         fprintf(handle, "    organization = \"%s\"%s\n", organization,
00410             (last == organization)?"":",");
00411       if ( key )
00412         fprintf(handle, "    key = \"%s\"%s\n", key,
00413             (last == key)?"":",");
00414       if ( pages )
00415         fprintf(handle, "    pages = \"%s\"%s\n", pages,
00416             (last == pages)?"":",");
00417       if (month != NULL)
00418         free(month);
00419       if (year != NULL)
00420         free(year);
00421       fprintf(handle, "}\n\n");
00422     }
00423 }
00424 
00425 /**
00426  * Demo for libExtractor.
00427  * <p>
00428  * Invoke with a list of filenames to extract keywords
00429  * from (demo will use all the extractor libraries that
00430  * are available by default).
00431  */
00432 int
00433 main (int argc, char *argv[])
00434 {
00435   int i;
00436   EXTRACTOR_ExtractorList *extractors;
00437   EXTRACTOR_KeywordList *keywords;
00438   int option_index;
00439   int c;
00440   char * libraries = NULL;
00441   char * hash = NULL;
00442   int splitKeywords = NO;
00443   int verbose = 0;
00444   int useFilename = NO;
00445   int nodefault = NO;
00446   int *print;
00447   int defaultAll = YES;
00448   int duplicates = EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN;
00449   int bibtex = NO;
00450   int grepfriendly = NO;
00451   char * binary = NULL;
00452   int ret = 0;
00453 
00454 #ifdef MINGW
00455   InitWinEnv();
00456 #endif
00457 #if ENABLE_NLS
00458   setlocale(LC_ALL, "");
00459   textdomain("libextractor");
00460   BINDTEXTDOMAIN("libextractor", LOCALEDIR);
00461 #endif
00462   print = malloc (sizeof (int) * EXTRACTOR_getHighestKeywordTypeNumber ());
00463   for (i = 0; i < EXTRACTOR_getHighestKeywordTypeNumber (); i++)
00464     print[i] = YES;             /* default: print everything */
00465 
00466   while (1)
00467     {
00468       static struct option long_options[] = {
00469         {"all", 0, 0, 'a'},
00470         {"binary", 1, 0, 'B'},
00471         {"bibtex", 0, 0, 'b'},
00472         {"duplicates", 0, 0, 'd'},
00473         {"filename", 0, 0, 'f'},
00474         {"grep-friendly", 0, 0, 'g'},
00475         {"help", 0, 0, 'h'},
00476         {"hash", 1, 0, 'H'},
00477         {"list", 0, 0, 'L'},
00478         {"library", 1, 0, 'l'},
00479         {"nodefault", 0, 0, 'n'},
00480         {"print", 1, 0, 'p'},
00481         {"remove-duplicates", 0, 0, 'r'},
00482         {"split", 0, 0, 's'},
00483         {"verbose", 0, 0, 'V'},
00484         {"version", 0, 0, 'v'},
00485         {"exclude", 1, 0, 'x'},
00486         {0, 0, 0, 0}
00487       };
00488       option_index = 0;
00489       c = getopt_long (argc,
00490                        argv, "vhbgl:nsH:fp:x:LVdraB:",
00491                        long_options,
00492                        &option_index);
00493 
00494       if (c == -1)
00495         break;                  /* No more flags to process */
00496       switch (c)
00497         {
00498         case 'a':
00499           duplicates = -1;
00500           break;
00501         case 'b':
00502           bibtex = YES;
00503           break;
00504         case 'B':
00505           binary = optarg;
00506           break;
00507         case 'd':
00508           duplicates = 0;
00509           break;
00510         case 'f':
00511           useFilename = YES;
00512           break;
00513         case 'g':
00514           grepfriendly = YES;
00515           break;
00516         case 'h':
00517           printHelp();
00518           return 0;
00519         case 'H':
00520           hash = optarg;
00521           break;
00522         case 'l':
00523           libraries = optarg;
00524           break;
00525         case 'L':
00526           i = 0;
00527           while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
00528             printf ("%s\n",
00529                     _(EXTRACTOR_getKeywordTypeAsString (i++)));
00530           return 0;
00531         case 'n':
00532           nodefault = YES;
00533           break;
00534         case 'p':
00535           if (optarg == NULL) {
00536             fprintf(stderr,
00537                     _("You must specify an argument for the `%s' option (option ignored).\n"),
00538                     "-p");
00539             break;
00540           }
00541           if (defaultAll == YES)
00542             {
00543               defaultAll = NO;
00544               i = 0;
00545               while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
00546                 print[i++] = NO;
00547             }
00548           i = 0;
00549           while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
00550             {
00551               if ( (0 == strcmp (optarg, EXTRACTOR_getKeywordTypeAsString (i))) ||
00552                    (0 == strcmp (optarg, _(EXTRACTOR_getKeywordTypeAsString (i)))) )
00553                 
00554                 {
00555                   print[i] = YES;
00556                   break;
00557                 }
00558               i++;
00559             }
00560           if (NULL == EXTRACTOR_getKeywordTypeAsString (i))
00561             {
00562               fprintf(stderr,
00563                       "Unknown keyword type `%s', use option `%s' to get a list.\n",
00564                       optarg,
00565                        "-L");
00566               return -1;
00567             }
00568           break;
00569         case 'r':
00570           duplicates = EXTRACTOR_DUPLICATES_TYPELESS;
00571           break;
00572         case 's':
00573           splitKeywords = YES;
00574           break;
00575         case 'v':
00576           printf ("extract v%s\n", PACKAGE_VERSION);
00577           return 0;
00578         case 'V':
00579           verbose++;
00580           break;
00581         case 'x':
00582           i = 0;
00583           while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
00584             {
00585               if ( (0 == strcmp (optarg, EXTRACTOR_getKeywordTypeAsString (i))) ||
00586                    (0 == strcmp (optarg, _(EXTRACTOR_getKeywordTypeAsString (i)))) )
00587                 {
00588                   print[i] = NO;
00589                   break;
00590                 }
00591               i++;
00592             }
00593           if (NULL == EXTRACTOR_getKeywordTypeAsString (i))
00594             {
00595               fprintf (stderr,
00596                        "Unknown keyword type `%s', use option `%s' to get a list.\n",
00597                        optarg,
00598                        "-L");
00599 #ifdef MINGW
00600                         ShutdownWinEnv();
00601 #endif
00602               return -1;
00603             }
00604           break;
00605         default:
00606           fprintf (stderr,
00607                    _("Use --help to get a list of options.\n"));
00608 #ifdef MINGW
00609         ShutdownWinEnv();
00610 #endif
00611           return -1;
00612         }                       /* end of parsing commandline */
00613     }                           /* while (1) */
00614 
00615   if (argc - optind < 1)
00616     {
00617       fprintf (stderr,
00618                "Invoke with list of filenames to extract keywords form!\n");
00619 #ifdef MINGW
00620                 ShutdownWinEnv();
00621 #endif
00622       free (print);
00623       return -1;
00624     }
00625 
00626   /* build list of libraries */
00627   if (nodefault == NO)
00628     extractors = EXTRACTOR_loadDefaultLibraries ();
00629   else
00630     extractors = NULL;
00631   if (useFilename == YES)
00632     extractors = EXTRACTOR_addLibrary (extractors,
00633                                        "libextractor_filename");
00634   if (libraries != NULL)
00635     extractors = EXTRACTOR_loadConfigLibraries (extractors, libraries);
00636 
00637   if (binary != NULL) {
00638     char * name;
00639     name = malloc(strlen(binary) + strlen("libextractor_printable_") + 1);
00640     strcpy(name, "libextractor_printable_");
00641     strcat(name, binary);
00642     extractors = EXTRACTOR_addLibraryLast(extractors,
00643                                           name);
00644     free(name);
00645   }
00646   if (hash != NULL) {
00647     char * name;
00648     name = malloc(strlen(hash) + strlen("libextractor_hash_") + 1);
00649     strcpy(name, "libextractor_hash_");
00650     strcat(name, hash);
00651     extractors = EXTRACTOR_addLibraryLast(extractors,
00652                                           name);
00653     free(name);
00654   }
00655 
00656   if (splitKeywords == YES)
00657     extractors = EXTRACTOR_addLibraryLast(extractors,
00658                                           "libextractor_split");
00659 
00660   /* extract keywords */
00661   if ( bibtex == YES )
00662     fprintf(stdout,
00663             _("%% BiBTeX file\n"));
00664   for (i = optind; i < argc; i++) {
00665     errno = 0;
00666     keywords = EXTRACTOR_getKeywords (extractors, argv[i]);
00667     if (0 != errno) {
00668       if (verbose > 0) {
00669         fprintf(stderr,
00670                 "%s: %s: %s\n",
00671                 argv[0], argv[i], strerror(errno));
00672       }
00673       ret = 1;
00674       continue;
00675     }
00676     if ( (duplicates != -1) || (bibtex == YES))
00677       keywords = EXTRACTOR_removeDuplicateKeywords (keywords, duplicates);
00678     if ( (verbose > 0) 
00679          && (bibtex == NO) ) {
00680       if (grepfriendly == YES)
00681         printf ("%s ", argv[i]);
00682       else
00683         printf (_("Keywords for file %s:\n"),
00684                 argv[i]);
00685     }
00686     if (bibtex == YES)
00687       printSelectedKeywordsBibtex (stdout, keywords, print, argv[i]);
00688     else if (grepfriendly == YES)
00689       printSelectedKeywordsGrepFriendly(stdout, keywords, print, verbose);
00690     else
00691       printSelectedKeywords (stdout, keywords, print, verbose);
00692     if (verbose > 0 && bibtex == NO)
00693       printf ("\n");
00694     EXTRACTOR_freeKeywords (keywords);
00695   }
00696   free (print);
00697   EXTRACTOR_removeAll (extractors);
00698 
00699 #ifdef MINGW
00700   ShutdownWinEnv();
00701 #endif
00702 
00703   return ret;
00704 }

Generated on Fri Jan 9 16:44:28 2009 for libextractor by  doxygen 1.5.1