00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "platform.h"
00022 #include "extractor.h"
00023 #include "getopt.h"
00024
00025 #define YES 1
00026 #define NO 0
00027
00028
00029 typedef struct {
00030 char shortArg;
00031 char * longArg;
00032 char * mandatoryArg;
00033 char * description;
00034 } Help;
00035
00036 #define BORDER 29
00037
00038 static void formatHelp(const char * general,
00039 const char * description,
00040 const Help * opt) {
00041 int slen;
00042 int i;
00043 int j;
00044 int ml;
00045 int p;
00046 char * scp;
00047 const char * trans;
00048
00049 printf(_("Usage: %s\n%s\n\n"),
00050 gettext(general),
00051 gettext(description));
00052 printf(_("Arguments mandatory for long options are also mandatory for short options.\n"));
00053 slen = 0;
00054 i = 0;
00055 while (opt[i].description != NULL) {
00056 if (opt[i].shortArg == 0)
00057 printf(" ");
00058 else
00059 printf(" -%c, ",
00060 opt[i].shortArg);
00061 printf("--%s",
00062 opt[i].longArg);
00063 slen = 8 + strlen(opt[i].longArg);
00064 if (opt[i].mandatoryArg != NULL) {
00065 printf("=%s",
00066 opt[i].mandatoryArg);
00067 slen += 1+strlen(opt[i].mandatoryArg);
00068 }
00069 if (slen > BORDER) {
00070 printf("\n%*s", BORDER, "");
00071 slen = BORDER;
00072 }
00073 if (slen < BORDER) {
00074 printf("%*s", BORDER-slen, "");
00075 slen = BORDER;
00076 }
00077 trans = gettext(opt[i].description);
00078 ml = strlen(trans);
00079 p = 0;
00080 OUTER:
00081 while (ml - p > 78 - slen) {
00082 for (j=p+78-slen;j>p;j--) {
00083 if (isspace(trans[j])) {
00084 scp = malloc(j-p+1);
00085 memcpy(scp,
00086 &trans[p],
00087 j-p);
00088 scp[j-p] = '\0';
00089 printf("%s\n%*s",
00090 scp,
00091 BORDER+2,
00092 "");
00093 free(scp);
00094 p = j+1;
00095 slen = BORDER+2;
00096 goto OUTER;
00097 }
00098 }
00099
00100 scp = malloc(78 - slen + 1);
00101 memcpy(scp,
00102 &trans[p],
00103 78 - slen);
00104 scp[78 - slen] = '\0';
00105 printf("%s\n%*s",
00106 scp,
00107 BORDER+2,
00108 "");
00109 free(scp);
00110 slen = BORDER+2;
00111 p = p + 78 - slen;
00112 }
00113
00114 if (p < ml)
00115 printf("%s\n",
00116 &trans[p]);
00117 i++;
00118 }
00119 }
00120
00121 static void
00122 printHelp ()
00123 {
00124 static Help help[] = {
00125 { 'a', "all", NULL,
00126 gettext_noop("do not remove any duplicates") },
00127 { 'b', "bibtex", NULL,
00128 gettext_noop("print output in bibtex format") },
00129 { 'B', "binary", "LANG",
00130 gettext_noop("use the generic plaintext extractor for the language with the 2-letter language code LANG") },
00131 { 'd', "duplicates", NULL,
00132 gettext_noop("remove duplicates only if types match") },
00133 { 'f', "filename", NULL,
00134 gettext_noop("use the filename as a keyword (loads filename-extractor plugin)") },
00135 { 'g', "grep-friendly", NULL,
00136 gettext_noop("produce grep-friendly output (all results on one line per file)") },
00137 { 'h', "help", NULL,
00138 gettext_noop("print this help") },
00139 { 'H', "hash", "ALGORITHM",
00140 gettext_noop("compute hash using the given ALGORITHM (currently sha1 or md5)") },
00141 { 'l', "library", "LIBRARY",
00142 gettext_noop("load an extractor plugin named LIBRARY") },
00143 { 'L', "list", NULL,
00144 gettext_noop("list all keyword types") },
00145 { 'n', "nodefault", NULL,
00146 gettext_noop("do not use the default set of extractor plugins") },
00147 { 'p', "print", "TYPE",
00148 gettext_noop("print only keywords of the given TYPE (use -L to get a list)") },
00149 { 'r', "remove-duplicates", NULL,
00150 gettext_noop("remove duplicates even if keyword types do not match") },
00151 { 's', "split", NULL,
00152 gettext_noop("use keyword splitting (loads split-extractor plugin)") },
00153 { 'v', "version", NULL,
00154 gettext_noop("print the version number") },
00155 { 'V', "verbose", NULL,
00156 gettext_noop("be verbose") },
00157 { 'x', "exclude", "TYPE",
00158 gettext_noop("do not print keywords of the given TYPE") },
00159 { 0, NULL, NULL, NULL },
00160 };
00161 formatHelp(_("extract [OPTIONS] [FILENAME]*"),
00162 _("Extract metadata from files."),
00163 help);
00164
00165 }
00166
00167 #include "iconv.c"
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177 static void
00178 printSelectedKeywords(FILE * handle,
00179 EXTRACTOR_KeywordList * keywords,
00180 const int * print,
00181 const int verbose)
00182 {
00183 char * keyword;
00184 iconv_t cd;
00185
00186 cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
00187 while (keywords != NULL) {
00188 if (EXTRACTOR_isBinaryType(keywords->keywordType)) {
00189 fprintf (handle,
00190 _("%s - (binary)\n"),
00191 _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)));
00192 } else {
00193 if (cd != (iconv_t) -1)
00194 keyword = iconvHelper(cd,
00195 keywords->keyword);
00196 else
00197 keyword = strdup(keywords->keyword);
00198 if (NULL == EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)) {
00199 if (verbose == YES) {
00200 fprintf(handle,
00201 _("INVALID TYPE - %s\n"),
00202 keyword);
00203 }
00204 } else if (print[keywords->keywordType] == YES)
00205 fprintf (handle,
00206 "%s - %s\n",
00207 _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)),
00208 keyword);
00209 free(keyword);
00210 }
00211 keywords = keywords->next;
00212 }
00213 if (cd != (iconv_t) -1)
00214 iconv_close(cd);
00215 }
00216
00217
00218
00219
00220
00221
00222
00223
00224 static void
00225 printSelectedKeywordsGrepFriendly(FILE * handle,
00226 EXTRACTOR_KeywordList * keywords,
00227 const int * print,
00228 const int verbose)
00229 {
00230 char * keyword;
00231 iconv_t cd;
00232 size_t pos;
00233
00234 cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
00235 while (keywords != NULL) {
00236 if ( (EXTRACTOR_isBinaryType(EXTRACTOR_THUMBNAIL_DATA)) &&
00237 (print[keywords->keywordType] == YES) ) {
00238 if (verbose > 1)
00239 fprintf(handle,
00240 "%s: ",
00241 _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)));
00242 if (cd != (iconv_t) -1)
00243 keyword = iconvHelper(cd,
00244 keywords->keyword);
00245 else
00246 keyword = strdup(keywords->keyword);
00247 pos = 0;
00248 while (keyword[pos] != '\0') {
00249 if (iscntrl(keyword[pos]))
00250 keyword[pos] = ' ';
00251 pos++;
00252 }
00253 fprintf (handle,
00254 (keywords->next == NULL) ? "%s" : "%s ",
00255 keyword);
00256 free(keyword);
00257 }
00258 keywords = keywords->next;
00259 }
00260 fprintf(handle, "\n");
00261 if (cd != (iconv_t) -1)
00262 iconv_close(cd);
00263 }
00264
00265
00266
00267
00268 static char *
00269 str_splice(const char * title,
00270 const char * auth,
00271 const char * year) {
00272 char * temp = malloc(16);
00273 int i = 0;
00274
00275 snprintf(temp,
00276 15,
00277 "%.5s%.5s%.5s",
00278 auth,
00279 year,
00280 title);
00281 for (i=0;i<strlen(temp);i++ ) {
00282 if (! isalnum(temp[i]) )
00283 temp[i] = '_';
00284 else
00285 temp[i] = tolower(temp[i]);
00286 }
00287 return temp;
00288 }
00289
00290
00291
00292
00293
00294
00295
00296
00297 static void
00298 printSelectedKeywordsBibtex (FILE * handle,
00299 EXTRACTOR_KeywordList * keywords,
00300 const int * print,
00301 const char * filename)
00302 {
00303 const char * last = NULL;
00304 if (keywords == NULL)
00305 return;
00306 if (print[keywords->keywordType] == YES)
00307 {
00308 const char * title = NULL;
00309 const char * author = NULL;
00310 const char * note = NULL;
00311 const char * date = NULL;
00312 const char * publisher = NULL;
00313 const char * organization = NULL;
00314 const char * key = NULL;
00315 const char * pages = NULL;
00316 char * year = NULL;
00317 char * month = NULL;
00318 char * tmp;
00319
00320 title = EXTRACTOR_extractLastByString(_("title"), keywords);
00321 if ( !title )
00322 title = EXTRACTOR_extractLastByString(_("filename"), keywords);
00323 if ( !title )
00324 title = (char*)filename;
00325 last = title;
00326
00327 author = EXTRACTOR_extractLastByString(_("author"), keywords);
00328 if ( author )
00329 last = author;
00330
00331 note = EXTRACTOR_extractLastByString(_("description"), keywords);
00332 if ( !note )
00333 note = EXTRACTOR_extractLastByString(_("keywords"), keywords);
00334 if ( !note )
00335 note = EXTRACTOR_extractLastByString(_("comment"), keywords);
00336 if ( note )
00337 last = note;
00338
00339 date = EXTRACTOR_extractLastByString(_("date"), keywords);
00340 if ( !date )
00341 date = EXTRACTOR_extractLastByString(_("creation date"), keywords);
00342 if ( date ) {
00343 if ( strlen(keywords->keyword) >= 7 ) {
00344 year = (char*)malloc(sizeof(char)*5);
00345 memset(year, 0, sizeof(char)*5);
00346 month = (char*)malloc(sizeof(char)*3);
00347 memset(month, 0, sizeof(char)*3);
00348 year[0] = keywords->keyword[0];
00349 year[1] = keywords->keyword[1];
00350 year[2] = keywords->keyword[2];
00351 year[3] = keywords->keyword[3];
00352 month[0] = keywords->keyword[4];
00353 month[1] = keywords->keyword[5];
00354 } else if ( strlen(keywords->keyword) >= 4 ) {
00355 year = (char*)malloc(sizeof(char)*5);
00356 memset(year, 0, sizeof(char)*5);
00357 year[0] = keywords->keyword[0];
00358 year[1] = keywords->keyword[1];
00359 year[2] = keywords->keyword[2];
00360 year[3] = keywords->keyword[3];
00361 }
00362 }
00363 if ( year )
00364 last = year;
00365
00366 if ( month )
00367 last = month;
00368
00369 publisher = EXTRACTOR_extractLastByString(_("publisher"), keywords);
00370 if ( publisher )
00371 last = publisher;
00372
00373 organization = EXTRACTOR_extractLastByString(_("organization"), keywords);
00374 if ( organization )
00375 last = organization;
00376
00377 key = EXTRACTOR_extractLastByString(_("subject"), keywords);
00378 if ( key )
00379 last = key;
00380
00381 pages = EXTRACTOR_extractLastByString(_("page count"), keywords);
00382 if ( pages )
00383 last = pages;
00384
00385 tmp = str_splice(title, author, year);
00386 fprintf(handle,
00387 "@misc{ %s,\n",
00388 tmp);
00389 free(tmp);
00390 if ( title )
00391 fprintf(handle, " title = \"%s\"%s\n", title,
00392 (last == title)?"":",");
00393 if ( author )
00394 fprintf(handle, " author = \"%s\"%s\n", author,
00395 (last == author)?"":",");
00396 if ( note )
00397 fprintf(handle, " note = \"%s\"%s\n", note,
00398 (last == note)?"":",");
00399 if ( year )
00400 fprintf(handle, " year = \"%s\"%s\n", year,
00401 (last == year)?"":",");
00402 if ( month )
00403 fprintf(handle, " month = \"%s\"%s\n", month,
00404 (last == month)?"":",");
00405 if ( publisher )
00406 fprintf(handle, " publisher = \"%s\"%s\n", publisher,
00407 (last == publisher)?"":",");
00408 if ( organization )
00409 fprintf(handle, " organization = \"%s\"%s\n", organization,
00410 (last == organization)?"":",");
00411 if ( key )
00412 fprintf(handle, " key = \"%s\"%s\n", key,
00413 (last == key)?"":",");
00414 if ( pages )
00415 fprintf(handle, " pages = \"%s\"%s\n", pages,
00416 (last == pages)?"":",");
00417 if (month != NULL)
00418 free(month);
00419 if (year != NULL)
00420 free(year);
00421 fprintf(handle, "}\n\n");
00422 }
00423 }
00424
00425
00426
00427
00428
00429
00430
00431
00432 int
00433 main (int argc, char *argv[])
00434 {
00435 int i;
00436 EXTRACTOR_ExtractorList *extractors;
00437 EXTRACTOR_KeywordList *keywords;
00438 int option_index;
00439 int c;
00440 char * libraries = NULL;
00441 char * hash = NULL;
00442 int splitKeywords = NO;
00443 int verbose = 0;
00444 int useFilename = NO;
00445 int nodefault = NO;
00446 int *print;
00447 int defaultAll = YES;
00448 int duplicates = EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN;
00449 int bibtex = NO;
00450 int grepfriendly = NO;
00451 char * binary = NULL;
00452 int ret = 0;
00453
00454 #ifdef MINGW
00455 InitWinEnv();
00456 #endif
00457 #if ENABLE_NLS
00458 setlocale(LC_ALL, "");
00459 textdomain("libextractor");
00460 BINDTEXTDOMAIN("libextractor", LOCALEDIR);
00461 #endif
00462 print = malloc (sizeof (int) * EXTRACTOR_getHighestKeywordTypeNumber ());
00463 for (i = 0; i < EXTRACTOR_getHighestKeywordTypeNumber (); i++)
00464 print[i] = YES;
00465
00466 while (1)
00467 {
00468 static struct option long_options[] = {
00469 {"all", 0, 0, 'a'},
00470 {"binary", 1, 0, 'B'},
00471 {"bibtex", 0, 0, 'b'},
00472 {"duplicates", 0, 0, 'd'},
00473 {"filename", 0, 0, 'f'},
00474 {"grep-friendly", 0, 0, 'g'},
00475 {"help", 0, 0, 'h'},
00476 {"hash", 1, 0, 'H'},
00477 {"list", 0, 0, 'L'},
00478 {"library", 1, 0, 'l'},
00479 {"nodefault", 0, 0, 'n'},
00480 {"print", 1, 0, 'p'},
00481 {"remove-duplicates", 0, 0, 'r'},
00482 {"split", 0, 0, 's'},
00483 {"verbose", 0, 0, 'V'},
00484 {"version", 0, 0, 'v'},
00485 {"exclude", 1, 0, 'x'},
00486 {0, 0, 0, 0}
00487 };
00488 option_index = 0;
00489 c = getopt_long (argc,
00490 argv, "vhbgl:nsH:fp:x:LVdraB:",
00491 long_options,
00492 &option_index);
00493
00494 if (c == -1)
00495 break;
00496 switch (c)
00497 {
00498 case 'a':
00499 duplicates = -1;
00500 break;
00501 case 'b':
00502 bibtex = YES;
00503 break;
00504 case 'B':
00505 binary = optarg;
00506 break;
00507 case 'd':
00508 duplicates = 0;
00509 break;
00510 case 'f':
00511 useFilename = YES;
00512 break;
00513 case 'g':
00514 grepfriendly = YES;
00515 break;
00516 case 'h':
00517 printHelp();
00518 return 0;
00519 case 'H':
00520 hash = optarg;
00521 break;
00522 case 'l':
00523 libraries = optarg;
00524 break;
00525 case 'L':
00526 i = 0;
00527 while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
00528 printf ("%s\n",
00529 _(EXTRACTOR_getKeywordTypeAsString (i++)));
00530 return 0;
00531 case 'n':
00532 nodefault = YES;
00533 break;
00534 case 'p':
00535 if (optarg == NULL) {
00536 fprintf(stderr,
00537 _("You must specify an argument for the `%s' option (option ignored).\n"),
00538 "-p");
00539 break;
00540 }
00541 if (defaultAll == YES)
00542 {
00543 defaultAll = NO;
00544 i = 0;
00545 while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
00546 print[i++] = NO;
00547 }
00548 i = 0;
00549 while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
00550 {
00551 if ( (0 == strcmp (optarg, EXTRACTOR_getKeywordTypeAsString (i))) ||
00552 (0 == strcmp (optarg, _(EXTRACTOR_getKeywordTypeAsString (i)))) )
00553
00554 {
00555 print[i] = YES;
00556 break;
00557 }
00558 i++;
00559 }
00560 if (NULL == EXTRACTOR_getKeywordTypeAsString (i))
00561 {
00562 fprintf(stderr,
00563 "Unknown keyword type `%s', use option `%s' to get a list.\n",
00564 optarg,
00565 "-L");
00566 return -1;
00567 }
00568 break;
00569 case 'r':
00570 duplicates = EXTRACTOR_DUPLICATES_TYPELESS;
00571 break;
00572 case 's':
00573 splitKeywords = YES;
00574 break;
00575 case 'v':
00576 printf ("extract v%s\n", PACKAGE_VERSION);
00577 return 0;
00578 case 'V':
00579 verbose++;
00580 break;
00581 case 'x':
00582 i = 0;
00583 while (NULL != EXTRACTOR_getKeywordTypeAsString (i))
00584 {
00585 if ( (0 == strcmp (optarg, EXTRACTOR_getKeywordTypeAsString (i))) ||
00586 (0 == strcmp (optarg, _(EXTRACTOR_getKeywordTypeAsString (i)))) )
00587 {
00588 print[i] = NO;
00589 break;
00590 }
00591 i++;
00592 }
00593 if (NULL == EXTRACTOR_getKeywordTypeAsString (i))
00594 {
00595 fprintf (stderr,
00596 "Unknown keyword type `%s', use option `%s' to get a list.\n",
00597 optarg,
00598 "-L");
00599 #ifdef MINGW
00600 ShutdownWinEnv();
00601 #endif
00602 return -1;
00603 }
00604 break;
00605 default:
00606 fprintf (stderr,
00607 _("Use --help to get a list of options.\n"));
00608 #ifdef MINGW
00609 ShutdownWinEnv();
00610 #endif
00611 return -1;
00612 }
00613 }
00614
00615 if (argc - optind < 1)
00616 {
00617 fprintf (stderr,
00618 "Invoke with list of filenames to extract keywords form!\n");
00619 #ifdef MINGW
00620 ShutdownWinEnv();
00621 #endif
00622 free (print);
00623 return -1;
00624 }
00625
00626
00627 if (nodefault == NO)
00628 extractors = EXTRACTOR_loadDefaultLibraries ();
00629 else
00630 extractors = NULL;
00631 if (useFilename == YES)
00632 extractors = EXTRACTOR_addLibrary (extractors,
00633 "libextractor_filename");
00634 if (libraries != NULL)
00635 extractors = EXTRACTOR_loadConfigLibraries (extractors, libraries);
00636
00637 if (binary != NULL) {
00638 char * name;
00639 name = malloc(strlen(binary) + strlen("libextractor_printable_") + 1);
00640 strcpy(name, "libextractor_printable_");
00641 strcat(name, binary);
00642 extractors = EXTRACTOR_addLibraryLast(extractors,
00643 name);
00644 free(name);
00645 }
00646 if (hash != NULL) {
00647 char * name;
00648 name = malloc(strlen(hash) + strlen("libextractor_hash_") + 1);
00649 strcpy(name, "libextractor_hash_");
00650 strcat(name, hash);
00651 extractors = EXTRACTOR_addLibraryLast(extractors,
00652 name);
00653 free(name);
00654 }
00655
00656 if (splitKeywords == YES)
00657 extractors = EXTRACTOR_addLibraryLast(extractors,
00658 "libextractor_split");
00659
00660
00661 if ( bibtex == YES )
00662 fprintf(stdout,
00663 _("%% BiBTeX file\n"));
00664 for (i = optind; i < argc; i++) {
00665 errno = 0;
00666 keywords = EXTRACTOR_getKeywords (extractors, argv[i]);
00667 if (0 != errno) {
00668 if (verbose > 0) {
00669 fprintf(stderr,
00670 "%s: %s: %s\n",
00671 argv[0], argv[i], strerror(errno));
00672 }
00673 ret = 1;
00674 continue;
00675 }
00676 if ( (duplicates != -1) || (bibtex == YES))
00677 keywords = EXTRACTOR_removeDuplicateKeywords (keywords, duplicates);
00678 if ( (verbose > 0)
00679 && (bibtex == NO) ) {
00680 if (grepfriendly == YES)
00681 printf ("%s ", argv[i]);
00682 else
00683 printf (_("Keywords for file %s:\n"),
00684 argv[i]);
00685 }
00686 if (bibtex == YES)
00687 printSelectedKeywordsBibtex (stdout, keywords, print, argv[i]);
00688 else if (grepfriendly == YES)
00689 printSelectedKeywordsGrepFriendly(stdout, keywords, print, verbose);
00690 else
00691 printSelectedKeywords (stdout, keywords, print, verbose);
00692 if (verbose > 0 && bibtex == NO)
00693 printf ("\n");
00694 EXTRACTOR_freeKeywords (keywords);
00695 }
00696 free (print);
00697 EXTRACTOR_removeAll (extractors);
00698
00699 #ifdef MINGW
00700 ShutdownWinEnv();
00701 #endif
00702
00703 return ret;
00704 }