00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "platform.h"
00029 #include "extractor.h"
00030 #include "convert.h"
00031
00032 #include <glib-object.h>
00033 #include <string.h>
00034 #include <stdio.h>
00035 #include <ctype.h>
00036
00037 #include <gsf/gsf-utils.h>
00038 #include <gsf/gsf-input-memory.h>
00039 #include <gsf/gsf-infile.h>
00040 #include <gsf/gsf-infile-msole.h>
00041 #include <gsf/gsf-msole-utils.h>
00042
00043 #define DEBUG_OLE2 0
00044
00045
00046
00047 static struct EXTRACTOR_Keywords *
00048 addKeyword(EXTRACTOR_KeywordList *oldhead,
00049 const char *phrase,
00050 EXTRACTOR_KeywordType type) {
00051 EXTRACTOR_KeywordList * keyword;
00052
00053 if (strlen(phrase) == 0)
00054 return oldhead;
00055 if (0 == strcmp(phrase, "\"\""))
00056 return oldhead;
00057 if (0 == strcmp(phrase, "\" \""))
00058 return oldhead;
00059 if (0 == strcmp(phrase, " "))
00060 return oldhead;
00061 keyword = malloc(sizeof(EXTRACTOR_KeywordList));
00062 keyword->next = oldhead;
00063 keyword->keyword = strdup(phrase);
00064 keyword->keywordType = type;
00065 return keyword;
00066 }
00067
00068
00069 static guint8 const component_guid [] = {
00070 0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10,
00071 0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9
00072 };
00073
00074 static guint8 const document_guid [] = {
00075 0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
00076 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
00077 };
00078
00079 static guint8 const user_guid [] = {
00080 0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
00081 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
00082 };
00083
00084 typedef struct {
00085 char * text;
00086 EXTRACTOR_KeywordType type;
00087 } Matches;
00088
00089 static Matches tmap[] = {
00090 { "Title", EXTRACTOR_TITLE },
00091 { "PresentationFormat", EXTRACTOR_FORMAT },
00092 { "Category", EXTRACTOR_DESCRIPTION },
00093 { "Manager", EXTRACTOR_MANAGER },
00094 { "Company", EXTRACTOR_COMPANY },
00095 { "Subject", EXTRACTOR_SUBJECT },
00096 { "Author", EXTRACTOR_AUTHOR },
00097 { "Keywords", EXTRACTOR_KEYWORDS },
00098 { "Comments", EXTRACTOR_COMMENT },
00099 { "Template", EXTRACTOR_TEMPLATE },
00100 { "NumPages", EXTRACTOR_PAGE_COUNT },
00101 { "AppName", EXTRACTOR_SOFTWARE },
00102 { "RevisionNumber", EXTRACTOR_VERSIONNUMBER },
00103 { "Dictionary", EXTRACTOR_LANGUAGE },
00104 { "NumBytes", EXTRACTOR_SIZE },
00105 { "CreatedTime", EXTRACTOR_CREATION_DATE },
00106 { "LastSavedTime" , EXTRACTOR_MODIFICATION_DATE },
00107 { "gsf:company", EXTRACTOR_COMPANY },
00108
00109 { "gsf:character-count", EXTRACTOR_CHARACTER_COUNT },
00110 { "gsf:page-count", EXTRACTOR_PAGE_COUNT },
00111 { "gsf:line-count", EXTRACTOR_LINE_COUNT },
00112 { "gsf:word-count", EXTRACTOR_WORD_COUNT },
00113 { "gsf:paragraph-count", EXTRACTOR_PARAGRAPH_COUNT },
00114 { "gsf:last-saved-by", EXTRACTOR_LAST_SAVED_BY },
00115
00116 { "gsf:manager", EXTRACTOR_MANAGER },
00117 { "dc:title", EXTRACTOR_TITLE },
00118 { "dc:creator", EXTRACTOR_CREATOR },
00119 { "dc:date", EXTRACTOR_DATE },
00120 { "dc:subject", EXTRACTOR_SUBJECT },
00121 { "dc:keywords", EXTRACTOR_KEYWORDS },
00122 { "dc:last-printed", EXTRACTOR_LAST_PRINTED },
00123 { "dc:description", EXTRACTOR_DESCRIPTION },
00124 { "meta:creation-date", EXTRACTOR_CREATION_DATE },
00125
00126 { "meta:generator", EXTRACTOR_GENERATOR },
00127 { "meta:template", EXTRACTOR_TEMPLATE },
00128
00129
00130 { NULL, 0 },
00131 };
00132
00133 static void processMetadata(gpointer key,
00134 gpointer value,
00135 gpointer user_data) {
00136 struct EXTRACTOR_Keywords ** pprev = user_data;
00137 const char * type = key;
00138 const GsfDocProp * prop = value;
00139 const GValue * gval;
00140 char * contents;
00141 int pos;
00142
00143 if ( (key == NULL) ||
00144 (value == NULL) )
00145 return;
00146 gval = gsf_doc_prop_get_val(prop);
00147
00148 if (G_VALUE_TYPE(gval) == G_TYPE_STRING) {
00149 contents = strdup(g_value_get_string(gval));
00150 } else {
00151
00152 contents = g_strdup_value_contents(gval);
00153 }
00154 if (contents == NULL)
00155 return;
00156 if ( (strlen(contents) > 0) &&
00157 (contents[strlen(contents)-1] == '\n') )
00158 contents[strlen(contents)-1] = '\0';
00159 pos = 0;
00160 while (tmap[pos].text != NULL) {
00161 if (0 == strcmp(tmap[pos].text,
00162 type))
00163 break;
00164 pos++;
00165 }
00166 if (tmap[pos].text != NULL)
00167 *pprev = addKeyword(*pprev,
00168 contents,
00169 tmap[pos].type);
00170 #if DEBUG_OLE2
00171 else
00172 printf("No match for type `%s'\n",
00173 type);
00174 #endif
00175 free(contents);
00176 }
00177
00178
00179 static struct EXTRACTOR_Keywords *
00180 process(GsfInput * in,
00181 struct EXTRACTOR_Keywords * prev) {
00182 GsfDocMetaData * sections;
00183 GError * error;
00184
00185 sections = gsf_doc_meta_data_new();
00186 error = gsf_msole_metadata_read(in, sections);
00187 if (error == NULL) {
00188 gsf_doc_meta_data_foreach(sections,
00189 &processMetadata,
00190 &prev);
00191 }
00192 g_object_unref(G_OBJECT(sections));
00193 return prev;
00194 }
00195
00196 static struct EXTRACTOR_Keywords *
00197 processSO(GsfInput * src,
00198 struct EXTRACTOR_Keywords * prev) {
00199 off_t size;
00200 char * buf;
00201
00202 size = gsf_input_size(src);
00203 if (size < 0x374)
00204 return prev;
00205 buf = malloc(size);
00206 gsf_input_read(src, size, (unsigned char*) buf);
00207 if ( (buf[0] != 0x0F) ||
00208 (buf[1] != 0x0) ||
00209 (0 != strncmp(&buf[2],
00210 "SfxDocumentInfo",
00211 strlen("SfxDocumentInfo"))) ||
00212 (buf[0x11] != 0x0B) ||
00213 (buf[0x13] != 0x00) ||
00214 (buf[0x12] != 0x00) ) {
00215 free(buf);
00216 return prev;
00217 }
00218 buf[0xd3] = '\0';
00219 if (buf[0x94] + buf[0x93] > 0)
00220 prev = addKeyword(prev,
00221 &buf[0x95],
00222 EXTRACTOR_TITLE);
00223 buf[0x114] = '\0';
00224 if (buf[0xd5] + buf[0xd4] > 0)
00225 prev = addKeyword(prev,
00226 &buf[0xd6],
00227 EXTRACTOR_SUBJECT);
00228 buf[0x215] = '\0';
00229 if (buf[0x115] + buf[0x116] > 0)
00230 prev = addKeyword(prev,
00231 &buf[0x117],
00232 EXTRACTOR_COMMENT);
00233 buf[0x296] = '\0';
00234 if (buf[0x216] + buf[0x217] > 0)
00235 prev = addKeyword(prev,
00236 &buf[0x218],
00237 EXTRACTOR_KEYWORDS);
00238
00239
00240
00241 free(buf);
00242 return prev;
00243 }
00244
00245
00246
00247 #define __(a) dgettext("iso-639", a)
00248
00249 static const char * lidToLanguage( unsigned int lid ) {
00250 switch ( lid ) {
00251 case 0x0400:
00252 return _("No Proofing");
00253 case 0x0401:
00254 return __("Arabic");
00255 case 0x0402:
00256 return __("Bulgarian");
00257 case 0x0403:
00258 return __("Catalan");
00259 case 0x0404:
00260 return _("Traditional Chinese");
00261 case 0x0804:
00262 return _("Simplified Chinese");
00263 case 0x0405:
00264 return __("Chechen");
00265 case 0x0406:
00266 return __("Danish");
00267 case 0x0407:
00268 return __("German");
00269 case 0x0807:
00270 return _("Swiss German");
00271 case 0x0408:
00272 return __("Greek");
00273 case 0x0409:
00274 return _("U.S. English");
00275 case 0x0809:
00276 return _("U.K. English");
00277 case 0x0c09:
00278 return _("Australian English");
00279 case 0x040a:
00280 return _("Castilian Spanish");
00281 case 0x080a:
00282 return _("Mexican Spanish");
00283 case 0x040b:
00284 return __("Finnish");
00285 case 0x040c:
00286 return __("French");
00287 case 0x080c:
00288 return _("Belgian French");
00289 case 0x0c0c:
00290 return _("Canadian French");
00291 case 0x100c:
00292 return _("Swiss French");
00293 case 0x040d:
00294 return __("Hebrew");
00295 case 0x040e:
00296 return __("Hungarian");
00297 case 0x040f:
00298 return __("Icelandic");
00299 case 0x0410:
00300 return __("Italian");
00301 case 0x0810:
00302 return _("Swiss Italian");
00303 case 0x0411:
00304 return __("Japanese");
00305 case 0x0412:
00306 return __("Korean");
00307 case 0x0413:
00308 return __("Dutch");
00309 case 0x0813:
00310 return _("Belgian Dutch");
00311 case 0x0414:
00312 return _("Norwegian Bokmal");
00313 case 0x0814:
00314 return __("Norwegian Nynorsk");
00315 case 0x0415:
00316 return __("Polish");
00317 case 0x0416:
00318 return __("Brazilian Portuguese");
00319 case 0x0816:
00320 return __("Portuguese");
00321 case 0x0417:
00322 return _("Rhaeto-Romanic");
00323 case 0x0418:
00324 return __("Romanian");
00325 case 0x0419:
00326 return __("Russian");
00327 case 0x041a:
00328 return _("Croato-Serbian (Latin)");
00329 case 0x081a:
00330 return _("Serbo-Croatian (Cyrillic)");
00331 case 0x041b:
00332 return __("Slovak");
00333 case 0x041c:
00334 return __("Albanian");
00335 case 0x041d:
00336 return __("Swedish");
00337 case 0x041e:
00338 return __("Thai");
00339 case 0x041f:
00340 return __("Turkish");
00341 case 0x0420:
00342 return __("Urdu");
00343 case 0x0421:
00344 return __("Bahasa");
00345 case 0x0422:
00346 return __("Ukrainian");
00347 case 0x0423:
00348 return __("Byelorussian");
00349 case 0x0424:
00350 return __("Slovenian");
00351 case 0x0425:
00352 return __("Estonian");
00353 case 0x0426:
00354 return __("Latvian");
00355 case 0x0427:
00356 return __("Lithuanian");
00357 case 0x0429:
00358 return _("Farsi");
00359 case 0x042D:
00360 return __("Basque");
00361 case 0x042F:
00362 return __("Macedonian");
00363 case 0x0436:
00364 return __("Afrikaans");
00365 case 0x043E:
00366 return __("Malayalam");
00367 default:
00368 return NULL;
00369 }
00370 }
00371
00372
00373 static struct EXTRACTOR_Keywords *
00374 history_extract(GsfInput * stream,
00375 unsigned int lcbSttbSavedBy,
00376 unsigned int fcSttbSavedBy,
00377 struct EXTRACTOR_Keywords * prev) {
00378 unsigned int where = 0;
00379 unsigned char * lbuffer;
00380 unsigned int i;
00381 unsigned int length;
00382 char * author;
00383 char * filename;
00384 char * rbuf;
00385 unsigned int nRev;
00386
00387
00388 gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET);
00389 if (gsf_input_remaining(stream) < lcbSttbSavedBy)
00390 return prev;
00391 lbuffer = malloc(lcbSttbSavedBy);
00392
00393 gsf_input_read(stream, lcbSttbSavedBy, lbuffer);
00394
00395 nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
00396 where = 6;
00397 for (i=0; i < nRev; i++) {
00398 if (where >= lcbSttbSavedBy)
00399 break;
00400 length = lbuffer[where++];
00401 if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
00402 (where + 2 * length + 2 <= where) )
00403 break;
00404 author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
00405 length * 2,
00406 "UTF-16BE");
00407 where += length * 2 + 1;
00408 length = lbuffer[where++];
00409 if ( (where + 2 * length >= lcbSttbSavedBy) ||
00410 (where + 2 * length + 1 <= where) ) {
00411 free(author);
00412 break;
00413 }
00414 filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
00415 length * 2,
00416 "UTF-16BE");
00417 where += length * 2 + 1;
00418 rbuf = malloc(strlen(author) + strlen(filename) + 512);
00419 snprintf(rbuf, 512 + strlen(author) + strlen(filename),
00420 _("Revision #%u: Author '%s' worked on '%s'"),
00421 i, author, filename);
00422 free(author);
00423 free(filename);
00424 prev = addKeyword(prev,
00425 rbuf,
00426 EXTRACTOR_REVISION_HISTORY);
00427 free(rbuf);
00428 }
00429 free(lbuffer);
00430 return prev;
00431 }
00432
00433
00434
00435
00436 struct EXTRACTOR_Keywords *
00437 libextractor_ole2_extract(const char * filename,
00438 const char * data,
00439 size_t size,
00440 struct EXTRACTOR_Keywords * prev) {
00441 GsfInput * input;
00442 GsfInfile * infile;
00443 GsfInput * src;
00444 const char * name;
00445 const char * generator = NULL;
00446 int i;
00447 unsigned int lcb;
00448 unsigned int fcb;
00449 const unsigned char * data512;
00450 unsigned int lid;
00451 const char * lang;
00452
00453 if (size < 512 + 898)
00454 return prev;
00455 input = gsf_input_memory_new((const guint8 *) data,
00456 (gsf_off_t) size,
00457 FALSE);
00458 if (input == NULL)
00459 return prev;
00460
00461 infile = gsf_infile_msole_new(input, NULL);
00462 if (infile == NULL) {
00463 g_object_unref(G_OBJECT(input));
00464 return prev;
00465 }
00466 lcb = 0;
00467 fcb = 0;
00468 for (i=0;i<gsf_infile_num_children(infile);i++) {
00469 name = gsf_infile_name_by_index (infile, i);
00470 src = NULL;
00471 if (name == NULL)
00472 continue;
00473 if ( (0 == strcmp(name, "\005SummaryInformation"))
00474 || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
00475 src = gsf_infile_child_by_index (infile, i);
00476 if (src != NULL)
00477 prev = process(src,
00478 prev);
00479 }
00480 if (0 == strcmp(name, "SfxDocumentInfo")) {
00481 src = gsf_infile_child_by_index (infile, i);
00482 if (src != NULL)
00483 prev = processSO(src,
00484 prev);
00485 }
00486 if (src != NULL)
00487 g_object_unref(G_OBJECT(src));
00488 }
00489
00490 data512 = (const unsigned char*) &data[512];
00491 lid = data512[6] + (data512[7] << 8);
00492 lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
00493 fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
00494 lang = lidToLanguage(lid);
00495 if (lang != NULL) {
00496 prev = addKeyword(prev,
00497 lang,
00498 EXTRACTOR_LANGUAGE);
00499 }
00500 if (lcb >= 6) {
00501 for (i=0;i<gsf_infile_num_children(infile);i++) {
00502 name = gsf_infile_name_by_index (infile, i);
00503 if (name == NULL)
00504 continue;
00505 if ( (0 == strcmp(name, "1Table")) ||
00506 (0 == strcmp(name, "0Table")) ) {
00507 src = gsf_infile_child_by_index (infile, i);
00508 if (src != NULL) {
00509 prev = history_extract(src,
00510 lcb,
00511 fcb,
00512 prev);
00513 g_object_unref(G_OBJECT(src));
00514 }
00515 }
00516 }
00517 }
00518 g_object_unref(G_OBJECT(infile));
00519 g_object_unref(G_OBJECT(input));
00520
00521
00522
00523
00524 generator = EXTRACTOR_extractLast(EXTRACTOR_GENERATOR, prev);
00525 if (NULL == generator) {
00526
00527
00528
00529 if ( (8 < size)
00530 && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) )
00531 generator = "Microsoft Office";
00532 }
00533
00534 if(NULL != generator) {
00535 const char * mimetype = "application/vnd.ms-files";
00536
00537 if((0 == strncmp(generator, "Microsoft Word", 14)) ||
00538 (0 == strncmp(generator, "Microsoft Office Word", 21)))
00539 mimetype = "application/msword";
00540 else if((0 == strncmp(generator, "Microsoft Excel", 15)) ||
00541 (0 == strncmp(generator, "Microsoft Office Excel", 22)))
00542 mimetype = "application/vnd.ms-excel";
00543 else if((0 == strncmp(generator, "Microsoft PowerPoint", 20)) ||
00544 (0 == strncmp(generator, "Microsoft Office PowerPoint", 27)))
00545 mimetype = "application/vnd.ms-powerpoint";
00546 else if(0 == strncmp(generator, "Microsoft Project", 17))
00547 mimetype = "application/vnd.ms-project";
00548 else if(0 == strncmp(generator, "Microsoft Visio", 15))
00549 mimetype = "application/vnd.visio";
00550 else if(0 == strncmp(generator, "Microsoft Office", 16))
00551 mimetype = "application/vnd.ms-office";
00552
00553 prev = addKeyword(prev, mimetype, EXTRACTOR_MIMETYPE);
00554 }
00555
00556 return prev;
00557 }
00558 static void nolog (const gchar *log_domain,
00559 GLogLevelFlags log_level,
00560 const gchar *message,
00561 gpointer user_data) {
00562 }
00563
00564 void __attribute__ ((constructor)) ole2_ltdl_init() {
00565 g_type_init();
00566 #ifdef HAVE_GSF_INIT
00567 gsf_init();
00568 #endif
00569
00570 g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING, &nolog, NULL);
00571
00572 }
00573
00574 void __attribute__ ((destructor)) ole2_ltdl_fini() {
00575 #ifdef HAVE_GSF_INIT
00576 gsf_shutdown();
00577 #endif
00578
00579 }
00580
00581
00582