#include "platform.h"#include "extractor.h"#include "convert.h"#include <glib-object.h>#include <string.h>#include <stdio.h>#include <ctype.h>#include <gsf/gsf-utils.h>#include <gsf/gsf-input-memory.h>#include <gsf/gsf-infile.h>#include <gsf/gsf-infile-msole.h>#include <gsf/gsf-msole-utils.h>Go to the source code of this file.
Data Structures | |
| struct | Matches |
Defines | |
| #define | DEBUG_OLE2 0 |
| #define | __(a) dgettext("iso-639", a) |
Functions | |
| static struct EXTRACTOR_Keywords * | addKeyword (EXTRACTOR_KeywordList *oldhead, const char *phrase, EXTRACTOR_KeywordType type) |
| static void | processMetadata (gpointer key, gpointer value, gpointer user_data) |
| static struct EXTRACTOR_Keywords * | process (GsfInput *in, struct EXTRACTOR_Keywords *prev) |
| static struct EXTRACTOR_Keywords * | processSO (GsfInput *src, struct EXTRACTOR_Keywords *prev) |
| static const char * | lidToLanguage (unsigned int lid) |
| static struct EXTRACTOR_Keywords * | history_extract (GsfInput *stream, unsigned int lcbSttbSavedBy, unsigned int fcSttbSavedBy, struct EXTRACTOR_Keywords *prev) |
| EXTRACTOR_Keywords * | libextractor_ole2_extract (const char *filename, const char *data, size_t size, struct EXTRACTOR_Keywords *prev) |
| static void | nolog (const gchar *log_domain, GLogLevelFlags log_level, const gchar *message, gpointer user_data) |
| void | __attribute__ ((constructor)) |
| void | __attribute__ ((destructor)) |
Variables | |
| static guint8 const | component_guid [] |
| static guint8 const | document_guid [] |
| static guint8 const | user_guid [] |
| static Matches | tmap [] |
| #define __ | ( | a | ) | dgettext("iso-639", a) |
| #define DEBUG_OLE2 0 |
Definition at line 43 of file ole2extractor.c.
| void __attribute__ | ( | (destructor) | ) |
Definition at line 574 of file ole2extractor.c.
00574 { 00575 #ifdef HAVE_GSF_INIT 00576 gsf_shutdown(); 00577 #endif 00578 // gsf_shutdown_dynamic(NULL); 00579 }
| void __attribute__ | ( | (constructor) | ) |
Definition at line 564 of file ole2extractor.c.
00564 { 00565 g_type_init(); 00566 #ifdef HAVE_GSF_INIT 00567 gsf_init(); 00568 #endif 00569 /* disable logging -- thanks, Jody! */ 00570 g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING, &nolog, NULL); 00571 // gsf_init_dynamic(NULL); 00572 }
| static struct EXTRACTOR_Keywords* addKeyword | ( | EXTRACTOR_KeywordList * | oldhead, | |
| const char * | phrase, | |||
| EXTRACTOR_KeywordType | type | |||
| ) | [static] |
Definition at line 48 of file ole2extractor.c.
References EXTRACTOR_Keywords::keyword, and malloc.
00050 { 00051 EXTRACTOR_KeywordList * keyword; 00052 00053 if (strlen(phrase) == 0) 00054 return oldhead; 00055 if (0 == strcmp(phrase, "\"\"")) 00056 return oldhead; 00057 if (0 == strcmp(phrase, "\" \"")) 00058 return oldhead; 00059 if (0 == strcmp(phrase, " ")) 00060 return oldhead; 00061 keyword = malloc(sizeof(EXTRACTOR_KeywordList)); 00062 keyword->next = oldhead; 00063 keyword->keyword = strdup(phrase); 00064 keyword->keywordType = type; 00065 return keyword; 00066 }
| static struct EXTRACTOR_Keywords* history_extract | ( | GsfInput * | stream, | |
| unsigned int | lcbSttbSavedBy, | |||
| unsigned int | fcSttbSavedBy, | |||
| struct EXTRACTOR_Keywords * | prev | |||
| ) | [static] |
Definition at line 374 of file ole2extractor.c.
References _, addKeyword(), EXTRACTOR_common_convert_to_utf8(), EXTRACTOR_REVISION_HISTORY, filename, free, and malloc.
Referenced by libextractor_ole2_extract().
00377 { 00378 unsigned int where = 0; 00379 unsigned char * lbuffer; 00380 unsigned int i; 00381 unsigned int length; 00382 char * author; 00383 char * filename; 00384 char * rbuf; 00385 unsigned int nRev; 00386 00387 // goto offset of revision 00388 gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET); 00389 if (gsf_input_remaining(stream) < lcbSttbSavedBy) 00390 return prev; 00391 lbuffer = malloc(lcbSttbSavedBy); 00392 // read all the revision history 00393 gsf_input_read(stream, lcbSttbSavedBy, lbuffer); 00394 // there are n strings, so n/2 revisions (author & file) 00395 nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2; 00396 where = 6; 00397 for (i=0; i < nRev; i++) { 00398 if (where >= lcbSttbSavedBy) 00399 break; 00400 length = lbuffer[where++]; 00401 if ( (where + 2 * length + 2 >= lcbSttbSavedBy) || 00402 (where + 2 * length + 2 <= where) ) 00403 break; 00404 author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where], 00405 length * 2, 00406 "UTF-16BE"); 00407 where += length * 2 + 1; 00408 length = lbuffer[where++]; 00409 if ( (where + 2 * length >= lcbSttbSavedBy) || 00410 (where + 2 * length + 1 <= where) ) { 00411 free(author); 00412 break; 00413 } 00414 filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where], 00415 length * 2, 00416 "UTF-16BE"); 00417 where += length * 2 + 1; 00418 rbuf = malloc(strlen(author) + strlen(filename) + 512); 00419 snprintf(rbuf, 512 + strlen(author) + strlen(filename), 00420 _("Revision #%u: Author '%s' worked on '%s'"), 00421 i, author, filename); 00422 free(author); 00423 free(filename); 00424 prev = addKeyword(prev, 00425 rbuf, 00426 EXTRACTOR_REVISION_HISTORY); 00427 free(rbuf); 00428 } 00429 free(lbuffer); 00430 return prev; 00431 }
| struct EXTRACTOR_Keywords* libextractor_ole2_extract | ( | const char * | filename, | |
| const char * | data, | |||
| size_t | size, | |||
| struct EXTRACTOR_Keywords * | prev | |||
| ) |
Definition at line 437 of file ole2extractor.c.
References addKeyword(), EXTRACTOR_extractLast(), EXTRACTOR_GENERATOR, EXTRACTOR_LANGUAGE, EXTRACTOR_MIMETYPE, history_extract(), lidToLanguage(), name, NULL, process(), processSO(), and src.
00440 { 00441 GsfInput * input; 00442 GsfInfile * infile; 00443 GsfInput * src; 00444 const char * name; 00445 const char * generator = NULL; 00446 int i; 00447 unsigned int lcb; 00448 unsigned int fcb; 00449 const unsigned char * data512; 00450 unsigned int lid; 00451 const char * lang; 00452 00453 if (size < 512 + 898) 00454 return prev; /* can hardly be OLE2 */ 00455 input = gsf_input_memory_new((const guint8 *) data, 00456 (gsf_off_t) size, 00457 FALSE); 00458 if (input == NULL) 00459 return prev; 00460 00461 infile = gsf_infile_msole_new(input, NULL); 00462 if (infile == NULL) { 00463 g_object_unref(G_OBJECT(input)); 00464 return prev; 00465 } 00466 lcb = 0; 00467 fcb = 0; 00468 for (i=0;i<gsf_infile_num_children(infile);i++) { 00469 name = gsf_infile_name_by_index (infile, i); 00470 src = NULL; 00471 if (name == NULL) 00472 continue; 00473 if ( (0 == strcmp(name, "\005SummaryInformation")) 00474 || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) { 00475 src = gsf_infile_child_by_index (infile, i); 00476 if (src != NULL) 00477 prev = process(src, 00478 prev); 00479 } 00480 if (0 == strcmp(name, "SfxDocumentInfo")) { 00481 src = gsf_infile_child_by_index (infile, i); 00482 if (src != NULL) 00483 prev = processSO(src, 00484 prev); 00485 } 00486 if (src != NULL) 00487 g_object_unref(G_OBJECT(src)); 00488 } 00489 00490 data512 = (const unsigned char*) &data[512]; 00491 lid = data512[6] + (data512[7] << 8); 00492 lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24); 00493 fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24); 00494 lang = lidToLanguage(lid); 00495 if (lang != NULL) { 00496 prev = addKeyword(prev, 00497 lang, 00498 EXTRACTOR_LANGUAGE); 00499 } 00500 if (lcb >= 6) { 00501 for (i=0;i<gsf_infile_num_children(infile);i++) { 00502 name = gsf_infile_name_by_index (infile, i); 00503 if (name == NULL) 00504 continue; 00505 if ( (0 == strcmp(name, "1Table")) || 00506 (0 == strcmp(name, "0Table")) ) { 00507 src = gsf_infile_child_by_index (infile, i); 00508 if (src != NULL) { 00509 prev = history_extract(src, 00510 lcb, 00511 fcb, 00512 prev); 00513 g_object_unref(G_OBJECT(src)); 00514 } 00515 } 00516 } 00517 } 00518 g_object_unref(G_OBJECT(infile)); 00519 g_object_unref(G_OBJECT(input)); 00520 00521 /* 00522 * Hack to return an appropriate mimetype 00523 */ 00524 generator = EXTRACTOR_extractLast(EXTRACTOR_GENERATOR, prev); 00525 if (NULL == generator) { 00526 /* 00527 * when very puzzled, just look at file magic number 00528 */ 00529 if ( (8 < size) 00530 && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) ) 00531 generator = "Microsoft Office"; 00532 } 00533 00534 if(NULL != generator) { 00535 const char * mimetype = "application/vnd.ms-files"; 00536 00537 if((0 == strncmp(generator, "Microsoft Word", 14)) || 00538 (0 == strncmp(generator, "Microsoft Office Word", 21))) 00539 mimetype = "application/msword"; 00540 else if((0 == strncmp(generator, "Microsoft Excel", 15)) || 00541 (0 == strncmp(generator, "Microsoft Office Excel", 22))) 00542 mimetype = "application/vnd.ms-excel"; 00543 else if((0 == strncmp(generator, "Microsoft PowerPoint", 20)) || 00544 (0 == strncmp(generator, "Microsoft Office PowerPoint", 27))) 00545 mimetype = "application/vnd.ms-powerpoint"; 00546 else if(0 == strncmp(generator, "Microsoft Project", 17)) 00547 mimetype = "application/vnd.ms-project"; 00548 else if(0 == strncmp(generator, "Microsoft Visio", 15)) 00549 mimetype = "application/vnd.visio"; 00550 else if(0 == strncmp(generator, "Microsoft Office", 16)) 00551 mimetype = "application/vnd.ms-office"; 00552 00553 prev = addKeyword(prev, mimetype, EXTRACTOR_MIMETYPE); 00554 } 00555 00556 return prev; 00557 }
| static const char* lidToLanguage | ( | unsigned int | lid | ) | [static] |
Definition at line 249 of file ole2extractor.c.
Referenced by libextractor_ole2_extract().
00249 { 00250 switch ( lid ) { 00251 case 0x0400: 00252 return _("No Proofing"); 00253 case 0x0401: 00254 return __("Arabic"); 00255 case 0x0402: 00256 return __("Bulgarian"); 00257 case 0x0403: 00258 return __("Catalan"); 00259 case 0x0404: 00260 return _("Traditional Chinese"); 00261 case 0x0804: 00262 return _("Simplified Chinese"); 00263 case 0x0405: 00264 return __("Chechen"); 00265 case 0x0406: 00266 return __("Danish"); 00267 case 0x0407: 00268 return __("German"); 00269 case 0x0807: 00270 return _("Swiss German"); 00271 case 0x0408: 00272 return __("Greek"); 00273 case 0x0409: 00274 return _("U.S. English"); 00275 case 0x0809: 00276 return _("U.K. English"); 00277 case 0x0c09: 00278 return _("Australian English"); 00279 case 0x040a: 00280 return _("Castilian Spanish"); 00281 case 0x080a: 00282 return _("Mexican Spanish"); 00283 case 0x040b: 00284 return __("Finnish"); 00285 case 0x040c: 00286 return __("French"); 00287 case 0x080c: 00288 return _("Belgian French"); 00289 case 0x0c0c: 00290 return _("Canadian French"); 00291 case 0x100c: 00292 return _("Swiss French"); 00293 case 0x040d: 00294 return __("Hebrew"); 00295 case 0x040e: 00296 return __("Hungarian"); 00297 case 0x040f: 00298 return __("Icelandic"); 00299 case 0x0410: 00300 return __("Italian"); 00301 case 0x0810: 00302 return _("Swiss Italian"); 00303 case 0x0411: 00304 return __("Japanese"); 00305 case 0x0412: 00306 return __("Korean"); 00307 case 0x0413: 00308 return __("Dutch"); 00309 case 0x0813: 00310 return _("Belgian Dutch"); 00311 case 0x0414: 00312 return _("Norwegian Bokmal"); 00313 case 0x0814: 00314 return __("Norwegian Nynorsk"); 00315 case 0x0415: 00316 return __("Polish"); 00317 case 0x0416: 00318 return __("Brazilian Portuguese"); 00319 case 0x0816: 00320 return __("Portuguese"); 00321 case 0x0417: 00322 return _("Rhaeto-Romanic"); 00323 case 0x0418: 00324 return __("Romanian"); 00325 case 0x0419: 00326 return __("Russian"); 00327 case 0x041a: 00328 return _("Croato-Serbian (Latin)"); 00329 case 0x081a: 00330 return _("Serbo-Croatian (Cyrillic)"); 00331 case 0x041b: 00332 return __("Slovak"); 00333 case 0x041c: 00334 return __("Albanian"); 00335 case 0x041d: 00336 return __("Swedish"); 00337 case 0x041e: 00338 return __("Thai"); 00339 case 0x041f: 00340 return __("Turkish"); 00341 case 0x0420: 00342 return __("Urdu"); 00343 case 0x0421: 00344 return __("Bahasa"); 00345 case 0x0422: 00346 return __("Ukrainian"); 00347 case 0x0423: 00348 return __("Byelorussian"); 00349 case 0x0424: 00350 return __("Slovenian"); 00351 case 0x0425: 00352 return __("Estonian"); 00353 case 0x0426: 00354 return __("Latvian"); 00355 case 0x0427: 00356 return __("Lithuanian"); 00357 case 0x0429: 00358 return _("Farsi"); 00359 case 0x042D: 00360 return __("Basque"); 00361 case 0x042F: 00362 return __("Macedonian"); 00363 case 0x0436: 00364 return __("Afrikaans"); 00365 case 0x043E: 00366 return __("Malayalam"); 00367 default: 00368 return NULL; 00369 } 00370 }
| static void nolog | ( | const gchar * | log_domain, | |
| GLogLevelFlags | log_level, | |||
| const gchar * | message, | |||
| gpointer | user_data | |||
| ) | [static] |
| static struct EXTRACTOR_Keywords* process | ( | GsfInput * | in, | |
| struct EXTRACTOR_Keywords * | prev | |||
| ) | [static] |
Definition at line 180 of file ole2extractor.c.
References error(), NULL, and processMetadata().
Referenced by libextractor_ole2_extract(), and testKeyword().
00181 { 00182 GsfDocMetaData * sections; 00183 GError * error; 00184 00185 sections = gsf_doc_meta_data_new(); 00186 error = gsf_msole_metadata_read(in, sections); 00187 if (error == NULL) { 00188 gsf_doc_meta_data_foreach(sections, 00189 &processMetadata, 00190 &prev); 00191 } 00192 g_object_unref(G_OBJECT(sections)); 00193 return prev; 00194 }
| static void processMetadata | ( | gpointer | key, | |
| gpointer | value, | |||
| gpointer | user_data | |||
| ) | [static] |
Definition at line 133 of file ole2extractor.c.
References addKeyword(), free, NULL, printf, tmap, and type.
Referenced by process().
00135 { 00136 struct EXTRACTOR_Keywords ** pprev = user_data; 00137 const char * type = key; 00138 const GsfDocProp * prop = value; 00139 const GValue * gval; 00140 char * contents; 00141 int pos; 00142 00143 if ( (key == NULL) || 00144 (value == NULL) ) 00145 return; 00146 gval = gsf_doc_prop_get_val(prop); 00147 00148 if (G_VALUE_TYPE(gval) == G_TYPE_STRING) { 00149 contents = strdup(g_value_get_string(gval)); 00150 } else { 00151 /* convert other formats? */ 00152 contents = g_strdup_value_contents(gval); 00153 } 00154 if (contents == NULL) 00155 return; 00156 if ( (strlen(contents) > 0) && 00157 (contents[strlen(contents)-1] == '\n') ) 00158 contents[strlen(contents)-1] = '\0'; 00159 pos = 0; 00160 while (tmap[pos].text != NULL) { 00161 if (0 == strcmp(tmap[pos].text, 00162 type)) 00163 break; 00164 pos++; 00165 } 00166 if (tmap[pos].text != NULL) 00167 *pprev = addKeyword(*pprev, 00168 contents, 00169 tmap[pos].type); 00170 #if DEBUG_OLE2 00171 else 00172 printf("No match for type `%s'\n", 00173 type); 00174 #endif 00175 free(contents); 00176 }
| static struct EXTRACTOR_Keywords* processSO | ( | GsfInput * | src, | |
| struct EXTRACTOR_Keywords * | prev | |||
| ) | [static] |
Definition at line 197 of file ole2extractor.c.
References addKeyword(), EXTRACTOR_COMMENT, EXTRACTOR_KEYWORDS, EXTRACTOR_SUBJECT, EXTRACTOR_TITLE, free, malloc, and size.
Referenced by libextractor_ole2_extract().
00198 { 00199 off_t size; 00200 char * buf; 00201 00202 size = gsf_input_size(src); 00203 if (size < 0x374) /* == 0x375?? */ 00204 return prev; 00205 buf = malloc(size); 00206 gsf_input_read(src, size, (unsigned char*) buf); 00207 if ( (buf[0] != 0x0F) || 00208 (buf[1] != 0x0) || 00209 (0 != strncmp(&buf[2], 00210 "SfxDocumentInfo", 00211 strlen("SfxDocumentInfo"))) || 00212 (buf[0x11] != 0x0B) || 00213 (buf[0x13] != 0x00) || /* pw protected! */ 00214 (buf[0x12] != 0x00) ) { 00215 free(buf); 00216 return prev; 00217 } 00218 buf[0xd3] = '\0'; 00219 if (buf[0x94] + buf[0x93] > 0) 00220 prev = addKeyword(prev, 00221 &buf[0x95], 00222 EXTRACTOR_TITLE); 00223 buf[0x114] = '\0'; 00224 if (buf[0xd5] + buf[0xd4] > 0) 00225 prev = addKeyword(prev, 00226 &buf[0xd6], 00227 EXTRACTOR_SUBJECT); 00228 buf[0x215] = '\0'; 00229 if (buf[0x115] + buf[0x116] > 0) 00230 prev = addKeyword(prev, 00231 &buf[0x117], 00232 EXTRACTOR_COMMENT); 00233 buf[0x296] = '\0'; 00234 if (buf[0x216] + buf[0x217] > 0) 00235 prev = addKeyword(prev, 00236 &buf[0x218], 00237 EXTRACTOR_KEYWORDS); 00238 /* fixme: do timestamps, 00239 mime-type, user-defined info's */ 00240 00241 free(buf); 00242 return prev; 00243 }
guint8 const component_guid[] [static] |
Initial value:
{
0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10,
0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9
}
Definition at line 69 of file ole2extractor.c.
guint8 const document_guid[] [static] |
Initial value:
{
0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
}
Definition at line 74 of file ole2extractor.c.
Definition at line 89 of file ole2extractor.c.
guint8 const user_guid[] [static] |
Initial value:
{
0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
}
Definition at line 79 of file ole2extractor.c.
1.5.1