ole2extractor.c File Reference

#include "platform.h"
#include "extractor.h"
#include "convert.h"
#include <glib-object.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include <gsf/gsf-utils.h>
#include <gsf/gsf-input-memory.h>
#include <gsf/gsf-infile.h>
#include <gsf/gsf-infile-msole.h>
#include <gsf/gsf-msole-utils.h>

Go to the source code of this file.

Data Structures

struct  Matches

Defines

#define DEBUG_OLE2   0
#define __(a)   dgettext("iso-639", a)

Functions

static struct EXTRACTOR_KeywordsaddKeyword (EXTRACTOR_KeywordList *oldhead, const char *phrase, EXTRACTOR_KeywordType type)
static void processMetadata (gpointer key, gpointer value, gpointer user_data)
static struct EXTRACTOR_Keywordsprocess (GsfInput *in, struct EXTRACTOR_Keywords *prev)
static struct EXTRACTOR_KeywordsprocessSO (GsfInput *src, struct EXTRACTOR_Keywords *prev)
static const char * lidToLanguage (unsigned int lid)
static struct EXTRACTOR_Keywordshistory_extract (GsfInput *stream, unsigned int lcbSttbSavedBy, unsigned int fcSttbSavedBy, struct EXTRACTOR_Keywords *prev)
EXTRACTOR_Keywordslibextractor_ole2_extract (const char *filename, const char *data, size_t size, struct EXTRACTOR_Keywords *prev)
static void nolog (const gchar *log_domain, GLogLevelFlags log_level, const gchar *message, gpointer user_data)
void __attribute__ ((constructor))
void __attribute__ ((destructor))

Variables

static guint8 const component_guid []
static guint8 const document_guid []
static guint8 const user_guid []
static Matches tmap []


Define Documentation

#define __ (  )     dgettext("iso-639", a)

Definition at line 247 of file ole2extractor.c.

Referenced by lidToLanguage().

#define DEBUG_OLE2   0

Definition at line 43 of file ole2extractor.c.


Function Documentation

void __attribute__ ( (destructor)   ) 

Definition at line 574 of file ole2extractor.c.

00574                                                    {
00575 #ifdef HAVE_GSF_INIT
00576   gsf_shutdown();
00577 #endif
00578   // gsf_shutdown_dynamic(NULL);
00579 }

void __attribute__ ( (constructor)   ) 

Definition at line 564 of file ole2extractor.c.

References nolog(), and NULL.

00564                                                     {
00565  g_type_init();
00566 #ifdef HAVE_GSF_INIT
00567   gsf_init();
00568 #endif
00569   /* disable logging -- thanks, Jody! */
00570   g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING,  &nolog, NULL);
00571   // gsf_init_dynamic(NULL);
00572 }

static struct EXTRACTOR_Keywords* addKeyword ( EXTRACTOR_KeywordList oldhead,
const char *  phrase,
EXTRACTOR_KeywordType  type 
) [static]

Definition at line 48 of file ole2extractor.c.

References EXTRACTOR_Keywords::keyword, and malloc.

00050                                        {
00051   EXTRACTOR_KeywordList * keyword;
00052 
00053   if (strlen(phrase) == 0)
00054     return oldhead;
00055   if (0 == strcmp(phrase, "\"\""))
00056     return oldhead;
00057   if (0 == strcmp(phrase, "\" \""))
00058     return oldhead;
00059   if (0 == strcmp(phrase, " "))
00060     return oldhead;
00061   keyword = malloc(sizeof(EXTRACTOR_KeywordList));
00062   keyword->next = oldhead;
00063   keyword->keyword = strdup(phrase);
00064   keyword->keywordType = type;
00065   return keyword;
00066 }

static struct EXTRACTOR_Keywords* history_extract ( GsfInput *  stream,
unsigned int  lcbSttbSavedBy,
unsigned int  fcSttbSavedBy,
struct EXTRACTOR_Keywords prev 
) [static]

Definition at line 374 of file ole2extractor.c.

References _, addKeyword(), EXTRACTOR_common_convert_to_utf8(), EXTRACTOR_REVISION_HISTORY, filename, free, and malloc.

Referenced by libextractor_ole2_extract().

00377                                                   {
00378   unsigned int where = 0;
00379   unsigned char * lbuffer;
00380   unsigned int i;
00381   unsigned int length;
00382   char * author;
00383   char * filename;
00384   char * rbuf;
00385   unsigned int nRev;
00386 
00387   // goto offset of revision
00388   gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET);
00389   if (gsf_input_remaining(stream) < lcbSttbSavedBy)
00390     return prev;
00391   lbuffer = malloc(lcbSttbSavedBy);
00392   // read all the revision history
00393   gsf_input_read(stream, lcbSttbSavedBy, lbuffer);
00394   // there are n strings, so n/2 revisions (author & file)
00395   nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
00396   where = 6;
00397   for (i=0; i < nRev; i++) {
00398     if (where >= lcbSttbSavedBy)
00399       break;
00400     length = lbuffer[where++];
00401     if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
00402          (where + 2 * length + 2 <= where) )
00403       break;
00404     author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
00405                            length * 2,
00406                            "UTF-16BE");
00407     where += length * 2 + 1;
00408     length = lbuffer[where++];
00409     if ( (where + 2 * length >= lcbSttbSavedBy) ||
00410          (where + 2 * length + 1 <= where) ) {
00411       free(author);
00412       break;
00413     }
00414     filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
00415                              length * 2,
00416                              "UTF-16BE");
00417     where += length * 2 + 1;
00418     rbuf = malloc(strlen(author) + strlen(filename) + 512);
00419     snprintf(rbuf, 512 + strlen(author) + strlen(filename),
00420              _("Revision #%u: Author '%s' worked on '%s'"),
00421              i, author, filename);
00422     free(author);
00423     free(filename);
00424     prev = addKeyword(prev,
00425                       rbuf,
00426                       EXTRACTOR_REVISION_HISTORY);
00427     free(rbuf);
00428   }
00429   free(lbuffer);
00430   return prev;
00431 }

struct EXTRACTOR_Keywords* libextractor_ole2_extract ( const char *  filename,
const char *  data,
size_t  size,
struct EXTRACTOR_Keywords prev 
)

Definition at line 437 of file ole2extractor.c.

References addKeyword(), EXTRACTOR_extractLast(), EXTRACTOR_GENERATOR, EXTRACTOR_LANGUAGE, EXTRACTOR_MIMETYPE, history_extract(), lidToLanguage(), name, NULL, process(), processSO(), and src.

00440                                                             {
00441   GsfInput * input;
00442   GsfInfile * infile;
00443   GsfInput * src;
00444   const char * name;
00445   const char * generator = NULL;
00446   int i;
00447   unsigned int lcb;
00448   unsigned int fcb;
00449   const unsigned char * data512;
00450   unsigned int lid;
00451   const char * lang;
00452 
00453   if (size < 512 + 898)
00454     return prev; /* can hardly be OLE2 */
00455   input = gsf_input_memory_new((const guint8 *) data,
00456                                (gsf_off_t) size,
00457                                FALSE);
00458   if (input == NULL)
00459     return prev;
00460 
00461   infile = gsf_infile_msole_new(input, NULL);
00462   if (infile == NULL) {
00463     g_object_unref(G_OBJECT(input));
00464     return prev;
00465   }
00466   lcb = 0;
00467   fcb = 0;
00468   for (i=0;i<gsf_infile_num_children(infile);i++) {
00469     name = gsf_infile_name_by_index (infile, i);
00470     src = NULL;
00471     if (name == NULL)
00472       continue;
00473     if ( (0 == strcmp(name, "\005SummaryInformation"))
00474          || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
00475       src = gsf_infile_child_by_index (infile, i);
00476       if (src != NULL)
00477         prev = process(src,
00478                        prev);
00479     }
00480     if (0 == strcmp(name, "SfxDocumentInfo")) {
00481       src = gsf_infile_child_by_index (infile, i);
00482       if (src != NULL)
00483         prev = processSO(src,
00484                          prev);
00485     }
00486     if (src != NULL)
00487       g_object_unref(G_OBJECT(src));
00488   }
00489 
00490   data512 = (const unsigned char*) &data[512];
00491   lid = data512[6] + (data512[7] << 8);
00492   lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
00493   fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
00494   lang = lidToLanguage(lid);
00495   if (lang != NULL) {
00496     prev = addKeyword(prev,
00497                       lang,
00498                       EXTRACTOR_LANGUAGE);
00499   }
00500   if (lcb >= 6) {
00501     for (i=0;i<gsf_infile_num_children(infile);i++) {
00502       name = gsf_infile_name_by_index (infile, i);
00503       if (name == NULL)
00504         continue;
00505       if ( (0 == strcmp(name, "1Table")) ||
00506            (0 == strcmp(name, "0Table")) ) {
00507         src = gsf_infile_child_by_index (infile, i);
00508         if (src != NULL) {
00509           prev = history_extract(src,
00510                                  lcb,
00511                                  fcb,
00512                                  prev);
00513           g_object_unref(G_OBJECT(src));
00514         }
00515       }
00516     }
00517   }
00518   g_object_unref(G_OBJECT(infile));
00519   g_object_unref(G_OBJECT(input));
00520 
00521   /*
00522    * Hack to return an appropriate mimetype
00523    */
00524   generator = EXTRACTOR_extractLast(EXTRACTOR_GENERATOR, prev);
00525   if (NULL == generator) {
00526      /*
00527       * when very puzzled, just look at file magic number
00528       */
00529     if ( (8 < size)
00530          && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) )
00531       generator = "Microsoft Office";
00532   }
00533 
00534   if(NULL != generator) {
00535     const char * mimetype = "application/vnd.ms-files";
00536 
00537     if((0 == strncmp(generator, "Microsoft Word", 14)) ||
00538        (0 == strncmp(generator, "Microsoft Office Word", 21)))
00539       mimetype = "application/msword";
00540     else if((0 == strncmp(generator, "Microsoft Excel", 15)) ||
00541             (0 == strncmp(generator, "Microsoft Office Excel", 22)))
00542       mimetype = "application/vnd.ms-excel";
00543     else if((0 == strncmp(generator, "Microsoft PowerPoint", 20)) ||
00544             (0 == strncmp(generator, "Microsoft Office PowerPoint", 27)))
00545       mimetype = "application/vnd.ms-powerpoint";
00546     else if(0 == strncmp(generator, "Microsoft Project", 17))
00547       mimetype = "application/vnd.ms-project";
00548     else if(0 == strncmp(generator, "Microsoft Visio", 15))
00549       mimetype = "application/vnd.visio";
00550     else if(0 == strncmp(generator, "Microsoft Office", 16))
00551       mimetype = "application/vnd.ms-office";
00552 
00553     prev = addKeyword(prev, mimetype, EXTRACTOR_MIMETYPE);
00554   }
00555 
00556   return prev;
00557 }

static const char* lidToLanguage ( unsigned int  lid  )  [static]

Definition at line 249 of file ole2extractor.c.

References _, __, and NULL.

Referenced by libextractor_ole2_extract().

00249                                                       {
00250   switch ( lid ) {
00251   case 0x0400:
00252     return _("No Proofing");
00253   case 0x0401:
00254     return __("Arabic");
00255   case 0x0402:
00256     return __("Bulgarian");
00257   case 0x0403:
00258     return __("Catalan");
00259   case 0x0404:
00260     return _("Traditional Chinese");
00261   case 0x0804:
00262     return _("Simplified Chinese");
00263   case 0x0405:
00264     return __("Chechen");
00265   case 0x0406:
00266     return __("Danish");
00267   case 0x0407:
00268     return __("German");
00269   case 0x0807:
00270     return _("Swiss German");
00271   case 0x0408:
00272     return __("Greek");
00273   case 0x0409:
00274     return _("U.S. English");
00275   case 0x0809:
00276     return _("U.K. English");
00277   case 0x0c09:
00278     return _("Australian English");
00279   case 0x040a:
00280     return _("Castilian Spanish");
00281   case 0x080a:
00282     return _("Mexican Spanish");
00283   case 0x040b:
00284     return __("Finnish");
00285   case 0x040c:
00286     return __("French");
00287   case 0x080c:
00288     return _("Belgian French");
00289   case 0x0c0c:
00290     return _("Canadian French");
00291   case 0x100c:
00292     return _("Swiss French");
00293   case 0x040d:
00294     return __("Hebrew");
00295   case 0x040e:
00296     return __("Hungarian");
00297   case 0x040f:
00298     return __("Icelandic");
00299   case 0x0410:
00300     return __("Italian");
00301   case 0x0810:
00302     return _("Swiss Italian");
00303   case 0x0411:
00304     return __("Japanese");
00305   case 0x0412:
00306     return __("Korean");
00307   case 0x0413:
00308     return __("Dutch");
00309   case 0x0813:
00310     return _("Belgian Dutch");
00311   case 0x0414:
00312     return _("Norwegian Bokmal");
00313   case 0x0814:
00314     return __("Norwegian Nynorsk");
00315   case 0x0415:
00316     return __("Polish");
00317   case 0x0416:
00318     return __("Brazilian Portuguese");
00319   case 0x0816:
00320     return __("Portuguese");
00321   case 0x0417:
00322     return _("Rhaeto-Romanic");
00323   case 0x0418:
00324     return __("Romanian");
00325   case 0x0419:
00326     return __("Russian");
00327   case 0x041a:
00328     return _("Croato-Serbian (Latin)");
00329   case 0x081a:
00330     return _("Serbo-Croatian (Cyrillic)");
00331   case 0x041b:
00332     return __("Slovak");
00333   case 0x041c:
00334     return __("Albanian");
00335   case 0x041d:
00336     return __("Swedish");
00337   case 0x041e:
00338     return __("Thai");
00339   case 0x041f:
00340     return __("Turkish");
00341   case 0x0420:
00342     return __("Urdu");
00343   case 0x0421:
00344     return __("Bahasa");
00345   case 0x0422:
00346     return __("Ukrainian");
00347   case 0x0423:
00348     return __("Byelorussian");
00349   case 0x0424:
00350     return __("Slovenian");
00351   case 0x0425:
00352     return __("Estonian");
00353   case 0x0426:
00354     return __("Latvian");
00355   case 0x0427:
00356     return __("Lithuanian");
00357   case 0x0429:
00358     return _("Farsi");
00359   case 0x042D:
00360     return __("Basque");
00361   case 0x042F:
00362     return __("Macedonian");
00363   case 0x0436:
00364     return __("Afrikaans");
00365   case 0x043E:
00366     return __("Malayalam");
00367   default:
00368     return NULL;
00369   }
00370 }

static void nolog ( const gchar *  log_domain,
GLogLevelFlags  log_level,
const gchar *  message,
gpointer  user_data 
) [static]

Definition at line 558 of file ole2extractor.c.

Referenced by __attribute__().

00561                                        {
00562 }

static struct EXTRACTOR_Keywords* process ( GsfInput *  in,
struct EXTRACTOR_Keywords prev 
) [static]

Definition at line 180 of file ole2extractor.c.

References error(), NULL, and processMetadata().

Referenced by libextractor_ole2_extract(), and testKeyword().

00181                                           {
00182   GsfDocMetaData * sections;
00183   GError * error;
00184 
00185   sections = gsf_doc_meta_data_new();
00186   error = gsf_msole_metadata_read(in, sections);
00187   if (error == NULL) {
00188     gsf_doc_meta_data_foreach(sections,
00189                               &processMetadata,
00190                               &prev);
00191   }
00192   g_object_unref(G_OBJECT(sections));
00193   return prev;
00194 }

static void processMetadata ( gpointer  key,
gpointer  value,
gpointer  user_data 
) [static]

Definition at line 133 of file ole2extractor.c.

References addKeyword(), free, NULL, printf, tmap, and type.

Referenced by process().

00135                                                 {
00136   struct EXTRACTOR_Keywords ** pprev = user_data;
00137   const char * type = key;
00138   const GsfDocProp * prop = value;
00139   const GValue * gval;
00140   char * contents;
00141   int pos;
00142 
00143   if ( (key == NULL) ||
00144        (value == NULL) )
00145     return;
00146   gval = gsf_doc_prop_get_val(prop);
00147 
00148   if (G_VALUE_TYPE(gval) == G_TYPE_STRING) {
00149     contents = strdup(g_value_get_string(gval));
00150   } else {
00151     /* convert other formats? */
00152     contents = g_strdup_value_contents(gval);
00153   }
00154   if (contents == NULL)
00155     return;
00156   if ( (strlen(contents) > 0) &&
00157        (contents[strlen(contents)-1] == '\n') )
00158     contents[strlen(contents)-1] = '\0';
00159   pos = 0;
00160   while (tmap[pos].text != NULL) {
00161     if (0 == strcmp(tmap[pos].text,
00162                     type))
00163       break;
00164     pos++;
00165   }
00166   if (tmap[pos].text != NULL)
00167     *pprev = addKeyword(*pprev,
00168                         contents,
00169                         tmap[pos].type);
00170 #if DEBUG_OLE2
00171   else
00172     printf("No match for type `%s'\n",
00173            type);
00174 #endif
00175   free(contents);
00176 }

static struct EXTRACTOR_Keywords* processSO ( GsfInput *  src,
struct EXTRACTOR_Keywords prev 
) [static]

Definition at line 197 of file ole2extractor.c.

References addKeyword(), EXTRACTOR_COMMENT, EXTRACTOR_KEYWORDS, EXTRACTOR_SUBJECT, EXTRACTOR_TITLE, free, malloc, and size.

Referenced by libextractor_ole2_extract().

00198                                             {
00199   off_t size;
00200   char * buf;
00201 
00202   size = gsf_input_size(src);
00203   if (size < 0x374) /* == 0x375?? */
00204     return prev;
00205   buf = malloc(size);
00206   gsf_input_read(src, size, (unsigned char*) buf);
00207   if ( (buf[0] != 0x0F) ||
00208        (buf[1] != 0x0) ||
00209        (0 != strncmp(&buf[2],
00210                      "SfxDocumentInfo",
00211                      strlen("SfxDocumentInfo"))) ||
00212        (buf[0x11] != 0x0B) ||
00213        (buf[0x13] != 0x00) || /* pw protected! */
00214        (buf[0x12] != 0x00) ) {
00215     free(buf);
00216     return prev;
00217   }
00218   buf[0xd3] = '\0';
00219   if (buf[0x94] + buf[0x93] > 0)
00220     prev = addKeyword(prev,
00221                       &buf[0x95],
00222                       EXTRACTOR_TITLE);
00223   buf[0x114] = '\0';
00224   if (buf[0xd5] + buf[0xd4] > 0)
00225     prev = addKeyword(prev,
00226                       &buf[0xd6],
00227                       EXTRACTOR_SUBJECT);
00228   buf[0x215] = '\0';
00229   if (buf[0x115] + buf[0x116] > 0)
00230     prev = addKeyword(prev,
00231                       &buf[0x117],
00232                       EXTRACTOR_COMMENT);
00233   buf[0x296] = '\0';
00234   if (buf[0x216] + buf[0x217] > 0)
00235     prev = addKeyword(prev,
00236                       &buf[0x218],
00237                       EXTRACTOR_KEYWORDS);
00238   /* fixme: do timestamps,
00239      mime-type, user-defined info's */
00240 
00241   free(buf);
00242   return prev;
00243 }


Variable Documentation

guint8 const component_guid[] [static]

Initial value:

 {
        0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10,
        0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9
}

Definition at line 69 of file ole2extractor.c.

guint8 const document_guid[] [static]

Initial value:

 {
        0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
        0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
}

Definition at line 74 of file ole2extractor.c.

Matches tmap[] [static]

Definition at line 89 of file ole2extractor.c.

guint8 const user_guid[] [static]

Initial value:

 {
        0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
        0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
}

Definition at line 79 of file ole2extractor.c.


Generated on Thu Nov 20 10:45:47 2008 for libextractor by  doxygen 1.5.1