ole2extractor.c

Go to the documentation of this file.
00001 /*
00002      This file is part of libextractor.
00003      (C) 2004, 2005, 2006, 2007 Vidyut Samanta and Christian Grothoff
00004 
00005      libextractor is free software; you can redistribute it and/or modify
00006      it under the terms of the GNU General Public License as published
00007      by the Free Software Foundation; either version 2, or (at your
00008      option) any later version.
00009 
00010      libextractor is distributed in the hope that it will be useful, but
00011      WITHOUT ANY WARRANTY; without even the implied warranty of
00012      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013      General Public License for more details.
00014 
00015      You should have received a copy of the GNU General Public License
00016      along with libextractor; see the file COPYING.  If not, write to the
00017      Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00018      Boston, MA 02111-1307, USA.
00019 
00020      This code makes extensive use of libgsf
00021      -- the Gnome Structured File Library
00022      Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
00023 
00024      Part of this code was borrowed from wordleaker.cpp. See also
00025      the README file in this directory.
00026 */
00027 
00028 #include "platform.h"
00029 #include "extractor.h"
00030 #include "../convert.h"
00031 
00032 #include <glib-object.h>
00033 #include <string.h>
00034 #include <stdio.h>
00035 #include <ctype.h>
00036 
00037 #include <gsf/gsf-utils.h>
00038 #include <gsf/gsf-input-memory.h>
00039 #include <gsf/gsf-infile.h>
00040 #include <gsf/gsf-infile-msole.h>
00041 #include <gsf/gsf-msole-utils.h>
00042 
00043 #define DEBUG_OLE2 0
00044 
00045 /* ******************************** main extraction code ************************ */
00046 
00047 static struct EXTRACTOR_Keywords *
00048 addKeyword(EXTRACTOR_KeywordList *oldhead,
00049            const char *phrase,
00050            EXTRACTOR_KeywordType type) {
00051   EXTRACTOR_KeywordList * keyword;
00052 
00053   if (strlen(phrase) == 0)
00054     return oldhead;
00055   if (0 == strcmp(phrase, "\"\""))
00056     return oldhead;
00057   if (0 == strcmp(phrase, "\" \""))
00058     return oldhead;
00059   if (0 == strcmp(phrase, " "))
00060     return oldhead;
00061   keyword = malloc(sizeof(EXTRACTOR_KeywordList));
00062   keyword->next = oldhead;
00063   keyword->keyword = strdup(phrase);
00064   keyword->keywordType = type;
00065   return keyword;
00066 }
00067 
00068 
00069 static guint8 const component_guid [] = {
00070         0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10,
00071         0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9
00072 };
00073 
00074 static guint8 const document_guid [] = {
00075         0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
00076         0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
00077 };
00078 
00079 static guint8 const user_guid [] = {
00080         0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
00081         0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
00082 };
00083 
00084 typedef struct {
00085   char * text;
00086   EXTRACTOR_KeywordType type;
00087 } Matches;
00088 
00089 static Matches tmap[] = {
00090   { "Title", EXTRACTOR_TITLE },
00091   { "PresentationFormat", EXTRACTOR_FORMAT },
00092   { "Category", EXTRACTOR_DESCRIPTION },
00093   { "Manager", EXTRACTOR_MANAGER },
00094   { "Company", EXTRACTOR_COMPANY },
00095   { "Subject", EXTRACTOR_SUBJECT },
00096   { "Author", EXTRACTOR_AUTHOR },
00097   { "Keywords", EXTRACTOR_KEYWORDS },
00098   { "Comments", EXTRACTOR_COMMENT },
00099   { "Template", EXTRACTOR_TEMPLATE },
00100   { "NumPages", EXTRACTOR_PAGE_COUNT },
00101   { "AppName", EXTRACTOR_SOFTWARE },
00102   { "RevisionNumber", EXTRACTOR_VERSIONNUMBER },
00103   { "Dictionary", EXTRACTOR_LANGUAGE },
00104   { "NumBytes", EXTRACTOR_SIZE },
00105   { "CreatedTime", EXTRACTOR_CREATION_DATE },
00106   { "LastSavedTime" , EXTRACTOR_MODIFICATION_DATE },
00107   { "gsf:company", EXTRACTOR_COMPANY },
00108   /*  { "gsf:security", EXTRACTOR_SECURITY }, */
00109   { "gsf:character-count", EXTRACTOR_CHARACTER_COUNT },
00110   { "gsf:page-count", EXTRACTOR_PAGE_COUNT },
00111   { "gsf:line-count", EXTRACTOR_LINE_COUNT },
00112   { "gsf:word-count", EXTRACTOR_WORD_COUNT },
00113   { "gsf:paragraph-count", EXTRACTOR_PARAGRAPH_COUNT },
00114   { "gsf:last-saved-by", EXTRACTOR_LAST_SAVED_BY },
00115   /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
00116   { "gsf:manager", EXTRACTOR_MANAGER },
00117   { "dc:title", EXTRACTOR_TITLE },
00118   { "dc:creator", EXTRACTOR_CREATOR },
00119   { "dc:date", EXTRACTOR_DATE },
00120   { "dc:subject", EXTRACTOR_SUBJECT },
00121   { "dc:keywords", EXTRACTOR_KEYWORDS },
00122   { "dc:last-printed", EXTRACTOR_LAST_PRINTED },
00123   { "dc:description", EXTRACTOR_DESCRIPTION },
00124   { "meta:creation-date", EXTRACTOR_CREATION_DATE },
00125   /* { "meta:editing-duration", EXTRACTOR_TOTAL_EDITING_TIME }, // encoding? */
00126   { "meta:generator", EXTRACTOR_GENERATOR },
00127   { "meta:template", EXTRACTOR_TEMPLATE },
00128   /* { "meta:editing-cycles", EXTRACTOR_EDITING_CYCLES }, // usually "FALSE" */
00129   /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
00130   { NULL, 0 },
00131 };
00132 
00133 static void processMetadata(gpointer key,
00134                             gpointer value,
00135                             gpointer user_data) {
00136   struct EXTRACTOR_Keywords ** pprev = user_data;
00137   const char * type = key;
00138   const GsfDocProp * prop = value;
00139   const GValue * gval;
00140   char * contents;
00141   int pos;
00142 
00143   if ( (key == NULL) ||
00144        (value == NULL) )
00145     return;
00146   gval = gsf_doc_prop_get_val(prop);
00147 
00148   if (G_VALUE_TYPE(gval) == G_TYPE_STRING) {
00149     contents = strdup(g_value_get_string(gval));
00150   } else {
00151     /* convert other formats? */
00152     contents = g_strdup_value_contents(gval);
00153   }
00154   if (contents == NULL)
00155     return;
00156   if ( (strlen(contents) > 0) &&
00157        (contents[strlen(contents)-1] == '\n') )
00158     contents[strlen(contents)-1] = '\0';
00159   pos = 0;
00160   while (tmap[pos].text != NULL) {
00161     if (0 == strcmp(tmap[pos].text,
00162                     type))
00163       break;
00164     pos++;
00165   }
00166   if (tmap[pos].text != NULL)
00167     *pprev = addKeyword(*pprev,
00168                         contents,
00169                         tmap[pos].type);
00170 #if DEBUG_OLE2
00171   else
00172     printf("No match for type `%s'\n",
00173            type);
00174 #endif
00175   free(contents);
00176 }
00177 
00178 
00179 static struct EXTRACTOR_Keywords *
00180 process(GsfInput * in,
00181         struct EXTRACTOR_Keywords * prev) {
00182   GsfDocMetaData * sections;
00183   GError * error;
00184 
00185   sections = gsf_doc_meta_data_new();
00186   error = gsf_msole_metadata_read(in, sections);
00187   if (error == NULL) {
00188     gsf_doc_meta_data_foreach(sections,
00189                               &processMetadata,
00190                               &prev);
00191   }
00192   g_object_unref(G_OBJECT(sections));
00193   return prev;
00194 }
00195 
00196 static struct EXTRACTOR_Keywords *
00197 processSO(GsfInput * src,
00198           struct EXTRACTOR_Keywords * prev) {
00199   off_t size;
00200   char * buf;
00201 
00202   size = gsf_input_size(src);
00203   if (size < 0x374) /* == 0x375?? */
00204     return prev;
00205   buf = malloc(size);
00206   gsf_input_read(src, size, (unsigned char*) buf);
00207   if ( (buf[0] != 0x0F) ||
00208        (buf[1] != 0x0) ||
00209        (0 != strncmp(&buf[2],
00210                      "SfxDocumentInfo",
00211                      strlen("SfxDocumentInfo"))) ||
00212        (buf[0x11] != 0x0B) ||
00213        (buf[0x13] != 0x00) || /* pw protected! */
00214        (buf[0x12] != 0x00) ) {
00215     free(buf);
00216     return prev;
00217   }
00218   buf[0xd3] = '\0';
00219   if (buf[0x94] + buf[0x93] > 0)
00220     prev = addKeyword(prev,
00221                       &buf[0x95],
00222                       EXTRACTOR_TITLE);
00223   buf[0x114] = '\0';
00224   if (buf[0xd5] + buf[0xd4] > 0)
00225     prev = addKeyword(prev,
00226                       &buf[0xd6],
00227                       EXTRACTOR_SUBJECT);
00228   buf[0x215] = '\0';
00229   if (buf[0x115] + buf[0x116] > 0)
00230     prev = addKeyword(prev,
00231                       &buf[0x117],
00232                       EXTRACTOR_COMMENT);
00233   buf[0x296] = '\0';
00234   if (buf[0x216] + buf[0x217] > 0)
00235     prev = addKeyword(prev,
00236                       &buf[0x218],
00237                       EXTRACTOR_KEYWORDS);
00238   /* fixme: do timestamps,
00239      mime-type, user-defined info's */
00240 
00241   free(buf);
00242   return prev;
00243 }
00244 
00245 /* *************** wordleaker stuff *************** */
00246 
00247 #define __(a) dgettext("iso-639", a)
00248 
00249 static const char * lidToLanguage( unsigned int lid ) {
00250   switch ( lid ) {
00251   case 0x0400:
00252     return _("No Proofing");
00253   case 0x0401:
00254     return __("Arabic");
00255   case 0x0402:
00256     return __("Bulgarian");
00257   case 0x0403:
00258     return __("Catalan");
00259   case 0x0404:
00260     return _("Traditional Chinese");
00261   case 0x0804:
00262     return _("Simplified Chinese");
00263   case 0x0405:
00264     return __("Chechen");
00265   case 0x0406:
00266     return __("Danish");
00267   case 0x0407:
00268     return __("German");
00269   case 0x0807:
00270     return _("Swiss German");
00271   case 0x0408:
00272     return __("Greek");
00273   case 0x0409:
00274     return _("U.S. English");
00275   case 0x0809:
00276     return _("U.K. English");
00277   case 0x0c09:
00278     return _("Australian English");
00279   case 0x040a:
00280     return _("Castilian Spanish");
00281   case 0x080a:
00282     return _("Mexican Spanish");
00283   case 0x040b:
00284     return __("Finnish");
00285   case 0x040c:
00286     return __("French");
00287   case 0x080c:
00288     return _("Belgian French");
00289   case 0x0c0c:
00290     return _("Canadian French");
00291   case 0x100c:
00292     return _("Swiss French");
00293   case 0x040d:
00294     return __("Hebrew");
00295   case 0x040e:
00296     return __("Hungarian");
00297   case 0x040f:
00298     return __("Icelandic");
00299   case 0x0410:
00300     return __("Italian");
00301   case 0x0810:
00302     return _("Swiss Italian");
00303   case 0x0411:
00304     return __("Japanese");
00305   case 0x0412:
00306     return __("Korean");
00307   case 0x0413:
00308     return __("Dutch");
00309   case 0x0813:
00310     return _("Belgian Dutch");
00311   case 0x0414:
00312     return _("Norwegian Bokmal");
00313   case 0x0814:
00314     return __("Norwegian Nynorsk");
00315   case 0x0415:
00316     return __("Polish");
00317   case 0x0416:
00318     return __("Brazilian Portuguese");
00319   case 0x0816:
00320     return __("Portuguese");
00321   case 0x0417:
00322     return _("Rhaeto-Romanic");
00323   case 0x0418:
00324     return __("Romanian");
00325   case 0x0419:
00326     return __("Russian");
00327   case 0x041a:
00328     return _("Croato-Serbian (Latin)");
00329   case 0x081a:
00330     return _("Serbo-Croatian (Cyrillic)");
00331   case 0x041b:
00332     return __("Slovak");
00333   case 0x041c:
00334     return __("Albanian");
00335   case 0x041d:
00336     return __("Swedish");
00337   case 0x041e:
00338     return __("Thai");
00339   case 0x041f:
00340     return __("Turkish");
00341   case 0x0420:
00342     return __("Urdu");
00343   case 0x0421:
00344     return __("Bahasa");
00345   case 0x0422:
00346     return __("Ukrainian");
00347   case 0x0423:
00348     return __("Byelorussian");
00349   case 0x0424:
00350     return __("Slovenian");
00351   case 0x0425:
00352     return __("Estonian");
00353   case 0x0426:
00354     return __("Latvian");
00355   case 0x0427:
00356     return __("Lithuanian");
00357   case 0x0429:
00358     return _("Farsi");
00359   case 0x042D:
00360     return __("Basque");
00361   case 0x042F:
00362     return __("Macedonian");
00363   case 0x0436:
00364     return __("Afrikaans");
00365   case 0x043E:
00366     return __("Malayalam");
00367   default:
00368     return NULL;
00369   }
00370 }
00371 
00372 
00373 static struct EXTRACTOR_Keywords *
00374 history_extract(GsfInput * stream,
00375                 unsigned int lcbSttbSavedBy,
00376                 unsigned int fcSttbSavedBy,
00377                 struct EXTRACTOR_Keywords * prev) {
00378   unsigned int where = 0;
00379   unsigned char * lbuffer;
00380   unsigned int i;
00381   unsigned int length;
00382   char * author;
00383   char * filename;
00384   char * rbuf;
00385   unsigned int nRev;
00386 
00387   // goto offset of revision
00388   gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET);
00389   if (gsf_input_remaining(stream) < lcbSttbSavedBy)
00390     return prev;
00391   lbuffer = malloc(lcbSttbSavedBy);
00392   // read all the revision history
00393   gsf_input_read(stream, lcbSttbSavedBy, lbuffer);
00394   // there are n strings, so n/2 revisions (author & file)
00395   nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
00396   where = 6;
00397   for (i=0; i < nRev; i++) {    
00398     if (where >= lcbSttbSavedBy)
00399       break;
00400     length = lbuffer[where++];
00401     if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
00402          (where + 2 * length + 2 <= where) )
00403       break;
00404     author = convertToUtf8((const char*) &lbuffer[where],
00405                            length * 2,
00406                            "UTF-16BE");
00407     where += length * 2 + 1;
00408     length = lbuffer[where++];
00409     if ( (where + 2 * length >= lcbSttbSavedBy) ||
00410          (where + 2 * length + 1 <= where) ) {
00411       free(author);
00412       break;
00413     }
00414     filename = convertToUtf8((const char*) &lbuffer[where],
00415                              length * 2,
00416                              "UTF-16BE");       
00417     where += length * 2 + 1;
00418     rbuf = malloc(strlen(author) + strlen(filename) + 512);
00419     snprintf(rbuf, 512 + strlen(author) + strlen(filename),
00420              _("Revision #%u: Author '%s' worked on '%s'"),
00421              i, author, filename);
00422     free(author);
00423     free(filename);
00424     prev = addKeyword(prev,
00425                       rbuf,
00426                       EXTRACTOR_REVISION_HISTORY);
00427     free(rbuf);
00428   }
00429   free(lbuffer);
00430   return prev;
00431 }
00432 
00433 
00434 /* ************** main method *********** */
00435 
00436 struct EXTRACTOR_Keywords *
00437 libextractor_ole2_extract(const char * filename,
00438                           const char * data,
00439                           size_t size,
00440                           struct EXTRACTOR_Keywords * prev) {
00441   GsfInput * input;
00442   GsfInfile * infile;
00443   GsfInput * src;
00444   const char * name;
00445   const char * generator = NULL;
00446   int i;
00447   unsigned int lcb;
00448   unsigned int fcb;
00449   const unsigned char * data512;
00450   unsigned int lid;
00451   const char * lang;
00452 
00453   if (size < 512 + 898)
00454     return prev; /* can hardly be OLE2 */
00455   input = gsf_input_memory_new((const guint8 *) data,
00456                                (gsf_off_t) size,
00457                                FALSE);
00458   if (input == NULL)
00459     return prev;
00460 
00461   infile = gsf_infile_msole_new(input, NULL);
00462   if (infile == NULL) {
00463     g_object_unref(G_OBJECT(input));
00464     return prev;
00465   }
00466   lcb = 0;
00467   fcb = 0;
00468   for (i=0;i<gsf_infile_num_children(infile);i++) {
00469     name = gsf_infile_name_by_index (infile, i);
00470     src = NULL;
00471     if (name == NULL)
00472       continue;
00473     if ( (0 == strcmp(name, "\005SummaryInformation"))
00474          || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
00475       src = gsf_infile_child_by_index (infile, i);
00476       if (src != NULL)
00477         prev = process(src,
00478                        prev);
00479     }
00480     if (0 == strcmp(name, "SfxDocumentInfo")) {
00481       src = gsf_infile_child_by_index (infile, i);
00482       if (src != NULL)
00483         prev = processSO(src,
00484                          prev);
00485     }
00486     if (src != NULL)
00487       g_object_unref(G_OBJECT(src));
00488   }
00489 
00490   data512 = (const unsigned char*) &data[512];
00491   lid = data512[6] + (data512[7] << 8);
00492   lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
00493   fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
00494   lang = lidToLanguage(lid);
00495   if (lang != NULL) {
00496     prev = addKeyword(prev,
00497                       lang,
00498                       EXTRACTOR_LANGUAGE);
00499   }
00500   if (lcb >= 6) {
00501     for (i=0;i<gsf_infile_num_children(infile);i++) {
00502       name = gsf_infile_name_by_index (infile, i);
00503       if (name == NULL)
00504         continue;
00505       if ( (0 == strcmp(name, "1Table")) ||
00506            (0 == strcmp(name, "0Table")) ) {
00507         src = gsf_infile_child_by_index (infile, i);
00508         if (src != NULL) {
00509           prev = history_extract(src,
00510                                  lcb,
00511                                  fcb,
00512                                  prev);
00513           g_object_unref(G_OBJECT(src));
00514         }
00515       }
00516     }
00517   }
00518   g_object_unref(G_OBJECT(infile));
00519   g_object_unref(G_OBJECT(input));
00520 
00521   /*
00522    * Hack to return an appropriate mimetype
00523    */
00524   generator = EXTRACTOR_extractLast(EXTRACTOR_GENERATOR, prev);
00525   if (NULL == generator) {
00526      /*
00527       * when very puzzled, just look at file magic number
00528       */
00529     if ( (8 < size)
00530          && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) )
00531       generator = "Microsoft Office";
00532   }
00533 
00534   if(NULL != generator) {
00535     const char * mimetype = "application/vnd.ms-files";
00536 
00537     if((0 == strncmp(generator, "Microsoft Word", 14)) ||
00538        (0 == strncmp(generator, "Microsoft Office Word", 21)))
00539       mimetype = "application/msword";
00540     else if((0 == strncmp(generator, "Microsoft Excel", 15)) ||
00541             (0 == strncmp(generator, "Microsoft Office Excel", 22)))
00542       mimetype = "application/vnd.ms-excel";
00543     else if((0 == strncmp(generator, "Microsoft PowerPoint", 20)) ||
00544             (0 == strncmp(generator, "Microsoft Office PowerPoint", 27)))
00545       mimetype = "application/vnd.ms-powerpoint";
00546     else if(0 == strncmp(generator, "Microsoft Project", 17))
00547       mimetype = "application/vnd.ms-project";
00548     else if(0 == strncmp(generator, "Microsoft Visio", 15))
00549       mimetype = "application/vnd.visio";
00550     else if(0 == strncmp(generator, "Microsoft Office", 16))
00551       mimetype = "application/vnd.ms-office";
00552 
00553     prev = addKeyword(prev, mimetype, EXTRACTOR_MIMETYPE);
00554   }
00555 
00556   return prev;
00557 }
00558 static void nolog (const gchar *log_domain,
00559                    GLogLevelFlags log_level,
00560                    const gchar *message,
00561                    gpointer user_data) {
00562 }
00563 
00564 void __attribute__ ((constructor)) ole2_ltdl_init() {
00565  g_type_init();
00566 #ifdef HAVE_GSF_INIT
00567   gsf_init();
00568 #endif
00569   /* disable logging -- thanks, Jody! */
00570   g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING,  &nolog, NULL);
00571   // gsf_init_dynamic(NULL);
00572 }
00573 
00574 void __attribute__ ((destructor)) ole2_ltdl_fini() {
00575 #ifdef HAVE_GSF_INIT
00576   gsf_shutdown();
00577 #endif
00578   // gsf_shutdown_dynamic(NULL);
00579 }
00580 
00581 /* end of ole2extractor.c */
00582 

Generated on Thu Aug 28 16:44:24 2008 for libextractor by  doxygen 1.5.1