dviextractor.c

Go to the documentation of this file.
00001 /*
00002      This file is part of libextractor.
00003      (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff
00004 
00005      libextractor is free software; you can redistribute it and/or modify
00006      it under the terms of the GNU General Public License as published
00007      by the Free Software Foundation; either version 2, or (at your
00008      option) any later version.
00009 
00010      libextractor is distributed in the hope that it will be useful, but
00011      WITHOUT ANY WARRANTY; without even the implied warranty of
00012      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013      General Public License for more details.
00014 
00015      You should have received a copy of the GNU General Public License
00016      along with libextractor; see the file COPYING.  If not, write to the
00017      Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00018      Boston, MA 02111-1307, USA.
00019  */
00020 
00021 #include "platform.h"
00022 #include "extractor.h"
00023 
00024 static EXTRACTOR_KeywordList *
00025 addKeyword (EXTRACTOR_KeywordType type,
00026             char *keyword, EXTRACTOR_KeywordList * next)
00027 {
00028   EXTRACTOR_KeywordList *result;
00029 
00030   if (keyword == NULL)
00031     return next;
00032   result = malloc (sizeof (EXTRACTOR_KeywordList));
00033   result->next = next;
00034   result->keyword = keyword;
00035   result->keywordType = type;
00036   return result;
00037 }
00038 
00039 typedef struct
00040 {
00041   char *text;
00042   EXTRACTOR_KeywordType type;
00043 } Matches;
00044 
00045 static Matches tmap[] = {
00046   {"/Title (", EXTRACTOR_TITLE},
00047   {"/Subject (", EXTRACTOR_SUBJECT},
00048   {"/Author (", EXTRACTOR_AUTHOR},
00049   {"/Keywords (", EXTRACTOR_KEYWORDS},
00050   {"/Creator (", EXTRACTOR_CREATOR},
00051   {"/Producer (", EXTRACTOR_PRODUCER},
00052   {NULL, 0},
00053 };
00054 
00055 static struct EXTRACTOR_Keywords *
00056 parseZZZ (const char *data,
00057           size_t pos, size_t len, struct EXTRACTOR_Keywords *prev)
00058 {
00059   size_t slen;
00060   size_t end;
00061   int i;
00062   char *value;
00063 
00064   end = pos + len;
00065   slen = strlen ("ps:SDict begin [");
00066   if (len <= slen)
00067     return prev;
00068   if (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen))
00069     return prev;
00070   pos += slen;
00071   while (pos < end)
00072     {
00073       i = 0;
00074       while (tmap[i].text != NULL)
00075         {
00076           slen = strlen (tmap[i].text);
00077           if (pos + slen < end)
00078             {
00079               if (0 == strncmp (&data[pos], tmap[i].text, slen))
00080                 {
00081                   pos += slen;
00082                   slen = pos;
00083                   while ((slen < end) && (data[slen] != ')'))
00084                     slen++;
00085                   slen = slen - pos;
00086                   value = malloc (slen + 1);
00087                   value[slen] = '\0';
00088                   memcpy (value, &data[pos], slen);
00089                   prev = addKeyword (tmap[i].type, value, prev);
00090                   pos += slen + 1;
00091                 }
00092             }
00093           i++;
00094         }
00095       pos++;
00096     }
00097   return prev;
00098 }
00099 
00100 static unsigned int
00101 getIntAt (const void *data)
00102 {
00103   char p[4];
00104 
00105   memcpy (p, data, 4);          /* ensure alignment! */
00106   return *(unsigned int *) &p[0];
00107 }
00108 
00109 static unsigned int
00110 getShortAt (const void *data)
00111 {
00112   char p[2];
00113 
00114   memcpy (p, data, 2);          /* ensure alignment! */
00115   return *(unsigned short *) &p[0];
00116 }
00117 
00118 struct EXTRACTOR_Keywords *
00119 libextractor_dvi_extract (const char *filename,
00120                           const unsigned char *data,
00121                           size_t size, struct EXTRACTOR_Keywords *prev)
00122 {
00123   unsigned int klen;
00124   char *comment;
00125   unsigned int pos;
00126   unsigned int opos;
00127   unsigned int len;
00128   unsigned int pageCount;
00129   char *pages;
00130 
00131   if (size < 40)
00132     return prev;
00133   if ((data[0] != 247) || (data[1] != 2))
00134     return prev;                /* cannot be dvi or unsupported version */
00135   klen = data[14];
00136 
00137   pos = size - 1;
00138   while ((data[pos] == 223) && (pos > 0))
00139     pos--;
00140   if ((data[pos] != 2) || (pos < 40))
00141     return prev;
00142   pos--;
00143   pos -= 4;
00144   /* assert pos at 'post_post tag' */
00145   if (data[pos] != 249)
00146     return prev;
00147   opos = pos;
00148   pos = ntohl (getIntAt (&data[opos + 1]));
00149   if (pos + 25 > size)
00150     return prev;
00151   /* assert pos at 'post' command */
00152   if (data[pos] != 248)
00153     return prev;
00154   pageCount = 0;
00155   opos = pos;
00156   pos = ntohl (getIntAt (&data[opos + 1]));
00157   while (1)
00158     {
00159       if (pos == (unsigned int) -1)
00160         break;
00161       if (pos + 45 > size)
00162         return prev;
00163       if (data[pos] != 139)     /* expect 'bop' */
00164         return prev;
00165       pageCount++;
00166       opos = pos;
00167       pos = ntohl (getIntAt (&data[opos + 41]));
00168       if (pos == (unsigned int) -1)
00169         break;
00170       if (pos >= opos)
00171         return prev;            /* invalid! */
00172     }
00173   /* ok, now we believe it's a dvi... */
00174   pages = malloc (16);
00175   snprintf (pages, 16, "%u", pageCount);
00176   comment = malloc (klen + 1);
00177   comment[klen] = '\0';
00178   memcpy (comment, &data[15], klen);
00179   prev = addKeyword (EXTRACTOR_MIMETYPE, strdup ("application/x-dvi"), prev);
00180   prev = addKeyword (EXTRACTOR_COMMENT, comment, prev);
00181   prev = addKeyword (EXTRACTOR_PAGE_COUNT, pages, prev);
00182   /* try to find PDF/ps special */
00183   pos = opos;
00184   while (pos < size - 100)
00185     {
00186       switch (data[pos])
00187         {
00188         case 139:              /* begin page 'bop', we typically have to skip that one to
00189                                    find the zzz's */
00190           pos += 45;            /* skip bop */
00191           break;
00192         case 239:              /* zzz1 */
00193           len = data[pos + 1];
00194           if (pos + 2 + len < size)
00195             prev = parseZZZ ((const char *) data, pos + 2, len, prev);
00196           pos += len + 2;
00197           break;
00198         case 240:              /* zzz2 */
00199           len = ntohs (getShortAt (&data[pos + 1]));
00200           if (pos + 3 + len < size)
00201             prev = parseZZZ ((const char *) data, pos + 3, len, prev);
00202           pos += len + 3;
00203           break;
00204         case 241:              /* zzz3, who uses that? */
00205           len = (ntohs (getShortAt (&data[pos + 1]))) + 65536 * data[pos + 3];
00206           if (pos + 4 + len < size)
00207             prev = parseZZZ ((const char *) data, pos + 4, len, prev);
00208           pos += len + 4;
00209           break;
00210         case 242:              /* zzz4, hurray! */
00211           len = ntohl (getIntAt (&data[pos + 1]));
00212           if (pos + 1 + len < size)
00213             prev = parseZZZ ((const char *) data, pos + 5, len, prev);
00214           pos += len + 5;
00215           break;
00216         default:               /* unsupported opcode, abort scan */
00217           return prev;
00218         }
00219     }
00220   return prev;
00221 }

Generated on Fri Jan 9 16:44:28 2009 for libextractor by  doxygen 1.5.1