qtextractor.c

Go to the documentation of this file.
00001 /*
00002      This file is part of libextractor.
00003      (C) 2002, 2003, 2006 Vidyut Samanta and Christian Grothoff
00004 
00005      libextractor is free software; you can redistribute it and/or modify
00006      it under the terms of the GNU General Public License as published
00007      by the Free Software Foundation; either version 2, or (at your
00008      option) any later version.
00009 
00010      libextractor is distributed in the hope that it will be useful, but
00011      WITHOUT ANY WARRANTY; without even the implied warranty of
00012      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013      General Public License for more details.
00014 
00015      You should have received a copy of the GNU General Public License
00016      along with libextractor; see the file COPYING.  If not, write to the
00017      Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00018      Boston, MA 02111-1307, USA.
00019  */
00020 
00021 #include "platform.h"
00022 #include "extractor.h"
00023 #include <zlib.h>
00024 #include <math.h>
00025 
00026 #define DEBUG 0
00027 
00028 /* verbatim from mp3extractor */
00029 static const char *const genre_names[] = {
00030   gettext_noop ("Blues"),
00031   gettext_noop ("Classic Rock"),
00032   gettext_noop ("Country"),
00033   gettext_noop ("Dance"),
00034   gettext_noop ("Disco"),
00035   gettext_noop ("Funk"),
00036   gettext_noop ("Grunge"),
00037   gettext_noop ("Hip-Hop"),
00038   gettext_noop ("Jazz"),
00039   gettext_noop ("Metal"),
00040   gettext_noop ("New Age"),
00041   gettext_noop ("Oldies"),
00042   gettext_noop ("Other"),
00043   gettext_noop ("Pop"),
00044   gettext_noop ("R&B"),
00045   gettext_noop ("Rap"),
00046   gettext_noop ("Reggae"),
00047   gettext_noop ("Rock"),
00048   gettext_noop ("Techno"),
00049   gettext_noop ("Industrial"),
00050   gettext_noop ("Alternative"),
00051   gettext_noop ("Ska"),
00052   gettext_noop ("Death Metal"),
00053   gettext_noop ("Pranks"),
00054   gettext_noop ("Soundtrack"),
00055   gettext_noop ("Euro-Techno"),
00056   gettext_noop ("Ambient"),
00057   gettext_noop ("Trip-Hop"),
00058   gettext_noop ("Vocal"),
00059   gettext_noop ("Jazz+Funk"),
00060   gettext_noop ("Fusion"),
00061   gettext_noop ("Trance"),
00062   gettext_noop ("Classical"),
00063   gettext_noop ("Instrumental"),
00064   gettext_noop ("Acid"),
00065   gettext_noop ("House"),
00066   gettext_noop ("Game"),
00067   gettext_noop ("Sound Clip"),
00068   gettext_noop ("Gospel"),
00069   gettext_noop ("Noise"),
00070   gettext_noop ("Alt. Rock"),
00071   gettext_noop ("Bass"),
00072   gettext_noop ("Soul"),
00073   gettext_noop ("Punk"),
00074   gettext_noop ("Space"),
00075   gettext_noop ("Meditative"),
00076   gettext_noop ("Instrumental Pop"),
00077   gettext_noop ("Instrumental Rock"),
00078   gettext_noop ("Ethnic"),
00079   gettext_noop ("Gothic"),
00080   gettext_noop ("Darkwave"),
00081   gettext_noop ("Techno-Industrial"),
00082   gettext_noop ("Electronic"),
00083   gettext_noop ("Pop-Folk"),
00084   gettext_noop ("Eurodance"),
00085   gettext_noop ("Dream"),
00086   gettext_noop ("Southern Rock"),
00087   gettext_noop ("Comedy"),
00088   gettext_noop ("Cult"),
00089   gettext_noop ("Gangsta Rap"),
00090   gettext_noop ("Top 40"),
00091   gettext_noop ("Christian Rap"),
00092   gettext_noop ("Pop/Funk"),
00093   gettext_noop ("Jungle"),
00094   gettext_noop ("Native American"),
00095   gettext_noop ("Cabaret"),
00096   gettext_noop ("New Wave"),
00097   gettext_noop ("Psychedelic"),
00098   gettext_noop ("Rave"),
00099   gettext_noop ("Showtunes"),
00100   gettext_noop ("Trailer"),
00101   gettext_noop ("Lo-Fi"),
00102   gettext_noop ("Tribal"),
00103   gettext_noop ("Acid Punk"),
00104   gettext_noop ("Acid Jazz"),
00105   gettext_noop ("Polka"),
00106   gettext_noop ("Retro"),
00107   gettext_noop ("Musical"),
00108   gettext_noop ("Rock & Roll"),
00109   gettext_noop ("Hard Rock"),
00110   gettext_noop ("Folk"),
00111   gettext_noop ("Folk/Rock"),
00112   gettext_noop ("National Folk"),
00113   gettext_noop ("Swing"),
00114   gettext_noop ("Fast-Fusion"),
00115   gettext_noop ("Bebob"),
00116   gettext_noop ("Latin"),
00117   gettext_noop ("Revival"),
00118   gettext_noop ("Celtic"),
00119   gettext_noop ("Bluegrass"),
00120   gettext_noop ("Avantgarde"),
00121   gettext_noop ("Gothic Rock"),
00122   gettext_noop ("Progressive Rock"),
00123   gettext_noop ("Psychedelic Rock"),
00124   gettext_noop ("Symphonic Rock"),
00125   gettext_noop ("Slow Rock"),
00126   gettext_noop ("Big Band"),
00127   gettext_noop ("Chorus"),
00128   gettext_noop ("Easy Listening"),
00129   gettext_noop ("Acoustic"),
00130   gettext_noop ("Humour"),
00131   gettext_noop ("Speech"),
00132   gettext_noop ("Chanson"),
00133   gettext_noop ("Opera"),
00134   gettext_noop ("Chamber Music"),
00135   gettext_noop ("Sonata"),
00136   gettext_noop ("Symphony"),
00137   gettext_noop ("Booty Bass"),
00138   gettext_noop ("Primus"),
00139   gettext_noop ("Porn Groove"),
00140   gettext_noop ("Satire"),
00141   gettext_noop ("Slow Jam"),
00142   gettext_noop ("Club"),
00143   gettext_noop ("Tango"),
00144   gettext_noop ("Samba"),
00145   gettext_noop ("Folklore"),
00146   gettext_noop ("Ballad"),
00147   gettext_noop ("Power Ballad"),
00148   gettext_noop ("Rhythmic Soul"),
00149   gettext_noop ("Freestyle"),
00150   gettext_noop ("Duet"),
00151   gettext_noop ("Punk Rock"),
00152   gettext_noop ("Drum Solo"),
00153   gettext_noop ("A Cappella"),
00154   gettext_noop ("Euro-House"),
00155   gettext_noop ("Dance Hall"),
00156   gettext_noop ("Goa"),
00157   gettext_noop ("Drum & Bass"),
00158   gettext_noop ("Club-House"),
00159   gettext_noop ("Hardcore"),
00160   gettext_noop ("Terror"),
00161   gettext_noop ("Indie"),
00162   gettext_noop ("BritPop"),
00163   gettext_noop ("Negerpunk"),
00164   gettext_noop ("Polsk Punk"),
00165   gettext_noop ("Beat"),
00166   gettext_noop ("Christian Gangsta Rap"),
00167   gettext_noop ("Heavy Metal"),
00168   gettext_noop ("Black Metal"),
00169   gettext_noop ("Crossover"),
00170   gettext_noop ("Contemporary Christian"),
00171   gettext_noop ("Christian Rock"),
00172   gettext_noop ("Merengue"),
00173   gettext_noop ("Salsa"),
00174   gettext_noop ("Thrash Metal"),
00175   gettext_noop ("Anime"),
00176   gettext_noop ("JPop"),
00177   gettext_noop ("Synthpop"),
00178 };
00179 
00180 #define GENRE_NAME_COUNT \
00181     ((unsigned int)(sizeof genre_names / sizeof (const char *const)))
00182 
00183 
00184 typedef struct
00185 {
00186   unsigned int size;
00187   unsigned int type;
00188 } Atom;
00189 
00190 typedef struct
00191 {
00192   unsigned int one;
00193   unsigned int type;
00194   unsigned long long size;
00195 } LongAtom;
00196 
00197 static unsigned long long
00198 ntohll (unsigned long long n)
00199 {
00200 #if __BYTE_ORDER == __BIG_ENDIAN
00201   return n;
00202 #else
00203   return (((unsigned long long) ntohl (n)) << 32) + ntohl (n >> 32);
00204 #endif
00205 }
00206 
00207 static void
00208 addKeyword (EXTRACTOR_KeywordType type,
00209             const char *keyword, struct EXTRACTOR_Keywords **list)
00210 {
00211   EXTRACTOR_KeywordList *result;
00212 
00213   if (keyword == NULL)
00214     return;
00215   result = malloc (sizeof (EXTRACTOR_KeywordList));
00216   result->next = *list;
00217   result->keyword = strdup (keyword);
00218   result->keywordType = type;
00219   *list = result;
00220 }
00221 
00222 
00223 /**
00224  * Check if at position pos there is a valid atom.
00225  * @return 0 if the atom is invalid, 1 if it is valid
00226  */
00227 static int
00228 checkAtomValid (const char *buffer, size_t size, size_t pos)
00229 {
00230   unsigned long long atomSize;
00231   const Atom *atom;
00232   const LongAtom *latom;
00233   if ((pos >= size) ||
00234       (pos + sizeof (Atom) > size) || (pos + sizeof (Atom) < pos))
00235     return 0;
00236   atom = (const Atom *) &buffer[pos];
00237   if (ntohl (atom->size) == 1)
00238     {
00239       if ((pos + sizeof (LongAtom) > size) || (pos + sizeof (LongAtom) < pos))
00240         return 0;
00241       latom = (const LongAtom *) &buffer[pos];
00242       atomSize = ntohll (latom->size);
00243       if ((atomSize < sizeof (LongAtom)) ||
00244           (atomSize + pos > size) || (atomSize + pos < atomSize))
00245         return 0;
00246     }
00247   else
00248     {
00249       atomSize = ntohl (atom->size);
00250       if ((atomSize < sizeof (Atom)) ||
00251           (atomSize + pos > size) || (atomSize + pos < atomSize))
00252         return 0;
00253     }
00254   return 1;
00255 }
00256 
00257 /**
00258  * Assumes that checkAtomValid has already been called.
00259  */
00260 static unsigned long long
00261 getAtomSize (const char *buf)
00262 {
00263   const Atom *atom;
00264   const LongAtom *latom;
00265   atom = (const Atom *) buf;
00266   if (ntohl (atom->size) == 1)
00267     {
00268       latom = (const LongAtom *) buf;
00269       return ntohll (latom->size);
00270     }
00271   return ntohl (atom->size);
00272 }
00273 
00274 /**
00275  * Assumes that checkAtomValid has already been called.
00276  */
00277 static unsigned int
00278 getAtomHeaderSize (const char *buf)
00279 {
00280   const Atom *atom;
00281 
00282   atom = (const Atom *) buf;
00283   if (ntohl (atom->size) == 1)
00284     return sizeof (const LongAtom);
00285   return sizeof (Atom);
00286 }
00287 
00288 /**
00289  * Assumes that checkAtomValid has already been called.
00290  */
00291 typedef int (*AtomHandler) (const char *input,
00292                             size_t size,
00293                             size_t pos, struct EXTRACTOR_Keywords ** list);
00294 
00295 typedef struct
00296 {
00297   char *name;
00298   AtomHandler handler;
00299 } HandlerEntry;
00300 
00301 /**
00302  * Call the handler for the atom at the given position.
00303  * Will check validity of the given atom.
00304  *
00305  * @return 0 on error, 1 for success, -1 for unknown atom type
00306  */
00307 static int handleAtom (HandlerEntry *handlers,
00308                        const char *input,
00309                        size_t size,
00310                        size_t pos, struct EXTRACTOR_Keywords **list);
00311 
00312 static HandlerEntry all_handlers[];
00313 static HandlerEntry ilst_handlers[];
00314 
00315 /**
00316  * Process atoms.
00317  * @return 0 on error, 1 for success, -1 for unknown atom type
00318  */
00319 static int
00320 processAtoms (HandlerEntry *handlers, const char *input,
00321                  size_t size, struct EXTRACTOR_Keywords **list)
00322 {
00323   size_t pos;
00324 
00325   if (size < sizeof (Atom))
00326     return 1;
00327   pos = 0;
00328   while (pos < size - sizeof (Atom))
00329     {
00330       if (0 == handleAtom (handlers, input, size, pos, list))
00331         return 0;
00332       pos += getAtomSize (&input[pos]);
00333     }
00334   return 1;
00335 }
00336 
00337 /**
00338  * Process all atoms.
00339  * @return 0 on error, 1 for success, -1 for unknown atom type
00340  */
00341 static int
00342 processAllAtoms (const char *input,
00343                  size_t size, struct EXTRACTOR_Keywords **list)
00344 {
00345   return processAtoms(all_handlers, input, size, list);
00346 }
00347 
00348 /**
00349  * Handle the moov atom.
00350  * @return 0 on error, 1 for success, -1 for unknown atom type
00351  */
00352 static int
00353 moovHandler (const char *input,
00354              size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00355 {
00356   unsigned int hdr = getAtomHeaderSize (&input[pos]);
00357   return processAllAtoms (&input[pos + hdr],
00358                           getAtomSize (&input[pos]) - hdr, list);
00359 }
00360 
00361 /* see http://developer.apple.com/documentation/QuickTime/QTFF/QTFFChap1/chapter_2_section_5.html */
00362 typedef struct
00363 {
00364   Atom header;
00365   /* major brand */
00366   char type[4];
00367   /* minor version */
00368   unsigned int version;
00369   /* compatible brands */
00370   char compatibility[4];
00371 } FileType;
00372 
00373 typedef struct
00374 {
00375   const char *ext;
00376   const char *mime;
00377 } C2M;
00378 
00379 /* see http://www.mp4ra.org/filetype.html 
00380  *     http://www.ftyps.com/ */
00381 static C2M ftMap[] = {
00382   {"qt  ", "video/quicktime"},
00383   {"isom", "video/mp4"},        /* ISO Base Media files */
00384   {"iso2", "video/mp4"},
00385   {"mp41", "video/mp4"},        /* MPEG-4 (ISO/IEC 14491-1) version 1 */
00386   {"mp42", "video/mp4"},        /* MPEG-4 (ISO/IEC 14491-1) version 2 */
00387   {"3gp1", "video/3gpp"},
00388   {"3gp2", "video/3gpp"},
00389   {"3gp3", "video/3gpp"},
00390   {"3gp4", "video/3gpp"},
00391   {"3gp5", "video/3gpp"},
00392   {"3g2a", "video/3gpp2"},
00393   {"mmp4", "video/mp4"},        /* Mobile MPEG-4 */
00394   {"M4A ", "audio/mp4"},
00395   {"M4B ", "audio/mp4"},
00396   {"M4P ", "audio/mp4"},
00397   {"M4V ", "video/mp4"},
00398   {"mj2s", "video/mj2"},        /* Motion JPEG 2000 */
00399   {"mjp2", "video/mj2"},
00400   {NULL, NULL},
00401 };
00402 
00403 static int
00404 ftypHandler (const char *input,
00405              size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00406 {
00407   const FileType *ft;
00408   int i;
00409 
00410   if (getAtomSize (&input[pos]) < sizeof (FileType)) {
00411     return 0;
00412   }
00413   ft = (const FileType *) &input[pos];
00414 
00415   i = 0;
00416   while ((ftMap[i].ext != NULL) && (0 != memcmp (ft->type, ftMap[i].ext, 4)))
00417     i++;
00418   if (ftMap[i].ext != NULL)
00419     addKeyword (EXTRACTOR_MIMETYPE, ftMap[i].mime, list);
00420   return 1;
00421 }
00422 
00423 typedef struct
00424 {
00425   Atom hdr;
00426   unsigned char version;
00427   unsigned char flags[3];
00428   /* in seconds since midnight, January 1, 1904 */
00429   unsigned int creationTime;
00430   /* in seconds since midnight, January 1, 1904 */
00431   unsigned int modificationTime;
00432   /* number of time units that pass per second in the movies time
00433      coordinate system */
00434   unsigned int timeScale;
00435   /* A time value that indicates the duration of the movie in time
00436      scale units. */
00437   unsigned int duration;
00438   unsigned int preferredRate;
00439   /* A 16-bit fixed-point number that specifies how loud to
00440      play. 1.0 indicates full volume */
00441   unsigned short preferredVolume;
00442   unsigned char reserved[10];
00443   unsigned char matrix[36];
00444   unsigned int previewTime;
00445   unsigned int previewDuration;
00446   unsigned int posterTime;
00447   unsigned int selectionTime;
00448   unsigned int selectionDuration;
00449   unsigned int currentTime;
00450   unsigned int nextTrackId;
00451 } MovieHeaderAtom;
00452 
00453 static int
00454 mvhdHandler (const char *input,
00455              size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00456 {
00457   const MovieHeaderAtom *m;
00458   char duration[16];
00459   if (getAtomSize (&input[pos]) != sizeof (MovieHeaderAtom))
00460     return 0;
00461   m = (const MovieHeaderAtom *) &input[pos];
00462   snprintf (duration, 16, "%us", ntohl (m->duration) / ntohl (m->timeScale));
00463   addKeyword (EXTRACTOR_DURATION, duration, list);
00464   return 1;
00465 }
00466 
00467 typedef struct
00468 {
00469   Atom cmovAtom;
00470   Atom dcomAtom;
00471   char compressor[4];
00472   Atom cmvdAtom;
00473   unsigned int decompressedSize;
00474 } CompressedMovieHeaderAtom;
00475 
00476 static int
00477 cmovHandler (const char *input,
00478              size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00479 {
00480   const CompressedMovieHeaderAtom *c;
00481   unsigned int s;
00482   char *buf;
00483   int ret;
00484   z_stream z_state;
00485   int z_ret_code;
00486 
00487 
00488   if (getAtomSize (&input[pos]) < sizeof (CompressedMovieHeaderAtom))
00489     return 0;
00490   c = (const CompressedMovieHeaderAtom *) &input[pos];
00491   if ((ntohl (c->dcomAtom.size) != 12) ||
00492       (0 != memcmp (&c->dcomAtom.type, "dcom", 4)) ||
00493       (0 != memcmp (c->compressor, "zlib", 4)) ||
00494       (0 != memcmp (&c->cmvdAtom.type, "cmvd", 4)) ||
00495       (ntohl (c->cmvdAtom.size) !=
00496        getAtomSize (&input[pos]) - sizeof (Atom) * 2 - 4))
00497     {
00498       return 0;                 /* dcom must be 12 bytes */
00499     }
00500   s = ntohl (c->decompressedSize);
00501   if (s > 16 * 1024 * 1024)
00502     return 1;                   /* ignore, too big! */
00503   buf = malloc (s);
00504   if (buf == NULL)
00505     return 1;                   /* out of memory, handle gracefully */
00506 
00507   z_state.next_in = (unsigned char *) &c[1];
00508   z_state.avail_in = ntohl (c->cmvdAtom.size);
00509   z_state.avail_out = s;
00510   z_state.next_out = (unsigned char *) buf;
00511   z_state.zalloc = (alloc_func) 0;
00512   z_state.zfree = (free_func) 0;
00513   z_state.opaque = (voidpf) 0;
00514   z_ret_code = inflateInit (&z_state);
00515   if (Z_OK != z_ret_code)
00516     {
00517       free (buf);
00518       return 0;                 /* crc error? */
00519     }
00520   z_ret_code = inflate (&z_state, Z_NO_FLUSH);
00521   if ((z_ret_code != Z_OK) && (z_ret_code != Z_STREAM_END))
00522     {
00523       free (buf);
00524       return 0;                 /* decode error? */
00525     }
00526   z_ret_code = inflateEnd (&z_state);
00527   if (Z_OK != z_ret_code)
00528     {
00529       free (buf);
00530       return 0;                 /* decode error? */
00531     }
00532   ret = handleAtom (all_handlers, buf, s, 0, list);
00533   free (buf);
00534   return ret;
00535 }
00536 
00537 typedef struct
00538 {
00539   short integer;
00540   short fraction;
00541 } Fixed;
00542 
00543 typedef struct
00544 {
00545   Atom hdr;
00546   unsigned int flags;           /* 1 byte of version, 3 bytes of flags */
00547   /* in seconds since midnight, January 1, 1904 */
00548   unsigned int creationTime;
00549   /* in seconds since midnight, January 1, 1904 */
00550   unsigned int modificationTime;
00551   unsigned int trackID;
00552   unsigned int reserved_0;
00553   unsigned int duration;
00554   unsigned int reserved_1;
00555   unsigned int reserved_2;
00556   unsigned short layer;
00557   unsigned short alternate_group;
00558   unsigned short volume;
00559   unsigned short reserved_3;
00560   Fixed matrix[3][3];
00561   /* in pixels */
00562   Fixed track_width;
00563   /* in pixels */
00564   Fixed track_height;
00565 } TrackAtom;
00566 
00567 static int
00568 tkhdHandler (const char *input,
00569              size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00570 {
00571   const TrackAtom *m;
00572   char dimensions[40];
00573 
00574   if (getAtomSize (&input[pos]) < sizeof (TrackAtom))
00575     return 0;
00576   m = (const TrackAtom *) &input[pos];
00577   if (ntohs (m->track_width.integer) != 0)
00578     {
00579       /* if actually a/the video track */
00580       snprintf (dimensions,
00581                 40,
00582                 "%dx%d",
00583                 ntohs (m->track_width.integer),
00584                 ntohs (m->track_height.integer));
00585       addKeyword (EXTRACTOR_FORMAT, dimensions, list);
00586     }
00587   return 1;
00588 }
00589 
00590 static int
00591 trakHandler (const char *input,
00592              size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00593 {
00594   unsigned int hdr = getAtomHeaderSize (&input[pos]);
00595   return processAllAtoms (&input[pos + hdr],
00596                           getAtomSize (&input[pos]) - hdr, list);
00597 }
00598 
00599 static int
00600 metaHandler (const char *input,
00601              size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00602 {
00603   unsigned int hdr = getAtomHeaderSize (&input[pos]);
00604   if (getAtomSize (&input[pos]) < hdr + 4)
00605     return 0;
00606   return processAllAtoms (&input[pos + hdr + 4],
00607                           getAtomSize (&input[pos]) - hdr - 4, list);
00608 }
00609 
00610 typedef struct
00611 {
00612   Atom header;
00613   unsigned short length;
00614   unsigned short language;
00615 } InternationalText;
00616 
00617 static const char *languages[] = {
00618   "English",
00619   "French",
00620   "German",
00621   "Italian",
00622   "Dutch",
00623   "Swedish",
00624   "Spanish",
00625   "Danish",
00626   "Portuguese",
00627   "Norwegian",
00628   "Hebrew",
00629   "Japanese",
00630   "Arabic",
00631   "Finnish",
00632   "Greek",
00633   "Icelandic",
00634   "Maltese",
00635   "Turkish",
00636   "Croatian",
00637   "Traditional Chinese",
00638   "Urdu",
00639   "Hindi",
00640   "Thai",
00641   "Korean",
00642   "Lithuanian",
00643   "Polish",
00644   "Hungarian",
00645   "Estonian",
00646   "Lettish",
00647   "Saamisk",
00648   "Lappish",
00649   "Faeroese",
00650   "Farsi",
00651   "Russian",
00652   "Simplified Chinese",
00653   "Flemish",
00654   "Irish",
00655   "Albanian",
00656   "Romanian",
00657   "Czech",
00658   "Slovak",
00659   "Slovenian",
00660   "Yiddish",
00661   "Serbian",
00662   "Macedonian",
00663   "Bulgarian",
00664   "Ukrainian",
00665   "Byelorussian",
00666   "Uzbek",
00667   "Kazakh",
00668   "Azerbaijani",
00669   "AzerbaijanAr",
00670   "Armenian",
00671   "Georgian",
00672   "Moldavian",
00673   "Kirghiz",
00674   "Tajiki",
00675   "Turkmen",
00676   "Mongolian",
00677   "MongolianCyr",
00678   "Pashto",
00679   "Kurdish",
00680   "Kashmiri",
00681   "Sindhi",
00682   "Tibetan",
00683   "Nepali",
00684   "Sanskrit",
00685   "Marathi",
00686   "Bengali",
00687   "Assamese",
00688   "Gujarati",
00689   "Punjabi",
00690   "Oriya",
00691   "Malayalam",
00692   "Kannada",
00693   "Tamil",
00694   "Telugu",
00695   "Sinhalese",
00696   "Burmese",
00697   "Khmer",
00698   "Lao",
00699   "Vietnamese",
00700   "Indonesian",
00701   "Tagalog",
00702   "MalayRoman",
00703   "MalayArabic",
00704   "Amharic",
00705   "Tigrinya",
00706   "Galla",
00707   "Oromo",
00708   "Somali",
00709   "Swahili",
00710   "Ruanda",
00711   "Rundi",
00712   "Chewa",
00713   "Malagasy",
00714   "Esperanto",
00715   "Welsh",
00716   "Basque",
00717   "Catalan",
00718   "Latin",
00719   "Quechua",
00720   "Guarani",
00721   "Aymara",
00722   "Tatar",
00723   "Uighur",
00724   "Dzongkha",
00725   "JavaneseRom",
00726 };
00727 
00728 /*
00729  * see http://developer.apple.com/documentation/QuickTime/QTFF/QTFFChap2/chapter
00730 _3_section_2.html
00731  *   "User Data Text Strings and Language Codes"
00732  * TODO: make conformant
00733  */
00734 static int
00735 processTextTag (const char *input,
00736                 size_t size,
00737                 size_t pos,
00738                 EXTRACTOR_KeywordType type, struct EXTRACTOR_Keywords **list)
00739 {
00740   unsigned long long as;
00741   unsigned short len;
00742   unsigned short lang;
00743   const InternationalText *txt;
00744   char *meta;
00745   int i;
00746 
00747   /* contains "international text":
00748      16-bit size + 16 bit language code */
00749   as = getAtomSize (&input[pos]);
00750   if (as < sizeof (InternationalText))
00751     return 0;                   /* invalid */
00752   txt = (const InternationalText *) &input[pos];
00753   len = ntohs (txt->length);
00754   if (len + sizeof (InternationalText) > as)
00755     return 0;                   /* invalid */
00756   lang = ntohs (txt->language);
00757   if (lang >= sizeof (languages) / sizeof (char *))
00758     return 0;                   /* invalid */
00759   addKeyword (EXTRACTOR_LANGUAGE, languages[lang], list);
00760 
00761   meta = malloc (len + 1);
00762   memcpy (meta, &txt[1], len);
00763   meta[len] = '\0';
00764   for (i = 0; i < len; i++)
00765     if (meta[i] == '\r')
00766       meta[i] = '\n';
00767   addKeyword (type, meta, list);
00768   free (meta);
00769   return 1;
00770 }
00771 
00772 typedef struct CHE
00773 {
00774   const char *pfx;
00775   EXTRACTOR_KeywordType type;
00776 } CHE;
00777 
00778 static CHE cHm[] = {
00779   {"aut", EXTRACTOR_AUTHOR,},
00780   {"cpy", EXTRACTOR_COPYRIGHT,},
00781   {"day", EXTRACTOR_CREATION_DATE,},
00782   {"cmt", EXTRACTOR_COMMENT,},
00783   {"hst", EXTRACTOR_BUILDHOST,},
00784   {"inf", EXTRACTOR_INFORMATION,},
00785   {"nam", EXTRACTOR_FULL_NAME,},
00786   {"mak", EXTRACTOR_CAMERA_MAKE,},
00787   {"mod", EXTRACTOR_CAMERA_MODEL,},
00788   {"des", EXTRACTOR_DESCRIPTION,},
00789   {"dis", EXTRACTOR_DISCLAIMER,},
00790   {"dir", EXTRACTOR_MOVIE_DIRECTOR,},
00791   {"src", EXTRACTOR_CONTRIBUTOR,},
00792   {"prf", EXTRACTOR_ARTIST,},   /* performer */
00793   {"req", EXTRACTOR_CREATED_FOR,},      /* hardware requirements */
00794   {"fmt", EXTRACTOR_FORMAT,},
00795   {"prd", EXTRACTOR_PRODUCER,},
00796   {"PRD", EXTRACTOR_PRODUCTVERSION,},   /* just product */
00797   {"swr", EXTRACTOR_SOFTWARE,},
00798   {"wrt", EXTRACTOR_AUTHOR,},   /* writer */
00799   {"wrn", EXTRACTOR_WARNING,},
00800   {"ed1", EXTRACTOR_REVISION_HISTORY,},
00801   {"ed2", EXTRACTOR_REVISION_HISTORY,},
00802   {"ed3", EXTRACTOR_REVISION_HISTORY,},
00803   {"ed4", EXTRACTOR_REVISION_HISTORY,},
00804   {"ed5", EXTRACTOR_REVISION_HISTORY,},
00805   {"ed6", EXTRACTOR_REVISION_HISTORY,},
00806   {"ed7", EXTRACTOR_REVISION_HISTORY,},
00807   {"ed8", EXTRACTOR_REVISION_HISTORY,},
00808   {"ed9", EXTRACTOR_REVISION_HISTORY,},
00809   {"chp", EXTRACTOR_CHAPTER,},
00810   {NULL, EXTRACTOR_UNKNOWN},
00811 };
00812 
00813 static int
00814 c_Handler (const char *input,
00815            size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00816 {
00817   int i;
00818 
00819   i = 0;
00820   while ((cHm[i].pfx != NULL) && (0 != memcmp (&input[pos+5], cHm[i].pfx, 3)))
00821     i++;
00822   if (cHm[i].pfx != NULL)
00823     return processTextTag (input, size, pos, cHm[i].type, list);
00824   return -1;                    /* not found */
00825 }
00826 
00827 static int
00828 udtaHandler (const char *input,
00829              size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00830 {
00831   unsigned int hdr = getAtomHeaderSize (&input[pos]);
00832   return processAllAtoms (&input[pos + hdr],
00833                           getAtomSize (&input[pos]) - hdr, list);
00834 }
00835 
00836 static int
00837 processDataAtom (const char *input,
00838                 size_t size, /* parent atom size */
00839                 size_t pos,
00840                 const char *patom,
00841                 EXTRACTOR_KeywordType type,
00842                 struct EXTRACTOR_Keywords **list)
00843 {
00844   char *meta;
00845   unsigned char version;
00846   unsigned int flags;
00847   unsigned long long asize;
00848   unsigned int len;
00849   unsigned int hdr;
00850   int i;
00851 
00852   hdr = getAtomHeaderSize (&input[pos]);
00853   asize = getAtomSize (&input[pos]);
00854   if (memcmp(&input[pos+4], "data", 4) != 0)
00855     return -1;
00856 
00857   if (asize < hdr + 8 || /* header + u32 flags + u32 reserved */
00858       asize > (getAtomSize(&patom[0]) - 8))
00859     return 0;
00860 
00861   len = (unsigned int)(asize - (hdr + 8));
00862 
00863   version = input[pos+8];
00864   flags = ((unsigned char)input[pos+9]<<16) |
00865           ((unsigned char)input[pos+10]<<8) | 
00866           (unsigned char)input[pos+11];
00867 #if DEBUG
00868   printf("[data] version:%02x flags:%08x txtlen:%d\n", version, flags, len);
00869 #endif
00870 
00871   if (version != 0)
00872     return -1;
00873 
00874   if (flags == 0x0) { /* binary data */
00875     if (memcmp(&patom[4], "gnre", 4) == 0) {
00876       if (len >= 2) {
00877         unsigned short genre = ((unsigned char)input[pos+16] << 8) |
00878                                 (unsigned char)input[pos+17];
00879         if (genre > 0 && genre < GENRE_NAME_COUNT)
00880           addKeyword(type, genre_names[genre-1], list);
00881       }
00882       return 1;
00883     }
00884     else if ((memcmp(&patom[4], "trkn", 4) == 0) || 
00885         (memcmp(&patom[4], "disk", 4) == 0)) {
00886       if (len >= 4) {
00887         unsigned short n = ((unsigned char)input[pos+18] << 8) |
00888                             (unsigned char)input[pos+19];
00889         char s[8];
00890         snprintf(s, 8, "%d", n);
00891         addKeyword(type, s, list);
00892       }
00893     }
00894     else {
00895       return -1;
00896     }
00897   }
00898   else if (flags == 0x1) { /* text data */
00899     meta = malloc (len + 1);
00900     memcpy (meta, &input[pos+16], len);
00901     meta[len] = '\0';
00902     for (i = 0; i < len; i++)
00903       if (meta[i] == '\r')
00904         meta[i] = '\n';
00905     addKeyword (type, meta, list);
00906     free (meta);
00907     return 1;
00908   }
00909 
00910   return -1;
00911 }
00912 
00913 typedef struct
00914 {
00915   const char *atom_type;
00916   EXTRACTOR_KeywordType type;
00917 } ITTagConversionEntry;
00918 
00919 /* iTunes Tags:
00920  * see http://atomicparsley.sourceforge.net/mpeg-4files.html */
00921 static ITTagConversionEntry it_to_extr_table[] = {
00922   {"\xa9" "alb", EXTRACTOR_ALBUM,},
00923   {"\xa9" "ART", EXTRACTOR_ARTIST,},
00924   {"aART", EXTRACTOR_ARTIST,},
00925   {"\xa9" "cmt", EXTRACTOR_COMMENT,},
00926   {"\xa9" "day", EXTRACTOR_YEAR,},
00927   {"\xa9" "nam", EXTRACTOR_TITLE,},
00928   {"trkn", EXTRACTOR_TRACK_NUMBER,},
00929   {"disk", EXTRACTOR_DISC_NUMBER,},
00930   {"\xa9" "gen", EXTRACTOR_GENRE,},
00931   {"gnre", EXTRACTOR_GENRE,},
00932   {"\xa9" "wrt", EXTRACTOR_AUTHOR,},
00933   {"\xa9" "too", EXTRACTOR_ENCODED_BY,},
00934   {"cprt", EXTRACTOR_COPYRIGHT,},
00935   {"\xa9" "grp", EXTRACTOR_GROUP,},
00936   {"catg", EXTRACTOR_CATEGORY,},
00937   {"keyw", EXTRACTOR_KEYWORDS,},
00938   {"desc", EXTRACTOR_DESCRIPTION,},
00939   {"tvnn", EXTRACTOR_PUBLISHER,}, /* TV Network Name */
00940   {"tvsh", EXTRACTOR_TITLE,}, /* TV Show Name */
00941 /*  {"tven", EXTRACTOR_i,},*/ /* TV Network Name */
00942   {NULL, EXTRACTOR_UNKNOWN},
00943 };
00944 
00945 /* NOTE: iTunes tag processing should, in theory, be limited to iTunes
00946  * file types (from ftyp), but, in reality, it seems that there are other
00947  * files, like 3gpp, out in the wild with iTunes tags. */
00948 static int
00949 iTunesTagHandler (const char *input,
00950            size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00951 {
00952   unsigned long long asize;
00953   unsigned int hdr;
00954   int i;
00955 
00956   hdr = getAtomHeaderSize (&input[pos]);
00957   asize = getAtomSize (&input[pos]);
00958 
00959   if (asize < hdr + 8) /* header + at least one atom */
00960     return 0;
00961 
00962   i = 0;
00963   while ((it_to_extr_table[i].atom_type != NULL) && 
00964          (0 != memcmp (&input[pos+4], it_to_extr_table[i].atom_type, 4)))
00965     i++;
00966   if (it_to_extr_table[i].atom_type != NULL)
00967     return processDataAtom(input, asize, pos+hdr, &input[pos],  
00968                            it_to_extr_table[i].type, list);
00969 
00970   return -1;
00971 }
00972 
00973 
00974 static int 
00975 ilstHandler (const char *input,
00976              size_t size, size_t pos, struct EXTRACTOR_Keywords **list)
00977 {
00978   unsigned int hdr = getAtomHeaderSize (&input[pos]);
00979   return processAtoms(ilst_handlers, &input[pos + hdr],
00980                       getAtomSize(&input[pos]) - hdr, list);
00981 }
00982 
00983 
00984 static HandlerEntry all_handlers[] = {
00985   {"moov", &moovHandler},
00986   {"cmov", &cmovHandler},
00987   {"mvhd", &mvhdHandler},
00988   {"trak", &trakHandler},
00989   {"tkhd", &tkhdHandler},
00990   {"ilst", &ilstHandler},
00991   {"meta", &