1"""Guess the MIME type of a file.
2
3This module defines two useful functions:
4
5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
6
7guess_extension(type, strict=True) -- guess the extension for a given MIME type.
8
9It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
15suffix_map -- dictionary mapping suffixes to suffixes
16encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles (on Windows, the
22  default values are taken from the registry)
23read_mime_types(file) -- parse one file, return a dictionary or None
24"""
25
26import os
27import sys
28import posixpath
29import urllib.parse
30try:
31    import winreg as _winreg
32except ImportError:
33    _winreg = None
34
35__all__ = [
36    "knownfiles", "inited", "MimeTypes",
37    "guess_type", "guess_all_extensions", "guess_extension",
38    "add_type", "init", "read_mime_types",
39    "suffix_map", "encodings_map", "types_map", "common_types"
40]
41
42knownfiles = [
43    "/etc/mime.types",
44    "/etc/httpd/mime.types",                    # Mac OS X
45    "/etc/httpd/conf/mime.types",               # Apache
46    "/etc/apache/mime.types",                   # Apache 1
47    "/etc/apache2/mime.types",                  # Apache 2
48    "/usr/local/etc/httpd/conf/mime.types",
49    "/usr/local/lib/netscape/mime.types",
50    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
51    "/usr/local/etc/mime.types",                # Apache 1.3
52    ]
53
54inited = False
55_db = None
56
57
58class MimeTypes:
59    """MIME-types datastore.
60
61    This datastore can handle information from mime.types-style files
62    and supports basic determination of MIME type from a filename or
63    URL, and can guess a reasonable extension given a MIME type.
64    """
65
66    def __init__(self, filenames=(), strict=True):
67        if not inited:
68            init()
69        self.encodings_map = encodings_map.copy()
70        self.suffix_map = suffix_map.copy()
71        self.types_map = ({}, {}) # dict for (non-strict, strict)
72        self.types_map_inv = ({}, {})
73        for (ext, type) in types_map.items():
74            self.add_type(type, ext, True)
75        for (ext, type) in common_types.items():
76            self.add_type(type, ext, False)
77        for name in filenames:
78            self.read(name, strict)
79
80    def add_type(self, type, ext, strict=True):
81        """Add a mapping between a type and an extension.
82
83        When the extension is already known, the new
84        type will replace the old one. When the type
85        is already known the extension will be added
86        to the list of known extensions.
87
88        If strict is true, information will be added to
89        list of standard types, else to the list of non-standard
90        types.
91        """
92        self.types_map[strict][ext] = type
93        exts = self.types_map_inv[strict].setdefault(type, [])
94        if ext not in exts:
95            exts.append(ext)
96
97    def guess_type(self, url, strict=True):
98        """Guess the type of a file based on its URL.
99
100        Return value is a tuple (type, encoding) where type is None if
101        the type can't be guessed (no or unknown suffix) or a string
102        of the form type/subtype, usable for a MIME Content-type
103        header; and encoding is None for no encoding or the name of
104        the program used to encode (e.g. compress or gzip).  The
105        mappings are table driven.  Encoding suffixes are case
106        sensitive; type suffixes are first tried case sensitive, then
107        case insensitive.
108
109        The suffixes .tgz, .taz and .tz (case sensitive!) are all
110        mapped to '.tar.gz'.  (This is table-driven too, using the
111        dictionary suffix_map.)
112
113        Optional `strict' argument when False adds a bunch of commonly found,
114        but non-standard types.
115        """
116        scheme, url = urllib.parse.splittype(url)
117        if scheme == 'data':
118            # syntax of data URLs:
119            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
120            # mediatype := [ type "/" subtype ] *( ";" parameter )
121            # data      := *urlchar
122            # parameter := attribute "=" value
123            # type/subtype defaults to "text/plain"
124            comma = url.find(',')
125            if comma < 0:
126                # bad data URL
127                return None, None
128            semi = url.find(';', 0, comma)
129            if semi >= 0:
130                type = url[:semi]
131            else:
132                type = url[:comma]
133            if '=' in type or '/' not in type:
134                type = 'text/plain'
135            return type, None           # never compressed, so encoding is None
136        base, ext = posixpath.splitext(url)
137        while ext in self.suffix_map:
138            base, ext = posixpath.splitext(base + self.suffix_map[ext])
139        if ext in self.encodings_map:
140            encoding = self.encodings_map[ext]
141            base, ext = posixpath.splitext(base)
142        else:
143            encoding = None
144        types_map = self.types_map[True]
145        if ext in types_map:
146            return types_map[ext], encoding
147        elif ext.lower() in types_map:
148            return types_map[ext.lower()], encoding
149        elif strict:
150            return None, encoding
151        types_map = self.types_map[False]
152        if ext in types_map:
153            return types_map[ext], encoding
154        elif ext.lower() in types_map:
155            return types_map[ext.lower()], encoding
156        else:
157            return None, encoding
158
159    def guess_all_extensions(self, type, strict=True):
160        """Guess the extensions for a file based on its MIME type.
161
162        Return value is a list of strings giving the possible filename
163        extensions, including the leading dot ('.').  The extension is not
164        guaranteed to have been associated with any particular data stream,
165        but would be mapped to the MIME type `type' by guess_type().
166
167        Optional `strict' argument when false adds a bunch of commonly found,
168        but non-standard types.
169        """
170        type = type.lower()
171        extensions = self.types_map_inv[True].get(type, [])
172        if not strict:
173            for ext in self.types_map_inv[False].get(type, []):
174                if ext not in extensions:
175                    extensions.append(ext)
176        return extensions
177
178    def guess_extension(self, type, strict=True):
179        """Guess the extension for a file based on its MIME type.
180
181        Return value is a string giving a filename extension,
182        including the leading dot ('.').  The extension is not
183        guaranteed to have been associated with any particular data
184        stream, but would be mapped to the MIME type `type' by
185        guess_type().  If no extension can be guessed for `type', None
186        is returned.
187
188        Optional `strict' argument when false adds a bunch of commonly found,
189        but non-standard types.
190        """
191        extensions = self.guess_all_extensions(type, strict)
192        if not extensions:
193            return None
194        return extensions[0]
195
196    def read(self, filename, strict=True):
197        """
198        Read a single mime.types-format file, specified by pathname.
199
200        If strict is true, information will be added to
201        list of standard types, else to the list of non-standard
202        types.
203        """
204        with open(filename, encoding='utf-8') as fp:
205            self.readfp(fp, strict)
206
207    def readfp(self, fp, strict=True):
208        """
209        Read a single mime.types-format file.
210
211        If strict is true, information will be added to
212        list of standard types, else to the list of non-standard
213        types.
214        """
215        while 1:
216            line = fp.readline()
217            if not line:
218                break
219            words = line.split()
220            for i in range(len(words)):
221                if words[i][0] == '#':
222                    del words[i:]
223                    break
224            if not words:
225                continue
226            type, suffixes = words[0], words[1:]
227            for suff in suffixes:
228                self.add_type(type, '.' + suff, strict)
229
230    def read_windows_registry(self, strict=True):
231        """
232        Load the MIME types database from Windows registry.
233
234        If strict is true, information will be added to
235        list of standard types, else to the list of non-standard
236        types.
237        """
238
239        # Windows only
240        if not _winreg:
241            return
242
243        def enum_types(mimedb):
244            i = 0
245            while True:
246                try:
247                    ctype = _winreg.EnumKey(mimedb, i)
248                except OSError:
249                    break
250                else:
251                    if '\0' not in ctype:
252                        yield ctype
253                i += 1
254
255        with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
256            for subkeyname in enum_types(hkcr):
257                try:
258                    with _winreg.OpenKey(hkcr, subkeyname) as subkey:
259                        # Only check file extensions
260                        if not subkeyname.startswith("."):
261                            continue
262                        # raises OSError if no 'Content Type' value
263                        mimetype, datatype = _winreg.QueryValueEx(
264                            subkey, 'Content Type')
265                        if datatype != _winreg.REG_SZ:
266                            continue
267                        self.add_type(mimetype, subkeyname, strict)
268                except OSError:
269                    continue
270
271def guess_type(url, strict=True):
272    """Guess the type of a file based on its URL.
273
274    Return value is a tuple (type, encoding) where type is None if the
275    type can't be guessed (no or unknown suffix) or a string of the
276    form type/subtype, usable for a MIME Content-type header; and
277    encoding is None for no encoding or the name of the program used
278    to encode (e.g. compress or gzip).  The mappings are table
279    driven.  Encoding suffixes are case sensitive; type suffixes are
280    first tried case sensitive, then case insensitive.
281
282    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
283    to ".tar.gz".  (This is table-driven too, using the dictionary
284    suffix_map).
285
286    Optional `strict' argument when false adds a bunch of commonly found, but
287    non-standard types.
288    """
289    if _db is None:
290        init()
291    return _db.guess_type(url, strict)
292
293
294def guess_all_extensions(type, strict=True):
295    """Guess the extensions for a file based on its MIME type.
296
297    Return value is a list of strings giving the possible filename
298    extensions, including the leading dot ('.').  The extension is not
299    guaranteed to have been associated with any particular data
300    stream, but would be mapped to the MIME type `type' by
301    guess_type().  If no extension can be guessed for `type', None
302    is returned.
303
304    Optional `strict' argument when false adds a bunch of commonly found,
305    but non-standard types.
306    """
307    if _db is None:
308        init()
309    return _db.guess_all_extensions(type, strict)
310
311def guess_extension(type, strict=True):
312    """Guess the extension for a file based on its MIME type.
313
314    Return value is a string giving a filename extension, including the
315    leading dot ('.').  The extension is not guaranteed to have been
316    associated with any particular data stream, but would be mapped to the
317    MIME type `type' by guess_type().  If no extension can be guessed for
318    `type', None is returned.
319
320    Optional `strict' argument when false adds a bunch of commonly found,
321    but non-standard types.
322    """
323    if _db is None:
324        init()
325    return _db.guess_extension(type, strict)
326
327def add_type(type, ext, strict=True):
328    """Add a mapping between a type and an extension.
329
330    When the extension is already known, the new
331    type will replace the old one. When the type
332    is already known the extension will be added
333    to the list of known extensions.
334
335    If strict is true, information will be added to
336    list of standard types, else to the list of non-standard
337    types.
338    """
339    if _db is None:
340        init()
341    return _db.add_type(type, ext, strict)
342
343
344def init(files=None):
345    global suffix_map, types_map, encodings_map, common_types
346    global inited, _db
347    inited = True    # so that MimeTypes.__init__() doesn't call us again
348    db = MimeTypes()
349    if files is None:
350        if _winreg:
351            db.read_windows_registry()
352        files = knownfiles
353    for file in files:
354        if os.path.isfile(file):
355            db.read(file)
356    encodings_map = db.encodings_map
357    suffix_map = db.suffix_map
358    types_map = db.types_map[True]
359    common_types = db.types_map[False]
360    # Make the DB a global variable now that it is fully initialized
361    _db = db
362
363
364def read_mime_types(file):
365    try:
366        f = open(file)
367    except OSError:
368        return None
369    with f:
370        db = MimeTypes()
371        db.readfp(f, True)
372        return db.types_map[True]
373
374
375def _default_mime_types():
376    global suffix_map
377    global encodings_map
378    global types_map
379    global common_types
380
381    suffix_map = {
382        '.svgz': '.svg.gz',
383        '.tgz': '.tar.gz',
384        '.taz': '.tar.gz',
385        '.tz': '.tar.gz',
386        '.tbz2': '.tar.bz2',
387        '.txz': '.tar.xz',
388        }
389
390    encodings_map = {
391        '.gz': 'gzip',
392        '.Z': 'compress',
393        '.bz2': 'bzip2',
394        '.xz': 'xz',
395        }
396
397    # Before adding new types, make sure they are either registered with IANA,
398    # at http://www.iana.org/assignments/media-types
399    # or extensions, i.e. using the x- prefix
400
401    # If you add to these, please keep them sorted!
402    types_map = {
403        '.a'      : 'application/octet-stream',
404        '.ai'     : 'application/postscript',
405        '.aif'    : 'audio/x-aiff',
406        '.aifc'   : 'audio/x-aiff',
407        '.aiff'   : 'audio/x-aiff',
408        '.au'     : 'audio/basic',
409        '.avi'    : 'video/x-msvideo',
410        '.bat'    : 'text/plain',
411        '.bcpio'  : 'application/x-bcpio',
412        '.bin'    : 'application/octet-stream',
413        '.bmp'    : 'image/bmp',
414        '.c'      : 'text/plain',
415        '.cdf'    : 'application/x-netcdf',
416        '.cpio'   : 'application/x-cpio',
417        '.csh'    : 'application/x-csh',
418        '.css'    : 'text/css',
419        '.csv'    : 'text/csv',
420        '.dll'    : 'application/octet-stream',
421        '.doc'    : 'application/msword',
422        '.dot'    : 'application/msword',
423        '.dvi'    : 'application/x-dvi',
424        '.eml'    : 'message/rfc822',
425        '.eps'    : 'application/postscript',
426        '.etx'    : 'text/x-setext',
427        '.exe'    : 'application/octet-stream',
428        '.gif'    : 'image/gif',
429        '.gtar'   : 'application/x-gtar',
430        '.h'      : 'text/plain',
431        '.hdf'    : 'application/x-hdf',
432        '.htm'    : 'text/html',
433        '.html'   : 'text/html',
434        '.ico'    : 'image/vnd.microsoft.icon',
435        '.ief'    : 'image/ief',
436        '.jpe'    : 'image/jpeg',
437        '.jpeg'   : 'image/jpeg',
438        '.jpg'    : 'image/jpeg',
439        '.js'     : 'application/javascript',
440        '.json'   : 'application/json',
441        '.ksh'    : 'text/plain',
442        '.latex'  : 'application/x-latex',
443        '.m1v'    : 'video/mpeg',
444        '.m3u'    : 'application/vnd.apple.mpegurl',
445        '.m3u8'   : 'application/vnd.apple.mpegurl',
446        '.man'    : 'application/x-troff-man',
447        '.me'     : 'application/x-troff-me',
448        '.mht'    : 'message/rfc822',
449        '.mhtml'  : 'message/rfc822',
450        '.mif'    : 'application/x-mif',
451        '.mjs'    : 'application/javascript',
452        '.mov'    : 'video/quicktime',
453        '.movie'  : 'video/x-sgi-movie',
454        '.mp2'    : 'audio/mpeg',
455        '.mp3'    : 'audio/mpeg',
456        '.mp4'    : 'video/mp4',
457        '.mpa'    : 'video/mpeg',
458        '.mpe'    : 'video/mpeg',
459        '.mpeg'   : 'video/mpeg',
460        '.mpg'    : 'video/mpeg',
461        '.ms'     : 'application/x-troff-ms',
462        '.nc'     : 'application/x-netcdf',
463        '.nws'    : 'message/rfc822',
464        '.o'      : 'application/octet-stream',
465        '.obj'    : 'application/octet-stream',
466        '.oda'    : 'application/oda',
467        '.p12'    : 'application/x-pkcs12',
468        '.p7c'    : 'application/pkcs7-mime',
469        '.pbm'    : 'image/x-portable-bitmap',
470        '.pdf'    : 'application/pdf',
471        '.pfx'    : 'application/x-pkcs12',
472        '.pgm'    : 'image/x-portable-graymap',
473        '.pl'     : 'text/plain',
474        '.png'    : 'image/png',
475        '.pnm'    : 'image/x-portable-anymap',
476        '.pot'    : 'application/vnd.ms-powerpoint',
477        '.ppa'    : 'application/vnd.ms-powerpoint',
478        '.ppm'    : 'image/x-portable-pixmap',
479        '.pps'    : 'application/vnd.ms-powerpoint',
480        '.ppt'    : 'application/vnd.ms-powerpoint',
481        '.ps'     : 'application/postscript',
482        '.pwz'    : 'application/vnd.ms-powerpoint',
483        '.py'     : 'text/x-python',
484        '.pyc'    : 'application/x-python-code',
485        '.pyo'    : 'application/x-python-code',
486        '.qt'     : 'video/quicktime',
487        '.ra'     : 'audio/x-pn-realaudio',
488        '.ram'    : 'application/x-pn-realaudio',
489        '.ras'    : 'image/x-cmu-raster',
490        '.rdf'    : 'application/xml',
491        '.rgb'    : 'image/x-rgb',
492        '.roff'   : 'application/x-troff',
493        '.rtx'    : 'text/richtext',
494        '.sgm'    : 'text/x-sgml',
495        '.sgml'   : 'text/x-sgml',
496        '.sh'     : 'application/x-sh',
497        '.shar'   : 'application/x-shar',
498        '.snd'    : 'audio/basic',
499        '.so'     : 'application/octet-stream',
500        '.src'    : 'application/x-wais-source',
501        '.sv4cpio': 'application/x-sv4cpio',
502        '.sv4crc' : 'application/x-sv4crc',
503        '.svg'    : 'image/svg+xml',
504        '.swf'    : 'application/x-shockwave-flash',
505        '.t'      : 'application/x-troff',
506        '.tar'    : 'application/x-tar',
507        '.tcl'    : 'application/x-tcl',
508        '.tex'    : 'application/x-tex',
509        '.texi'   : 'application/x-texinfo',
510        '.texinfo': 'application/x-texinfo',
511        '.tif'    : 'image/tiff',
512        '.tiff'   : 'image/tiff',
513        '.tr'     : 'application/x-troff',
514        '.tsv'    : 'text/tab-separated-values',
515        '.txt'    : 'text/plain',
516        '.ustar'  : 'application/x-ustar',
517        '.vcf'    : 'text/x-vcard',
518        '.wav'    : 'audio/x-wav',
519        '.webm'   : 'video/webm',
520        '.wiz'    : 'application/msword',
521        '.wsdl'   : 'application/xml',
522        '.xbm'    : 'image/x-xbitmap',
523        '.xlb'    : 'application/vnd.ms-excel',
524        '.xls'    : 'application/vnd.ms-excel',
525        '.xml'    : 'text/xml',
526        '.xpdl'   : 'application/xml',
527        '.xpm'    : 'image/x-xpixmap',
528        '.xsl'    : 'application/xml',
529        '.xwd'    : 'image/x-xwindowdump',
530        '.zip'    : 'application/zip',
531        }
532
533    # These are non-standard types, commonly found in the wild.  They will
534    # only match if strict=0 flag is given to the API methods.
535
536    # Please sort these too
537    common_types = {
538        '.jpg' : 'image/jpg',
539        '.mid' : 'audio/midi',
540        '.midi': 'audio/midi',
541        '.pct' : 'image/pict',
542        '.pic' : 'image/pict',
543        '.pict': 'image/pict',
544        '.rtf' : 'application/rtf',
545        '.xul' : 'text/xul'
546        }
547
548
549_default_mime_types()
550
551
552if __name__ == '__main__':
553    import getopt
554
555    USAGE = """\
556Usage: mimetypes.py [options] type
557
558Options:
559    --help / -h       -- print this message and exit
560    --lenient / -l    -- additionally search of some common, but non-standard
561                         types.
562    --extension / -e  -- guess extension instead of type
563
564More than one type argument may be given.
565"""
566
567    def usage(code, msg=''):
568        print(USAGE)
569        if msg: print(msg)
570        sys.exit(code)
571
572    try:
573        opts, args = getopt.getopt(sys.argv[1:], 'hle',
574                                   ['help', 'lenient', 'extension'])
575    except getopt.error as msg:
576        usage(1, msg)
577
578    strict = 1
579    extension = 0
580    for opt, arg in opts:
581        if opt in ('-h', '--help'):
582            usage(0)
583        elif opt in ('-l', '--lenient'):
584            strict = 0
585        elif opt in ('-e', '--extension'):
586            extension = 1
587    for gtype in args:
588        if extension:
589            guess = guess_extension(gtype, strict)
590            if not guess: print("I don't know anything about type", gtype)
591            else: print(guess)
592        else:
593            guess, encoding = guess_type(gtype, strict)
594            if not guess: print("I don't know anything about type", gtype)
595            else: print('type:', guess, 'encoding:', encoding)
596