1"""Guess the MIME type of a file. 2 3This module defines two useful functions: 4 5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL. 6 7guess_extension(type, strict=True) -- guess the extension for a given MIME type. 8 9It also contains the following, for tuning the behavior: 10 11Data: 12 13knownfiles -- list of files to parse 14inited -- flag set when init() has been called 15suffix_map -- dictionary mapping suffixes to suffixes 16encodings_map -- dictionary mapping suffixes to encodings 17types_map -- dictionary mapping suffixes to types 18 19Functions: 20 21init([files]) -- parse a list of files, default knownfiles (on Windows, the 22 default values are taken from the registry) 23read_mime_types(file) -- parse one file, return a dictionary or None 24""" 25 26import os 27import sys 28import posixpath 29import urllib.parse 30try: 31 import winreg as _winreg 32except ImportError: 33 _winreg = None 34 35__all__ = [ 36 "knownfiles", "inited", "MimeTypes", 37 "guess_type", "guess_all_extensions", "guess_extension", 38 "add_type", "init", "read_mime_types", 39 "suffix_map", "encodings_map", "types_map", "common_types" 40] 41 42knownfiles = [ 43 "/etc/mime.types", 44 "/etc/httpd/mime.types", # Mac OS X 45 "/etc/httpd/conf/mime.types", # Apache 46 "/etc/apache/mime.types", # Apache 1 47 "/etc/apache2/mime.types", # Apache 2 48 "/usr/local/etc/httpd/conf/mime.types", 49 "/usr/local/lib/netscape/mime.types", 50 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 51 "/usr/local/etc/mime.types", # Apache 1.3 52 ] 53 54inited = False 55_db = None 56 57 58class MimeTypes: 59 """MIME-types datastore. 60 61 This datastore can handle information from mime.types-style files 62 and supports basic determination of MIME type from a filename or 63 URL, and can guess a reasonable extension given a MIME type. 64 """ 65 66 def __init__(self, filenames=(), strict=True): 67 if not inited: 68 init() 69 self.encodings_map = encodings_map.copy() 70 self.suffix_map = suffix_map.copy() 71 self.types_map = ({}, {}) # dict for (non-strict, strict) 72 self.types_map_inv = ({}, {}) 73 for (ext, type) in types_map.items(): 74 self.add_type(type, ext, True) 75 for (ext, type) in common_types.items(): 76 self.add_type(type, ext, False) 77 for name in filenames: 78 self.read(name, strict) 79 80 def add_type(self, type, ext, strict=True): 81 """Add a mapping between a type and an extension. 82 83 When the extension is already known, the new 84 type will replace the old one. When the type 85 is already known the extension will be added 86 to the list of known extensions. 87 88 If strict is true, information will be added to 89 list of standard types, else to the list of non-standard 90 types. 91 """ 92 self.types_map[strict][ext] = type 93 exts = self.types_map_inv[strict].setdefault(type, []) 94 if ext not in exts: 95 exts.append(ext) 96 97 def guess_type(self, url, strict=True): 98 """Guess the type of a file based on its URL. 99 100 Return value is a tuple (type, encoding) where type is None if 101 the type can't be guessed (no or unknown suffix) or a string 102 of the form type/subtype, usable for a MIME Content-type 103 header; and encoding is None for no encoding or the name of 104 the program used to encode (e.g. compress or gzip). The 105 mappings are table driven. Encoding suffixes are case 106 sensitive; type suffixes are first tried case sensitive, then 107 case insensitive. 108 109 The suffixes .tgz, .taz and .tz (case sensitive!) are all 110 mapped to '.tar.gz'. (This is table-driven too, using the 111 dictionary suffix_map.) 112 113 Optional `strict' argument when False adds a bunch of commonly found, 114 but non-standard types. 115 """ 116 scheme, url = urllib.parse.splittype(url) 117 if scheme == 'data': 118 # syntax of data URLs: 119 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 120 # mediatype := [ type "/" subtype ] *( ";" parameter ) 121 # data := *urlchar 122 # parameter := attribute "=" value 123 # type/subtype defaults to "text/plain" 124 comma = url.find(',') 125 if comma < 0: 126 # bad data URL 127 return None, None 128 semi = url.find(';', 0, comma) 129 if semi >= 0: 130 type = url[:semi] 131 else: 132 type = url[:comma] 133 if '=' in type or '/' not in type: 134 type = 'text/plain' 135 return type, None # never compressed, so encoding is None 136 base, ext = posixpath.splitext(url) 137 while ext in self.suffix_map: 138 base, ext = posixpath.splitext(base + self.suffix_map[ext]) 139 if ext in self.encodings_map: 140 encoding = self.encodings_map[ext] 141 base, ext = posixpath.splitext(base) 142 else: 143 encoding = None 144 types_map = self.types_map[True] 145 if ext in types_map: 146 return types_map[ext], encoding 147 elif ext.lower() in types_map: 148 return types_map[ext.lower()], encoding 149 elif strict: 150 return None, encoding 151 types_map = self.types_map[False] 152 if ext in types_map: 153 return types_map[ext], encoding 154 elif ext.lower() in types_map: 155 return types_map[ext.lower()], encoding 156 else: 157 return None, encoding 158 159 def guess_all_extensions(self, type, strict=True): 160 """Guess the extensions for a file based on its MIME type. 161 162 Return value is a list of strings giving the possible filename 163 extensions, including the leading dot ('.'). The extension is not 164 guaranteed to have been associated with any particular data stream, 165 but would be mapped to the MIME type `type' by guess_type(). 166 167 Optional `strict' argument when false adds a bunch of commonly found, 168 but non-standard types. 169 """ 170 type = type.lower() 171 extensions = self.types_map_inv[True].get(type, []) 172 if not strict: 173 for ext in self.types_map_inv[False].get(type, []): 174 if ext not in extensions: 175 extensions.append(ext) 176 return extensions 177 178 def guess_extension(self, type, strict=True): 179 """Guess the extension for a file based on its MIME type. 180 181 Return value is a string giving a filename extension, 182 including the leading dot ('.'). The extension is not 183 guaranteed to have been associated with any particular data 184 stream, but would be mapped to the MIME type `type' by 185 guess_type(). If no extension can be guessed for `type', None 186 is returned. 187 188 Optional `strict' argument when false adds a bunch of commonly found, 189 but non-standard types. 190 """ 191 extensions = self.guess_all_extensions(type, strict) 192 if not extensions: 193 return None 194 return extensions[0] 195 196 def read(self, filename, strict=True): 197 """ 198 Read a single mime.types-format file, specified by pathname. 199 200 If strict is true, information will be added to 201 list of standard types, else to the list of non-standard 202 types. 203 """ 204 with open(filename, encoding='utf-8') as fp: 205 self.readfp(fp, strict) 206 207 def readfp(self, fp, strict=True): 208 """ 209 Read a single mime.types-format file. 210 211 If strict is true, information will be added to 212 list of standard types, else to the list of non-standard 213 types. 214 """ 215 while 1: 216 line = fp.readline() 217 if not line: 218 break 219 words = line.split() 220 for i in range(len(words)): 221 if words[i][0] == '#': 222 del words[i:] 223 break 224 if not words: 225 continue 226 type, suffixes = words[0], words[1:] 227 for suff in suffixes: 228 self.add_type(type, '.' + suff, strict) 229 230 def read_windows_registry(self, strict=True): 231 """ 232 Load the MIME types database from Windows registry. 233 234 If strict is true, information will be added to 235 list of standard types, else to the list of non-standard 236 types. 237 """ 238 239 # Windows only 240 if not _winreg: 241 return 242 243 def enum_types(mimedb): 244 i = 0 245 while True: 246 try: 247 ctype = _winreg.EnumKey(mimedb, i) 248 except OSError: 249 break 250 else: 251 if '\0' not in ctype: 252 yield ctype 253 i += 1 254 255 with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr: 256 for subkeyname in enum_types(hkcr): 257 try: 258 with _winreg.OpenKey(hkcr, subkeyname) as subkey: 259 # Only check file extensions 260 if not subkeyname.startswith("."): 261 continue 262 # raises OSError if no 'Content Type' value 263 mimetype, datatype = _winreg.QueryValueEx( 264 subkey, 'Content Type') 265 if datatype != _winreg.REG_SZ: 266 continue 267 self.add_type(mimetype, subkeyname, strict) 268 except OSError: 269 continue 270 271def guess_type(url, strict=True): 272 """Guess the type of a file based on its URL. 273 274 Return value is a tuple (type, encoding) where type is None if the 275 type can't be guessed (no or unknown suffix) or a string of the 276 form type/subtype, usable for a MIME Content-type header; and 277 encoding is None for no encoding or the name of the program used 278 to encode (e.g. compress or gzip). The mappings are table 279 driven. Encoding suffixes are case sensitive; type suffixes are 280 first tried case sensitive, then case insensitive. 281 282 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped 283 to ".tar.gz". (This is table-driven too, using the dictionary 284 suffix_map). 285 286 Optional `strict' argument when false adds a bunch of commonly found, but 287 non-standard types. 288 """ 289 if _db is None: 290 init() 291 return _db.guess_type(url, strict) 292 293 294def guess_all_extensions(type, strict=True): 295 """Guess the extensions for a file based on its MIME type. 296 297 Return value is a list of strings giving the possible filename 298 extensions, including the leading dot ('.'). The extension is not 299 guaranteed to have been associated with any particular data 300 stream, but would be mapped to the MIME type `type' by 301 guess_type(). If no extension can be guessed for `type', None 302 is returned. 303 304 Optional `strict' argument when false adds a bunch of commonly found, 305 but non-standard types. 306 """ 307 if _db is None: 308 init() 309 return _db.guess_all_extensions(type, strict) 310 311def guess_extension(type, strict=True): 312 """Guess the extension for a file based on its MIME type. 313 314 Return value is a string giving a filename extension, including the 315 leading dot ('.'). The extension is not guaranteed to have been 316 associated with any particular data stream, but would be mapped to the 317 MIME type `type' by guess_type(). If no extension can be guessed for 318 `type', None is returned. 319 320 Optional `strict' argument when false adds a bunch of commonly found, 321 but non-standard types. 322 """ 323 if _db is None: 324 init() 325 return _db.guess_extension(type, strict) 326 327def add_type(type, ext, strict=True): 328 """Add a mapping between a type and an extension. 329 330 When the extension is already known, the new 331 type will replace the old one. When the type 332 is already known the extension will be added 333 to the list of known extensions. 334 335 If strict is true, information will be added to 336 list of standard types, else to the list of non-standard 337 types. 338 """ 339 if _db is None: 340 init() 341 return _db.add_type(type, ext, strict) 342 343 344def init(files=None): 345 global suffix_map, types_map, encodings_map, common_types 346 global inited, _db 347 inited = True # so that MimeTypes.__init__() doesn't call us again 348 db = MimeTypes() 349 if files is None: 350 if _winreg: 351 db.read_windows_registry() 352 files = knownfiles 353 for file in files: 354 if os.path.isfile(file): 355 db.read(file) 356 encodings_map = db.encodings_map 357 suffix_map = db.suffix_map 358 types_map = db.types_map[True] 359 common_types = db.types_map[False] 360 # Make the DB a global variable now that it is fully initialized 361 _db = db 362 363 364def read_mime_types(file): 365 try: 366 f = open(file) 367 except OSError: 368 return None 369 with f: 370 db = MimeTypes() 371 db.readfp(f, True) 372 return db.types_map[True] 373 374 375def _default_mime_types(): 376 global suffix_map 377 global encodings_map 378 global types_map 379 global common_types 380 381 suffix_map = { 382 '.svgz': '.svg.gz', 383 '.tgz': '.tar.gz', 384 '.taz': '.tar.gz', 385 '.tz': '.tar.gz', 386 '.tbz2': '.tar.bz2', 387 '.txz': '.tar.xz', 388 } 389 390 encodings_map = { 391 '.gz': 'gzip', 392 '.Z': 'compress', 393 '.bz2': 'bzip2', 394 '.xz': 'xz', 395 } 396 397 # Before adding new types, make sure they are either registered with IANA, 398 # at http://www.iana.org/assignments/media-types 399 # or extensions, i.e. using the x- prefix 400 401 # If you add to these, please keep them sorted! 402 types_map = { 403 '.a' : 'application/octet-stream', 404 '.ai' : 'application/postscript', 405 '.aif' : 'audio/x-aiff', 406 '.aifc' : 'audio/x-aiff', 407 '.aiff' : 'audio/x-aiff', 408 '.au' : 'audio/basic', 409 '.avi' : 'video/x-msvideo', 410 '.bat' : 'text/plain', 411 '.bcpio' : 'application/x-bcpio', 412 '.bin' : 'application/octet-stream', 413 '.bmp' : 'image/bmp', 414 '.c' : 'text/plain', 415 '.cdf' : 'application/x-netcdf', 416 '.cpio' : 'application/x-cpio', 417 '.csh' : 'application/x-csh', 418 '.css' : 'text/css', 419 '.csv' : 'text/csv', 420 '.dll' : 'application/octet-stream', 421 '.doc' : 'application/msword', 422 '.dot' : 'application/msword', 423 '.dvi' : 'application/x-dvi', 424 '.eml' : 'message/rfc822', 425 '.eps' : 'application/postscript', 426 '.etx' : 'text/x-setext', 427 '.exe' : 'application/octet-stream', 428 '.gif' : 'image/gif', 429 '.gtar' : 'application/x-gtar', 430 '.h' : 'text/plain', 431 '.hdf' : 'application/x-hdf', 432 '.htm' : 'text/html', 433 '.html' : 'text/html', 434 '.ico' : 'image/vnd.microsoft.icon', 435 '.ief' : 'image/ief', 436 '.jpe' : 'image/jpeg', 437 '.jpeg' : 'image/jpeg', 438 '.jpg' : 'image/jpeg', 439 '.js' : 'application/javascript', 440 '.json' : 'application/json', 441 '.ksh' : 'text/plain', 442 '.latex' : 'application/x-latex', 443 '.m1v' : 'video/mpeg', 444 '.m3u' : 'application/vnd.apple.mpegurl', 445 '.m3u8' : 'application/vnd.apple.mpegurl', 446 '.man' : 'application/x-troff-man', 447 '.me' : 'application/x-troff-me', 448 '.mht' : 'message/rfc822', 449 '.mhtml' : 'message/rfc822', 450 '.mif' : 'application/x-mif', 451 '.mjs' : 'application/javascript', 452 '.mov' : 'video/quicktime', 453 '.movie' : 'video/x-sgi-movie', 454 '.mp2' : 'audio/mpeg', 455 '.mp3' : 'audio/mpeg', 456 '.mp4' : 'video/mp4', 457 '.mpa' : 'video/mpeg', 458 '.mpe' : 'video/mpeg', 459 '.mpeg' : 'video/mpeg', 460 '.mpg' : 'video/mpeg', 461 '.ms' : 'application/x-troff-ms', 462 '.nc' : 'application/x-netcdf', 463 '.nws' : 'message/rfc822', 464 '.o' : 'application/octet-stream', 465 '.obj' : 'application/octet-stream', 466 '.oda' : 'application/oda', 467 '.p12' : 'application/x-pkcs12', 468 '.p7c' : 'application/pkcs7-mime', 469 '.pbm' : 'image/x-portable-bitmap', 470 '.pdf' : 'application/pdf', 471 '.pfx' : 'application/x-pkcs12', 472 '.pgm' : 'image/x-portable-graymap', 473 '.pl' : 'text/plain', 474 '.png' : 'image/png', 475 '.pnm' : 'image/x-portable-anymap', 476 '.pot' : 'application/vnd.ms-powerpoint', 477 '.ppa' : 'application/vnd.ms-powerpoint', 478 '.ppm' : 'image/x-portable-pixmap', 479 '.pps' : 'application/vnd.ms-powerpoint', 480 '.ppt' : 'application/vnd.ms-powerpoint', 481 '.ps' : 'application/postscript', 482 '.pwz' : 'application/vnd.ms-powerpoint', 483 '.py' : 'text/x-python', 484 '.pyc' : 'application/x-python-code', 485 '.pyo' : 'application/x-python-code', 486 '.qt' : 'video/quicktime', 487 '.ra' : 'audio/x-pn-realaudio', 488 '.ram' : 'application/x-pn-realaudio', 489 '.ras' : 'image/x-cmu-raster', 490 '.rdf' : 'application/xml', 491 '.rgb' : 'image/x-rgb', 492 '.roff' : 'application/x-troff', 493 '.rtx' : 'text/richtext', 494 '.sgm' : 'text/x-sgml', 495 '.sgml' : 'text/x-sgml', 496 '.sh' : 'application/x-sh', 497 '.shar' : 'application/x-shar', 498 '.snd' : 'audio/basic', 499 '.so' : 'application/octet-stream', 500 '.src' : 'application/x-wais-source', 501 '.sv4cpio': 'application/x-sv4cpio', 502 '.sv4crc' : 'application/x-sv4crc', 503 '.svg' : 'image/svg+xml', 504 '.swf' : 'application/x-shockwave-flash', 505 '.t' : 'application/x-troff', 506 '.tar' : 'application/x-tar', 507 '.tcl' : 'application/x-tcl', 508 '.tex' : 'application/x-tex', 509 '.texi' : 'application/x-texinfo', 510 '.texinfo': 'application/x-texinfo', 511 '.tif' : 'image/tiff', 512 '.tiff' : 'image/tiff', 513 '.tr' : 'application/x-troff', 514 '.tsv' : 'text/tab-separated-values', 515 '.txt' : 'text/plain', 516 '.ustar' : 'application/x-ustar', 517 '.vcf' : 'text/x-vcard', 518 '.wav' : 'audio/x-wav', 519 '.webm' : 'video/webm', 520 '.wiz' : 'application/msword', 521 '.wsdl' : 'application/xml', 522 '.xbm' : 'image/x-xbitmap', 523 '.xlb' : 'application/vnd.ms-excel', 524 '.xls' : 'application/vnd.ms-excel', 525 '.xml' : 'text/xml', 526 '.xpdl' : 'application/xml', 527 '.xpm' : 'image/x-xpixmap', 528 '.xsl' : 'application/xml', 529 '.xwd' : 'image/x-xwindowdump', 530 '.zip' : 'application/zip', 531 } 532 533 # These are non-standard types, commonly found in the wild. They will 534 # only match if strict=0 flag is given to the API methods. 535 536 # Please sort these too 537 common_types = { 538 '.jpg' : 'image/jpg', 539 '.mid' : 'audio/midi', 540 '.midi': 'audio/midi', 541 '.pct' : 'image/pict', 542 '.pic' : 'image/pict', 543 '.pict': 'image/pict', 544 '.rtf' : 'application/rtf', 545 '.xul' : 'text/xul' 546 } 547 548 549_default_mime_types() 550 551 552if __name__ == '__main__': 553 import getopt 554 555 USAGE = """\ 556Usage: mimetypes.py [options] type 557 558Options: 559 --help / -h -- print this message and exit 560 --lenient / -l -- additionally search of some common, but non-standard 561 types. 562 --extension / -e -- guess extension instead of type 563 564More than one type argument may be given. 565""" 566 567 def usage(code, msg=''): 568 print(USAGE) 569 if msg: print(msg) 570 sys.exit(code) 571 572 try: 573 opts, args = getopt.getopt(sys.argv[1:], 'hle', 574 ['help', 'lenient', 'extension']) 575 except getopt.error as msg: 576 usage(1, msg) 577 578 strict = 1 579 extension = 0 580 for opt, arg in opts: 581 if opt in ('-h', '--help'): 582 usage(0) 583 elif opt in ('-l', '--lenient'): 584 strict = 0 585 elif opt in ('-e', '--extension'): 586 extension = 1 587 for gtype in args: 588 if extension: 589 guess = guess_extension(gtype, strict) 590 if not guess: print("I don't know anything about type", gtype) 591 else: print(guess) 592 else: 593 guess, encoding = guess_type(gtype, strict) 594 if not guess: print("I don't know anything about type", gtype) 595 else: print('type:', guess, 'encoding:', encoding) 596