Merge branch '18-libmagic-mime-type-detection' into staging-unstable

This commit is contained in:
s3lph 2018-07-20 13:04:57 +02:00
commit 63aef74e10
2 changed files with 63 additions and 9 deletions

View file

@ -4,6 +4,7 @@ from typing import Any, Callable, Dict, Tuple, Type, Union
import logging import logging
import os import os
import socket import socket
import mimetypes
import magic import magic
from socketserver import TCPServer from socketserver import TCPServer
from http.server import HTTPServer, BaseHTTPRequestHandler from http.server import HTTPServer, BaseHTTPRequestHandler
@ -336,7 +337,11 @@ class HttpHandler(BaseHTTPRequestHandler):
except ValueError: except ValueError:
mimetype = 'application/octet-stream' mimetype = 'application/octet-stream'
charset = 'binary' charset = 'binary'
headers['Content-Type'] = f'{mimetype}; charset={charset}' # Only append the charset if it is not "binary"
if charset == 'binary':
headers['Content-Type'] = mimetype
else:
headers['Content-Type'] = f'{mimetype}; charset={charset}'
# Send all headers set by the pagelet # Send all headers set by the pagelet
for name, value in headers.items(): for name, value in headers.items():
self.send_header(name, value) self.send_header(name, value)
@ -374,16 +379,21 @@ class HttpHandler(BaseHTTPRequestHandler):
data = f.read() data = f.read()
# File read successfully, send 'OK' header # File read successfully, send 'OK' header
self.send_response(200) self.send_response(200)
# Guess the MIME type and encoding using libmagic # Guess the MIME type by file extension, or use libmagic as fallback
# Use libmagic to guess the charset
try: try:
exttype: str = mimetypes.guess_type(filepath)[0]
filemagic: magic.FileMagic = magic.detect_from_filename(filepath) filemagic: magic.FileMagic = magic.detect_from_filename(filepath)
mimetype: str = filemagic.mime_type mimetype: str = exttype if exttype is not None else filemagic.mime_type
charset: str = filemagic.encoding charset: str = filemagic.encoding
except ValueError: except ValueError:
mimetype = 'application/octet-stream' mimetype = 'application/octet-stream'
charset = 'binary' charset = 'binary'
# Send content type and length header # Send content type and length header. Only set the charset if it's not "binary"
self.send_header('Content-Type', f'{mimetype}; charset={charset}') if charset == 'binary':
self.send_header('Content-Type', mimetype)
else:
self.send_header('Content-Type', f'{mimetype}; charset={charset}')
self.send_header('Content-Length', str(len(data))) self.send_header('Content-Length', str(len(data)))
self.send_header('Last-Modified', fileage.strftime('%a, %d %b %Y %H:%M:%S GMT')) self.send_header('Last-Modified', fileage.strftime('%a, %d %b %Y %H:%M:%S GMT'))
self.send_header('Cache-Control', 'max-age=1') self.send_header('Cache-Control', 'max-age=1')

View file

@ -17,7 +17,6 @@ def serve_test_pagelet_str(method: str,
session_vars: Dict[str, Any], session_vars: Dict[str, Any],
headers: Dict[str, str], headers: Dict[str, str],
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
headers['Content-Type'] = 'text/plain'
return 'serve test pagelet str' return 'serve test pagelet str'
@ -28,7 +27,7 @@ def serve_test_pagelet_bytes(method: str,
session_vars: Dict[str, Any], session_vars: Dict[str, Any],
headers: Dict[str, str], headers: Dict[str, str],
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
headers['Content-Type'] = 'application/octet-stream' headers['Content-Type'] = 'application/x-foo-bar'
return b'serve\x80test\xffpagelet\xfebytes' return b'serve\x80test\xffpagelet\xfebytes'
@ -49,7 +48,6 @@ def serve_test_pagelet_template(method: str,
session_vars: Dict[str, Any], session_vars: Dict[str, Any],
headers: Dict[str, str], headers: Dict[str, str],
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
headers['Content-Type'] = 'text/plain'
return TemplateResponse('test.txt', what='World') return TemplateResponse('test.txt', what='World')
@ -62,7 +60,6 @@ def serve_test_pagelet_fail(method: str,
headers: Dict[str, str], headers: Dict[str, str],
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
session_vars['test'] = 'hello, world!' session_vars['test'] = 'hello, world!'
headers['Content-Type'] = 'text/plain'
raise HttpException(599, 'Error expected during unit testing') raise HttpException(599, 'Error expected during unit testing')
@ -81,6 +78,15 @@ class TestServe(AbstractHttpdTest):
with open(forbidden, 'w') as f: with open(forbidden, 'w') as f:
f.write('This should not be readable') f.write('This should not be readable')
os.chmod(forbidden, 0) os.chmod(forbidden, 0)
# Create a CSS resource whose MIME type should be detected by file extension
with open(os.path.join(self.tempdir.name, 'teststyle.css'), 'w') as f:
f.write('.ninja { display: none; }\n')
# Create a file without extension (containing UTF-16 text with BOM); libmagic should take over
with open(os.path.join(self.tempdir.name, 'testdata'), 'wb') as f:
f.write(b'\xFE\xFFH\x00e\x00l\x00l\x00o\x00,\x00 \x00w\x00o\x00r\x00l\x00d\x00!\x00\n\x00')
# Create a file that will yield "text/plain; charset=binary"
with open(os.path.join(self.tempdir.name, 'testbin.txt'), 'wb') as f:
f.write(b'\x00\x00\x00\x00\x00\x00\x00\x00')
def test_serve_pagelet_str(self): def test_serve_pagelet_str(self):
# Call the test pagelet that produces a 200 OK result # Call the test pagelet that produces a 200 OK result
@ -228,3 +234,41 @@ class TestServe(AbstractHttpdTest):
self.assertIsNone(packet.pagelet) self.assertIsNone(packet.pagelet)
# Make sure a 405 Method Not Allowed header is served # Make sure a 405 Method Not Allowed header is served
self.assertEqual(405, packet.statuscode) self.assertEqual(405, packet.statuscode)
def test_serve_pagelet_libmagic(self):
# The correct Content-Type header must be guessed, if a pagelet does not provide one
self.client_sock.set_request(b'GET /just/testing/serve_pagelet_str HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
self.assertEqual('text/plain; charset=us-ascii', packet.headers['Content-Type'])
def test_serve_pagelet_libmagic_skipped(self):
# The Content-Type set by a pagelet should not be overwritten
self.client_sock.set_request(b'GET /just/testing/serve_pagelet_bytes HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
self.assertEqual('application/x-foo-bar', packet.headers['Content-Type'])
def test_serve_static_mime_extension(self):
# The correct Content-Type should be guessed by file extension primarily
self.client_sock.set_request(b'GET /teststyle.css HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
# libmagic would say text/plain instead
self.assertEqual('text/css; charset=us-ascii', packet.headers['Content-Type'])
def test_serve_static_mime_magic(self):
# The correct Content-Type should be guessed by file extension primarily
self.client_sock.set_request(b'GET /testdata HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
# Extension-based would fail, as there is no extension
self.assertEqual('text/plain; charset=utf-16be', packet.headers['Content-Type'])
def test_serve_static_mime_magic_binary(self):
# The correct Content-Type should be guessed by file extension primarily
self.client_sock.set_request(b'GET /testbin.txt HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
# No charset should be in the header. Yes, this is a stupid example
self.assertEqual('text/plain', packet.headers['Content-Type'])