From 8b0e871dc78c74445b415cfd2870a747913da695 Mon Sep 17 00:00:00 2001 From: s3lph Date: Fri, 20 Jul 2018 13:01:33 +0200 Subject: [PATCH] Only use libmagic for static resources if extension-based guessing fails. --- matemat/webserver/httpd.py | 20 +++++++++++---- matemat/webserver/test/test_serve.py | 37 ++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/matemat/webserver/httpd.py b/matemat/webserver/httpd.py index 795e0df..e2bde4f 100644 --- a/matemat/webserver/httpd.py +++ b/matemat/webserver/httpd.py @@ -4,6 +4,7 @@ from typing import Any, Callable, Dict, Tuple, Type, Union import logging import os import socket +import mimetypes import magic from socketserver import TCPServer from http.server import HTTPServer, BaseHTTPRequestHandler @@ -336,7 +337,11 @@ class HttpHandler(BaseHTTPRequestHandler): except ValueError: mimetype = 'application/octet-stream' charset = 'binary' - headers['Content-Type'] = f'{mimetype}; charset={charset}' + # Only append the charset if it is not "binary" + if charset == 'binary': + headers['Content-Type'] = mimetype + else: + headers['Content-Type'] = f'{mimetype}; charset={charset}' # Send all headers set by the pagelet for name, value in headers.items(): self.send_header(name, value) @@ -374,16 +379,21 @@ class HttpHandler(BaseHTTPRequestHandler): data = f.read() # File read successfully, send 'OK' header self.send_response(200) - # Guess the MIME type and encoding using libmagic + # Guess the MIME type by file extension, or use libmagic as fallback + # Use libmagic to guess the charset try: + exttype: str = mimetypes.guess_type(filepath)[0] filemagic: magic.FileMagic = magic.detect_from_filename(filepath) - mimetype: str = filemagic.mime_type + mimetype: str = exttype if exttype is not None else filemagic.mime_type charset: str = filemagic.encoding except ValueError: mimetype = 'application/octet-stream' charset = 'binary' - # Send content type and length header - self.send_header('Content-Type', f'{mimetype}; charset={charset}') + # Send content type and length header. Only set the charset if it's not "binary" + if charset == 'binary': + self.send_header('Content-Type', mimetype) + else: + self.send_header('Content-Type', f'{mimetype}; charset={charset}') self.send_header('Content-Length', str(len(data))) self.send_header('Last-Modified', fileage.strftime('%a, %d %b %Y %H:%M:%S GMT')) self.send_header('Cache-Control', 'max-age=1') diff --git a/matemat/webserver/test/test_serve.py b/matemat/webserver/test/test_serve.py index 2dd7e7e..b7ec850 100644 --- a/matemat/webserver/test/test_serve.py +++ b/matemat/webserver/test/test_serve.py @@ -78,6 +78,15 @@ class TestServe(AbstractHttpdTest): with open(forbidden, 'w') as f: f.write('This should not be readable') os.chmod(forbidden, 0) + # Create a CSS resource whose MIME type should be detected by file extension + with open(os.path.join(self.tempdir.name, 'teststyle.css'), 'w') as f: + f.write('.ninja { display: none; }\n') + # Create a file without extension (containing UTF-16 text with BOM); libmagic should take over + with open(os.path.join(self.tempdir.name, 'testdata'), 'wb') as f: + f.write(b'\xFE\xFFH\x00e\x00l\x00l\x00o\x00,\x00 \x00w\x00o\x00r\x00l\x00d\x00!\x00\n\x00') + # Create a file that will yield "text/plain; charset=binary" + with open(os.path.join(self.tempdir.name, 'testbin.txt'), 'wb') as f: + f.write(b'\x00\x00\x00\x00\x00\x00\x00\x00') def test_serve_pagelet_str(self): # Call the test pagelet that produces a 200 OK result @@ -226,16 +235,40 @@ class TestServe(AbstractHttpdTest): # Make sure a 405 Method Not Allowed header is served self.assertEqual(405, packet.statuscode) - def test_serve_static_libmagic(self): + def test_serve_pagelet_libmagic(self): # The correct Content-Type header must be guessed, if a pagelet does not provide one self.client_sock.set_request(b'GET /just/testing/serve_pagelet_str HTTP/1.1\r\n\r\n') HttpHandler(self.client_sock, ('::1', 45678), self.server) packet = self.client_sock.get_response() self.assertEqual('text/plain; charset=us-ascii', packet.headers['Content-Type']) - def test_serve_static_libmagic_skipped(self): + def test_serve_pagelet_libmagic_skipped(self): # The Content-Type set by a pagelet should not be overwritten self.client_sock.set_request(b'GET /just/testing/serve_pagelet_bytes HTTP/1.1\r\n\r\n') HttpHandler(self.client_sock, ('::1', 45678), self.server) packet = self.client_sock.get_response() self.assertEqual('application/x-foo-bar', packet.headers['Content-Type']) + + def test_serve_static_mime_extension(self): + # The correct Content-Type should be guessed by file extension primarily + self.client_sock.set_request(b'GET /teststyle.css HTTP/1.1\r\n\r\n') + HttpHandler(self.client_sock, ('::1', 45678), self.server) + packet = self.client_sock.get_response() + # libmagic would say text/plain instead + self.assertEqual('text/css; charset=us-ascii', packet.headers['Content-Type']) + + def test_serve_static_mime_magic(self): + # The correct Content-Type should be guessed by file extension primarily + self.client_sock.set_request(b'GET /testdata HTTP/1.1\r\n\r\n') + HttpHandler(self.client_sock, ('::1', 45678), self.server) + packet = self.client_sock.get_response() + # Extension-based would fail, as there is no extension + self.assertEqual('text/plain; charset=utf-16be', packet.headers['Content-Type']) + + def test_serve_static_mime_magic_binary(self): + # The correct Content-Type should be guessed by file extension primarily + self.client_sock.set_request(b'GET /testbin.txt HTTP/1.1\r\n\r\n') + HttpHandler(self.client_sock, ('::1', 45678), self.server) + packet = self.client_sock.get_response() + # No charset should be in the header. Yes, this is a stupid example + self.assertEqual('text/plain', packet.headers['Content-Type'])