From 5912d1e6241c59bfcf19828a4e65439aae9f963c Mon Sep 17 00:00:00 2001 From: s3lph Date: Fri, 20 Jul 2018 00:32:51 +0200 Subject: [PATCH 1/7] Use a libmagic wrapper to guess Content-Type headers. --- README.md | 1 + matemat/webserver/httpd.py | 18 ++++++++++-------- matemat/webserver/test/test_serve.py | 19 +++++++++++++++---- requirements.txt | 1 + 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 95aa201..49816f3 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ This project intends to provide a well-tested and maintainable alternative to - Python 3 (>=3.6) - Python dependencies: + - file-magic - jinja2 ## Usage diff --git a/matemat/webserver/httpd.py b/matemat/webserver/httpd.py index e962a8c..23045bc 100644 --- a/matemat/webserver/httpd.py +++ b/matemat/webserver/httpd.py @@ -4,7 +4,7 @@ from typing import Any, Callable, Dict, Tuple, Type, Union import logging import os import socket -import mimetypes +import magic from socketserver import TCPServer from http.server import HTTPServer, BaseHTTPRequestHandler from http.cookies import SimpleCookie @@ -308,7 +308,6 @@ class HttpHandler(BaseHTTPRequestHandler): if path in _PAGELET_PATHS: # Prepare some headers. Those can still be overwritten by the pagelet headers: Dict[str, str] = { - 'Content-Type': 'text/html', 'Cache-Control': 'no-cache' } # Call the pagelet function @@ -328,6 +327,10 @@ class HttpHandler(BaseHTTPRequestHandler): f'matemat_session_id={session_id}; expires={expires}') # Compute the body length and add the appropriate header headers['Content-Length'] = str(len(data)) + # If the pagelet did not set its own Content-Type header, use libmagic to guess an appropriate one + if 'Content-Type' not in headers: + filemagic: magic.FileMagic = magic.detect_from_content(data) + headers['Content-Type'] = f'{filemagic.mime_type}; charset={filemagic.encoding}' # Send all headers set by the pagelet for name, value in headers.items(): self.send_header(name, value) @@ -365,13 +368,12 @@ class HttpHandler(BaseHTTPRequestHandler): data = f.read() # File read successfully, send 'OK' header self.send_response(200) - # TODO: Guess the MIME type. Unfortunately this call solely relies on the file extension, not ideal? - mimetype, _ = mimetypes.guess_type(filepath) - # Fall back to octet-stream type, if unknown - if mimetype is None: - mimetype = 'application/octet-stream' + # Guess the MIME type and encoding using libmagic + filemagic: magic.FileMagic = magic.detect_from_filename(filepath) + mimetype: str = filemagic.mime_type + charset: str = filemagic.encoding # Send content type and length header - self.send_header('Content-Type', mimetype) + self.send_header('Content-Type', f'{mimetype}; charset={charset}') self.send_header('Content-Length', str(len(data))) self.send_header('Last-Modified', fileage.strftime('%a, %d %b %Y %H:%M:%S GMT')) self.send_header('Cache-Control', 'max-age=1') diff --git a/matemat/webserver/test/test_serve.py b/matemat/webserver/test/test_serve.py index c67e06e..2dd7e7e 100644 --- a/matemat/webserver/test/test_serve.py +++ b/matemat/webserver/test/test_serve.py @@ -17,7 +17,6 @@ def serve_test_pagelet_str(method: str, session_vars: Dict[str, Any], headers: Dict[str, str], pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: - headers['Content-Type'] = 'text/plain' return 'serve test pagelet str' @@ -28,7 +27,7 @@ def serve_test_pagelet_bytes(method: str, session_vars: Dict[str, Any], headers: Dict[str, str], pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: - headers['Content-Type'] = 'application/octet-stream' + headers['Content-Type'] = 'application/x-foo-bar' return b'serve\x80test\xffpagelet\xfebytes' @@ -49,7 +48,6 @@ def serve_test_pagelet_template(method: str, session_vars: Dict[str, Any], headers: Dict[str, str], pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: - headers['Content-Type'] = 'text/plain' return TemplateResponse('test.txt', what='World') @@ -62,7 +60,6 @@ def serve_test_pagelet_fail(method: str, headers: Dict[str, str], pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: session_vars['test'] = 'hello, world!' - headers['Content-Type'] = 'text/plain' raise HttpException(599, 'Error expected during unit testing') @@ -228,3 +225,17 @@ class TestServe(AbstractHttpdTest): self.assertIsNone(packet.pagelet) # Make sure a 405 Method Not Allowed header is served self.assertEqual(405, packet.statuscode) + + def test_serve_static_libmagic(self): + # The correct Content-Type header must be guessed, if a pagelet does not provide one + self.client_sock.set_request(b'GET /just/testing/serve_pagelet_str HTTP/1.1\r\n\r\n') + HttpHandler(self.client_sock, ('::1', 45678), self.server) + packet = self.client_sock.get_response() + self.assertEqual('text/plain; charset=us-ascii', packet.headers['Content-Type']) + + def test_serve_static_libmagic_skipped(self): + # The Content-Type set by a pagelet should not be overwritten + self.client_sock.set_request(b'GET /just/testing/serve_pagelet_bytes HTTP/1.1\r\n\r\n') + HttpHandler(self.client_sock, ('::1', 45678), self.server) + packet = self.client_sock.get_response() + self.assertEqual('application/x-foo-bar', packet.headers['Content-Type']) diff --git a/requirements.txt b/requirements.txt index 7f7afbf..4913234 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ +file-magic jinja2 From e2f2ab7727b825117f2858a0e9197da5c3d9f624 Mon Sep 17 00:00:00 2001 From: s3lph Date: Fri, 20 Jul 2018 00:45:15 +0200 Subject: [PATCH 2/7] Added libmagic to testing and staging Dockerfiles. --- .gitlab-ci.yml | 2 +- Dockerfile | 1 + testing/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 72bd21c..5bfad1b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,5 +1,5 @@ --- -image: s3lph/matemat-ci:20180711-02 +image: s3lph/matemat-ci:20180720-01 stages: - test diff --git a/Dockerfile b/Dockerfile index 83514a4..d084dfb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM python:3.6-alpine RUN mkdir -p /var/matemat/db /var/matemat/upload +RUN apk add libmagic=5.33-r0 ADD . / RUN pip3 install -r /requirements.txt diff --git a/testing/Dockerfile b/testing/Dockerfile index c51e01b..94644e6 100644 --- a/testing/Dockerfile +++ b/testing/Dockerfile @@ -5,7 +5,7 @@ RUN useradd -d /home/matemat -m matemat RUN mkdir -p /var/matemat/db && chown matemat:matemat -R /var/matemat/db RUN mkdir -p /var/matemat/upload && chown matemat:matemat -R /var/matemat/upload RUN apt-get update -qy -RUN apt-get install -y --no-install-recommends sudo openssh-client git docker.io python3-dev python3-pip python3-coverage python3-setuptools build-essential +RUN apt-get install -y --no-install-recommends file sudo openssh-client git docker.io python3-dev python3-pip python3-coverage python3-setuptools build-essential RUN pip3 install wheel pycodestyle mypy WORKDIR /home/matemat From 8598daf3b08752066096b3db4dab20c9a98722d1 Mon Sep 17 00:00:00 2001 From: s3lph Date: Fri, 20 Jul 2018 00:49:29 +0200 Subject: [PATCH 3/7] Catch potential file-magic exception. --- matemat/webserver/httpd.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/matemat/webserver/httpd.py b/matemat/webserver/httpd.py index 23045bc..795e0df 100644 --- a/matemat/webserver/httpd.py +++ b/matemat/webserver/httpd.py @@ -329,8 +329,14 @@ class HttpHandler(BaseHTTPRequestHandler): headers['Content-Length'] = str(len(data)) # If the pagelet did not set its own Content-Type header, use libmagic to guess an appropriate one if 'Content-Type' not in headers: - filemagic: magic.FileMagic = magic.detect_from_content(data) - headers['Content-Type'] = f'{filemagic.mime_type}; charset={filemagic.encoding}' + try: + filemagic: magic.FileMagic = magic.detect_from_content(data) + mimetype: str = filemagic.mime_type + charset: str = filemagic.encoding + except ValueError: + mimetype = 'application/octet-stream' + charset = 'binary' + headers['Content-Type'] = f'{mimetype}; charset={charset}' # Send all headers set by the pagelet for name, value in headers.items(): self.send_header(name, value) @@ -369,9 +375,13 @@ class HttpHandler(BaseHTTPRequestHandler): # File read successfully, send 'OK' header self.send_response(200) # Guess the MIME type and encoding using libmagic - filemagic: magic.FileMagic = magic.detect_from_filename(filepath) - mimetype: str = filemagic.mime_type - charset: str = filemagic.encoding + try: + filemagic: magic.FileMagic = magic.detect_from_filename(filepath) + mimetype: str = filemagic.mime_type + charset: str = filemagic.encoding + except ValueError: + mimetype = 'application/octet-stream' + charset = 'binary' # Send content type and length header self.send_header('Content-Type', f'{mimetype}; charset={charset}') self.send_header('Content-Length', str(len(data))) From e6d5aef2b223bce1d4b16d92f251a93d5c8c4f79 Mon Sep 17 00:00:00 2001 From: s3lph Date: Fri, 20 Jul 2018 00:57:30 +0200 Subject: [PATCH 4/7] Removed libmagic dependency from staging Dockerfile; already present in the python:3.6-alpine base image. --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d084dfb..83514a4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,6 @@ FROM python:3.6-alpine RUN mkdir -p /var/matemat/db /var/matemat/upload -RUN apk add libmagic=5.33-r0 ADD . / RUN pip3 install -r /requirements.txt From eb4504a7eabb504ce58cfa2df9df30a0fee6b227 Mon Sep 17 00:00:00 2001 From: s3lph Date: Fri, 20 Jul 2018 11:09:43 +0200 Subject: [PATCH 5/7] Added missing libmagic dependency to alpine-based image. --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 83514a4..044faee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM python:3.6-alpine RUN mkdir -p /var/matemat/db /var/matemat/upload +RUN apk --update add libmagic ADD . / RUN pip3 install -r /requirements.txt From fd7ff591377b1829299b85848848600933828b6f Mon Sep 17 00:00:00 2001 From: s3lph Date: Fri, 20 Jul 2018 11:38:16 +0200 Subject: [PATCH 6/7] Wrapped command for running matemat in the staging container in a shell script --- Dockerfile | 2 +- run.sh | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100755 run.sh diff --git a/Dockerfile b/Dockerfile index 044faee..c427a2e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,4 +7,4 @@ ADD . / RUN pip3 install -r /requirements.txt EXPOSE 80/tcp -CMD [ "/usr/local/bin/python3", "-m", "matemat", "/etc/matemat.conf", "/matemat.docker.conf" ] +CMD [ "/run.sh" ] diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..ca8c643 --- /dev/null +++ b/run.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +export LD_PRELOAD=/usr/lib/libmagic.so.1 + +/usr/local/bin/python3 -m matemat /etc/matemat.conf /matemat.docker.conf From 8b0e871dc78c74445b415cfd2870a747913da695 Mon Sep 17 00:00:00 2001 From: s3lph Date: Fri, 20 Jul 2018 13:01:33 +0200 Subject: [PATCH 7/7] Only use libmagic for static resources if extension-based guessing fails. --- matemat/webserver/httpd.py | 20 +++++++++++---- matemat/webserver/test/test_serve.py | 37 ++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/matemat/webserver/httpd.py b/matemat/webserver/httpd.py index 795e0df..e2bde4f 100644 --- a/matemat/webserver/httpd.py +++ b/matemat/webserver/httpd.py @@ -4,6 +4,7 @@ from typing import Any, Callable, Dict, Tuple, Type, Union import logging import os import socket +import mimetypes import magic from socketserver import TCPServer from http.server import HTTPServer, BaseHTTPRequestHandler @@ -336,7 +337,11 @@ class HttpHandler(BaseHTTPRequestHandler): except ValueError: mimetype = 'application/octet-stream' charset = 'binary' - headers['Content-Type'] = f'{mimetype}; charset={charset}' + # Only append the charset if it is not "binary" + if charset == 'binary': + headers['Content-Type'] = mimetype + else: + headers['Content-Type'] = f'{mimetype}; charset={charset}' # Send all headers set by the pagelet for name, value in headers.items(): self.send_header(name, value) @@ -374,16 +379,21 @@ class HttpHandler(BaseHTTPRequestHandler): data = f.read() # File read successfully, send 'OK' header self.send_response(200) - # Guess the MIME type and encoding using libmagic + # Guess the MIME type by file extension, or use libmagic as fallback + # Use libmagic to guess the charset try: + exttype: str = mimetypes.guess_type(filepath)[0] filemagic: magic.FileMagic = magic.detect_from_filename(filepath) - mimetype: str = filemagic.mime_type + mimetype: str = exttype if exttype is not None else filemagic.mime_type charset: str = filemagic.encoding except ValueError: mimetype = 'application/octet-stream' charset = 'binary' - # Send content type and length header - self.send_header('Content-Type', f'{mimetype}; charset={charset}') + # Send content type and length header. Only set the charset if it's not "binary" + if charset == 'binary': + self.send_header('Content-Type', mimetype) + else: + self.send_header('Content-Type', f'{mimetype}; charset={charset}') self.send_header('Content-Length', str(len(data))) self.send_header('Last-Modified', fileage.strftime('%a, %d %b %Y %H:%M:%S GMT')) self.send_header('Cache-Control', 'max-age=1') diff --git a/matemat/webserver/test/test_serve.py b/matemat/webserver/test/test_serve.py index 2dd7e7e..b7ec850 100644 --- a/matemat/webserver/test/test_serve.py +++ b/matemat/webserver/test/test_serve.py @@ -78,6 +78,15 @@ class TestServe(AbstractHttpdTest): with open(forbidden, 'w') as f: f.write('This should not be readable') os.chmod(forbidden, 0) + # Create a CSS resource whose MIME type should be detected by file extension + with open(os.path.join(self.tempdir.name, 'teststyle.css'), 'w') as f: + f.write('.ninja { display: none; }\n') + # Create a file without extension (containing UTF-16 text with BOM); libmagic should take over + with open(os.path.join(self.tempdir.name, 'testdata'), 'wb') as f: + f.write(b'\xFE\xFFH\x00e\x00l\x00l\x00o\x00,\x00 \x00w\x00o\x00r\x00l\x00d\x00!\x00\n\x00') + # Create a file that will yield "text/plain; charset=binary" + with open(os.path.join(self.tempdir.name, 'testbin.txt'), 'wb') as f: + f.write(b'\x00\x00\x00\x00\x00\x00\x00\x00') def test_serve_pagelet_str(self): # Call the test pagelet that produces a 200 OK result @@ -226,16 +235,40 @@ class TestServe(AbstractHttpdTest): # Make sure a 405 Method Not Allowed header is served self.assertEqual(405, packet.statuscode) - def test_serve_static_libmagic(self): + def test_serve_pagelet_libmagic(self): # The correct Content-Type header must be guessed, if a pagelet does not provide one self.client_sock.set_request(b'GET /just/testing/serve_pagelet_str HTTP/1.1\r\n\r\n') HttpHandler(self.client_sock, ('::1', 45678), self.server) packet = self.client_sock.get_response() self.assertEqual('text/plain; charset=us-ascii', packet.headers['Content-Type']) - def test_serve_static_libmagic_skipped(self): + def test_serve_pagelet_libmagic_skipped(self): # The Content-Type set by a pagelet should not be overwritten self.client_sock.set_request(b'GET /just/testing/serve_pagelet_bytes HTTP/1.1\r\n\r\n') HttpHandler(self.client_sock, ('::1', 45678), self.server) packet = self.client_sock.get_response() self.assertEqual('application/x-foo-bar', packet.headers['Content-Type']) + + def test_serve_static_mime_extension(self): + # The correct Content-Type should be guessed by file extension primarily + self.client_sock.set_request(b'GET /teststyle.css HTTP/1.1\r\n\r\n') + HttpHandler(self.client_sock, ('::1', 45678), self.server) + packet = self.client_sock.get_response() + # libmagic would say text/plain instead + self.assertEqual('text/css; charset=us-ascii', packet.headers['Content-Type']) + + def test_serve_static_mime_magic(self): + # The correct Content-Type should be guessed by file extension primarily + self.client_sock.set_request(b'GET /testdata HTTP/1.1\r\n\r\n') + HttpHandler(self.client_sock, ('::1', 45678), self.server) + packet = self.client_sock.get_response() + # Extension-based would fail, as there is no extension + self.assertEqual('text/plain; charset=utf-16be', packet.headers['Content-Type']) + + def test_serve_static_mime_magic_binary(self): + # The correct Content-Type should be guessed by file extension primarily + self.client_sock.set_request(b'GET /testbin.txt HTTP/1.1\r\n\r\n') + HttpHandler(self.client_sock, ('::1', 45678), self.server) + packet = self.client_sock.get_response() + # No charset should be in the header. Yes, this is a stupid example + self.assertEqual('text/plain', packet.headers['Content-Type'])