Merge branch '18-libmagic-mime-type-detection' into 'master'

Resolve "Use a libmagic wrapper for MIME type detection"

Closes #18

See merge request s3lph/matemat!19
This commit is contained in:
s3lph 2018-07-20 11:39:31 +00:00
commit 7f4dc078ec
8 changed files with 88 additions and 14 deletions

View file

@ -1,5 +1,5 @@
--- ---
image: s3lph/matemat-ci:20180711-02 image: s3lph/matemat-ci:20180720-01
stages: stages:
- test - test

View file

@ -2,8 +2,9 @@
FROM python:3.6-alpine FROM python:3.6-alpine
RUN mkdir -p /var/matemat/db /var/matemat/upload RUN mkdir -p /var/matemat/db /var/matemat/upload
RUN apk --update add libmagic
ADD . / ADD . /
RUN pip3 install -r /requirements.txt RUN pip3 install -r /requirements.txt
EXPOSE 80/tcp EXPOSE 80/tcp
CMD [ "/usr/local/bin/python3", "-m", "matemat", "/etc/matemat.conf", "/matemat.docker.conf" ] CMD [ "/run.sh" ]

View file

@ -18,6 +18,7 @@ This project intends to provide a well-tested and maintainable alternative to
- Python 3 (>=3.6) - Python 3 (>=3.6)
- Python dependencies: - Python dependencies:
- file-magic
- jinja2 - jinja2
## Usage ## Usage

View file

@ -5,6 +5,7 @@ import logging
import os import os
import socket import socket
import mimetypes import mimetypes
import magic
from socketserver import TCPServer from socketserver import TCPServer
from http.server import HTTPServer, BaseHTTPRequestHandler from http.server import HTTPServer, BaseHTTPRequestHandler
from http.cookies import SimpleCookie from http.cookies import SimpleCookie
@ -308,7 +309,6 @@ class HttpHandler(BaseHTTPRequestHandler):
if path in _PAGELET_PATHS: if path in _PAGELET_PATHS:
# Prepare some headers. Those can still be overwritten by the pagelet # Prepare some headers. Those can still be overwritten by the pagelet
headers: Dict[str, str] = { headers: Dict[str, str] = {
'Content-Type': 'text/html',
'Cache-Control': 'no-cache' 'Cache-Control': 'no-cache'
} }
# Call the pagelet function # Call the pagelet function
@ -328,6 +328,20 @@ class HttpHandler(BaseHTTPRequestHandler):
f'matemat_session_id={session_id}; expires={expires}') f'matemat_session_id={session_id}; expires={expires}')
# Compute the body length and add the appropriate header # Compute the body length and add the appropriate header
headers['Content-Length'] = str(len(data)) headers['Content-Length'] = str(len(data))
# If the pagelet did not set its own Content-Type header, use libmagic to guess an appropriate one
if 'Content-Type' not in headers:
try:
filemagic: magic.FileMagic = magic.detect_from_content(data)
mimetype: str = filemagic.mime_type
charset: str = filemagic.encoding
except ValueError:
mimetype = 'application/octet-stream'
charset = 'binary'
# Only append the charset if it is not "binary"
if charset == 'binary':
headers['Content-Type'] = mimetype
else:
headers['Content-Type'] = f'{mimetype}; charset={charset}'
# Send all headers set by the pagelet # Send all headers set by the pagelet
for name, value in headers.items(): for name, value in headers.items():
self.send_header(name, value) self.send_header(name, value)
@ -365,13 +379,21 @@ class HttpHandler(BaseHTTPRequestHandler):
data = f.read() data = f.read()
# File read successfully, send 'OK' header # File read successfully, send 'OK' header
self.send_response(200) self.send_response(200)
# TODO: Guess the MIME type. Unfortunately this call solely relies on the file extension, not ideal? # Guess the MIME type by file extension, or use libmagic as fallback
mimetype, _ = mimetypes.guess_type(filepath) # Use libmagic to guess the charset
# Fall back to octet-stream type, if unknown try:
if mimetype is None: exttype: str = mimetypes.guess_type(filepath)[0]
filemagic: magic.FileMagic = magic.detect_from_filename(filepath)
mimetype: str = exttype if exttype is not None else filemagic.mime_type
charset: str = filemagic.encoding
except ValueError:
mimetype = 'application/octet-stream' mimetype = 'application/octet-stream'
# Send content type and length header charset = 'binary'
# Send content type and length header. Only set the charset if it's not "binary"
if charset == 'binary':
self.send_header('Content-Type', mimetype) self.send_header('Content-Type', mimetype)
else:
self.send_header('Content-Type', f'{mimetype}; charset={charset}')
self.send_header('Content-Length', str(len(data))) self.send_header('Content-Length', str(len(data)))
self.send_header('Last-Modified', fileage.strftime('%a, %d %b %Y %H:%M:%S GMT')) self.send_header('Last-Modified', fileage.strftime('%a, %d %b %Y %H:%M:%S GMT'))
self.send_header('Cache-Control', 'max-age=1') self.send_header('Cache-Control', 'max-age=1')

View file

@ -17,7 +17,6 @@ def serve_test_pagelet_str(method: str,
session_vars: Dict[str, Any], session_vars: Dict[str, Any],
headers: Dict[str, str], headers: Dict[str, str],
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
headers['Content-Type'] = 'text/plain'
return 'serve test pagelet str' return 'serve test pagelet str'
@ -28,7 +27,7 @@ def serve_test_pagelet_bytes(method: str,
session_vars: Dict[str, Any], session_vars: Dict[str, Any],
headers: Dict[str, str], headers: Dict[str, str],
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
headers['Content-Type'] = 'application/octet-stream' headers['Content-Type'] = 'application/x-foo-bar'
return b'serve\x80test\xffpagelet\xfebytes' return b'serve\x80test\xffpagelet\xfebytes'
@ -49,7 +48,6 @@ def serve_test_pagelet_template(method: str,
session_vars: Dict[str, Any], session_vars: Dict[str, Any],
headers: Dict[str, str], headers: Dict[str, str],
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
headers['Content-Type'] = 'text/plain'
return TemplateResponse('test.txt', what='World') return TemplateResponse('test.txt', what='World')
@ -62,7 +60,6 @@ def serve_test_pagelet_fail(method: str,
headers: Dict[str, str], headers: Dict[str, str],
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]: pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
session_vars['test'] = 'hello, world!' session_vars['test'] = 'hello, world!'
headers['Content-Type'] = 'text/plain'
raise HttpException(599, 'Error expected during unit testing') raise HttpException(599, 'Error expected during unit testing')
@ -81,6 +78,15 @@ class TestServe(AbstractHttpdTest):
with open(forbidden, 'w') as f: with open(forbidden, 'w') as f:
f.write('This should not be readable') f.write('This should not be readable')
os.chmod(forbidden, 0) os.chmod(forbidden, 0)
# Create a CSS resource whose MIME type should be detected by file extension
with open(os.path.join(self.tempdir.name, 'teststyle.css'), 'w') as f:
f.write('.ninja { display: none; }\n')
# Create a file without extension (containing UTF-16 text with BOM); libmagic should take over
with open(os.path.join(self.tempdir.name, 'testdata'), 'wb') as f:
f.write(b'\xFE\xFFH\x00e\x00l\x00l\x00o\x00,\x00 \x00w\x00o\x00r\x00l\x00d\x00!\x00\n\x00')
# Create a file that will yield "text/plain; charset=binary"
with open(os.path.join(self.tempdir.name, 'testbin.txt'), 'wb') as f:
f.write(b'\x00\x00\x00\x00\x00\x00\x00\x00')
def test_serve_pagelet_str(self): def test_serve_pagelet_str(self):
# Call the test pagelet that produces a 200 OK result # Call the test pagelet that produces a 200 OK result
@ -228,3 +234,41 @@ class TestServe(AbstractHttpdTest):
self.assertIsNone(packet.pagelet) self.assertIsNone(packet.pagelet)
# Make sure a 405 Method Not Allowed header is served # Make sure a 405 Method Not Allowed header is served
self.assertEqual(405, packet.statuscode) self.assertEqual(405, packet.statuscode)
def test_serve_pagelet_libmagic(self):
# The correct Content-Type header must be guessed, if a pagelet does not provide one
self.client_sock.set_request(b'GET /just/testing/serve_pagelet_str HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
self.assertEqual('text/plain; charset=us-ascii', packet.headers['Content-Type'])
def test_serve_pagelet_libmagic_skipped(self):
# The Content-Type set by a pagelet should not be overwritten
self.client_sock.set_request(b'GET /just/testing/serve_pagelet_bytes HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
self.assertEqual('application/x-foo-bar', packet.headers['Content-Type'])
def test_serve_static_mime_extension(self):
# The correct Content-Type should be guessed by file extension primarily
self.client_sock.set_request(b'GET /teststyle.css HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
# libmagic would say text/plain instead
self.assertEqual('text/css; charset=us-ascii', packet.headers['Content-Type'])
def test_serve_static_mime_magic(self):
# The correct Content-Type should be guessed by file extension primarily
self.client_sock.set_request(b'GET /testdata HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
# Extension-based would fail, as there is no extension
self.assertEqual('text/plain; charset=utf-16be', packet.headers['Content-Type'])
def test_serve_static_mime_magic_binary(self):
# The correct Content-Type should be guessed by file extension primarily
self.client_sock.set_request(b'GET /testbin.txt HTTP/1.1\r\n\r\n')
HttpHandler(self.client_sock, ('::1', 45678), self.server)
packet = self.client_sock.get_response()
# No charset should be in the header. Yes, this is a stupid example
self.assertEqual('text/plain', packet.headers['Content-Type'])

View file

@ -1 +1,2 @@
file-magic
jinja2 jinja2

5
run.sh Executable file
View file

@ -0,0 +1,5 @@
#!/bin/sh
export LD_PRELOAD=/usr/lib/libmagic.so.1
/usr/local/bin/python3 -m matemat /etc/matemat.conf /matemat.docker.conf

View file

@ -5,7 +5,7 @@ RUN useradd -d /home/matemat -m matemat
RUN mkdir -p /var/matemat/db && chown matemat:matemat -R /var/matemat/db RUN mkdir -p /var/matemat/db && chown matemat:matemat -R /var/matemat/db
RUN mkdir -p /var/matemat/upload && chown matemat:matemat -R /var/matemat/upload RUN mkdir -p /var/matemat/upload && chown matemat:matemat -R /var/matemat/upload
RUN apt-get update -qy RUN apt-get update -qy
RUN apt-get install -y --no-install-recommends sudo openssh-client git docker.io python3-dev python3-pip python3-coverage python3-setuptools build-essential RUN apt-get install -y --no-install-recommends file sudo openssh-client git docker.io python3-dev python3-pip python3-coverage python3-setuptools build-essential
RUN pip3 install wheel pycodestyle mypy RUN pip3 install wheel pycodestyle mypy
WORKDIR /home/matemat WORKDIR /home/matemat