Merge branch '18-libmagic-mime-type-detection' into 'master'
Resolve "Use a libmagic wrapper for MIME type detection" Closes #18 See merge request s3lph/matemat!19
This commit is contained in:
commit
7f4dc078ec
8 changed files with 88 additions and 14 deletions
|
@ -1,5 +1,5 @@
|
||||||
---
|
---
|
||||||
image: s3lph/matemat-ci:20180711-02
|
image: s3lph/matemat-ci:20180720-01
|
||||||
|
|
||||||
stages:
|
stages:
|
||||||
- test
|
- test
|
||||||
|
|
|
@ -2,8 +2,9 @@
|
||||||
FROM python:3.6-alpine
|
FROM python:3.6-alpine
|
||||||
|
|
||||||
RUN mkdir -p /var/matemat/db /var/matemat/upload
|
RUN mkdir -p /var/matemat/db /var/matemat/upload
|
||||||
|
RUN apk --update add libmagic
|
||||||
ADD . /
|
ADD . /
|
||||||
RUN pip3 install -r /requirements.txt
|
RUN pip3 install -r /requirements.txt
|
||||||
|
|
||||||
EXPOSE 80/tcp
|
EXPOSE 80/tcp
|
||||||
CMD [ "/usr/local/bin/python3", "-m", "matemat", "/etc/matemat.conf", "/matemat.docker.conf" ]
|
CMD [ "/run.sh" ]
|
||||||
|
|
|
@ -18,6 +18,7 @@ This project intends to provide a well-tested and maintainable alternative to
|
||||||
|
|
||||||
- Python 3 (>=3.6)
|
- Python 3 (>=3.6)
|
||||||
- Python dependencies:
|
- Python dependencies:
|
||||||
|
- file-magic
|
||||||
- jinja2
|
- jinja2
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
|
@ -5,6 +5,7 @@ import logging
|
||||||
import os
|
import os
|
||||||
import socket
|
import socket
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
import magic
|
||||||
from socketserver import TCPServer
|
from socketserver import TCPServer
|
||||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||||
from http.cookies import SimpleCookie
|
from http.cookies import SimpleCookie
|
||||||
|
@ -308,7 +309,6 @@ class HttpHandler(BaseHTTPRequestHandler):
|
||||||
if path in _PAGELET_PATHS:
|
if path in _PAGELET_PATHS:
|
||||||
# Prepare some headers. Those can still be overwritten by the pagelet
|
# Prepare some headers. Those can still be overwritten by the pagelet
|
||||||
headers: Dict[str, str] = {
|
headers: Dict[str, str] = {
|
||||||
'Content-Type': 'text/html',
|
|
||||||
'Cache-Control': 'no-cache'
|
'Cache-Control': 'no-cache'
|
||||||
}
|
}
|
||||||
# Call the pagelet function
|
# Call the pagelet function
|
||||||
|
@ -328,6 +328,20 @@ class HttpHandler(BaseHTTPRequestHandler):
|
||||||
f'matemat_session_id={session_id}; expires={expires}')
|
f'matemat_session_id={session_id}; expires={expires}')
|
||||||
# Compute the body length and add the appropriate header
|
# Compute the body length and add the appropriate header
|
||||||
headers['Content-Length'] = str(len(data))
|
headers['Content-Length'] = str(len(data))
|
||||||
|
# If the pagelet did not set its own Content-Type header, use libmagic to guess an appropriate one
|
||||||
|
if 'Content-Type' not in headers:
|
||||||
|
try:
|
||||||
|
filemagic: magic.FileMagic = magic.detect_from_content(data)
|
||||||
|
mimetype: str = filemagic.mime_type
|
||||||
|
charset: str = filemagic.encoding
|
||||||
|
except ValueError:
|
||||||
|
mimetype = 'application/octet-stream'
|
||||||
|
charset = 'binary'
|
||||||
|
# Only append the charset if it is not "binary"
|
||||||
|
if charset == 'binary':
|
||||||
|
headers['Content-Type'] = mimetype
|
||||||
|
else:
|
||||||
|
headers['Content-Type'] = f'{mimetype}; charset={charset}'
|
||||||
# Send all headers set by the pagelet
|
# Send all headers set by the pagelet
|
||||||
for name, value in headers.items():
|
for name, value in headers.items():
|
||||||
self.send_header(name, value)
|
self.send_header(name, value)
|
||||||
|
@ -365,13 +379,21 @@ class HttpHandler(BaseHTTPRequestHandler):
|
||||||
data = f.read()
|
data = f.read()
|
||||||
# File read successfully, send 'OK' header
|
# File read successfully, send 'OK' header
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
# TODO: Guess the MIME type. Unfortunately this call solely relies on the file extension, not ideal?
|
# Guess the MIME type by file extension, or use libmagic as fallback
|
||||||
mimetype, _ = mimetypes.guess_type(filepath)
|
# Use libmagic to guess the charset
|
||||||
# Fall back to octet-stream type, if unknown
|
try:
|
||||||
if mimetype is None:
|
exttype: str = mimetypes.guess_type(filepath)[0]
|
||||||
|
filemagic: magic.FileMagic = magic.detect_from_filename(filepath)
|
||||||
|
mimetype: str = exttype if exttype is not None else filemagic.mime_type
|
||||||
|
charset: str = filemagic.encoding
|
||||||
|
except ValueError:
|
||||||
mimetype = 'application/octet-stream'
|
mimetype = 'application/octet-stream'
|
||||||
# Send content type and length header
|
charset = 'binary'
|
||||||
|
# Send content type and length header. Only set the charset if it's not "binary"
|
||||||
|
if charset == 'binary':
|
||||||
self.send_header('Content-Type', mimetype)
|
self.send_header('Content-Type', mimetype)
|
||||||
|
else:
|
||||||
|
self.send_header('Content-Type', f'{mimetype}; charset={charset}')
|
||||||
self.send_header('Content-Length', str(len(data)))
|
self.send_header('Content-Length', str(len(data)))
|
||||||
self.send_header('Last-Modified', fileage.strftime('%a, %d %b %Y %H:%M:%S GMT'))
|
self.send_header('Last-Modified', fileage.strftime('%a, %d %b %Y %H:%M:%S GMT'))
|
||||||
self.send_header('Cache-Control', 'max-age=1')
|
self.send_header('Cache-Control', 'max-age=1')
|
||||||
|
|
|
@ -17,7 +17,6 @@ def serve_test_pagelet_str(method: str,
|
||||||
session_vars: Dict[str, Any],
|
session_vars: Dict[str, Any],
|
||||||
headers: Dict[str, str],
|
headers: Dict[str, str],
|
||||||
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
|
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
|
||||||
headers['Content-Type'] = 'text/plain'
|
|
||||||
return 'serve test pagelet str'
|
return 'serve test pagelet str'
|
||||||
|
|
||||||
|
|
||||||
|
@ -28,7 +27,7 @@ def serve_test_pagelet_bytes(method: str,
|
||||||
session_vars: Dict[str, Any],
|
session_vars: Dict[str, Any],
|
||||||
headers: Dict[str, str],
|
headers: Dict[str, str],
|
||||||
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
|
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
|
||||||
headers['Content-Type'] = 'application/octet-stream'
|
headers['Content-Type'] = 'application/x-foo-bar'
|
||||||
return b'serve\x80test\xffpagelet\xfebytes'
|
return b'serve\x80test\xffpagelet\xfebytes'
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,7 +48,6 @@ def serve_test_pagelet_template(method: str,
|
||||||
session_vars: Dict[str, Any],
|
session_vars: Dict[str, Any],
|
||||||
headers: Dict[str, str],
|
headers: Dict[str, str],
|
||||||
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
|
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
|
||||||
headers['Content-Type'] = 'text/plain'
|
|
||||||
return TemplateResponse('test.txt', what='World')
|
return TemplateResponse('test.txt', what='World')
|
||||||
|
|
||||||
|
|
||||||
|
@ -62,7 +60,6 @@ def serve_test_pagelet_fail(method: str,
|
||||||
headers: Dict[str, str],
|
headers: Dict[str, str],
|
||||||
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
|
pagelet_variables: Dict[str, str]) -> Union[bytes, str, PageletResponse]:
|
||||||
session_vars['test'] = 'hello, world!'
|
session_vars['test'] = 'hello, world!'
|
||||||
headers['Content-Type'] = 'text/plain'
|
|
||||||
raise HttpException(599, 'Error expected during unit testing')
|
raise HttpException(599, 'Error expected during unit testing')
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,6 +78,15 @@ class TestServe(AbstractHttpdTest):
|
||||||
with open(forbidden, 'w') as f:
|
with open(forbidden, 'w') as f:
|
||||||
f.write('This should not be readable')
|
f.write('This should not be readable')
|
||||||
os.chmod(forbidden, 0)
|
os.chmod(forbidden, 0)
|
||||||
|
# Create a CSS resource whose MIME type should be detected by file extension
|
||||||
|
with open(os.path.join(self.tempdir.name, 'teststyle.css'), 'w') as f:
|
||||||
|
f.write('.ninja { display: none; }\n')
|
||||||
|
# Create a file without extension (containing UTF-16 text with BOM); libmagic should take over
|
||||||
|
with open(os.path.join(self.tempdir.name, 'testdata'), 'wb') as f:
|
||||||
|
f.write(b'\xFE\xFFH\x00e\x00l\x00l\x00o\x00,\x00 \x00w\x00o\x00r\x00l\x00d\x00!\x00\n\x00')
|
||||||
|
# Create a file that will yield "text/plain; charset=binary"
|
||||||
|
with open(os.path.join(self.tempdir.name, 'testbin.txt'), 'wb') as f:
|
||||||
|
f.write(b'\x00\x00\x00\x00\x00\x00\x00\x00')
|
||||||
|
|
||||||
def test_serve_pagelet_str(self):
|
def test_serve_pagelet_str(self):
|
||||||
# Call the test pagelet that produces a 200 OK result
|
# Call the test pagelet that produces a 200 OK result
|
||||||
|
@ -228,3 +234,41 @@ class TestServe(AbstractHttpdTest):
|
||||||
self.assertIsNone(packet.pagelet)
|
self.assertIsNone(packet.pagelet)
|
||||||
# Make sure a 405 Method Not Allowed header is served
|
# Make sure a 405 Method Not Allowed header is served
|
||||||
self.assertEqual(405, packet.statuscode)
|
self.assertEqual(405, packet.statuscode)
|
||||||
|
|
||||||
|
def test_serve_pagelet_libmagic(self):
|
||||||
|
# The correct Content-Type header must be guessed, if a pagelet does not provide one
|
||||||
|
self.client_sock.set_request(b'GET /just/testing/serve_pagelet_str HTTP/1.1\r\n\r\n')
|
||||||
|
HttpHandler(self.client_sock, ('::1', 45678), self.server)
|
||||||
|
packet = self.client_sock.get_response()
|
||||||
|
self.assertEqual('text/plain; charset=us-ascii', packet.headers['Content-Type'])
|
||||||
|
|
||||||
|
def test_serve_pagelet_libmagic_skipped(self):
|
||||||
|
# The Content-Type set by a pagelet should not be overwritten
|
||||||
|
self.client_sock.set_request(b'GET /just/testing/serve_pagelet_bytes HTTP/1.1\r\n\r\n')
|
||||||
|
HttpHandler(self.client_sock, ('::1', 45678), self.server)
|
||||||
|
packet = self.client_sock.get_response()
|
||||||
|
self.assertEqual('application/x-foo-bar', packet.headers['Content-Type'])
|
||||||
|
|
||||||
|
def test_serve_static_mime_extension(self):
|
||||||
|
# The correct Content-Type should be guessed by file extension primarily
|
||||||
|
self.client_sock.set_request(b'GET /teststyle.css HTTP/1.1\r\n\r\n')
|
||||||
|
HttpHandler(self.client_sock, ('::1', 45678), self.server)
|
||||||
|
packet = self.client_sock.get_response()
|
||||||
|
# libmagic would say text/plain instead
|
||||||
|
self.assertEqual('text/css; charset=us-ascii', packet.headers['Content-Type'])
|
||||||
|
|
||||||
|
def test_serve_static_mime_magic(self):
|
||||||
|
# The correct Content-Type should be guessed by file extension primarily
|
||||||
|
self.client_sock.set_request(b'GET /testdata HTTP/1.1\r\n\r\n')
|
||||||
|
HttpHandler(self.client_sock, ('::1', 45678), self.server)
|
||||||
|
packet = self.client_sock.get_response()
|
||||||
|
# Extension-based would fail, as there is no extension
|
||||||
|
self.assertEqual('text/plain; charset=utf-16be', packet.headers['Content-Type'])
|
||||||
|
|
||||||
|
def test_serve_static_mime_magic_binary(self):
|
||||||
|
# The correct Content-Type should be guessed by file extension primarily
|
||||||
|
self.client_sock.set_request(b'GET /testbin.txt HTTP/1.1\r\n\r\n')
|
||||||
|
HttpHandler(self.client_sock, ('::1', 45678), self.server)
|
||||||
|
packet = self.client_sock.get_response()
|
||||||
|
# No charset should be in the header. Yes, this is a stupid example
|
||||||
|
self.assertEqual('text/plain', packet.headers['Content-Type'])
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
|
file-magic
|
||||||
jinja2
|
jinja2
|
||||||
|
|
5
run.sh
Executable file
5
run.sh
Executable file
|
@ -0,0 +1,5 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
export LD_PRELOAD=/usr/lib/libmagic.so.1
|
||||||
|
|
||||||
|
/usr/local/bin/python3 -m matemat /etc/matemat.conf /matemat.docker.conf
|
|
@ -5,7 +5,7 @@ RUN useradd -d /home/matemat -m matemat
|
||||||
RUN mkdir -p /var/matemat/db && chown matemat:matemat -R /var/matemat/db
|
RUN mkdir -p /var/matemat/db && chown matemat:matemat -R /var/matemat/db
|
||||||
RUN mkdir -p /var/matemat/upload && chown matemat:matemat -R /var/matemat/upload
|
RUN mkdir -p /var/matemat/upload && chown matemat:matemat -R /var/matemat/upload
|
||||||
RUN apt-get update -qy
|
RUN apt-get update -qy
|
||||||
RUN apt-get install -y --no-install-recommends sudo openssh-client git docker.io python3-dev python3-pip python3-coverage python3-setuptools build-essential
|
RUN apt-get install -y --no-install-recommends file sudo openssh-client git docker.io python3-dev python3-pip python3-coverage python3-setuptools build-essential
|
||||||
RUN pip3 install wheel pycodestyle mypy
|
RUN pip3 install wheel pycodestyle mypy
|
||||||
|
|
||||||
WORKDIR /home/matemat
|
WORKDIR /home/matemat
|
||||||
|
|
Loading…
Reference in a new issue