matemat/matemat/webserver/util.py


from typing import Dict, List, Tuple, Optional

import urllib.parse

from matemat.webserver import RequestArguments, RequestArgument


def _parse_multipart(body: bytes, boundary: str) -> List[RequestArgument]:
    """
    Given a HTTP body with form-data in multipart form, and the multipart-boundary, parse the multipart items and
    return them as a dictionary.

    :param body: The HTTP multipart/form-data body.
    :param boundary: The multipart boundary.
    :return: A dictionary of field names as key, and content types and field values as value.
    """
    # Prepend a CRLF for the first boundary to match
    body = b'\r\n' + body
    # Generate item header boundary and terminating boundary from general boundary string
    _boundary = f'\r\n--{boundary}\r\n'.encode('utf-8')
    _end_boundary = f'\r\n--{boundary}--\r\n'.encode('utf-8')
    # Split at the end boundary and make sure there comes nothing after it
    allparts = body.split(_end_boundary, 1)
    if len(allparts) != 2 or allparts[1] != b'':
        raise ValueError('Last boundary missing or corrupted')
    # Split remaining body into its parts, and verify at least 1 part is there
    parts: List[bytes] = (allparts[0]).split(_boundary)
    if len(parts) < 1 or parts[0] != b'':
        raise ValueError('First boundary missing or corrupted')
    # Remove the first, empty part
    parts = parts[1:]

    # Results go into this dict
    args: Dict[str, RequestArgument] = dict()

    # Parse each multipart part
    for part in parts:
        # Parse multipart headers
        hdr: Dict[str, str] = dict()
        while True:
            head, part = part.split(b'\r\n', 1)
            # Break on header/body delimiter
            if head == b'':
                break
            # Add header to hdr dict
            hk, hv = head.decode('utf-8').split(':')
            hdr[hk.strip()] = hv.strip()
        # No content type set - set broadest possible type
        if 'Content-Type' not in hdr:
            hdr['Content-Type'] = 'application/octet-stream'
        # At least Content-Disposition must be present
        if 'Content-Disposition' not in hdr:
            raise ValueError('Missing Content-Type or Content-Disposition header')
        # Extract Content-Disposition header value and its arguments
        cd, *cdargs = hdr['Content-Disposition'].split(';')
        # Content-Disposition MUST be form-data; everything else is rejected
        if cd.strip() != 'form-data':
            raise ValueError(f'Unknown Content-Disposition: {cd}')
        # Extract the "name" header argument
        has_name = False
        for cdarg in cdargs:
            k, v = cdarg.split('=', 1)
            if k.strip() == 'name':
                has_name = True
                name: str = v.strip()
                # Remove quotation marks around the name value
                if name.startswith('"') and name.endswith('"'):
                    name = v[1:-1]
                # Add the Content-Type and the content to the header, with the provided name
                if name not in args:
                    args[name] = RequestArgument(name)
                args[name].append(hdr['Content-Type'].strip(), part)
        if not has_name:
            # Content-Disposition header without name attribute
            raise ValueError('mutlipart/form-data part without name attribute')

    return list(args.values())


def parse_args(request: str, postbody: Optional[bytes] = None, enctype: str = 'text/plain') \
        -> Tuple[str, RequestArguments]:
    """
    Given a HTTP request path, and optionally a HTTP POST body in application/x-www-form-urlencoded or
    multipart/form-data form, parse the arguments and return them as a dictionary.

    If a key is used both in GET and in POST, the POST value takes precedence, and the GET value is discarded.

    :param request: The request string to parse.
    :param postbody: The POST body to parse, defaults to None.
    :param enctype: Encoding of the POST body; supported values are application/x-www-form-urlencoded and
                    multipart/form-data.
    :return: A tuple consisting of the base path and a dictionary with the parsed key/value pairs, and the value's
             content type.
    """
    # Parse the request "URL" (i.e. only the path)
    tokens = urllib.parse.urlparse(request)
    # Parse the GET arguments
    if len(tokens.query) == 0:
        getargs: Dict[str, List[str]] = dict()
    else:
        getargs = urllib.parse.parse_qs(tokens.query, strict_parsing=True, keep_blank_values=True, errors='strict')

    args = RequestArguments()
    for k, vs in getargs.items():
        args[k].clear()
        for v in vs:
            args[k].append('text/plain', v)

    if postbody is not None:
        if enctype == 'application/x-www-form-urlencoded':
            # Parse the POST body
            pb: str = postbody.decode('utf-8')
            if len(pb) == 0:
                postargs: Dict[str, List[str]] = dict()
            else:
                postargs = urllib.parse.parse_qs(pb, strict_parsing=True, keep_blank_values=True, errors='strict')
            # Write all POST values into the dict, overriding potential duplicates from GET
            for k, vs in postargs.items():
                args[k].clear()
                for v in vs:
                    args[k].append('text/plain', v)
        elif enctype.startswith('multipart/form-data'):
            # Parse the multipart boundary from the Content-Type header
            try:
                boundary: str = enctype.split('boundary=')[1].strip()
            except IndexError:
                raise ValueError('Multipart boundary in header not set or corrupted')
            # Parse the multipart body
            mpargs = _parse_multipart(postbody, boundary)
            for ra in mpargs:
                args[ra.name].clear()
                for a in ra:
                    args[ra.name].append(a.get_content_type(), bytes(a))
        else:
            raise ValueError(f'Unsupported Content-Type: {enctype}')
    # Return the path and the parsed arguments
    return tokens.path, args
No results found.