Django: Validate file type of uploaded file

Solution 1:

Validating files is a common challenge, so I would like to use a validator:

import magic

from django.utils.deconstruct import deconstructible
from django.template.defaultfilters import filesizeformat


@deconstructible
class FileValidator(object):
    error_messages = {
     'max_size': ("Ensure this file size is not greater than %(max_size)s."
                  " Your file size is %(size)s."),
     'min_size': ("Ensure this file size is not less than %(min_size)s. "
                  "Your file size is %(size)s."),
     'content_type': "Files of type %(content_type)s are not supported.",
    }

    def __init__(self, max_size=None, min_size=None, content_types=()):
        self.max_size = max_size
        self.min_size = min_size
        self.content_types = content_types

    def __call__(self, data):
        if self.max_size is not None and data.size > self.max_size:
            params = {
                'max_size': filesizeformat(self.max_size), 
                'size': filesizeformat(data.size),
            }
            raise ValidationError(self.error_messages['max_size'],
                                   'max_size', params)

        if self.min_size is not None and data.size < self.min_size:
            params = {
                'min_size': filesizeformat(self.min_size),
                'size': filesizeformat(data.size)
            }
            raise ValidationError(self.error_messages['min_size'], 
                                   'min_size', params)

        if self.content_types:
            content_type = magic.from_buffer(data.read(), mime=True)
            data.seek(0)

            if content_type not in self.content_types:
                params = { 'content_type': content_type }
                raise ValidationError(self.error_messages['content_type'],
                                   'content_type', params)

    def __eq__(self, other):
        return (
            isinstance(other, FileValidator) and
            self.max_size == other.max_size and
            self.min_size == other.min_size and
            self.content_types == other.content_types
        )

Then you can use FileValidator in your models.FileField or forms.FileField as follows:

validate_file = FileValidator(max_size=1024 * 100, 
                             content_types=('application/xml',))
file = models.FileField(upload_to=settings.XML_ROOT, 
                        validators=[validate_file])

Solution 2:

From django 1.11, you can also use FileExtensionValidator.

from django.core.validators import FileExtensionValidator
class UploadedFile(models.Model):
    file = models.FileField(upload_to=settings.XML_ROOT, 
        validators=[FileExtensionValidator(allowed_extensions=['xml'])])

Note this must be used on a FileField and won't work on a CharField (for example), since the validator validates on value.name.

ref: https://docs.djangoproject.com/en/dev/ref/validators/#fileextensionvalidator

Solution 3:

For posterity: the solution is to use the read method and pass that to magic.from_buffer.

class UploadedFileForm(ModelForm):
    def clean_file(self):
        file = self.cleaned_data.get("file", False)
        filetype = magic.from_buffer(file.read())
        if not "XML" in filetype:
            raise ValidationError("File is not XML.")
        return file

    class Meta:
        model = models.UploadedFile
        exclude = ('project',)

Solution 4:

I think what you want to do is to clean the uploaded file in Django's Form.clean_your_field_name_here() methods - the data is available on your system by then if it was submitted as normal HTTP POST request.

Also if you consider this inefficient explore the options of different Django file upload backends and how to do streaming processing.

If you need to consider the security of the system when dealing with uploads

  • Make sure uploaded file has correct extension

  • Make sure the mimetype matches the file extension

In the case you are worried about user's uploading exploit files (for attacking against your site)

  • Rewrite all the file contents on save to get rid of possible extra (exploit) payload (so you cannot embed HTML in XML which the browser would interpret as a site-origin HTML file when downloading)

  • Make sure you use content-disposition header on download

Some more info here: http://opensourcehacker.com/2013/07/31/secure-user-uploads-and-exploiting-served-user-content/

Below is my example how I sanitize the uploaded images:

class Example(models.Model):
    image = models.ImageField(upload_to=filename_gen("participant-images/"), blank=True, null=True)


class Example(forms.ModelForm):
    def clean_image(self):
        """ Clean the uploaded image attachemnt.
        """
        image = self.cleaned_data.get('image', False)
        utils.ensure_safe_user_image(image)
        return image


def ensure_safe_user_image(image):
    """ Perform various checks to sanitize user uploaded image data.

    Checks that image was valid header, then

    :param: InMemoryUploadedFile instance (Django form field value)

    :raise: ValidationError in the case the image content has issues
    """

    if not image:
        return

    assert isinstance(image, InMemoryUploadedFile), "Image rewrite has been only tested on in-memory upload backend"

    # Make sure the image is not too big, so that PIL trashes the server
    if image:
        if image._size > 4*1024*1024:
            raise ValidationError("Image file too large - the limit is 4 megabytes")

    # Then do header peak what the image claims
    image.file.seek(0)
    mime = magic.from_buffer(image.file.getvalue(), mime=True)
    if mime not in ("image/png", "image/jpeg"):
        raise ValidationError("Image is not valid. Please upload a JPEG or PNG image.")

    doc_type = mime.split("/")[-1].upper()

    # Read data from cStringIO instance
    image.file.seek(0)
    pil_image = Image.open(image.file)

    # Rewrite the image contents in the memory
    # (bails out with exception on bad data)
    buf = StringIO()
    pil_image.thumbnail((2048, 2048), Image.ANTIALIAS)
    pil_image.save(buf, doc_type)
    image.file = buf

    # Make sure the image has valid extension (can't upload .htm image)
    extension = unicode(doc_type.lower())
    if not image.name.endswith(u".%s" % extension):
        image.name = image.name + u"." + extension