import io import mimetypes import os import posixpath import threading from datetime import datetime, timedelta from gzip import GzipFile from tempfile import SpooledTemporaryFile from urllib.parse import parse_qsl, urlsplit from django.core.exceptions import ImproperlyConfigured, SuspiciousOperation from django.core.files.base import File from django.utils.deconstruct import deconstructible from django.utils.encoding import filepath_to_uri, force_bytes from django.utils.timezone import is_naive, make_naive from storages.base import BaseStorage from storages.utils import ( check_location, get_available_overwrite_name, lookup_env, safe_join, setting, ) try: import boto3.session from botocore.client import Config from botocore.exceptions import ClientError from botocore.signers import CloudFrontSigner except ImportError as e: raise ImproperlyConfigured("Could not load Boto3's S3 bindings. %s" % e) # NOTE: these are defined as functions so both can be tested def _use_cryptography_signer(): # https://cryptography.io as an RSA backend from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import hashes from cryptography.hazmat.primitives.asymmetric import padding from cryptography.hazmat.primitives.serialization import ( load_pem_private_key ) def _cloud_front_signer_from_pem(key_id, pem): key = load_pem_private_key( pem, password=None, backend=default_backend()) return CloudFrontSigner( key_id, lambda x: key.sign(x, padding.PKCS1v15(), hashes.SHA1())) return _cloud_front_signer_from_pem def _use_rsa_signer(): # https://stuvel.eu/rsa as an RSA backend import rsa def _cloud_front_signer_from_pem(key_id, pem): key = rsa.PrivateKey.load_pkcs1(pem) return CloudFrontSigner(key_id, lambda x: rsa.sign(x, key, 'SHA-1')) return _cloud_front_signer_from_pem for _signer_factory in (_use_cryptography_signer, _use_rsa_signer): try: _cloud_front_signer_from_pem = _signer_factory() break except ImportError: pass else: def _cloud_front_signer_from_pem(key_id, pem): raise ImproperlyConfigured( 'An RSA backend is required for signing cloudfront URLs.\n' 'Supported backends are packages: cryptography and rsa.') @deconstructible class S3Boto3StorageFile(File): """ The default file object used by the S3Boto3Storage backend. This file implements file streaming using boto's multipart uploading functionality. The file can be opened in read or write mode. This class extends Django's File class. However, the contained data is only the data contained in the current buffer. So you should not access the contained file object directly. You should access the data via this class. Warning: This file *must* be closed using the close() method in order to properly write the file to S3. Be sure to close the file in your application. """ buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880) def __init__(self, name, mode, storage, buffer_size=None): if 'r' in mode and 'w' in mode: raise ValueError("Can't combine 'r' and 'w' in mode.") self._storage = storage self.name = name[len(self._storage.location):].lstrip('/') self._mode = mode self._force_mode = (lambda b: b) if 'b' in mode else (lambda b: b.decode()) self.obj = storage.bucket.Object(name) if 'w' not in mode: # Force early RAII-style exception if object does not exist self.obj.load() self._is_dirty = False self._raw_bytes_written = 0 self._file = None self._multipart = None # 5 MB is the minimum part size (if there is more than one part). # Amazon allows up to 10,000 parts. The default supports uploads # up to roughly 50 GB. Increase the part size to accommodate # for files larger than this. if buffer_size is not None: self.buffer_size = buffer_size self._write_counter = 0 @property def size(self): return self.obj.content_length def _get_file(self): if self._file is None: self._file = SpooledTemporaryFile( max_size=self._storage.max_memory_size, suffix=".S3Boto3StorageFile", dir=setting("FILE_UPLOAD_TEMP_DIR") ) if 'r' in self._mode: self._is_dirty = False self.obj.download_fileobj(self._file) self._file.seek(0) if self._storage.gzip and self.obj.content_encoding == 'gzip': self._file = GzipFile(mode=self._mode, fileobj=self._file, mtime=0.0) return self._file def _set_file(self, value): self._file = value file = property(_get_file, _set_file) def read(self, *args, **kwargs): if 'r' not in self._mode: raise AttributeError("File was not opened in read mode.") return self._force_mode(super().read(*args, **kwargs)) def readline(self, *args, **kwargs): if 'r' not in self._mode: raise AttributeError("File was not opened in read mode.") return self._force_mode(super().readline(*args, **kwargs)) def write(self, content): if 'w' not in self._mode: raise AttributeError("File was not opened in write mode.") self._is_dirty = True if self._multipart is None: self._multipart = self.obj.initiate_multipart_upload( **self._storage._get_write_parameters(self.obj.key) ) if self.buffer_size <= self._buffer_file_size: self._flush_write_buffer() bstr = force_bytes(content) self._raw_bytes_written += len(bstr) return super().write(bstr) @property def _buffer_file_size(self): pos = self.file.tell() self.file.seek(0, os.SEEK_END) length = self.file.tell() self.file.seek(pos) return length def _flush_write_buffer(self): """ Flushes the write buffer. """ if self._buffer_file_size: self._write_counter += 1 self.file.seek(0) part = self._multipart.Part(self._write_counter) part.upload(Body=self.file.read()) self.file.seek(0) self.file.truncate() def _create_empty_on_close(self): """ Attempt to create an empty file for this key when this File is closed if no bytes have been written and no object already exists on S3 for this key. This behavior is meant to mimic the behavior of Django's builtin FileSystemStorage, where files are always created after they are opened in write mode: f = storage.open("file.txt", mode="w") f.close() """ assert "w" in self._mode assert self._raw_bytes_written == 0 try: # Check if the object exists on the server; if so, don't do anything self.obj.load() except ClientError as err: if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404: self.obj.put( Body=b"", **self._storage._get_write_parameters(self.obj.key) ) else: raise def close(self): if self._is_dirty: self._flush_write_buffer() # TODO: Possibly cache the part ids as they're being uploaded # instead of requesting parts from server. For now, emulating # s3boto's behavior. parts = [{'ETag': part.e_tag, 'PartNumber': part.part_number} for part in self._multipart.parts.all()] self._multipart.complete( MultipartUpload={'Parts': parts}) else: if self._multipart is not None: self._multipart.abort() if 'w' in self._mode and self._raw_bytes_written == 0: self._create_empty_on_close() if self._file is not None: self._file.close() self._file = None @deconstructible class S3Boto3Storage(BaseStorage): """ Amazon Simple Storage Service using Boto3 This storage backend supports opening files in read or write mode and supports streaming(buffering) data in chunks to S3 when writing. """ default_content_type = 'application/octet-stream' # If config provided in init, signature_version and addressing_style settings/args are ignored. config = None # used for looking up the access and secret key from env vars access_key_names = ['AWS_S3_ACCESS_KEY_ID', 'AWS_ACCESS_KEY_ID'] secret_key_names = ['AWS_S3_SECRET_ACCESS_KEY', 'AWS_SECRET_ACCESS_KEY'] security_token_names = ['AWS_SESSION_TOKEN', 'AWS_SECURITY_TOKEN'] security_token = None def __init__(self, **settings): super().__init__(**settings) check_location(self) # Backward-compatibility: given the anteriority of the SECURE_URL setting # we fall back to https if specified in order to avoid the construction # of unsecure urls. if self.secure_urls: self.url_protocol = 'https:' self._bucket = None self._connections = threading.local() self.access_key, self.secret_key = self._get_access_keys() self.security_token = self._get_security_token() if not self.config: self.config = Config( s3={'addressing_style': self.addressing_style}, signature_version=self.signature_version, proxies=self.proxies, ) def get_cloudfront_signer(self, key_id, key): return _cloud_front_signer_from_pem(key_id, key) def get_default_settings(self): cloudfront_key_id = setting('AWS_CLOUDFRONT_KEY_ID') cloudfront_key = setting('AWS_CLOUDFRONT_KEY') if bool(cloudfront_key_id) ^ bool(cloudfront_key): raise ImproperlyConfigured( 'Both AWS_CLOUDFRONT_KEY_ID and AWS_CLOUDFRONT_KEY must be ' 'provided together.' ) if cloudfront_key_id: cloudfront_signer = self.get_cloudfront_signer(cloudfront_key_id, cloudfront_key) else: cloudfront_signer = None return { "access_key": setting('AWS_S3_ACCESS_KEY_ID', setting('AWS_ACCESS_KEY_ID')), "secret_key": setting('AWS_S3_SECRET_ACCESS_KEY', setting('AWS_SECRET_ACCESS_KEY')), "file_overwrite": setting('AWS_S3_FILE_OVERWRITE', True), "object_parameters": setting('AWS_S3_OBJECT_PARAMETERS', {}), "bucket_name": setting('AWS_STORAGE_BUCKET_NAME'), "bucket_acl": setting('AWS_BUCKET_ACL', 'public-read'), "querystring_auth": setting('AWS_QUERYSTRING_AUTH', True), "querystring_expire": setting('AWS_QUERYSTRING_EXPIRE', 3600), "signature_version": setting('AWS_S3_SIGNATURE_VERSION'), "location": setting('AWS_LOCATION', ''), "custom_domain": setting('AWS_S3_CUSTOM_DOMAIN'), "cloudfront_signer": cloudfront_signer, "addressing_style": setting('AWS_S3_ADDRESSING_STYLE'), "secure_urls": setting('AWS_S3_SECURE_URLS', True), "file_name_charset": setting('AWS_S3_FILE_NAME_CHARSET', 'utf-8'), "gzip": setting('AWS_IS_GZIPPED', False), "gzip_content_types": setting('GZIP_CONTENT_TYPES', ( 'text/css', 'text/javascript', 'application/javascript', 'application/x-javascript', 'image/svg+xml', )), "url_protocol": setting('AWS_S3_URL_PROTOCOL', 'http:'), "endpoint_url": setting('AWS_S3_ENDPOINT_URL'), "proxies": setting('AWS_S3_PROXIES'), "region_name": setting('AWS_S3_REGION_NAME'), "use_ssl": setting('AWS_S3_USE_SSL', True), "verify": setting('AWS_S3_VERIFY', None), "max_memory_size": setting('AWS_S3_MAX_MEMORY_SIZE', 0), } def __getstate__(self): state = self.__dict__.copy() state.pop('_connections', None) state.pop('_bucket', None) return state def __setstate__(self, state): state['_connections'] = threading.local() state['_bucket'] = None self.__dict__ = state @property def connection(self): connection = getattr(self._connections, 'connection', None) if connection is None: session = boto3.session.Session() self._connections.connection = session.resource( 's3', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, aws_session_token=self.security_token, region_name=self.region_name, use_ssl=self.use_ssl, endpoint_url=self.endpoint_url, config=self.config, verify=self.verify, ) return self._connections.connection @property def bucket(self): """ Get the current bucket. If there is no current bucket object create it. """ if self._bucket is None: self._bucket = self.connection.Bucket(self.bucket_name) return self._bucket def _get_access_keys(self): """ Gets the access keys to use when accessing S3. If none is provided in the settings then get them from the environment variables. """ access_key = self.access_key or lookup_env(S3Boto3Storage.access_key_names) secret_key = self.secret_key or lookup_env(S3Boto3Storage.secret_key_names) return access_key, secret_key def _get_security_token(self): """ Gets the security token to use when accessing S3. Get it from the environment variables. """ security_token = self.security_token or lookup_env(S3Boto3Storage.security_token_names) return security_token def _clean_name(self, name): """ Cleans the name so that Windows style paths work """ # Normalize Windows style paths clean_name = posixpath.normpath(name).replace('\\', '/') # os.path.normpath() can strip trailing slashes so we implement # a workaround here. if name.endswith('/') and not clean_name.endswith('/'): # Add a trailing slash as it was stripped. clean_name += '/' return clean_name def _normalize_name(self, name): """ Normalizes the name so that paths like /path/to/ignored/../something.txt work. We check to make sure that the path pointed to is not outside the directory specified by the LOCATION setting. """ try: return safe_join(self.location, name) except ValueError: raise SuspiciousOperation("Attempted access to '%s' denied." % name) def _compress_content(self, content): """Gzip a given string content.""" content.seek(0) zbuf = io.BytesIO() # The GZIP header has a modification time attribute (see http://www.zlib.org/rfc-gzip.html) # This means each time a file is compressed it changes even if the other contents don't change # For S3 this defeats detection of changes using MD5 sums on gzipped files # Fixing the mtime at 0.0 at compression time avoids this problem zfile = GzipFile(mode='wb', fileobj=zbuf, mtime=0.0) try: zfile.write(force_bytes(content.read())) finally: zfile.close() zbuf.seek(0) # Boto 2 returned the InMemoryUploadedFile with the file pointer replaced, # but Boto 3 seems to have issues with that. No need for fp.name in Boto3 # so just returning the BytesIO directly return zbuf def _open(self, name, mode='rb'): name = self._normalize_name(self._clean_name(name)) try: f = S3Boto3StorageFile(name, mode, self) except ClientError as err: if err.response['ResponseMetadata']['HTTPStatusCode'] == 404: raise FileNotFoundError('File does not exist: %s' % name) raise # Let it bubble up if it was some other error return f def _save(self, name, content): cleaned_name = self._clean_name(name) name = self._normalize_name(cleaned_name) params = self._get_write_parameters(name, content) if (self.gzip and params['ContentType'] in self.gzip_content_types and 'ContentEncoding' not in params): content = self._compress_content(content) params['ContentEncoding'] = 'gzip' obj = self.bucket.Object(name) content.seek(0, os.SEEK_SET) obj.upload_fileobj(content, ExtraArgs=params) return cleaned_name def delete(self, name): name = self._normalize_name(self._clean_name(name)) self.bucket.Object(name).delete() def exists(self, name): name = self._normalize_name(self._clean_name(name)) try: self.connection.meta.client.head_object(Bucket=self.bucket_name, Key=name) return True except ClientError: return False def listdir(self, name): path = self._normalize_name(self._clean_name(name)) # The path needs to end with a slash, but if the root is empty, leave # it. if path and not path.endswith('/'): path += '/' directories = [] files = [] paginator = self.connection.meta.client.get_paginator('list_objects') pages = paginator.paginate(Bucket=self.bucket_name, Delimiter='/', Prefix=path) for page in pages: for entry in page.get('CommonPrefixes', ()): directories.append(posixpath.relpath(entry['Prefix'], path)) for entry in page.get('Contents', ()): files.append(posixpath.relpath(entry['Key'], path)) return directories, files def size(self, name): name = self._normalize_name(self._clean_name(name)) return self.bucket.Object(name).content_length def _get_write_parameters(self, name, content=None): params = {} _type, encoding = mimetypes.guess_type(name) content_type = getattr(content, 'content_type', None) content_type = content_type or _type or self.default_content_type params['ContentType'] = content_type if encoding: params['ContentEncoding'] = encoding params.update(self.get_object_parameters(name)) return params def get_object_parameters(self, name): """ Returns a dictionary that is passed to file upload. Override this method to adjust this on a per-object basis to set e.g ContentDisposition. By default, returns the value of AWS_S3_OBJECT_PARAMETERS. Setting ContentEncoding will prevent objects from being automatically gzipped. """ return self.object_parameters.copy() def get_modified_time(self, name): """ Returns an (aware) datetime object containing the last modified time if USE_TZ is True, otherwise returns a naive datetime in the local timezone. """ name = self._normalize_name(self._clean_name(name)) entry = self.bucket.Object(name) if setting('USE_TZ'): # boto3 returns TZ aware timestamps return entry.last_modified else: return make_naive(entry.last_modified) def modified_time(self, name): """Returns a naive datetime object containing the last modified time.""" # If USE_TZ=False then get_modified_time will return a naive datetime # so we just return that, else we have to localize and strip the tz mtime = self.get_modified_time(name) return mtime if is_naive(mtime) else make_naive(mtime) def _strip_signing_parameters(self, url): # Boto3 does not currently support generating URLs that are unsigned. Instead we # take the signed URLs and strip any querystring params related to signing and expiration. # Note that this may end up with URLs that are still invalid, especially if params are # passed in that only work with signed URLs, e.g. response header params. # The code attempts to strip all query parameters that match names of known parameters # from v2 and v4 signatures, regardless of the actual signature version used. split_url = urlsplit(url) qs = parse_qsl(split_url.query, keep_blank_values=True) blacklist = { 'x-amz-algorithm', 'x-amz-credential', 'x-amz-date', 'x-amz-expires', 'x-amz-signedheaders', 'x-amz-signature', 'x-amz-security-token', 'awsaccesskeyid', 'expires', 'signature', } filtered_qs = ((key, val) for key, val in qs if key.lower() not in blacklist) # Note: Parameters that did not have a value in the original query string will have # an '=' sign appended to it, e.g ?foo&bar becomes ?foo=&bar= joined_qs = ('='.join(keyval) for keyval in filtered_qs) split_url = split_url._replace(query="&".join(joined_qs)) return split_url.geturl() def url(self, name, parameters=None, expire=None, http_method=None): # Preserve the trailing slash after normalizing the path. name = self._normalize_name(self._clean_name(name)) if expire is None: expire = self.querystring_expire if self.custom_domain: url = "{}//{}/{}".format( self.url_protocol, self.custom_domain, filepath_to_uri(name)) if self.querystring_auth and self.cloudfront_signer: expiration = datetime.utcnow() + timedelta(seconds=expire) return self.cloudfront_signer.generate_presigned_url(url, date_less_than=expiration) return url params = parameters.copy() if parameters else {} params['Bucket'] = self.bucket.name params['Key'] = name url = self.bucket.meta.client.generate_presigned_url('get_object', Params=params, ExpiresIn=expire, HttpMethod=http_method) if self.querystring_auth: return url return self._strip_signing_parameters(url) def get_available_name(self, name, max_length=None): """Overwrite existing file with the same name.""" name = self._clean_name(name) if self.file_overwrite: return get_available_overwrite_name(name, max_length) return super().get_available_name(name, max_length)