import codecs import hashlib import logging import os import urlparse import mimetypes from ClonedResourceDetails import ClonedResourceDetails CLIENT_SIDE_FORENSICS_CODE = """<script src="/session.js"></script> <script> var xhr = new XMLHttpRequest(); xhr.open("POST", window.location.href + "additional_data", true); xhr.setRequestHeader('Content-Type', 'application/json'); xhr.send(JSON.stringify(session, null, '\t')); </script>""" class PostProcessor(object): MISSING_MIME_TYPES = { 'text/javascript' : ".js" } DEFAULT_FILE_EXTENSION = ".html" FILE_PATH_MAX_LEN = 255 ILLEGAL_WINDOWS_FILE_PATH_CHARS = ['~', '*', '', ':', '<', '>', '|', '?', '"'] CLOSE_HEAD_TAG = "</head>" SERVER_DEFAULT_SERVE_FILE_NAMES = ["index.html", "index.htm"] REDIRECTION_URL_PLACE_HOLDER = "$REDIRECTON_URL$" REDIRECTION_TEMPLATE_FILE_NAME = "redirect.html" def __init__(self, original_url, cloned_resources, output_directory): self.logger = logging.getLogger(__name__) self.original_url = original_url self.cloned_resources = cloned_resources self.output_directory = os.path.abspath(output_directory) if not os.path.exists(self.output_directory): os.mkdir(self.output_directory) self._init_mimetypes() def _init_mimetypes(self): mimetypes.init() for missing_mime_type in self.MISSING_MIME_TYPES: mimetypes.add_type(missing_mime_type, self.MISSING_MIME_TYPES[missing_mime_type]) def run(self): self._remove_full_path_links() self._update_url_query_paths() self._add_client_side_forensic() self._add_indexfiles_to_directories() self._save_resource_to_files() def _add_client_side_forensic(self): cloned_resource = self.cloned_resources[self.original_url] patched_resource_data = \ self.patch_resource(cloned_resource.resource_data, self.CLOSE_HEAD_TAG, CLIENT_SIDE_FORENSICS_CODE + self.CLOSE_HEAD_TAG) self.cloned_resources[self.original_url] = \ cloned_resource._replace(resource_data=patched_resource_data) def _get_resource_file_path(self, cloned_resource): return os.path.join(self.output_directory, cloned_resource.get_relative_file_path()) def _create_resource_file_path(self, cloned_resource): directory = self.output_directory for part in cloned_resource.get_directory().split("/"): directory = os.path.join(directory, part) if not os.path.exists(directory): os.mkdir(directory) def _save_resource_to_files(self): for cloned_resource in self.cloned_resources.itervalues(): self._create_resource_file_path(cloned_resource) with open(self._get_resource_file_path(cloned_resource), 'wb') as file_h: file_h.write(cloned_resource.resource_data) def patch_resources(self, substring, new_substr): for resource_path, cloned_resource in self.cloned_resources.iteritems(): patched_resource_data = self.patch_resource(cloned_resource.resource_data, substring, new_substr, encoding=cloned_resource.charset) self.cloned_resources[resource_path] = cloned_resource._replace( resource_data=patched_resource_data) def patch_resource(self, data, substring, replacement, encoding='utf-8'): # In order to patch the data of the resources we decode it to performing the replace # and returning it ot it's original encoding try: codec = codecs.lookup(encoding) except LookupError as error: self.logger.warning('failed to decode data from web response, ' + error.args[0]) return data try: data = codec.decode(data)[0] except ValueError as error: self.logger.warning( "failed to decode data from web response "\ "(%s) using encoding %s", error.__class__.__name__, encoding) return data return codec.encode(data.replace(substring, replacement))[0] def _remove_full_path_links(self): # Remove full url path url links from all the resources parsed_original_url = urlparse.urlparse(self.original_url) hostname_url_path_http = "http://" + parsed_original_url.netloc hostname_url_path_https = "https://" + parsed_original_url.netloc self.patch_resources(hostname_url_path_http, "") self.patch_resources(hostname_url_path_https, "") def mimetype_to_file_extension(self, mime_type): guessed_file_extension = mimetypes.guess_extension(mime_type) return guessed_file_extension if guessed_file_extension else self.DEFAULT_FILE_EXTENSION def _compress_file_path(self, input_url_path): file_path = input_url_path.lstrip('/') full_file_path_len = len(os.path.join( self.output_directory, file_path)) if self.FILE_PATH_MAX_LEN >= full_file_path_len: return input_url_path # Calculating how many chars we need to reduce from the path deviation_in_path = full_file_path_len - self.FILE_PATH_MAX_LEN # Adding the size of the hashed path new directory deviation_in_path = deviation_in_path + 33 splitted_file_path = file_path.split("/") file_name = splitted_file_path[-1] paths_removing = [] length_removed = 0 # Calculate the directory names to remove from the path for i in xrange(len(splitted_file_path) - 2, 0, -1): paths_removing.append(splitted_file_path[i]) length_removed += len(splitted_file_path[i]) if length_removed > deviation_in_path: break hashed_path = hashlib.md5(''.join(paths_removing)).hexdigest() remaining_path = splitted_file_path[:(len(paths_removing)+1)*-1] remaining_path.append(hashed_path) remaining_path.append(file_name) return "/" + "/".join(remaining_path) def _strip_file_path_from_invalid_characters(self, file_path): return ''.join([x for x in file_path if x not in self.ILLEGAL_WINDOWS_FILE_PATH_CHARS]) def _fix_file_name(self, cloned_resource): parsed_resource_url = urlparse.urlparse(cloned_resource.resource_url) # Setting a new file extension based on the file's mime type new_file_extension = self.mimetype_to_file_extension( cloned_resource.mime_type) new_file_name = parsed_resource_url.path if parsed_resource_url.query != "": query_hashed = hashlib.md5(parsed_resource_url.query).hexdigest() new_resource_url = new_file_name + "_" + query_hashed + new_file_extension else: if new_file_name.endswith(new_file_extension): new_resource_url = new_file_name else: new_resource_url = new_file_name + new_file_extension return new_resource_url def _update_url_query_paths(self): substrings_to_replace = [] for resource_path, cloned_resource in self.cloned_resources.iteritems(): resource_url = urlparse.urlparse(cloned_resource.resource_url) fixed_file_name_and_path = self._fix_file_name(cloned_resource) stripped_file_path = self._strip_file_path_from_invalid_characters( fixed_file_name_and_path) fixed_resource_path = self._compress_file_path(stripped_file_path) substring_to_locate = resource_url.path if resource_url.query != "": escaped_query = resource_url.query.replace("&", "&") substring_to_locate = resource_url.path + "?" + escaped_query if substring_to_locate != fixed_resource_path: self.cloned_resources[resource_path] = cloned_resource._replace( resource_url=fixed_resource_path) substrings_to_replace.append( (substring_to_locate, fixed_resource_path)) # we are sorting the substring to replaces by the len of the substring to locate # because we first want to replace the longest string so we won't create a case # we will replace it with a shorter one substrings_to_replace.sort(key=lambda tup: len(tup[0]), reverse=True) for (fullpath_link, replacement) in substrings_to_replace: self.patch_resources(fullpath_link, replacement) def _get_directories_without_default_files(self): directories = {} for cloned_resource in self.cloned_resources.itervalues(): resource_directory = cloned_resource.get_directory() directory = "/" directories[directory] = False for part in resource_directory.split("/"): if part: directory = directory + part +"/" directories[directory] = False for cloned_resource in self.cloned_resources.itervalues(): resource_directory = "/" + cloned_resource.get_directory() if resource_directory != "/": resource_directory += "/" if cloned_resource.get_filename() in self.SERVER_DEFAULT_SERVE_FILE_NAMES: directories[resource_directory] = True return directories def _add_indexfiles_to_directories(self): # We want to make sure that in any directory that we create there is an index file # So we manually create it directories = self._get_directories_without_default_files() # Setting up the data of the redirection file redirection_file_data = open(self.REDIRECTION_TEMPLATE_FILE_NAME, "rb").read() dest_redirection_url = "/" + \ self.cloned_resources[self.original_url].get_relative_file_path() redirection_file_data = redirection_file_data.replace( self.REDIRECTION_URL_PLACE_HOLDER, dest_redirection_url) # Adding default resource for each directory that don't have a default file # A default file is a file that an HTTP Server will serve # if there is no file in the browsed directory for directory, is_default_file_exist in directories.iteritems(): if is_default_file_exist: continue resource_url_path = directory + self.SERVER_DEFAULT_SERVE_FILE_NAMES[0] redirection_cloned_resource = ClonedResourceDetails(resource=resource_url_path, mime_type="text/html", resource_data=redirection_file_data, resource_url=resource_url_path, charset="utf-8", query="") self.cloned_resources[resource_url_path] = redirection_cloned_resource