from __future__ import unicode_literals from collections import defaultdict import io import os import platform import re import sys import xml.etree.ElementTree as ET from xml.etree.ElementTree import XMLParser import six from . import (ChatThread, ChatMessage, FacebookChatHistory) from .name_resolver import DummyNameResolver from .utils import yellow, magenta from .time import parse_timestamp class UnsuitableParserError(Exception): pass class MissingReferenceError(Exception): pass if six.PY2: FileNotFoundError = OSError class SafeXMLStream(object): """ Let's implement our own stream filter to remove the inexplicably present control characters for us. We will analyze the incoming byte stream and remove any instances of the offending characters. """ # The XML parser is super basic and can't understand special HTML-specific # aliases like . This header is artificially prepended to each XML # stream to tell guide the parser on what the token signifies. HTML_ENTITY_DEF = b"<!DOCTYPE html [<!ENTITY nbsp ' '>]>" def __init__(self, stream): # Create a regex for matching all illegal characters within the # XML 1.1 spec so that we can filter them out. illegal_unichrs = [(0x00, 0x08), (0x0B, 0x0C), (0x0E, 0x1F), (0x7F, 0x84), (0x86, 0x9F), (0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF), (0xD800, 0xDFFF), (0x0B, 0x1F)] if sys.maxunicode >= 0x10000: # not narrow build illegal_unichrs.extend([(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF), (0x3FFFE, 0x3FFFF), (0x4FFFE, 0x4FFFF), (0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF), (0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF), (0x9FFFE, 0x9FFFF), (0xAFFFE, 0xAFFFF), (0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF), (0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF), (0xFFFFE, 0xFFFFF), (0x10FFFE, 0x10FFFF)]) uni = chr if sys.version_info >= (3, 0) else unichr illegal_ranges = ["%s-%s" % (uni(low), uni(high)) for (low, high) in illegal_unichrs] self.scrubber = re.compile('[%s]' % ''.join(illegal_ranges)) self.stream = stream self.returned_dtd = False def read(self, size=-1): if not self.returned_dtd: self.returned_dtd = True return self.HTML_ENTITY_DEF buff = self.stream.read(size) # The XML parser is dumb and seems to only utilize UTF-8 # encoders/decoders if we hand it a byte stream. Fortunately, it # doesn't seem to care if it got more or less bytes then it asked for. return re.sub(self.scrubber, '', buff).encode('utf-8') def _truncate(string, length=60): if len(string) > 60: return "%s..." % string[:length] return string def _tag_and_class_attr(element): if element.attrib: class_attr = element.attrib.get('class', []) else: class_attr = element.get('class', []) tag = element.tag if element.tag else element.name return tag, class_attr class ChatThreadParser(object): def __init__(self, element_iter, timezone_hints=None, use_utc=True, name_resolver=None, no_sender_warning_status=True, seq_num=0): self.name_resolver = name_resolver or DummyNameResolver() self.messages = None self.current_sender = None self.current_timestamp = None self.current_text = None self.element_iter = element_iter self.seq_num = seq_num self.timezone_hints = timezone_hints or {} self.use_utc = use_utc self.no_sender_warning_status = no_sender_warning_status self.messages = [] self.messages_started = False def parse(self, participants): self.messages = [] self.current_sender = None self.current_timestamp = None for pos, element in self.element_iter: finished = self._process_element(pos, element) if finished: break thread = ChatThread(participants) for m in self.messages: thread.add_message(m) return self.no_sender_warning_status, thread def skip(self): """ Eats through the input iterator without recording the content. """ for pos, element in self.element_iter: tag, class_attr = _tag_and_class_attr(element) if tag == "div" and "thread" in class_attr and pos == "end": break def _process_element(self, pos, e): """ Parses an incoming HTML element/node for data. pos -- the part of the element being parsed (start/end) e -- the element being parsed """ tag, class_attr = _tag_and_class_attr(e) start_of_message = tag == 'div' and class_attr == 'message' and pos == 'start' end_of_thread = tag == 'div' and 'thread' in class_attr and pos == 'end' if start_of_message and not self.messages_started: self.messages_started = True elif tag == "span" and pos == "end": if "user" in class_attr: self.current_sender = self.name_resolver.resolve(e.text) elif "meta" in class_attr: self.current_timestamp =\ parse_timestamp(e.text, self.use_utc, self.timezone_hints) elif tag == 'p' and pos == 'end': # This is only necessary because of accidental double <p> nesting on # Facebook's end. Clearly, QA and testing is one of Facebook's strengths ;) if not self.current_text: self.current_text = e.text.strip() if e.text else '' elif tag == 'img' and pos == 'start': self.current_text = '(image reference: {})'.format(e.attrib['src']) elif (start_of_message or end_of_thread) and self.messages_started: if not self.current_timestamp: # This is the typical error when the new Facebook format is # used with the legacy parser. raise UnsuitableParserError if not self.current_sender: if not self.no_sender_warning_status: sys.stderr.write( "\rWARNING: The sender was missing in one or more parsed messages. " "This is an error on Facebook's end that unfortunately cannot be " "recovered from. Some or all messages in the output may show the " "sender as 'Unknown' within each thread.\n") self.no_sender_warning_status = True self.current_sender = "Unknown" cm = ChatMessage(timestamp=self.current_timestamp, sender=self.current_sender, content=self.current_text or '', seq_num=self.seq_num) self.messages += [cm] self.seq_num -= 1 self.current_sender, self.current_timestamp, self.current_text = None, None, None return end_of_thread class MessageHtmlParser(object): def __init__(self, handle, timezone_hints=None, use_utc=True, progress_output=False, thread_filter=None, name_resolver=None): self.name_resolver = name_resolver or DummyNameResolver() self.chat_threads = dict() self.message_cache = None self.user = None self.last_line_len = 0 self.handle = SafeXMLStream(handle) self.progress_output = progress_output self.thread_filter = ( tuple(p.lower() for p in thread_filter) if thread_filter else None) self.seq_num = 0 self.thread_signatures = set() self.timezone_hints = timezone_hints or {} self.use_utc = use_utc self.no_sender_warning = False def should_record_thread(self, participants): """ Determines if the thread should be parsed based on the participants and the filter given. For example, if the filter states ['jack', 'billy joe'], then only threads with exactly two participants (excluding the owner of the chat history) containing someone with the first or last name 'Jack' and someone named 'Billy Joel' will be included. Any of the following would match that criteria: - Jack Stevenson, Billy Joel - Billy Joel, Jack Stevens - Jack Jenson, Billy Joel - Jack Jack, Billy Joel participants -- the participants of the thread (excluding the history owner) """ if not self.thread_filter: return True if len(participants) != len(self.thread_filter): return False participants = [[p.lower()] + p.lower().split(" ") for p in participants] matches = defaultdict(set) for e, p in enumerate(participants): for f in self.thread_filter: if f in p: matches[f].add(e) matched = set() for f in matches: if len(matches[f]) == 0: return False matched |= matches[f] return len(matched) == len(participants) def parse(self): self.parse_impl() self._clear_output() return FacebookChatHistory(self.user, self.chat_threads) def parse_impl(self): # # Implementation details: # # 1. Load the file/manifest. # 2. Parse the user. # 3. Facilitate the parsing of a thread by identifying # participants and providing an element iterator. # 4. Save the thread. # 5. Return the history object. # raise NotImplementedError def parse_thread(self, participants, element_iter, require_flush): """ Parses a thread with appropriate CLI feedback. :param participants: The participants in this thread. :param element_iter: The XML iterator to parse the data from. :param require_flush: Whether the iterator needs to be flushed if it is determined that the thread should be skipped. :return: A `ChatThread` object if not skipped, otherwise `None`. """ # Very rarely threads may lack information on who the # participants are. We will consider those threads corrupted # and skip them. participants_text = _truncate(', '.join(participants), 60) if participants: skip_thread = not self.should_record_thread(participants) participants_text = yellow("[%s]" % participants_text) else: participants_text = "unknown participants" skip_thread = True if skip_thread: line = "\rSkipping chat thread with %s..." % \ yellow(participants_text) else: participants_key = ", ".join(participants) if participants_key in self.chat_threads: thread_current_len = len(self.chat_threads[participants_key]) line = "\rContinuing chat thread with %s %s..." \ % (yellow(participants_text), magenta("<@%d messages>" % thread_current_len)) else: line = "\rDiscovered chat thread with %s..." \ % yellow(participants_text) if self.progress_output: sys.stderr.write(line.ljust(self.last_line_len)) sys.stderr.flush() self.last_line_len = len(line) parser = ChatThreadParser( element_iter, self.timezone_hints, self.use_utc, self.name_resolver, self.no_sender_warning, self.seq_num) if skip_thread: if require_flush: parser.skip() else: self.no_sender_warning, thread = parser.parse(participants) return thread def save_thread(self, thread): if thread is None: return signature = thread.signature if signature in self.thread_signatures: # FIXME: Suppressed until use of a logging library is # implemented # error("Duplicate thread detected: %s\n " # % str(self.current_thread.participants)) return participants = ", ".join(thread.participants) self.thread_signatures.add(signature) if participants not in self.chat_threads: self.chat_threads[participants] = thread else: existing_thread = self.chat_threads[participants] for m in thread.messages: existing_thread.add_message(m) def parse_participants(self, participants): if len(participants) == 0: return () if not isinstance(participants, six.string_types): if not participants.text: return () if participants.attrib: participants = participants.text.strip() else: participants = participants.contents[0].strip() participants = [self.name_resolver.resolve(p) for p in participants.split(", ")] participants.sort() if self.user in participants: participants.remove(self.user) return tuple(participants) def _clear_output(self): """ Clears progress output (if any) that was written to the screen. """ # If progress output was being written, clear it from the screen. if self.progress_output: sys.stderr.write("\r".ljust(self.last_line_len)) sys.stderr.write("\r") sys.stderr.flush() def using_windows(): return 'windows' in platform.platform().lower() class LegacyMessageHtmlParser(MessageHtmlParser): """ A parser for the original archive format Facebook used until October 2017. """ def parse_impl(self): """ Parses the HTML content as a stream. This is far less memory intensive than loading the entire HTML file into memory, like BeautifulSoup does. """ # Cast to str to ensure not unicode under Python 2, as the parser # doesn't like that. parser = XMLParser(encoding=str('UTF-8')) element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser) for pos, element in element_iter: tag, class_attr = _tag_and_class_attr(element) if tag == "h1" and pos == "end": if not self.user: self.user = element.text.strip() elif tag == "div" and "thread" in class_attr and pos == "start": participants = self.parse_participants(element) thread = self.parse_thread(participants, element_iter, True) self.save_thread(thread) class SplitMessageHtmlParser(MessageHtmlParser): """ A parser for the archive format Facebook started using around October 2017. """ def __init__(self, handle, *args, **kwargs): super(SplitMessageHtmlParser, self).__init__(handle, *args, **kwargs) self.root = os.path.realpath(handle.name) delimiter = '\\' if using_windows() else '/' self.root = delimiter.join(self.root.split(delimiter)[:-2]) def parse_impl(self): self.user, thread_references = self._get_manifest_data() for participants, thread_path in thread_references: self.process_thread(participants, thread_path) self._clear_output() def _get_manifest_data(self): user, thread_references = None, [] ignore_anchors = True saw_anchor = False # Cast to str to ensure not unicode under Python 2, as the parser # doesn't like that. parser = XMLParser(encoding=str('UTF-8')) element_iter = ET.iterparse(self.handle, events=("start", "end"), parser=parser) for pos, element in element_iter: tag, class_attr = _tag_and_class_attr(element) if tag == "h1" and pos == "end": if not self.user: user = element.text.strip() elif tag == "div" and "content" in class_attr and pos == "start": ignore_anchors = False elif tag == "a" and pos == "start": if ignore_anchors: continue saw_anchor = True participants = self.parse_participants(element) thread_path = re.sub(r'^../', '', element.attrib['href']) if using_windows(): thread_path = thread_path.replace('/', '\\') thread_references += [(participants, os.path.join(self.root, thread_path))] if not saw_anchor: # Indicator of a `messages.htm` file that is probably in the legacy format. raise UnsuitableParserError return user, thread_references def process_thread(self, participants, thread_path): file_path = os.path.join(self.root, thread_path) try: with io.open(file_path, 'rt', encoding='utf8') as thread_file: parser = XMLParser(encoding=str('UTF-8')) element_iter = ET.iterparse( SafeXMLStream(thread_file), events=("start", "end"), parser=parser) thread = self.parse_thread(participants, element_iter, False) except FileNotFoundError: raise MissingReferenceError(file_path) self.save_thread(thread) class SplitMessageHtmlWithImagesParser(SplitMessageHtmlParser): """ A parser for the archive format Facebook started using around January 2018. """ _PARTICIPANT_PARSER = re.compile(r'</h3>Participants: ([^<]+)<div') def parse_impl(self): # Trying to correlate the manifest to anything is janky as hell and more or less # a lost cause at this point. Facebook made the participant information lossy # in the manifest, so we have to get it from the thread files. To maintain backwards # compatibility (and honestly sanity...), let's just dredge the lossless # participant data directly from the message HTML files with regex and then parse them. self.user, thread_references = self._get_manifest_data() unknown_user_count = 0 for participants, thread_path in thread_references: with io.open(thread_path, 'rt', encoding='utf8') as f: # Let's just read enough for the preamble (~5,000 characters # is probably sufficient. m = self._PARTICIPANT_PARSER.search(f.read(5000)) if m: # Un-escape any HTML entities. import bs4 unescaped = six.text_type(bs4.BeautifulSoup(m.group(1), 'html.parser')) participants = self.parse_participants(unescaped) else: # Sometimes threads will appear without participants. These appear to be # users who have deleted themselves or blocked you. Not sure why this # occurs. We will throw them in with the "Facebook User"s and deal with # it downstream. if participants: raise UnsuitableParserError participants = ('Facebook User',) # Under certain circumstances, conversation history for disabled users, or # users who have blocked you, will be saved under the name "Facebook User". # We should artificially differentiate these threads so all the messages don't # get lumped into a single chat thread. if participants == ('Facebook User',): participants = ('Unknown user #{:03d}'.format(unknown_user_count),) unknown_user_count += 1 self.process_thread(participants, thread_path) def parse(handle, *args, **kwargs): # We support every archive format since Facebook invented the # 'Download your Data' feature. We successively back-peddle # until we find a parser that works. for parser in (SplitMessageHtmlWithImagesParser, SplitMessageHtmlParser, LegacyMessageHtmlParser): try: return parser(handle, *args, **kwargs).parse() except UnsuitableParserError: # Rewind for the next parser. handle.seek(0) raise UnsuitableParserError("no suitable parser found")