import itertools import json import logging import re from datetime import datetime from json import JSONDecodeError from typing import Any, Dict, Optional from . import utils from .constants import FB_MOBILE_BASE_URL from .fb_types import RawPost, Options, Post, RequestFunction try: from youtube_dl import YoutubeDL from youtube_dl.utils import ExtractorError except ImportError: YoutubeDL = None logger = logging.getLogger(__name__) # Typing PartialPost = Optional[Dict[str, Any]] def extract_post(raw_post: RawPost, options: Options, request_fn: RequestFunction) -> Post: return PostExtractor(raw_post, options, request_fn).extract_post() def extract_group_post(raw_post: RawPost, options: Options, request_fn: RequestFunction) -> Post: return GroupPostExtractor(raw_post, options, request_fn).extract_post() class PostExtractor: """Class for Extracting fields from a FacebookPost""" likes_regex = re.compile(r'like_def[^>]*>([0-9,.]+)') comments_regex = re.compile(r'cmt_def[^>]*>([0-9,.]+)') shares_regex = re.compile(r'([0-9,.]+)\s+Shares', re.IGNORECASE) link_regex = re.compile(r"href=\"https:\/\/lm\.facebook\.com\/l\.php\?u=(.+?)\&h=") photo_link = re.compile(r'href=\"(/[^\"]+/photos/[^\"]+?)\"') image_regex = re.compile( r'<a href=\"([^\"]+?)\" target=\"_blank\" class=\"sec\">View Full Size<\/a>', re.IGNORECASE, ) image_regex_lq = re.compile(r"background-image: url\('(.+)'\)") post_url_regex = re.compile(r'/story.php\?story_fbid=') shares_and_reactions_regex = re.compile( r'<script>.*bigPipe.onPageletArrive\((?P<data>\{.*RelayPrefetchedStreamCache.*\})\);' '.*</script>' ) bad_json_key_regex = re.compile(r'(?P<prefix>[{,])(?P<key>\w+):') more_url_regex = re.compile(r'(?<=…\s)<a href="([^"]+)') post_story_regex = re.compile(r'href="(\/story[^"]+)" aria') def __init__(self, element, options, request_fn): self.element = element self.options = options self.request = request_fn self._data_ft = None def make_new_post(self) -> Post: return { 'post_id': None, 'text': None, 'post_text': None, 'shared_text': None, 'time': None, 'image': None, 'video': None, 'likes': None, 'comments': None, 'shares': None, 'post_url': None, 'link': None, } def extract_post(self) -> Post: """Parses the element into self.item""" methods = [ self.extract_post_id, self.extract_text, self.extract_time, self.extract_image, self.extract_likes, self.extract_comments, self.extract_shares, self.extract_post_url, self.extract_link, self.extract_video, ] post = self.make_new_post() # TODO: this is just used by `extract_reactions`, probably should not be acceded from self self.post = post for method in methods: try: partial_post = method() if partial_post is None: logger.warning("Extract method %s didn't return anything", method.__name__) continue post.update(partial_post) except Exception as ex: logger.warning("Exception while running %s: %r", method.__name__, ex) if 'reactions' in self.options: reactions = self.extract_reactions() post.update(reactions) return post def extract_post_id(self) -> PartialPost: return {'post_id': self.data_ft.get('mf_story_key')} # TODO: this method needs test for the 'has more' case and shared content def extract_text(self) -> PartialPost: # Open this article individually because not all content is fully loaded when skimming # through pages. # This ensures the full content can be read. element = self.element has_more = self.more_url_regex.search(element.html) if has_more: match = self.post_story_regex.search(element.html) if match: url = utils.urljoin(FB_MOBILE_BASE_URL, match.groups()[0].replace("&", "&")) response = self.request(url) element = response.html.find('.story_body_container', first=True) nodes = element.find('p, header') if nodes: post_text = [] shared_text = [] ended = False for node in nodes[1:]: if node.tag == 'header': ended = True # Remove '... More' # This button is meant to display the hidden text that is already loaded # Not to be confused with the 'More' that opens the article in a new page if node.tag == 'p': node = utils.make_html_element( html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1) ) if not ended: post_text.append(node.text) else: shared_text.append(node.text) text = '\n'.join(itertools.chain(post_text, shared_text)) post_text = '\n'.join(post_text) shared_text = '\n'.join(shared_text) return { 'text': text, 'post_text': post_text, 'shared_text': shared_text, } return None # TODO: Add the correct timezone def extract_time(self) -> PartialPost: page_insights = self.data_ft['page_insights'] for page in page_insights.values(): try: timestamp = page['post_context']['publish_time'] return { 'time': datetime.fromtimestamp(timestamp), } except (KeyError, ValueError): continue return None def extract_image(self) -> PartialPost: image_link = self.extract_photo_link() if image_link is not None: return image_link return self.extract_image_lq() def extract_image_lq(self) -> PartialPost: story_container = self.element.find('div.story_body_container', first=True) if story_container is None: return None other_containers = story_container.xpath('div/div') for container in other_containers: image_container = container.find('.img', first=True) if image_container is None: continue style = image_container.attrs.get('style', '') match = self.image_regex_lq.search(style) if match: return {'image': utils.decode_css_url(match.groups()[0])} return None def extract_link(self) -> PartialPost: match = self.link_regex.search(self.element.html) if match: return {'link': utils.unquote(match.groups()[0])} return None def extract_post_url(self) -> PartialPost: query_params = ('story_fbid', 'id') elements = self.element.find('header a') for element in elements: href = element.attrs.get('href', '') match = self.post_url_regex.match(href) if match: path = utils.filter_query_params(href, whitelist=query_params) url = utils.urljoin(FB_MOBILE_BASE_URL, path) return {'post_url': url} return None # TODO: Remove `or 0` from this methods def extract_likes(self) -> PartialPost: return { 'likes': utils.find_and_search( self.element, 'footer', self.likes_regex, utils.parse_int ) or 0, } def extract_comments(self) -> PartialPost: return { 'comments': utils.find_and_search( self.element, 'footer', self.comments_regex, utils.parse_int ) or 0, } def extract_shares(self) -> PartialPost: return { 'shares': utils.find_and_search( self.element, 'footer', self.shares_regex, utils.parse_int ) or 0, } def extract_photo_link(self) -> PartialPost: match = self.photo_link.search(self.element.html) if not match: return None url = utils.urljoin(FB_MOBILE_BASE_URL, match.groups()[0]) response = self.request(url) html = response.text match = self.image_regex.search(html) if match: return { 'image': match.groups()[0].replace("&", "&"), } return None def extract_reactions(self) -> PartialPost: """Fetch share and reactions information with a existing post obtained by `get_posts`. Return a merged post that has some new fields including `reactions`, `w3_fb_url`, `fetched_time`, and reactions fields `LIKE`, `ANGER`, `SORRY`, `WOW`, `LOVE`, `HAHA` if exist. Note that this method will raise one http request per post, use it when you want some more information. Example: ``` for post in get_posts('fanpage'): more_info_post = fetch_share_and_reactions(post) print(more_info_post) ``` """ url = self.post.get('post_url') post_id = self.post.get('post_id') if url: w3_fb_url = utils.urlparse(url)._replace(netloc='www.facebook.com').geturl() resp = self.request(w3_fb_url) for item in self.parse_share_and_reactions(resp.text): data = item['jsmods']['pre_display_requires'][0][3][1]['__bbox']['result'][ 'data' ]['feedback'] if data['subscription_target_id'] == post_id: return { 'shares': data['share_count']['count'], 'likes': data['reactors']['count'], 'reactions': { reaction['node']['reaction_type'].lower(): reaction['reaction_count'] for reaction in data['top_reactions']['edges'] }, 'comments': data['comment_count']['total_count'], 'w3_fb_url': data['url'], 'fetched_time': datetime.now(), } return None def extract_video(self): video_data_element = self.element.find('[data-sigil="inlineVideo"]', first=True) if video_data_element is None: return None if 'youtube_dl' in self.options: vid = self.extract_video_highres() if vid: return vid return self.extract_video_lowres(video_data_element) def extract_video_lowres(self, video_data_element): try: data = json.loads(video_data_element.attrs['data-store']) return {'video': data.get('src')} except JSONDecodeError as ex: logger.error("Error parsing data-store JSON: %r", ex) except KeyError: logger.error("data-store attribute not found") return None def extract_video_highres(self): if not YoutubeDL: raise ModuleNotFoundError( "youtube-dl must be installed to download videos in high resolution." ) ydl_opts = { 'format': 'best', 'quiet': True, } try: post_id = self.post.get('post_id') video_page = 'https://www.facebook.com/' + post_id with YoutubeDL(ydl_opts) as ydl: url = ydl.extract_info(video_page, download=False)['url'] return {'video': url} except ExtractorError as ex: logger.error("Error extracting video with youtube-dl: %r", ex) return None def parse_share_and_reactions(self, html: str): bad_jsons = self.shares_and_reactions_regex.findall(html) for bad_json in bad_jsons: good_json = self.bad_json_key_regex.sub(r'\g<prefix>"\g<key>":', bad_json) yield json.loads(good_json) @property def data_ft(self) -> dict: if self._data_ft is not None: return self._data_ft self._data_ft = {} try: data_ft_json = self.element.attrs['data-ft'] self._data_ft = json.loads(data_ft_json) except JSONDecodeError as ex: logger.error("Error parsing data-ft JSON: %r", ex) except KeyError: logger.error("data-ft attribute not found") return self._data_ft class GroupPostExtractor(PostExtractor): """Class for extracting posts from Facebook Groups rather than Pages""" # TODO: This might need to be aware of the timezone and locale (?) def extract_time(self) -> PartialPost: # This is a string with this format 'April 3, 2018 at 8:02 PM' time = self.element.find('abbr', first=True).text time = datetime.strptime(time, '%B %d, %Y at %I:%M %p') return { 'time': time, }