import re
import os
import requests
import logging
import csv
import json
import random
import time
from lxml import etree
from urllib import parse
from datetime import datetime
try:
    from json.decoder import JSONDecodeError
except ImportError:
    JSONDecodeError = ValueError

from tweetscrape.model.tweet_model import TweetInfo
from tweetscrape.model.user_model import UserInfo

logger = logging.getLogger(__name__)


class TweetScrapper:
    __twitter_request_url__ = None
    __twitter_request_header__ = None
    __twitter_request_params__ = None

    _tweets_pattern_ = '''//li[contains(@class,"stream-item")]'''
    _tweet_stream_max_ = '''//*[@id="timeline" or @id="descendants"]/div'''

    _tweet_min_position = '''data-min-position'''
    _tweet_content_pattern_ = '''./div[@class="content"]'''
    _tweet_time_ms_pattern_ = '''./div[@class="stream-item-header"]/
                                        small[@class="time"]/a[contains(@class,"tweet-timestamp")]/span'''
    _tweet_text_pattern_ = '''./div[@class="js-tweet-text-container"]//text()'''
    _tweet_links_list_pattern_ = '''./div[@class="js-tweet-text-container"]//a'''

    _tweet_reply_count_pattern_ = '''./div[@class="stream-item-footer"]/div/
                                            span[contains(@class, "ProfileTweet-action--reply")]/span'''
    _tweet_like_count_pattern_ = '''./div[@class="stream-item-footer"]/div/
                                            span[contains(@class, "ProfileTweet-action--favorite")]/span'''
    _tweet_retweet_count_pattern_ = '''./div[@class="stream-item-footer"]/div/
                                            span[contains(@class, "ProfileTweet-action--retweet")]/span'''

    _tweet_user_profile_sidebar_ = '''//div[contains(@class, "ProfileSidebar")]'''
    _tweet_user_profile_canopy_ = '''//div[contains(@class, "ProfileCanopy-navBar")]'''
    _tweet_user_tweets_count_ = '''//li[contains(@class, "ProfileNav-item--tweets")]/a/span[3]'''
    _tweet_user_following_count_ = '''//li[contains(@class, "ProfileNav-item--following")]/a/span[3]'''
    _tweet_user_followers_count_ = '''//li[contains(@class, "ProfileNav-item--followers")]/a/span[3]'''
    _tweet_user_favorites_count_ = '''//li[contains(@class, "ProfileNav-item--favorites")]/a/span[3]'''
    _tweet_user_lists_count_ = '''//li[contains(@class, "ProfileNav-item--lists")]/a/span[3]'''
    _tweet_user_name_ = '''//h1[contains(@class, "ProfileHeaderCard-name")]/a/text()'''
    _tweet_user_bio_ = '''//p[contains(@class, "ProfileHeaderCard-bio")]//text()'''
    _tweet_user_location_ = '''//div[contains(@class, "ProfileHeaderCard-location")]/span[2]/a'''
    _tweet_user_url_ = '''//div[contains(@class, "ProfileHeaderCard-url")]/span[2]/a'''

    _tweet_hastag_pattern_ = r'''/hashtag/([0-9a-zA-Z_]*)\?src=hash'''

    __twitter_user_agent__ = [
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
        'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
        'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
        'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
    ]

    __twitter_request_header__ = {
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'accept-language': 'en-US,en;q=0.8',
        'user-agent': random.choice(__twitter_user_agent__),
        'x-requested-with': 'XMLHttpRequest',
        'x-twitter-active-user': 'yes',
        'x-twitter-polling': 'true',
    }

    __twitter_request_delays__ = [2, 3, 4, 5, 6, 7]

    __twitter_search_url__ = 'https://twitter.com/i/search/timeline'
    __twitter_conversation_url__ = 'https://twitter.com/i/{username}/conversation/{parent_tweet_id}'

    twitter_date_format = '%Y-%m-%d'
    current_cursor = None
    scrape_pages = 2
    scraped_user_info = None

    def __init__(self, twitter_request_url, twitter_request_header,
                 twitter_request_params=None, twitter_request_proxies=None, scrape_pages=2,
                 twitter_file_path=None, twitter_file_format='csv'):

        self.__twitter_request_url__ = twitter_request_url
        if twitter_request_header is not None:
            self.__twitter_request_header__ = twitter_request_header
        self.__twitter_request_params__ = twitter_request_params
        self.__twitter_request_proxies__ = twitter_request_proxies
        self.scrape_pages = scrape_pages
        self.__twitter_tweet_persist_file_path__ = twitter_file_path
        self.__twitter_tweet_persist_file_format__ = twitter_file_format

        self.hashtag_capture = re.compile(self._tweet_hastag_pattern_)

        self.html_parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
        self.proxy_json = None

    def set_proxy_list(self, proxy_json=None):
        """
        :param proxy_json: {
            "http": ["http://username:password@address:port", ...],
            "https: ["https://username:password@address:port", ...]
        }
        """
        if isinstance(proxy_json.get("http"), (list,)) or isinstance(proxy_json.get("https"), (list,)):
            self.proxy_json = proxy_json

    def switch_request_user_agent(self):
        """
        User-Agents: https://udger.com/resources/ua-list
        """
        logger.info("Switching user-agent")
        self.__twitter_request_header__['user-agent'] = random.choice(self.__twitter_user_agent__)

    def switch_request_proxy(self):
        logger.info("Switching proxy")
        if self.proxy_json is not None:
            request_proxy = {
                "http": random.choice(self.proxy_json.get("http")),
                "https": random.choice(self.proxy_json.get("https"))
            }
            self.__twitter_request_proxies__ = request_proxy

    def update_request_params(self, twitter_request_url, twitter_request_params, update_refer=False):
        if twitter_request_params is not None:
            self.__twitter_request_params__ = twitter_request_params
            if update_refer and twitter_request_url != "":
                self.update_request_refer(twitter_request_url, self.__twitter_request_params__)

    def update_request_url(self, twitter_request_url):
        if twitter_request_url is not None:
            self.__twitter_request_url__ = twitter_request_url

    def update_request_refer(self, twitter_request_url, twitter_request_params):
        twitter_request_refer = twitter_request_url + '?' + parse.urlencode(twitter_request_params, quote_via=parse.quote)
        self.__twitter_request_header__['referer'] = twitter_request_refer

    def clear_old_cursor(self):
        self.current_cursor = None

    def execute_twitter_request(self, username=None, search_term=None, conversation_id=None, log_output=False, log_file=None,
                                add_delay=False, delay_tweet_count=100):
        tweet_count = 0
        last_tweet_id, last_tweet_time = '', ''

        if self.scrape_pages is None or self.scrape_pages < 0:
            is_stream = True
            self.scrape_pages = -1
        else:
            is_stream = False

        total_pages = self.scrape_pages

        if self.current_cursor is not None:
            self.__twitter_request_params__['reset_error_state'] = 'false'
            self.__twitter_request_params__['max_position'] = self.current_cursor

        while is_stream or self.scrape_pages > 0:
            current_tweet_count = 0
            min_position = None

            twitter_request_params_encoded = parse.urlencode(self.__twitter_request_params__, quote_via=parse.quote)

            response = requests.get(self.__twitter_request_url__,
                                    headers=self.__twitter_request_header__,
                                    params=twitter_request_params_encoded,
                                    proxies=self.__twitter_request_proxies__)

            if response.ok and response.status_code == 200:
                if search_term is not None:
                    self.__twitter_request_url__ = self.__twitter_search_url__
                elif username is not None and conversation_id is not None:
                    self.__twitter_request_url__ = self.__twitter_conversation_url__\
                        .format(username=username, parent_tweet_id=conversation_id)

                logger.debug("Page {0} request: {1}".format(abs(self.scrape_pages), response.status_code))

                try:
                    tweet_json = response.json()

                    try:
                        if tweet_json.get('has_more_items'):
                            num_new_tweets = tweet_json.get('new_latent_count')
                            min_position = tweet_json.get('min_position')
                        else:
                            logger.info("No more items...!!!")

                        if 'items_html' in tweet_json:
                            tweets_html = tweet_json.get('items_html')
                        else:
                            tweets_html = tweet_json.get('page')

                    except KeyError:
                        if search_term is not None:
                            raise ValueError("Oops! Something went wrong while searching {0}.".format(search_term))
                        elif username is not None:
                            raise ValueError("Oops! Either {0} does not exist or is private.".format(username))
                        else:
                            raise ValueError("Received no arguments")

                except JSONDecodeError:
                    tweets_html = response.text

                if log_output:
                    save_output_log(log_file + '.html', tweets_html)

                html_tree = etree.fromstring(tweets_html, self.html_parser)

                if html_tree is not None:
                    if username is not None and conversation_id is None:
                        profile_sidebar = html_tree.xpath(self._tweet_user_profile_sidebar_)
                        profile_canopy = html_tree.xpath(self._tweet_user_profile_canopy_)
                        if profile_sidebar is not None and len(profile_sidebar) > 0 and \
                                profile_canopy is not None and len(profile_canopy) > 0:
                            self.extract_user_data(username, profile_sidebar, profile_canopy)

                    tweet_stream = html_tree.xpath(self._tweet_stream_max_)
                    if tweet_stream is not None and len(tweet_stream) > 0:
                        min_position = tweet_stream[0].attrib['data-min-position']
                    tweet_list = html_tree.xpath(self._tweets_pattern_)

                    tweets_generator = self.extract_tweets_data(tweet_list)
                    tweet_id, tweet_time, current_tweet_count = self.persist_tweets(tweets_generator)

                    if tweet_time is not None and tweet_time != "":
                        last_tweet_time = tweet_time
                    if tweet_id is not None and tweet_id != "":
                        last_tweet_id = tweet_id
                    tweet_count += current_tweet_count

                    logger.debug(
                        "Extracting {0} tweets of {1} page...".format(len(tweet_list),
                                                                      total_pages - self.scrape_pages + 1))

                if not is_stream:
                    self.scrape_pages += -1

                self.current_cursor = min_position

                if current_tweet_count > 0 and min_position is not None:
                    # composed_count: 0
                    # interval: 30000
                    # latent_count: 0
                    # self.__twitter_request_params__['min_position'] = last_tweet_id
                    self.__twitter_request_params__['reset_error_state'] = 'false'
                    self.__twitter_request_params__['max_position'] = self.current_cursor

                    if conversation_id is not None:
                        self.__twitter_request_params__ = {
                            'include_available_features': 1,
                            'include_entities': 1,
                            'max_position': self.current_cursor,
                            'reset_error_state': 'false'
                        }
                        # self.__twitter_request_params__.pop('conversation_id', None)

                    if add_delay and tweet_count % delay_tweet_count == 0:
                        delay = random.choice(self.__twitter_request_delays__)
                        time.sleep(delay)
                else:
                    logger.info("End of tweet stream...")
                    return tweet_count, last_tweet_id, last_tweet_time, self.__twitter_tweet_persist_file_path__

        logger.info("Total {0} tweets extracted.".format(tweet_count))
        return tweet_count, last_tweet_id, last_tweet_time, self.__twitter_tweet_persist_file_path__

    def extract_tweets_data(self, tweet_list):
        if tweet_list is not None:
            for tweet in tweet_list:
                if 'data-item-type' in tweet.attrib and tweet.attrib.get('data-item-type') == "tweet":
                    item_id = tweet.attrib.get('data-item-id')
                    item_type = tweet.attrib.get('data-item-type')
                    tweet_data = TweetInfo(item_id, item_type)

                    if len(tweet.getchildren()) > 0:
                        tweet_meta = tweet.getchildren()[0]
                        tweet_id = tweet_meta.attrib.get('data-tweet-id')
                        tweet_author = tweet_meta.attrib.get('data-screen-name')
                        tweet_author_name = tweet_meta.attrib.get('data-name')
                        tweet_author_id = tweet_meta.attrib.get('data-user-id')
                        if "data-conversation-id" in tweet_meta.attrib:
                            tweet_has_parent = tweet_meta.attrib.get('data-has-parent-tweet', False)
                            tweet_conversation_id = tweet_meta.attrib.get('data-conversation-id', None)
                            tweet_data.set_tweet_conversation(tweet_conversation_id, tweet_has_parent)
                        if "data-retweet-id" in tweet_meta.attrib:
                            tweet_retweeter = tweet_meta.attrib.get('data-retweeter')
                            tweet_data.set_retweeter(tweet_retweeter)
                        tweet_data.set_tweet_author(tweet_author, tweet_author_name, tweet_author_id)

                        tweet_content = tweet_meta.xpath(self._tweet_content_pattern_)
                        if len(tweet_content) > 0:
                            tweet_time_ms = tweet_content[0].xpath(self._tweet_time_ms_pattern_)[0] \
                                .attrib.get('data-time-ms')
                            tweet_data.set_tweet_time_ms(tweet_time_ms)

                            tweet_text = tweet_content[0].xpath(self._tweet_text_pattern_)
                            tweet_text = ''.join(tweet_text).replace('\n', '')
                            tweet_text = tweet_text.strip()
                            tweet_data.set_tweet_text(tweet_text)

                            tweet_links_raw = tweet_content[0].xpath(self._tweet_links_list_pattern_)

                            for raw_link in tweet_links_raw:
                                raw_url = raw_link.attrib.get('href')
                                if raw_url.startswith('https://') or raw_url.startswith('http://'):
                                    tweet_data.set_tweet_links(raw_url)
                                elif raw_url.startswith('/hashtag/'):
                                    hash_tag_group = re.match(self.hashtag_capture, raw_url)
                                    if hash_tag_group is not None and hash_tag_group.group(1) is not None:
                                        hash_tag = "#" + hash_tag_group.group(1)
                                        tweet_data.set_tweet_hashtags(hash_tag)
                                else:
                                    mention = raw_url.replace('/', '@')
                                    tweet_data.set_tweet_mentions(mention)

                            tweet_replies = tweet_content[0].xpath(self._tweet_reply_count_pattern_)
                            tweet_replies_count = tweet_replies[0].attrib.get('data-tweet-stat-count')
                            tweet_likes = tweet_content[0].xpath(self._tweet_like_count_pattern_)
                            tweet_likes_count = tweet_likes[0].attrib.get('data-tweet-stat-count')
                            tweet_retweets = tweet_content[0].xpath(self._tweet_retweet_count_pattern_)
                            tweet_retweets_count = tweet_retweets[0].attrib.get('data-tweet-stat-count')

                            tweet_data.set_tweet_interactions(tweet_replies_count, tweet_likes_count,
                                                              tweet_retweets_count)

                            yield tweet_data

    def persist_tweets(self, tweets_generator, dump_mode='a'):
        if self.__twitter_tweet_persist_file_path__ is None or self.__twitter_tweet_persist_file_path__ == "":
            self.__twitter_tweet_persist_file_format__ = 'csv'
            self.__twitter_tweet_persist_file_path__ = os.getcwd() + 'tweets_dump.' + \
                                                       self.__twitter_tweet_persist_file_format__

        with open(self.__twitter_tweet_persist_file_path__, dump_mode, encoding="utf-8") as tweet_fp:
            tweet_count = 0
            last_tweet_id = ''
            last_tweet_timestamp = ''

            tweet_csv_writer = csv.DictWriter(tweet_fp, fieldnames=TweetInfo.tweet_fields)

            if self.__twitter_tweet_persist_file_format__.lower() != 'csv' and tweet_fp.tell() != 0:
                tweet_fp.seek(tweet_fp.tell() - 1, os.SEEK_SET)
                tweet_fp.truncate()
                tweet_fp.write(",")

            for tweet in tweets_generator:
                last_tweet_id = tweet.get_tweet_id()
                last_tweet_timestamp = tweet.get_tweet_time_ms()
                tweet_count += 1
                if self.__twitter_tweet_persist_file_format__.lower() == 'csv':
                    if tweet_fp.tell() == 0:
                        tweet_csv_writer.writeheader()
                    tweet_csv_writer.writerow(tweet.get_json())
                else:
                    if tweet_fp.tell() == 0:
                        tweet_fp.write("[")
                    json.dump(tweet.get_json(), tweet_fp)
                    tweet_fp.write(",")
            if self.__twitter_tweet_persist_file_format__.lower() != 'csv':
                tweet_fp.seek(tweet_fp.tell() - 1, os.SEEK_SET)
                tweet_fp.truncate()
                tweet_fp.write("]")

            try:
                last_datetime = datetime.fromtimestamp(int(last_tweet_timestamp) // 1000)
                last_tweet_timestamp = datetime.strftime(last_datetime, self.twitter_date_format)
            except ValueError:
                last_tweet_timestamp = ""
                logger.warning("Unable to get last tweet timestamp")

            logger.debug("Batch written to file:{0}".format(self.__twitter_tweet_persist_file_path__))
            return last_tweet_id, last_tweet_timestamp, tweet_count

    def extract_user_data(self, user_handle, profile_sidebar, profile_canopy):

        user_display_name = profile_sidebar[0].xpath(self._tweet_user_name_)
        if user_display_name is not None and len(user_display_name) > 0:
            user_display_name_val = ''.join(user_display_name)
        else:
            user_display_name_val = None
        user_bio_val = profile_sidebar[0].xpath(self._tweet_user_bio_)
        if user_bio_val is not None and len(user_bio_val) > 0:
            user_bio_val = ''.join(user_bio_val).replace('\xa0', '')
        else:
            user_bio_val = None
        user_location = profile_sidebar[0].xpath(self._tweet_user_location_)
        if user_location is not None and len(user_location) > 0:
            user_location_id_val = user_location[0].attrib.get('data-place-id')
            user_location_val = user_location[0].text
        else:
            user_location_id_val, user_location_val = None, None
        user_url = profile_sidebar[0].xpath(self._tweet_user_url_)
        if user_url is not None and len(user_url) > 0:
            user_url_val = user_url[0].attrib.get('title')
        else:
            user_url_val = None

        user_tweets_count = profile_canopy[0].xpath(self._tweet_user_tweets_count_)
        user_count_val = user_tweets_count[0].attrib.get('data-count')
        user_following = profile_canopy[0].xpath(self._tweet_user_following_count_)
        user_following_val = user_following[0].attrib.get('data-count')
        user_follower = profile_canopy[0].xpath(self._tweet_user_followers_count_)
        user_follower_val = user_follower[0].attrib.get('data-count')
        user_favorites = profile_canopy[0].xpath(self._tweet_user_favorites_count_)
        if user_favorites is not None and len(user_favorites) > 0:
            user_favorites_val = user_favorites[0].attrib.get('data-count')
        else:
            user_favorites_val = None

        self.scraped_user_info = UserInfo(
            user_handle,
            user_display_name_val,
            user_bio_val,
            user_location_val,
            user_location_id_val,
            user_url_val,
            user_count_val,
            user_following_val,
            user_follower_val,
            user_favorites_val
        )

    def get_user_info(self):
        if self.scraped_user_info is not None:
            return self.scraped_user_info.get_json()
        return None


def save_output_log(filename, data):
    if filename is not None and data is not None:
        file_path = os.path.dirname(os.path.realpath(__file__))
        with open(file_path + filename, 'w') as fp:
            fp.write(data)