python source code of tweets

import re
import os
import requests
import logging
import csv
import json
import random
import time
from lxml import etree
from urllib import parse
from datetime import datetime
try:
    from json.decoder import JSONDecodeError
except ImportError:
    JSONDecodeError = ValueError

from tweetscrape.model.tweet_model import TweetInfo
from tweetscrape.model.user_model import UserInfo

logger = logging.getLogger(__name__)


class TweetScrapper:
    __twitter_request_url__ = None
    __twitter_request_header__ = None
    __twitter_request_params__ = None

    _tweets_pattern_ = '''//li[contains(@class,"stream-item")]'''
    _tweet_stream_max_ = '''//*[@id="timeline" or @id="descendants"]/div'''

    _tweet_min_position = '''data-min-position'''
    _tweet_content_pattern_ = '''./div[@class="content"]'''
    _tweet_time_ms_pattern_ = '''./div[@class="stream-item-header"]/
                                        small[@class="time"]/a[contains(@class,"tweet-timestamp")]/span'''
    _tweet_text_pattern_ = '''./div[@class="js-tweet-text-container"]//text()'''
    _tweet_links_list_pattern_ = '''./div[@class="js-tweet-text-container"]//a'''

    _tweet_reply_count_pattern_ = '''./div[@class="stream-item-footer"]/div/
                                            span[contains(@class, "ProfileTweet-action--reply")]/span'''
    _tweet_like_count_pattern_ = '''./div[@class="stream-item-footer"]/div/
                                            span[contains(@class, "ProfileTweet-action--favorite")]/span'''
    _tweet_retweet_count_pattern_ = '''./div[@class="stream-item-footer"]/div/
                                            span[contains(@class, "ProfileTweet-action--retweet")]/span'''

    _tweet_user_profile_sidebar_ = '''//div[contains(@class, "ProfileSidebar")]'''
    _tweet_user_profile_canopy_ = '''//div[contains(@class, "ProfileCanopy-navBar")]'''
    _tweet_user_tweets_count_ = '''//li[contains(@class, "ProfileNav-item--tweets")]/a/span[3]'''
    _tweet_user_following_count_ = '''//li[contains(@class, "ProfileNav-item--following")]/a/span[3]'''
    _tweet_user_followers_count_ = '''//li[contains(@class, "ProfileNav-item--followers")]/a/span[3]'''
    _tweet_user_favorites_count_ = '''//li[contains(@class, "ProfileNav-item--favorites")]/a/span[3]'''
    _tweet_user_lists_count_ = '''//li[contains(@class, "ProfileNav-item--lists")]/a/span[3]'''
    _tweet_user_name_ = '''//h1[contains(@class, "ProfileHeaderCard-name")]/a/text()'''
    _tweet_user_bio_ = '''//p[contains(@class, "ProfileHeaderCard-bio")]//text()'''
    _tweet_user_location_ = '''//div[contains(@class, "ProfileHeaderCard-location")]/span[2]/a'''
    _tweet_user_url_ = '''//div[contains(@class, "ProfileHeaderCard-url")]/span[2]/a'''

    _tweet_hastag_pattern_ = r'''/hashtag/([0-9a-zA-Z_]*)\?src=hash'''

    __twitter_user_agent__ = [
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
        'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
        'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
        'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
    ]

    __twitter_request_header__ = {
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'accept-language': 'en-US,en;q=0.8',
        'user-agent': random.choice(__twitter_user_agent__),
        'x-requested-with': 'XMLHttpRequest',
        'x-twitter-active-user': 'yes',
        'x-twitter-polling': 'true',
    }

    __twitter_request_delays__ = [2, 3, 4, 5, 6, 7]

    __twitter_search_url__ = 'https://twitter.com/i/search/timeline'
    __twitter_conversation_url__ = 'https://twitter.com/i/{username}/conversation/{parent_tweet_id}'

    twitter_date_format = '%Y-%m-%d'
    current_cursor = None
    scrape_pages = 2
    scraped_user_info = None

    def __init__(self, twitter_request_url, twitter_request_header,
                 twitter_request_params=None, twitter_request_proxies=None, scrape_pages=2,
                 twitter_file_path=None, twitter_file_format='csv'):

        self.__twitter_request_url__ = twitter_request_url
        if twitter_request_header is not None:
            self.__twitter_request_header__ = twitter_request_header
        self.__twitter_request_params__ = twitter_request_params
        self.__twitter_request_proxies__ = twitter_request_proxies
        self.scrape_pages = scrape_pages
        self.__twitter_tweet_persist_file_path__ = twitter_file_path
        self.__twitter_tweet_persist_file_format__ = twitter_file_format

        self.hashtag_capture = re.compile(self._tweet_hastag_pattern_)

        self.html_parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
        self.proxy_json = None

    def set_proxy_list(self, proxy_json=None):
        """
        :param proxy_json: {
            "http": ["http://username:password@address:port", ...],
            "https: ["https://username:password@address:port", ...]
        }
        """
        if isinstance(proxy_json.get("http"), (list,)) or isinstance(proxy_json.get("https"), (list,)):
            self.proxy_json = proxy_json

    def switch_request_user_agent(self):
        """
        User-Agents: https://udger.com/resources/ua-list
        """
        logger.info("Switching user-agent")
        self.__twitter_request_header__['user-agent'] = random.choice(self.__twitter_user_agent__)

    def switch_request_proxy(self):
        logger.info("Switching proxy")
        if self.proxy_json is not None:
            request_proxy = {
                "http": random.choice(self.proxy_json.get("http")),
                "https": random.choice(self.proxy_json.get("https"))
            }
            self.__twitter_request_proxies__ = request_proxy

    def update_request_params(self, twitter_request_url, twitter_request_params, update_refer=False):
        if twitter_request_params is not None:
            self.__twitter_request_params__ = twitter_request_params
            if update_refer and twitter_request_url != "":
                self.update_request_refer(twitter_request_url, self.__twitter_request_params__)

    def update_request_url(self, twitter_request_url):
        if twitter_request_url is not None:
            self.__twitter_request_url__ = twitter_request_url

    def update_request_refer(self, twitter_request_url, twitter_request_params):
        twitter_request_refer = twitter_request_url + '?' + parse.urlencode(twitter_request_params, quote_via=parse.quote)
        self.__twitter_request_header__['referer'] = twitter_request_refer

    def clear_old_cursor(self):
        self.current_cursor = None

    def execute_twitter_request(self, username=None, search_term=None, conversation_id=None, log_output=False, log_file=None,
                                add_delay=False, delay_tweet_count=100):
        tweet_count = 0
        last_tweet_id, last_tweet_time = '', ''

        if self.scrape_pages is None or self.scrape_pages < 0:
            is_stream = True
            self.scrape_pages = -1
        else:
            is_stream = False

        total_pages = self.scrape_pages

        if self.current_cursor is not None:
            self.__twitter_request_params__['reset_error_state'] = 'false'
            self.__twitter_request_params__['max_position'] = self.current_cursor

        while is_stream or self.scrape_pages > 0:
            current_tweet_count = 0
            min_position = None

            twitter_request_params_encoded = parse.urlencode(self.__twitter_request_params__, quote_via=parse.quote)

            response = requests.get(self.__twitter_request_url__,
                                    headers=self.__twitter_request_header__,
                                    params=twitter_request_params_encoded,
                                    proxies=self.__twitter_request_proxies__)

            if response.ok and response.status_code == 200:
                if search_term is not None:
                    self.__twitter_request_url__ = self.__twitter_search_url__
                elif username is not None and conversation_id is not None:
                    self.__twitter_request_url__ = self.__twitter_conversation_url__\
                        .format(username=username, parent_tweet_id=conversation_id)

                logger.debug("Page {0} request: {1}".format(abs(self.scrape_pages), response.status_code))

                try:
                    tweet_json = response.json()

                    try:
                        if tweet_json.get('has_more_items'):
                            num_new_tweets = tweet_json.get('new_latent_count')
                            min_position = tweet_json.get('min_position')
                        else:
                            logger.info("No more items...!!!")

                        if 'items_html' in tweet_json:
                            tweets_html = tweet_json.get('items_html')
                        else:
                            tweets_html = tweet_json.get('page')

                    except KeyError:
                        if search_term is not None:
                            raise ValueError("Oops! Something went wrong while searching {0}.".format(search_term))
                        elif username is not None:
                            raise ValueError("Oops! Either {0} does not exist or is private.".format(username))
                        else:
                            raise ValueError("Received no arguments")

                except JSONDecodeError:
                    tweets_html = response.text

                if log_output:
                    save_output_log(log_file + '.html', tweets_html)

                html_tree = etree.fromstring(tweets_html, self.html_parser)

                if html_tree is not None:
                    if username is not None and conversation_id is None:
                        profile_sidebar = html_tree.xpath(self._tweet_user_profile_sidebar_)
                        profile_canopy = html_tree.xpath(self._tweet_user_profile_canopy_)
                        if profile_sidebar is not None and len(profile_sidebar) > 0 and \
                                profile_canopy is not None and len(profile_canopy) > 0:
                            self.extract_user_data(username, profile_sidebar, profile_canopy)

                    tweet_stream = html_tree.xpath(self._tweet_stream_max_)
                    if tweet_stream is not None and len(tweet_stream) > 0:
                        min_position = tweet_stream[0].attrib['data-min-position']
                    tweet_list = html_tree.xpath(self._tweets_pattern_)

                    tweets_generator = self.extract_tweets_data(tweet_list)
                    tweet_id, tweet_time, current_tweet_count = self.persist_tweets(tweets_generator)

                    if tweet_time is not None and tweet_time != "":
                        last_tweet_time = tweet_time
                    if tweet_id is not None and tweet_id != "":
                        last_tweet_id = tweet_id
                    tweet_count += current_tweet_count

                    logger.debug(
                        "Extracting {0} tweets of {1} page...".format(len(tweet_list),
                                                                      total_pages - self.scrape_pages + 1))

                if not is_stream:
                    self.scrape_pages += -1

                self.current_cursor = min_position

                if current_tweet_count > 0 and min_position is not None:
                    # composed_count: 0
                    # interval: 30000
                    # latent_count: 0
                    # self.__twitter_request_params__['min_position'] = last_tweet_id
                    self.__twitter_request_params__['reset_error_state'] = 'false'
                    self.__twitter_request_params__['max_position'] = self.current_cursor

                    if conversation_id is not None:
                        self.__twitter_request_params__ = {
                            'include_available_features': 1,
                            'include_entities': 1,
                            'max_position': self.current_cursor,
                            'reset_error_state': 'false'
                        }
                        # self.__twitter_request_params__.pop('conversation_id', None)

                    if add_delay and tweet_count % delay_tweet_count == 0:
                        delay = random.choice(self.__twitter_request_delays__)
                        time.sleep(delay)
                else:
                    logger.info("End of tweet stream...")
                    return tweet_count, last_tweet_id, last_tweet_time, self.__twitter_tweet_persist_file_path__

        logger.info("Total {0} tweets extracted.".format(tweet_count))
        return tweet_count, last_tweet_id, last_tweet_time, self.__twitter_tweet_persist_file_path__

    def extract_tweets_data(self, tweet_list):
        if tweet_list is not None:
            for tweet in tweet_list:
                if 'data-item-type' in tweet.attrib and tweet.attrib.get('data-item-type') == "tweet":
                    item_id = tweet.attrib.get('data-item-id')
                    item_type = tweet.attrib.get('data-item-type')
                    tweet_data = TweetInfo(item_id, item_type)

                    if len(tweet.getchildren()) > 0:
                        tweet_meta = tweet.getchildren()[0]
                        tweet_id = tweet_meta.attrib.get('data-tweet-id')
                        tweet_author = tweet_meta.attrib.get('data-screen-name')
                        tweet_author_name = tweet_meta.attrib.get('data-name')
                        tweet_author_id = tweet_meta.attrib.get('data-user-id')
                        if "data-conversation-id" in tweet_meta.attrib:
                            tweet_has_parent = tweet_meta.attrib.get('data-has-parent-tweet', False)
                            tweet_conversation_id = tweet_meta.attrib.get('data-conversation-id', None)
                            tweet_data.set_tweet_conversation(tweet_conversation_id, tweet_has_parent)
                        if "data-retweet-id" in tweet_meta.attrib:
                            tweet_retweeter = tweet_meta.attrib.get('data-retweeter')
                            tweet_data.set_retweeter(tweet_retweeter)
                        tweet_data.set_tweet_author(tweet_author, tweet_author_name, tweet_author_id)

                        tweet_content = tweet_meta.xpath(self._tweet_content_pattern_)
                        if len(tweet_content) > 0:
                            tweet_time_ms = tweet_content[0].xpath(self._tweet_time_ms_pattern_)[0] \
                                .attrib.get('data-time-ms')
                            tweet_data.set_tweet_time_ms(tweet_time_ms)

                            tweet_text = tweet_content[0].xpath(self._tweet_text_pattern_)
                            tweet_text = ''.join(tweet_text).replace('\n', '')
                            tweet_text = tweet_text.strip()
                            tweet_data.set_tweet_text(tweet_text)

                            tweet_links_raw = tweet_content[0].xpath(self._tweet_links_list_pattern_)

                            for raw_link in tweet_links_raw:
                                raw_url = raw_link.attrib.get('href')
                                if raw_url.startswith('https://') or raw_url.startswith('http://'):
                                    tweet_data.set_tweet_links(raw_url)
                                elif raw_url.startswith('/hashtag/'):
                                    hash_tag_group = re.match(self.hashtag_capture, raw_url)
                                    if hash_tag_group is not None and hash_tag_group.group(1) is not None:
                                        hash_tag = "#" + hash_tag_group.group(1)
                                        tweet_data.set_tweet_hashtags(hash_tag)
                                else:
                                    mention = raw_url.replace('/', '@')
                                    tweet_data.set_tweet_mentions(mention)

                            tweet_replies = tweet_content[0].xpath(self._tweet_reply_count_pattern_)
                            tweet_replies_count = tweet_replies[0].attrib.get('data-tweet-stat-count')
                            tweet_likes = tweet_content[0].xpath(self._tweet_like_count_pattern_)
                            tweet_likes_count = tweet_likes[0].attrib.get('data-tweet-stat-count')
                            tweet_retweets = tweet_content[0].xpath(self._tweet_retweet_count_pattern_)
                            tweet_retweets_count = tweet_retweets[0].attrib.get('data-tweet-stat-count')

                            tweet_data.set_tweet_interactions(tweet_replies_count, tweet_likes_count,
                                                              tweet_retweets_count)

                            yield tweet_data

    def persist_tweets(self, tweets_generator, dump_mode='a'):
        if self.__twitter_tweet_persist_file_path__ is None or self.__twitter_tweet_persist_file_path__ == "":
            self.__twitter_tweet_persist_file_format__ = 'csv'
            self.__twitter_tweet_persist_file_path__ = os.getcwd() + 'tweets_dump.' + \
                                                       self.__twitter_tweet_persist_file_format__

        with open(self.__twitter_tweet_persist_file_path__, dump_mode, encoding="utf-8") as tweet_fp:
            tweet_count = 0
            last_tweet_id = ''
            last_tweet_timestamp = ''

            tweet_csv_writer = csv.DictWriter(tweet_fp, fieldnames=TweetInfo.tweet_fields)

            if self.__twitter_tweet_persist_file_format__.lower() != 'csv' and tweet_fp.tell() != 0:
                tweet_fp.seek(tweet_fp.tell() - 1, os.SEEK_SET)
                tweet_fp.truncate()
                tweet_fp.write(",")

            for tweet in tweets_generator:
                last_tweet_id = tweet.get_tweet_id()
                last_tweet_timestamp = tweet.get_tweet_time_ms()
                tweet_count += 1
                if self.__twitter_tweet_persist_file_format__.lower() == 'csv':
                    if tweet_fp.tell() == 0:
                        tweet_csv_writer.writeheader()
                    tweet_csv_writer.writerow(tweet.get_json())
                else:
                    if tweet_fp.tell() == 0:
                        tweet_fp.write("[")
                    json.dump(tweet.get_json(), tweet_fp)
                    tweet_fp.write(",")
            if self.__twitter_tweet_persist_file_format__.lower() != 'csv':
                tweet_fp.seek(tweet_fp.tell() - 1, os.SEEK_SET)
                tweet_fp.truncate()
                tweet_fp.write("]")

            try:
                last_datetime = datetime.fromtimestamp(int(last_tweet_timestamp) // 1000)
                last_tweet_timestamp = datetime.strftime(last_datetime, self.twitter_date_format)
            except ValueError:
                last_tweet_timestamp = ""
                logger.warning("Unable to get last tweet timestamp")

            logger.debug("Batch written to file:{0}".format(self.__twitter_tweet_persist_file_path__))
            return last_tweet_id, last_tweet_timestamp, tweet_count

    def extract_user_data(self, user_handle, profile_sidebar, profile_canopy):

        user_display_name = profile_sidebar[0].xpath(self._tweet_user_name_)
        if user_display_name is not None and len(user_display_name) > 0:
            user_display_name_val = ''.join(user_display_name)
        else:
            user_display_name_val = None
        user_bio_val = profile_sidebar[0].xpath(self._tweet_user_bio_)
        if user_bio_val is not None and len(user_bio_val) > 0:
            user_bio_val = ''.join(user_bio_val).replace('\xa0', '')
        else:
            user_bio_val = None
        user_location = profile_sidebar[0].xpath(self._tweet_user_location_)
        if user_location is not None and len(user_location) > 0:
            user_location_id_val = user_location[0].attrib.get('data-place-id')
            user_location_val = user_location[0].text
        else:
            user_location_id_val, user_location_val = None, None
        user_url = profile_sidebar[0].xpath(self._tweet_user_url_)
        if user_url is not None and len(user_url) > 0:
            user_url_val = user_url[0].attrib.get('title')
        else:
            user_url_val = None

        user_tweets_count = profile_canopy[0].xpath(self._tweet_user_tweets_count_)
        user_count_val = user_tweets_count[0].attrib.get('data-count')
        user_following = profile_canopy[0].xpath(self._tweet_user_following_count_)
        user_following_val = user_following[0].attrib.get('data-count')
        user_follower = profile_canopy[0].xpath(self._tweet_user_followers_count_)
        user_follower_val = user_follower[0].attrib.get('data-count')
        user_favorites = profile_canopy[0].xpath(self._tweet_user_favorites_count_)
        if user_favorites is not None and len(user_favorites) > 0:
            user_favorites_val = user_favorites[0].attrib.get('data-count')
        else:
            user_favorites_val = None

        self.scraped_user_info = UserInfo(
            user_handle,
            user_display_name_val,
            user_bio_val,
            user_location_val,
            user_location_id_val,
            user_url_val,
            user_count_val,
            user_following_val,
            user_follower_val,
            user_favorites_val
        )

    def get_user_info(self):
        if self.scraped_user_info is not None:
            return self.scraped_user_info.get_json()
        return None


def save_output_log(filename, data):
    if filename is not None and data is not None:
        file_path = os.path.dirname(os.path.realpath(__file__))
        with open(file_path + filename, 'w') as fp:
            fp.write(data)