import datetime from typing import List, Tuple import tweepy from sqlalchemy import Column, Integer, DateTime, BigInteger, String, BLOB from sqlalchemy import func from sqlalchemy import create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker, scoped_session from tweepy import Status from config.twitter import TWITTER_TRAINING_DB_PATH, TwitterApiCredentials from storage.storage_common import TrainingDataManager Base = declarative_base() class ScraperStatus(Base): __tablename__ = "scraperstatus" id = Column(Integer, index=True, primary_key=True) screen_name = Column(String, nullable=False) since_id = Column(BigInteger, nullable=False) class Tweet(Base): __tablename__ = "tweet" id = Column(Integer, index=True, primary_key=True) status_id = Column(BigInteger, nullable=False, index=True, unique=True) user_id = Column(BigInteger, nullable=False) in_reply_to_status_id = Column(BigInteger) in_reply_to_user_id = Column(BigInteger) retweeted = Column(Integer, nullable=False) timestamp = Column(DateTime, nullable=False, default=datetime.datetime.utcnow) trained = Column(Integer, nullable=False, default=0) text = Column(BLOB, nullable=False) def __repr__(self): return self.text.decode() engine = create_engine('sqlite:///%s' % TWITTER_TRAINING_DB_PATH) Base.metadata.create_all(engine) session_factory = sessionmaker() session_factory.configure(bind=engine) Session = scoped_session(session_factory) class TwitterTrainingDataManager(TrainingDataManager): def __init__(self): TrainingDataManager.__init__(self, Tweet) self._session = Session() def store(self, data: Status): status = data tweet = self._session.query(Tweet).filter(Tweet.status_id == status.id).first() if tweet is None: tweet = Tweet(status_id=status.id, user_id=status.user.id, in_reply_to_user_id=status.in_reply_to_user_id, in_reply_to_status_id=status.in_reply_to_status_id, retweeted=int(status.retweeted), timestamp=status.created_at, text=status.text.encode()) self._session.add(tweet) self._session.commit() class TwitterScraper(object): def __init__(self, credentials: TwitterApiCredentials, screen_name: str): self._credentials = credentials self.screen_name = screen_name self.session = Session() row = self.session.query(func.max(Tweet.status_id)).first() if row is not None: since_id = row[0] if row[0] is not None else 0 else: since_id = 0 self._latest_tweet_processed_id = since_id self.scraper_status = self.session.query(ScraperStatus).filter( ScraperStatus.screen_name == self.screen_name).first() if self.scraper_status is None: self.scraper_status = ScraperStatus(screen_name=screen_name, since_id=since_id) self.session.add(self.scraper_status) self.session.commit() def _auth(self): auth = tweepy.OAuthHandler(self._credentials.consumer_key, self._credentials.consumer_secret) auth.set_access_token(self._credentials.access_token, self._credentials.access_token_secret) return auth def scrape(self, wait_on_rate_limit=True, learn_retweets=False): auth = self._auth() api = tweepy.API(auth, wait_on_rate_limit=wait_on_rate_limit) if self.scraper_status.since_id == 0: tweets = tweepy.Cursor(api.user_timeline, screen_name=self.screen_name, count=100, lang="en").items() else: tweets = tweepy.Cursor(api.user_timeline, screen_name=self.screen_name, count=100, lang="en", since_id=self.scraper_status.since_id).items() for tweet in tweets: tweet_row = self.session.query(Tweet).filter(Tweet.status_id == tweet.id).first() if tweet_row is None: if not tweet.retweeted or (tweet.retweeted and learn_retweets): tweet_row = Tweet(status_id=tweet.id, user_id=tweet.author.id, in_reply_to_status_id=tweet.in_reply_to_status_id, in_reply_to_user_id=tweet.in_reply_to_user_id, retweeted=tweet.retweeted, timestamp=tweet.created_at, text=tweet.text.encode()) self.session.add(tweet_row) # Store the highest ID so we can set it to since_id later if self._latest_tweet_processed_id is None or tweet.id > self._latest_tweet_processed_id: self._latest_tweet_processed_id = tweet.id # Normally it would be asinine to commit every insert, but we are rate limited by twitter anyway self.session.commit() # Complete scraper progress self.scraper_status.since_id = self._latest_tweet_processed_id self.session.commit()