"""
Script containing functions to populate friend and follower User objects from
a collection of seed users.

@auth dpb
@date 12/01/2014
"""

import logging
import argparse
import warnings
from tweepy import Cursor
from datetime import datetime
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
from smappPy.tweepy_pool import APIPool
from smappPy.tweepy_error_handling import call_with_error_handling
from smappPy.user_collection.userdocs import ensure_userdoc_indexes, create_userdoc
from smappPy.user_collection.network_edges import create_edge_doc, ensure_edge_indexes

BSON_NULL = 10

logger = logging.getLogger(__name__)

#TODO: def get_friend_ids_sample(api, user_id, user_doc=None, sample=0.1)
#TODO: Gets sample proportion of friends. Takes userdoc: if None, queries
#TODO: rest API for user, get num friends, iterates over cursor until sample
#TODO: number is hit

#TODO: Add friends_sample param to populate_.... Requires get_friend_ids_sample
#TODO: Add followers_sample param to populate_.... Requires get_followers_ids_sample

#TODO: def get_friends(api, user_id)
#TODO: Gets fully-hydrated friend user docs via Tweepy 'friends' method

#TODO: Add fctly for saving user status of 'no longer valid' or 'not authorized to view'
#TODO: based on twitter response (so can skip these users)


def get_friends_ids(api, user_id):
    """
    Given a Tweepy/smappPy TweepyPool api, query twitter's rest API for friends of
    given user_id. Returns IDs only (much faster / more per request).
    Parameters:
        api     - fully authenticated Tweepy api or smappPy TweepyPool api
        user_id - twitter user id
    Returns tuple: return code, list of IDs or None (if API call fails)
    """
    cursor = Cursor(api.friends_ids, user_id=user_id)
    user_list, ret_code = call_with_error_handling(list, cursor.items())
   
    if ret_code != 0:
        logger.warning("User {0}: Friends request failed".format(user_id))
    
    # Return user list from API or None (call_with_error_handling returns None if
    # call fail)
    return ret_code, user_list


def get_followers_ids(api, user_id):
    """
    Given a Tweepy/smappPy TweepyPool api, query twitter's rest API for followers of
    given user_id. Returns IDs only (much faster / more per request).
    Parameters:
        api     - fully authenticated Tweepy api or smappPy TweepyPool api
        user_id - twitter user id
    Returns tuple: return code, list of IDs or None (if API call fails)
    """
    cursor = Cursor(api.followers_ids, user_id=user_id)
    user_list, ret_code = call_with_error_handling(list, cursor.items())

    if ret_code != 0:
        logger.warning("User {0}: Followers request failed".format(user_id))
    
    # Return user list from API or None (call_with_error_handling returns None if
    # call fail)
    return ret_code, user_list


def populate_friends_from_collection(api, seed_collection, friend_collection, edge_collection=None,
    user_sample=1.0, friends_threshold=20000, update_threshold=None, requery=True,
    print_progress_every=1000):
    """
    Populates given 'friends_collection' with local user documents representing the friends
    of each user in given 'seed_collection'.
    Note: populated documents are NOT fully hydrated twitter user objects, just IDs with 
    status fields. 
    'friend_ids' field of seed_collection user docs will also be updated with IDs of friends
    fetched from twitter API.
    Parameters:
        api     - fully authenticated Tweepy api or smappPy TweepyPool api
        seed_collection    - fully authenticated (read/write) mongo collection
        friend_collection  - fully authenticated (read/write) mongo collection
        edge_collection    - [OPTIONAL] collection to store simple edge: {to: ID, from: ID}
        user_sample        - proportion of seed users to fetch friends for
        requery            - If False, only query for user's friends if 'friend_ids' field is empty
        friends_threshold  - If user has > friends_threshold, DO NOT QUERY for friends. If user's
                           - doc does not contain 'friends_count', ignore and query anyway
        update_threshold   - Datetime threshold on users to update. Only queries friends of users
                             with 'friends_updated' field LT 'update_threshold'
    """
    # Ensure indexes
    ensure_userdoc_indexes(seed_collection)
    ensure_userdoc_indexes(friend_collection)
    if edge_collection:
        ensure_edge_indexes(edge_collection)

    # Create cursor over users (sample and date restriction possible)
    users = _get_user_sample(seed_collection, user_sample, "friends_updated", update_threshold)

    # Progress vars
    user_count = users.count(with_limit_and_skip=True)
    user_it = 1
    logger.info("Friends: Considering total {0} users".format(user_count))

    # Iterate over users, get friends, save user and friends
    friend_request_failed_for = []
    for user in users:
        # Print progress
        if user_it % print_progress_every == 0:
            print ".. Processing user {0} of {1}".format(user_it, user_count)
        user_it += 1

        # Check user private/deleted fields - don't requery if unreachable
        if "deleted" in user and user["deleted"] == True:
            logging.info("User {0} deleted, skipping".format(user["id"]))
            continue
        elif "private" in user and user["private"] == True:
            logging.info("User {0} private, skipping".format(user["id"]))
            continue

        # Check requery. If false, and user has friend_ids, skip user
        if not requery and user["friend_ids"]:
            logger.debug("User {0} has friends, not re-querying".format(user["id"]))
            continue

        # Check friends count for threshold
        if "friends_count" in user and user["friends_count"] > friends_threshold:
            logger.info("User {0} has friends {1} above threshold {2}, skipping".format(
                user["id"], user["friends_count"], friends_threshold))
            continue

        r, friend_ids = get_friends_ids(api, user["id"])
        if _check_return_set_user(r, user, seed_collection):
            logging.info("User {0} unreachable, skipping".format(user["id"]))
            continue

        if friend_ids == None:
            friend_request_failed_for.append(user["id"])
            continue

        # Initialize (if necessary) and set user's friend_ids list
        if not user["friend_ids"]:
            user["friend_ids"] = list(set(friend_ids))
        else:
            user["friend_ids"] = list(set(user["friend_ids"] + friend_ids))

        # Save all friends as userdocs in friends collection
        _save_userdocs(friend_ids, friend_collection)

        # Optionally save "edge" documents
        if edge_collection:
            _save_friend_edges(user["id"], friend_ids, edge_collection)

        # Update user doc's timestamps and save
        user["updated_timestamp"] = datetime.now()
        user["friends_updated"] = datetime.now()
        seed_collection.save(user)

    # Print failure numbers
    logger.info("Failed to find friends for {0} users".format(len(friend_request_failed_for)))


def populate_followers_from_collection(api, seed_collection, follower_collection, edge_collection=None,
    user_sample=1.0, followers_threshold=20000, update_threshold=None, requery=True,
    print_progress_every=1000):
    """
    See 'populate_friends_from_collection'. Exactly the same, but for followers
    """
    # Ensure indexes
    ensure_userdoc_indexes(seed_collection)
    ensure_userdoc_indexes(follower_collection)
    if edge_collection:
        ensure_edge_indexes(edge_collection)

    # Create cursor over users (sample and date restriction possible)
    users = _get_user_sample(seed_collection, user_sample, "followers_updated", update_threshold)

    # Progress vars
    user_count = users.count(with_limit_and_skip=True)
    user_it = 1
    logger.info("Considering total {0} users".format(user_count))

    # Iterate over users, get followers, save user and followers
    follower_request_failed_for = []
    for user in users:
        if user_it % print_progress_every == 0:
            print ".. Processing user {0} of {1}".format(user_it, user_count)
        user_it += 1

        # Check user private/deleted fields - don't requery if unreachable
        if "deleted" in user and user["deleted"] == True:
            logging.info("User {0} deleted, skipping".format(user["id"]))
            continue
        elif "private" in user and user["private"] == True:
            logging.info("User {0} private, skipping".format(user["id"]))
            continue

        # Check requery. If false, and user has follower_ids, skip user
        if not requery and user["follower_ids"]:
            logger.debug("User {0} has followers, not re-querying".format(user["id"]))
            continue

        # Check followers count for threshold
        if "followers_count" in user and user["followers_count"] > followers_threshold:
            logger.info("User {0} has followers {1} above threshold {2}, skipping".format(
                user["id"], user["followers_count"], followers_threshold))
            continue


        r, follower_ids = get_followers_ids(api, user["id"])
        if _check_return_set_user(r, user, seed_collection):
            logging.info("User {0} unreachable, skipping".format(user["id"]))
            continue

        if follower_ids == None:
            follower_request_failed_for.append(user["id"])
            continue

        # Initialize (if necessary) and set user's follower_ids list
        if not user["follower_ids"]:
            user["follower_ids"] = list(set(follower_ids))
        else:
            user["follower_ids"] = list(set(user["follower_ids"] + follower_ids))
        
        # Save all followers as userdocs in followers collection
        _save_userdocs(follower_ids, follower_collection)

        # Optionally save "edge" documents
        if edge_collection:
            _save_follower_edges(user["id"], follower_ids, edge_collection)

        # Update user doc's timestamps and save
        user["updated_timestamp"] = datetime.now()
        user["followers_updated"] = datetime.now()
        seed_collection.save(user)

    # Print failure numbers
    logger.info("Failed to find followers for {0} users".format(len(follower_request_failed_for)))


def _get_user_sample(user_collection, user_sample, update_field, update_threshold):
    """
    Takes a collection of userdocs and gets cursor of appropriate sample given
    parameters.
    """
    user_count = user_collection.count()
    if update_threshold:
        users = user_collection.find({"$or": [
                                        {update_field: {"$lt": update_threshold}},
                                        {update_field: {"$type": BSON_NULL}}
                                     ]},
                                     limit=int(user_count * user_sample),
                                     no_cursor_timeout=True)
    else:
        users = user_collection.find(limit=int(user_count * user_sample), no_cursor_timeout=True)

    return users

def _check_return_set_user(ret_val, user_doc, collection):
    """
    Checks return of get_friends/followers call. If specific error messages found,
    (user DNE, user private, etc), sets and saves given userdoc and returns True.
    Otherwise, returns false.
    """
    if ret_val == 34:
        logger.info("User {0} no longer exists, updating userdoc".format(
            user_doc["id"]))
        user_doc["deleted"] = True
        collection.save(user_doc)
        return True
    elif ret_val == 179:
        logger.info("User {0} has private account, updating userdoc".format(
            user_doc["id"]))
        user_doc["private"] = True
        collection.save(user_doc)
        return True
    return False


def _save_userdocs(user_ids, collection):
    """Given a list of user IDs, save userdocs built from IDs to given collection"""
    for uid in user_ids:
        user_doc = create_userdoc(uid)
        try:
            collection.save(user_doc)
        except DuplicateKeyError:
            logger.warn("User {0} already in collection {1}".format(uid, collection.full_name))
        except Exception as e:
            logger.error("Storing User {0} in DB {1} failed".format(uid, collection.full_name))
            logger.error("Exception: {0}".format(e))

def _save_friend_edges(seed_id, friend_ids, collection):
    """Given the seed user and a list of friends, save all 'edges' to collection"""
    for fid in friend_ids:
        edge_doc  = create_edge_doc(seed_id, fid)
        try:
            edge_collection.save(edge_doc)
        except DuplicateKeyError:
            logger.warn("Edge {0} alread in DB {1}, skipping".format(edge_doc, collection.full_name))
        except Exception as e:
            logger.error("Storing Edge {0} in DB {1} failed".format(edge_doc, collection.full_name))
            logger.error("Exception: {0}".format(e))

def _save_follower_edges(seed_id, follower_ids, collection):
    """Given the seed user and a list of followers, save all 'edges' to collection"""
    for fid in follower_ids:
        edge_doc  = create_edge_doc(fid, seed_id)
        try:
            edge_collection.save(edge_doc)
        except DuplicateKeyError:
            logger.warn("Edge {0} alread in DB {1}, skipping".format(edge_doc, collection.full_name))
        except Exception as e:
            logger.error("Storing Edge {0} in DB {1} failed".format(edge_doc, collection.full_name))
            logger.error("Exception: {0}".format(e))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--server", default="smapp.politics.fas.nyu.edu",
        help="Database server host [smapp.politics.fas.nyu.edu]")
    parser.add_argument("-p", "--port", type=int, default=27011,
        help="Database server port [27011]")
    parser.add_argument("-u", "--user", dest="user", default="smapp_readWrite",
        help="Database username [smapp_readWrite]")
    parser.add_argument("-w", "--password", default=None,
        help="Database password [None]")
    parser.add_argument("-d", "--database", required=True,
        help="Database to store data in")
    parser.add_argument("-sc", "--seed_collection", required=True,
        help="Collection from which seed users are taken")
    parser.add_argument("-frc", "--friends_collection", default=None,
        help="Collection in which to store friends [None]")
    parser.add_argument("-foc", "--followers_collection", default=None,
        help="Collection in which to store followers [None]")
    parser.add_argument("-ec", "--edge_collection", default=None,
        help="Collection in which to store network edges [None]")
    parser.add_argument("-o", "--oauthsfile", required=True,
        help="Twitter oauths file. JSON file w/ LIST of app documents")
    parser.add_argument("-rq", "--requery", action="store_true", default=False,
        help="Whether to query Twitter for frs/fols of users that already have" \
        "frs/fols [False]")
    parser.add_argument("-ppe", "--print_progress_every", type=int,
        default=1000, help="Print progress every Nth user [1000]")
    parser.add_argument("-frt", "--friends_threshold", type=int, default=20000,
        help="Threshold. Do not query for friends of users with more friends than" \
        "this number [20000]")
    parser.add_argument("-fot", "--followers_threshold", type=int, default=20000,
        help="Threshold. Do not query for followers of users with more followers than" \
        "this number [20000]")
    parser.add_argument("-ut", "--update_threshold", type=int, nargs=5, default=None,
        help="If present, only users with friends/followers_updated timestamp BEFORE " \
        "given value will be updated. Format is five numbers, space-separated: " \
        "Year Month Day Hour Minute. EG: 2014 3 15 12 0. (Time in 24-hour format) " \
        "[None]")
    args = parser.parse_args()
    args.update_threshold = datetime(*args.update_threshold) if args.update_threshold else None

    # Set up logging
    logfile = "{0}.{1}".format(args.database, args.seed_collection)
    logfile += ".Friends" if args.friends_collection else ""
    logfile += ".Followers" if args.followers_collection else ""
    logfile += ".log"

    logger.setLevel(logging.DEBUG)
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    fm = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s",
                           datefmt="%m/%d/%Y %H:%M:%S")
    fh.setFormatter(fm)
    logger.addHandler(fh)

    logger.info("Friend/Follower collection started on {0}.{1}".format(args.database,
        args.seed_collection))
    logger.info("Passed arguments: {0}".format(args))

    # Set up TweepyPool API
    logger.debug("Loading twitter OAUTHs from {0}".format(args.oauthsfile))
    api = APIPool(oauths_filename=args.oauthsfile, debug=True)

    # Set up DB connection
    logger.debug("Connecting to MongoDB")
    client = MongoClient(args.server, args.port)
    database = client[args.database]
    if args.user and args.password:
        database.authenticate(args.user, args.password)
    seed_collection = database[args.seed_collection]
    edge_collection = database[args.edge_collection] if args.edge_collection else None

    # Attempt friends
    if args.friends_collection:
        logger.info("Populating Friends from {0}".format(seed_collection.full_name))
        friend_collection = database[args.friends_collection]
        populate_friends_from_collection(api, seed_collection, friend_collection, 
            edge_collection=edge_collection, user_sample=1.0, requery=args.requery,
            friends_threshold=args.friends_threshold,
            update_threshold=args.update_threshold,
            print_progress_every=args.print_progress_every)
        logger.info("Friends complete")
    
    # Attempt followers
    if args.followers_collection:
        logger.info("Populating Followers from {0}".format(seed_collection.full_name))
        follower_collection = database[args.followers_collection]
        populate_followers_from_collection(api, seed_collection, follower_collection, 
            edge_collection=edge_collection, user_sample=1.0, requery=args.requery,
            followers_threshold=args.followers_threshold,
            update_threshold=args.update_threshold,
            print_progress_every=args.print_progress_every)
        logger.info("Followers complete")