python source code of app

# -*- coding: utf-8 -*-
##
# Copyright (c) 2017-2020 Ivan Semkin.
#
# This file is part of VK-Scraper
# (see https://github.com/vanyasem/VK-Scraper).
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
##

import argparse
import concurrent.futures
import errno
import glob
import logging
import os
import re
import sys
import textwrap
import time

import requests
import tqdm
import vk_api
import youtube_dl


class VkScraper(object):
    """VkScraper scrapes and downloads an VK user's photos, saved pictures, videos, and stories"""

    def __init__(self, **kwargs):
        self.maximum = None
        self.media_types = None
        self.latest = None
        self.retain_username = None
        self.destination = None
        self.quiet = None
        self.usernames = None
        self.login_pass = None
        self.login_user = None

        default_attr = dict(username='', usernames=[], filename=None,
                            login_user=None, login_pass=None,
                            destination='./', retain_username=False,
                            quiet=False, maximum=0,
                            latest=False,
                            media_types=['image'],
                            verbose=0,
                            )

        allowed_attr = list(default_attr.keys())
        default_attr.update(kwargs)

        for key in default_attr:
            if key in allowed_attr:
                self.__dict__[key] = kwargs.get(key)

        # Set up a logger
        self.logger = VkScraper.get_logger(level=logging.DEBUG,
                                           verbose=default_attr.get('verbose'))

        self.session = requests.Session()

        self.logged_in = False
        self.vk = None
        self.vk_session = None
        self.tools = None
        self.last_scraped_file_time = 0

    def login(self):
        """Logs in to VK"""
        self.vk_session = vk_api.VkApi(
            self.login_user, self.login_pass,
            auth_handler=self.two_factor_handler,
            captcha_handler=self.captcha_handler,
            app_id=6036185,
            api_version='5.101',
        )

        try:
            self.vk_session.auth()
            self.logged_in = True
        except vk_api.AuthError as error_msg:
            print(error_msg)
            self.logger.error('Login failed for ' + self.login_user)
            return

        self.vk = self.vk_session.get_api()
        self.tools = vk_api.VkTools(self.vk_session)

    @staticmethod
    def two_factor_handler():
        key = input("Enter authentication code: ")
        remember_device = True

        return key, remember_device

    @staticmethod
    def captcha_handler(captcha):
        key = input("Enter captcha code {0}: ".format(captcha.get_url())).strip()

        return captcha.try_again(key)

    @staticmethod
    def get_logger(level=logging.DEBUG, verbose=0):
        """Returns a logger"""
        logger = logging.getLogger(__name__)

        fh = logging.FileHandler('vk-scraper.log', 'w')
        fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
        fh.setLevel(level)
        logger.addHandler(fh)

        sh = logging.StreamHandler(sys.stdout)
        sh.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
        sh_levels = [logging.ERROR, logging.WARNING, logging.INFO]
        sh.setLevel(sh_levels[verbose])
        logger.addHandler(sh)

        return logger

    @staticmethod
    def parse_file_usernames(usernames_file):
        """Parses a file containing a list of usernames"""
        users = []

        try:
            with open(usernames_file) as user_file:
                for line in user_file.readlines():
                    # Find all usernames delimited by ,; or whitespace
                    users += re.findall(r'[^,;\s]+', line.split("#")[0])
        except IOError as err:
            raise ValueError('File not found ' + err)
        return users

    @staticmethod
    def parse_delimited_str(inp):
        """Parse the string input as a list of delimited tokens"""
        return re.findall(r'[^,;\s]+', inp)

    def scrape(self, executor=concurrent.futures.ThreadPoolExecutor(max_workers=10)):
        """Crawls through and downloads user's media"""
        if self.login_user and self.login_pass:
            self.login()
            if not self.logged_in:
                self.logger.warning('Fallback anonymous scraping disabled')
                return

        for username in self.usernames:
            self.last_scraped_file_time = 0
            future_to_item = {}

            dst = self.make_dst_dir(username)

            # Get the user metadata.
            try:
                user_id = self.check_user(username)
            except:
                print('Error getting user details for {0}'.format(username))
                continue

            if user_id:
                self.get_photos(dst, executor, future_to_item, user_id)
                self.get_saved(dst, executor, future_to_item, user_id)
                self.get_videos(dst, executor, future_to_item, user_id)
                self.get_stories(dst, executor, future_to_item, user_id)

            # Displays the progress bar of completed downloads. Might not even pop up if all media is downloaded while
            # the above loop finishes.
            if future_to_item:
                for future in tqdm.tqdm(concurrent.futures.as_completed(future_to_item),
                                        total=len(future_to_item), desc='Downloading', disable=self.quiet):
                    item = future_to_item[future]

                    if future.exception() is not None:
                        self.logger.warning(
                            '\nMedia with ID {0} generated an exception: {1}'.format(item['id'], future.exception()))


    def make_dst_dir(self, username):
        """Creates the destination directory."""
        if self.destination == './':
            destination = './' + username
        else:
            if self.retain_username:
                destination = self.destination + '/' + username
            else:
                destination = self.destination

        try:
            os.makedirs(destination)
        except OSError as err:
            if err.errno == errno.EEXIST and os.path.isdir(destination):
                # Directory already exists
                self.get_last_scraped_file_time(destination)
                pass
            else:
                # Target dir exists as a file, or a different error
                raise

        return destination

    def get_last_scraped_file_time(self, dst):
        """Stores the last modified time of newest file in a directory."""
        list_of_files = []
        file_types = ('*.jpg', '*.mp4')

        for file_type in file_types:
            list_of_files.extend(glob.glob(dst + '/' + file_type))

        if list_of_files:
            latest_file = max(list_of_files, key=os.path.getmtime)
            self.last_scraped_file_time = int(os.path.getmtime(latest_file))

    def check_user(self, username):
        """Checks whether a user or community exists"""
        try:
            response = self.vk.users.get(user_ids=username)
            if response:
                try:
                    return response[0]['id']
                except:
                    raise ValueError('User {0} does not exist'.format(username))
        except vk_api.exceptions.ApiError:
            response = self.vk.groups.getById(group_id=-int(username))
            if response:
                try:
                    return -response[0]['id']
                except:
                    raise ValueError('Community {0} does not exist'.format(username))
            else:
                raise ValueError('Community {0} does not exist'.format(username))

    def is_new_media(self, item):
        """Returns True if the media is new"""
        return self.latest is False or self.last_scraped_file_time == 0 or \
            ('date' not in item) or item.get('date') > self.last_scraped_file_time

    @staticmethod
    def determine_max_media_res(item, save_dir):
        if 'duration' in item:  # Video
            return VkScraper.determine_max_video_res(item, save_dir)
        elif 'video' in item and 'duration' in item['video']:  # Video story
            return VkScraper.determine_max_video_res(item['video'], save_dir)
        elif 'sizes' in item:  # Photo
            return VkScraper.determine_max_photo_res(item)
        elif 'photo' in item and 'sizes' in item['photo']:  # Photo story
            return VkScraper.determine_max_photo_res(item['photo'])

    class VideoLogger(object):
        def debug(self, msg):
            pass

        def warning(self, msg):
            pass

        def error(self, msg):
            print('\n' + msg)

    @staticmethod
    def determine_max_video_res(item, save_dir):
        if 'files' in item:
            if 'mp4_1080' in item['files']:
                return item['files']['mp4_1080']
            if 'mp4_720' in item['files']:
                return item['files']['mp4_720']
            if 'mp4_480' in item['files']:
                return item['files']['mp4_480']
            if 'mp4_360' in item['files']:
                return item['files']['mp4_360']
            if 'mp4_240' in item['files']:
                return item['files']['mp4_240']
        elif 'player' in item:  # TODO: parse VK videos here to download user-owned private files
            ydl_opts = {
                'outtmpl': save_dir + '/%(title)s.%(ext)s',
                'noplaylist': True,
                'logger': VkScraper.VideoLogger(),
            }
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                ydl.download([item['player']])

    @staticmethod
    def determine_max_photo_res(item):
        sizes = []
        for size in item['sizes']:
            sizes.append(size['type'])
        if 'w' in sizes:
            return item['sizes'][sizes.index('w')]['url']
        elif 'z' in sizes:
            return item['sizes'][sizes.index('z')]['url']
        elif 'y' in sizes:
            return item['sizes'][sizes.index('y')]['url']
        elif 'x' in sizes:
            return item['sizes'][sizes.index('x')]['url']
        elif 'm' in sizes:
            return item['sizes'][sizes.index('m')]['url']
        elif 's' in sizes:
            return item['sizes'][sizes.index('s')]['url']

    def download(self, item, save_dir='./'):
        """Downloads the media file"""
        url = self.determine_max_media_res(item, save_dir)
        base_name = url.split('/')[-1].split('?')[0]
        file_path = os.path.join(save_dir, base_name)

        if not os.path.isfile(file_path):
            with open(file_path, 'wb') as media_file:
                try:
                    content = self.session.get(url).content
                except requests.exceptions.ConnectionError:
                    time.sleep(5)
                    content = self.session.get(url).content

                media_file.write(content)

            file_time = item.get('date', time.time())
            os.utime(file_path, (file_time, file_time))

    def photos_gen(self, user_id):
        """Generator of all user's photos"""
        try:
            photos = self.tools.get_all('photos.getAll', 200, {'owner_id': user_id})

            for item in photos['items']:
                yield item
        except ValueError:
            self.logger.exception('Failed to get photos for ' + user_id)

    def get_photos(self, dst, executor, future_to_item, username):
        """Scrapes the user's albums for photos"""
        if 'image' not in self.media_types:
            return

        iterator = 0
        for item in tqdm.tqdm(self.photos_gen(username), desc='Searching {0} for photos'.format(username),
                              unit=' photos', disable=self.quiet):
            if self.is_new_media(item):
                future = executor.submit(self.download, item, dst)
                future_to_item[future] = item

            iterator += 1
            if self.maximum != 0 and iterator >= self.maximum:
                break

    def saved_gen(self, user_id):
        """Generator of all user's saved pictures"""
        try:
            photos = self.tools.get_all('photos.get', 200, {'owner_id': user_id, 'album_id': 'saved'})

            for item in photos['items']:
                yield item
        except ValueError:
            self.logger.exception('Failed to get saved pictures for ' + user_id)

    def get_saved(self, dst, executor, future_to_item, username):
        """Scrapes the user's saved pictures for photos"""
        if 'saved' not in self.media_types:
            return

        iterator = 0
        for item in tqdm.tqdm(self.saved_gen(username), desc='Searching {0} for saved pictures'.format(username),
                              unit=' pictures', disable=self.quiet):
            if self.is_new_media(item):
                future = executor.submit(self.download, item, dst)
                future_to_item[future] = item

            iterator += 1
            if self.maximum != 0 and iterator >= self.maximum:
                break

    def videos_gen(self, user_id):
        """Generator of all user's videos"""
        try:
            videos = self.tools.get_all('video.get', 200, {'owner_id': user_id})

            for item in videos['items']:
                if item['owner_id'] == user_id:
                    yield item

        except ValueError:
            self.logger.exception('Failed to get videos for ' + user_id)

    def get_videos(self, dst, executor, future_to_item, username):
        """Scrapes the user's videos"""
        if 'video' not in self.media_types:
            return

        iterator = 0
        for item in tqdm.tqdm(self.videos_gen(username), desc='Searching {0} for videos'.format(username),
                              unit=' videos', disable=self.quiet):
            if self.is_new_media(item):
                future = executor.submit(self.download, item, dst)
                future_to_item[future] = item

            iterator += 1
            if self.maximum != 0 and iterator >= self.maximum:
                break

    def stories_gen(self, user_id):
        """Generator of user's stories"""
        try:
            stories = self.tools.get_all('stories.get', 200, {'owner_id': user_id})

            for item in stories['items'][0]:
                yield item
        except ValueError:
            self.logger.exception('Failed to get stories for ' + user_id)

    def get_stories(self, dst, executor, future_to_item, username):
        """Scrapes the user's stories"""
        if 'story' not in self.media_types:
            return

        iterator = 0
        for item in tqdm.tqdm(self.stories_gen(username), desc='Searching {0} for stories'.format(username),
                              unit=' stories', disable=self.quiet):
            if self.is_new_media(item):
                future = executor.submit(self.download, item, dst)
                future_to_item[future] = item

            iterator += 1
            if self.maximum != 0 and iterator >= self.maximum:
                break


def main():
    parser = argparse.ArgumentParser(
        description="VK-Scraper scrapes and downloads an VK user's photos and videos.",
        epilog=textwrap.dedent("""
            You can hide your credentials from the history, by reading your
            username from a local file:
            $ vk-scraper @vk_args.txt user_to_scrape
            with vk_args.txt looking like this:
            -u=my_username
            -p=my_password
            You can add all arguments you want to that file, just remember to have
            one argument per line.
            """),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        fromfile_prefix_chars='@')

    parser.add_argument('username', help='VK user(s) to scrape', nargs='*')
    parser.add_argument('--destination', '-d', default='./', help='Download destination')
    parser.add_argument('--login-user', '--login_user', '-u', default=None, help='VK username')
    parser.add_argument('--login-pass', '--login_pass', '-p', default=None, help='VK password')
    parser.add_argument('--filename', '-f', help='Path to a file containing a list of users to scrape')
    parser.add_argument('--quiet', '-q', default=False, action='store_true', help='Be quiet while scraping')
    parser.add_argument('--maximum', '-m', type=int, default=0, help='Maximum number of items to scrape')
    parser.add_argument('--retain-username', '--retain_username', '-n', action='store_true', default=False,
                        help='Creates username subdirectory when destination flag is set')
    parser.add_argument('--media-types', '--media_types', '-t', nargs='+',
                        default=['image'],
                        help='Specify media types to scrape')
    parser.add_argument('--latest', action='store_true', default=False, help='Scrape new media since the last scrape')
    parser.add_argument('--verbose', '-v', type=int, default=0, help='Logging verbosity level')

    args = parser.parse_args()

    if args.login_user is None or args.login_pass is None:
        parser.print_help()
        raise ValueError('You must provide both username and password')

    if not args.username and args.filename is None:
        parser.print_help()
        raise ValueError('You must either provide username(s) or a file containing username(s)')
    elif args.username and args.filename:
        parser.print_help()
        raise ValueError('You must either provide username(s) OR a file containing username(s)')


    if args.filename:
        args.usernames = VkScraper.parse_file_usernames(args.filename)
    else:
        args.usernames = VkScraper.parse_delimited_str(','.join(args.username))

    if args.media_types and len(args.media_types) == 1 and re.compile(r'[,;\s]+').findall(args.media_types[0]):
        args.media_types = VkScraper.parse_delimited_str(args.media_types[0])

    scraper = VkScraper(**vars(args))

    scraper.scrape()


if __name__ == '__main__':
    main()