python source code of urlextract

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# Copyright (C) 2015-2016 Zhuyifei1999
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>`
#

"""video2commons url extracter."""

from collections import OrderedDict
import re

import emoji
import guess_language
import pywikibot
import youtube_dl

SITE = pywikibot.Site()

# File extensions are probably alphanumeric with 0 to 4 chars
RE_EXTENSION = re.compile(r'^[a-z0-9]{0,4}$', re.IGNORECASE)

DEFAULT_LICENSE = '{{subst:nld|<!--replace this template with the license-->}}'
FILEDESC_TEMPLATE = """
=={{int:filedesc}}==
{{Information
|description=%(desc)s
|date=%(date)s
|source=%(source)s
|author=%(uploader)s
|permission=
|other_versions=
|other_fields=
}}

=={{int:license-header}}==
%(license)s
{{LicenseReview}}

[[Category:Uploaded with video2commons]]
"""


def make_dummy_desc(filename):
    filedesc = FILEDESC_TEMPLATE % {
        'desc': '',
        'date': '',
        'source': '',
        'uploader': '',
        'license': DEFAULT_LICENSE
    }

    # Remove the extension
    filename = filename.rsplit('.', 1)
    if len(filename) == 1 or RE_EXTENSION.match(filename[1]):
        filename = filename[0]
    else:
        filename = '.'.join(filename)

    return {
        'extractor': '(uploads)',
        'filedesc': filedesc.strip(),
        'filename': sanitize(filename)
    }


def do_extract_url(url):
    """Extract a video url."""
    params = {
        'format': 'bestvideo+bestaudio/best',
        'outtmpl': '/dev/null',
        'writedescription': True,
        'writeinfojson': True,
        'writesubtitles': False,
        'subtitlesformat': 'srt/ass/vtt/best',
        'cachedir': '/tmp/',
        'noplaylist': True,  # not implemented in video2commons
    }
    info = youtube_dl.YoutubeDL(params).extract_info(url, download=False)

    assert 'formats' in info or info.get('direct'), \
        'Your url cannot be processed correctly'

    ie_key = info['extractor_key']
    title = (info.get('title') or '').strip()
    url = info.get('webpage_url') or url

    filedesc = FILEDESC_TEMPLATE % {
        'desc': _desc(url, ie_key, title, info),
        'date': _date(url, ie_key, title, info),
        'source': _source(url, ie_key, title, info),
        'uploader': _uploader(url, ie_key, title, info),
        'license': _license(url, ie_key, title, info)
    }

    return {
        'url': url,
        'extractor': ie_key,
        'filedesc': filedesc.strip(),
        'filename': sanitize(title)
    }


def _date(url, ie_key, title, info):
    date = (info.get('upload_date') or '').strip()
    if re.match(r'^[0-9]{8}$', date):
        date = '%s-%s-%s' % (date[0:4], date[4:6], date[6:8])
    return date


def _source(url, ie_key, title, info):
    if info['id']:
        if ie_key == 'Youtube':
            return '{{From YouTube|1=%(id)s|2=%(title)s}}' % \
                {'id': info['id'], 'title': escape_wikitext(title)}
        elif ie_key == 'Vimeo':
            return '{{From Vimeo|1=%(id)s|2=%(title)s}}' % \
                {'id': info['id'], 'title': escape_wikitext(title)}

    if ie_key == 'Generic':
        return url
    else:
        if ':' in info['extractor']:
            # Try to find the anme of the 'owner' of this sub-ie
            ie_tmp = info['extractor'][:info['extractor'].index(':')]
            for ie in youtube_dl.gen_extractors():
                if ie.IE_NAME == ie_tmp:
                    ie_key = ie.ie_key()
                    break
        return '[%(url)s %(title)s - %(extractor)s]' % \
            {'url': url, 'title': escape_wikitext(title), 'extractor': ie_key}


def _desc(url, ie_key, title, info):
    desc_orig = desc = (info.get('description') or '').strip() or title
    desc = escape_wikitext(desc)
    if len(desc_orig) > 100:
        lang = guess_language.guessLanguage(desc_orig)
        if lang != 'UNKNOWN':
            desc = u'{{' + lang + u'|1=' + desc + u'}}'
    return desc


def _uploader(url, ie_key, title, info):
    uploader = escape_wikitext((info.get('uploader') or '').strip())
    uploader_url = info.get('uploader_url') or ''
    if uploader_url:
        # HACK: YouTube outputs http:// atm (issue #80)
        if ie_key == 'Youtube':
            uploader_url = uploader_url.replace('http://', 'https://')
        uploader = u'[%s %s]' % (uploader_url, uploader)
    return uploader


def _license(url, ie_key, title, info):
    uploader = info.get('uploader')
    uploader_param = ''
    if uploader:
        uploader_param = '|' + escape_wikitext(uploader.strip())

    default = DEFAULT_LICENSE
    if ie_key == 'Youtube' and info.get('license') == \
            'Creative Commons Attribution license (reuse allowed)':
        return '{{YouTube CC-BY%s}}' % uploader_param
    elif ie_key == 'Flickr':
        return {
            'Attribution':
                '{{cc-by-2.0%s}}' % uploader_param,
            'Attribution-ShareAlike':
                '{{cc-by-sa-2.0%s}}' % uploader_param,
            'No known copyright restrictions':
                '{{Flickr-no known copyright restrictions}}',
            'United States government work':
                '{{PD-USGov}}',
            'Public Domain Dedication (CC0)':
                '{{cc-zero}}',
            'Public Domain Work':
                '{{safesubst:Flickr-public domain mark/subst}}',
            'Public Domain Mark':
                '{{safesubst:Flickr-public domain mark/subst}}',
        }.get(info.get('license'), default)
    elif ie_key == 'Vimeo':
        return {
            'by':
                '{{cc-by-3.0%s}}' % uploader_param,
            'by-sa':
                '{{cc-by-sa-3.0%s}}' % uploader_param,
            'cc0':
                '{{cc-zero}}',
        }.get(info.get('license'), default)

    return default


def escape_wikitext(wikitext):
    """Escape wikitext for use in file description."""
    rep = OrderedDict([
        ('{|', '{{(}}&#124;'),
        ('|}', '&#124;{{)}}'),
        ('||', '&#124;&#124;'),
        ('|', '&#124;'),
        ('[[', '{{!((}}'),
        (']]', '{{))!}}'),
        ('{{', '{{((}}'),
        ('}}', '{{))}}'),
        ('{', '{{(}}'),
        ('}', '{{)}}'),
    ])
    rep = dict((re.escape(k), v) for k, v in rep.iteritems())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], wikitext)


# Source: mediawiki.Title.js@9df363d
sanitationRules = [
    # issue #101
    {
        'pattern': emoji.get_emoji_regexp(),
        'replace': ''
    },
    # "signature"
    {
        'pattern': re.compile(ur'~{3}'),
        'replace': ''
    },
    # Space, underscore, tab, NBSP and other unusual spaces
    {
        'pattern': re.compile(ur'[ _\u0009\u00A0\u1680\u180E\u2000-\u200A'
                              ur'\u2028\u2029\u202F\u205F\u3000\s]+'),
        'replace': ' '
    },
    # issue #96
    {
        'pattern': re.compile(ur'\u200B'),
        'replace': ''
    },
    # unicode bidi override characters: Implicit, Embeds, Overrides
    {
        'pattern': re.compile(ur'[\u200E\u200F\u202A-\u202E]'),
        'replace': ''
    },
    # control characters
    {
        'pattern': re.compile(ur'[\x00-\x1f\x7f]'),
        'replace': ''
    },
    # URL encoding (possibly)
    {
        'pattern': re.compile(ur'%([0-9A-Fa-f]{2})'),
        'replace': r'% \1'
    },
    # HTML-character-entities
    {
        'pattern': re.compile(ur'&(([A-Za-z0-9\x80-\xff]+|'
                              ur'#[0-9]+|#x[0-9A-Fa-f]+);)'),
        'replace': r'& \1'
    },
    # slash, colon (not supported by file systems like NTFS/Windows,
    # Mac OS 9 [:], ext4 [/])
    {
        'pattern': re.compile(ur'[:/#]'),
        'replace': '-'
    },
    # brackets, greater than
    {
        'pattern': re.compile(ur'[\]\}>]'),
        'replace': ')'
    },
    # brackets, lower than
    {
        'pattern': re.compile(ur'[\[\{<]'),
        'replace': '('
    },
    # directory structures
    {
        'pattern': re.compile(ur'^(\.|\.\.|\./.*|\.\./.*|.*/\./.*|'
                              ur'.*/\.\./.*|.*/\.|.*/\.\.)$'),
        'replace': ''
    },
    # everything that wasn't covered yet
    {
        'pattern': re.compile(ur'[|#+?:/\\\u0000-\u001f\u007f]'),
        'replace': '-'
    },
    # titleblacklist-custom-double-apostrophe
    {
        'pattern': re.compile(ur"'{2,}"),
        'replace': '"'
    },
]


def sanitize(filename):
    """Sanitize a filename for uploading."""
    for rule in sanitationRules:
        filename = rule['pattern'].sub(rule['replace'], filename)

    return filename


def do_validate_filename(filename):
    """Validate filename for invalid characters/parts."""
    assert len(filename) < 250, 'Your filename is too long'

    for rule in sanitationRules:
        reobj = rule['pattern'].search(filename)
        assert not reobj or reobj.group(0) == ' ', \
            'Your filename contains an illegal part: %r' % reobj.group(0)

    return filename.replace('_', ' ')


def do_validate_filedesc(filedesc):
    """Validate filename for invalid characters/parts."""
    parse = SITE._simple_request(
        action='parse',
        text=filedesc,
        prop='externallinks'
    ).submit()

    externallinks = parse.get('parse', {}).get('externallinks', [])

    if externallinks:
        spam = SITE._simple_request(
            action='spamblacklist',
            url=externallinks
        ).submit()

        assert spam.get('spamblacklist', {}).get('result') != 'blacklisted', \
            ('Your file description matches spam blacklist! Matches: %s' %
             ', '.join(spam.get('spamblacklist', {}).get('matches', [])))

    return filedesc