python source code of utils

# coding=utf-8
import os
import re
import zlib
import base64
from fnmatch import fnmatch
from html import escape as html_escape
from urllib.parse import urljoin, urlsplit, urlunsplit, quote_plus
from flask import make_response, Response

try:
    from typing import Union, Tuple  # for python 3.5+ type hint
except:
    pass

try:  # lru_cache的c语言实现, 比Python内置lru_cache更快
    from fastcache import lru_cache  # lru_cache用于缓存函数的执行结果
except:
    from functools import lru_cache

from . import CONSTS

from config_default import *
from config import *


def zmirror_root(filename):
    return os.path.join(CONSTS.ZMIRROR_ROOT, filename)


@lru_cache(maxsize=1024)
def s_esc(s):
    """
    equivalent to s.replace("/",r"\/")
    :type s: str
    :rtype: str
    """
    return s.replace("/", r"\/")


def extract_root_domain(domain):
    """
    提取出一个域名的根域名
    支持二级顶级域名, 允许包含端口(端口会被舍去)

    :param domain: eg: dwn.cdn.google.co.jp[:233]
    :type domain: str
    :return: root_domain, sub_domain
    :rtype: Tuple[str, str]
    """
    domain = domain.rstrip("0123456789").rstrip(":").strip(".")
    temp = domain.split('.')

    # 粗略判断是否是二级顶级域名
    is_level2_tld = len(temp[-1]) <= 3 and temp[-2] in ('com', 'net', 'org', 'co', 'edu', 'mil', 'gov', 'ac')

    if len(temp) <= 2 or len(temp) == 3 and is_level2_tld:
        # 它本身就是一个根域名
        return target_domain, ""
    elif is_level2_tld:
        return ".".join(temp[-3:]), ".".join(temp[:-3])
    else:
        return '.'.join(temp[-2:]), ".".join(temp[:-2])


# noinspection PyShadowingNames
def calc_domain_replace_prefix(_domain):
    """生成各种形式的scheme变体
    :type _domain: str
    :rtype: bool
    """
    return dict(
        # normal
        slash='//' + _domain,
        http='http://' + _domain,
        https='https://' + _domain,
        double_quoted='"%s"' % _domain,
        single_quoted="'%s'" % _domain,
        # hex
        hex_lower=('//' + _domain).replace('/', r'\x2f'),
        hex_upper=('//' + _domain).replace('/', r'\x2F'),
        # escape slash
        slash_esc=s_esc('//' + _domain),
        http_esc=s_esc('http://' + _domain),
        https_esc=s_esc('https://' + _domain),
        double_quoted_esc=r'\"%s\"' % _domain,
        single_quoted_esc=r"\'%s\'" % _domain,
        # double escape slash
        slash_double_esc=('//' + _domain).replace('/', r'\\/'),
        http_double_esc=('http://' + _domain).replace('/', r'\\/'),
        https_double_esc=('https://' + _domain).replace('/', r'\\/'),
        # triple escape slash
        slash_triple_esc=('//' + _domain).replace('/', r'\\\/'),
        http_triple_esc=('http://' + _domain).replace('/', r'\\\/'),
        https_triple_esc=('https://' + _domain).replace('/', r'\\\/'),
        # urlencoded
        slash_ue=quote_plus('//' + _domain),
        http_ue=quote_plus('http://' + _domain),
        https_ue=quote_plus('https://' + _domain),
        double_quoted_ue=quote_plus('"%s"' % _domain),
        single_quoted_ue=quote_plus("'%s'" % _domain),
        # escaped and urlencoded
        slash_esc_ue=quote_plus(s_esc('//' + _domain)),
        http_esc_ue=quote_plus(s_esc('http://' + _domain)),
        https_esc_ue=quote_plus(s_esc('https://' + _domain)),
    )


def current_line_number():
    """Returns the current line number in our program.
    :return: current line number
    :rtype: int
    """
    import inspect
    return inspect.currentframe().f_back.f_lineno


def generate_simple_resp_page(errormsg=b'We Got An Unknown Error', error_code=500):
    """

    :type errormsg: bytes
    :type error_code: int
    :rtype: Response
    """
    return make_response(errormsg, error_code)


@lru_cache(maxsize=128)
def is_mime_represents_text(input_mime):
    """
    Determine whether an mime is text (eg: text/html: True, image/png: False)
    :param input_mime: str
    :return: bool
    """
    input_mime_l = input_mime.lower()
    for text_word in text_like_mime_keywords:
        if text_word in input_mime_l:
            return True
    return False


@lru_cache(maxsize=128)
def extract_mime_from_content_type(_content_type):
    """从content-type中提取出mime, 如 'text/html; encoding=utf-8' --> 'text/html'
    :rtype: str
    """
    c = _content_type.find(';')
    if c == -1:
        return _content_type.lower()
    else:
        return _content_type[:c].lower()


@lru_cache(maxsize=128)
def is_content_type_using_cdn(_content_type):
    """根据content-type确定该资源是否使用CDN"""
    _mime = extract_mime_from_content_type(_content_type)
    if _mime in mime_to_use_cdn:
        # dbgprint(content_type, 'Should Use CDN')
        return _mime
    else:
        # dbgprint(content_type, 'Should NOT CDN')
        return False


def get_group(name, match_obj):
    """return a blank string if the match group is None"""
    try:
        obj = match_obj.group(name)
    except:
        return ''
    else:
        if obj is not None:
            return obj
        else:
            return ''


def get_ext_domain_inurl_scheme_prefix(ext_domain, force_https=None):
    """旧版本遗留函数, 已经不再需要, 永远返回空字符串"""
    return ''


def strx(*args):
    """
    :return: str
    """
    output = ''
    for arg in args:
        output += str(arg) + ' '
    output.rstrip(' ')
    return output


@lru_cache(maxsize=1024)
def check_global_ua_pass(ua_str):
    """该user-agent是否满足全局白名单"""
    if ua_str is None or not global_ua_white_name:
        return False
    ua_str = ua_str.lower()
    if global_ua_white_name in ua_str:
        return True
    else:
        return False


@lru_cache(maxsize=1024)
def is_domain_match_glob_whitelist(domain):
    """
    域名是否匹配 `domains_whitelist_auto_add_glob_list` 中设置的通配符
    :type domain: str
    :rtype: bool
    """
    for domain_glob in domains_whitelist_auto_add_glob_list:
        if fnmatch(domain, domain_glob):
            return True
    return False


@lru_cache(maxsize=128)
def is_mime_streamed(mime):
    """
    根据content-type判断是否应该用stream模式传输(服务器下载的同时发送给用户)
     视频/音频/图片等二进制内容默认用stream模式传输
     :param mime: mime or content-type, eg: "plain/text; encoding=utf-8"
     :type mime: str
     :rtype: bool
    """
    for streamed_keyword in steamed_mime_keywords:
        if streamed_keyword in mime:
            return True
    return False


def generate_html_redirect_page(target_url, msg='', delay_sec=1):
    """生成一个HTML重定向页面
    某些浏览器在301/302页面不接受cookies, 所以需要用html重定向页面来传cookie
    :type target_url: str
    :type msg: str
    :type delay_sec: int
    :rtype: Response
    """
    resp_content = r"""<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>重定向 (Page Redirect)</title>
<meta http-equiv="refresh" content="%d; url=%s">
<script>setTimeout(function(){location.href="%s"} , %d000);</script>
</head>
<body>
<pre>%s</pre>
<hr />
You are now redirecting to <a href="%s">%s</a>, if it didn't redirect automatically, please click that link.
</body>
</html>""" % (
        delay_sec, html_escape(target_url), html_escape(target_url), delay_sec + 1,
        html_escape(msg), html_escape(target_url), html_escape(target_url)
    )
    resp_content = resp_content.encode('utf-8')
    return Response(response=resp_content)


# 在 cdn_redirect_encode_query_str_into_url 中用于标示编码进url的分隔串
cdn_url_query_encode_salt = 'zm26'
_url_salt = re.escape(cdn_url_query_encode_salt)

regex_extract_base64_from_embedded_url = re.compile(
    r'_' + _url_salt + r'(?P<gzip>z?)_\.(?P<b64>[a-zA-Z0-9-_]+=*)\._' + _url_salt + r'_\.[a-zA-Z\d]+\b')


@lru_cache(maxsize=1024)
def extract_real_url_from_embedded_url(embedded_url):
    """
    将 embed_real_url_to_embedded_url() 编码后的url转换为原来的带有参数的url
    `cdn_redirect_encode_query_str_into_url`设置依赖于本函数, 详细说明请看配置文件中这个参数的部分

    eg: https://cdn.domain.com/a.php_zm24_.cT1zb21ldGhpbmc=._zm24_.css
        ---> https://foo.com/a.php?q=something (assume it returns an css) (base64 only)
    eg2: https://cdn.domain.com/a/b/_zm24_.bG92ZT1saXZl._zm24_.jpg
        ---> https://foo.com/a/b/?love=live (assume it returns an jpg) (base64 only)
    eg3: https://cdn.domain.com/a/b/_zm24z_.[some long long base64 encoded string]._zm24_.jpg
        ---> https://foo.com/a/b/?love=live[and a long long query string] (assume it returns an jpg) (gzip + base64)
    eg4:https://cdn.domain.com/a  (no change)
        ---> (no query string): https://foo.com/a (assume it returns an png) (no change)
    :param embedded_url: 可能被编码的URL
    :return: 如果传入的是编码后的URL, 则返回解码后的URL, 否则返回None
    :type embedded_url: str
    :rtype: Union[str, None]
    """
    if '._' + cdn_url_query_encode_salt + '_.' not in embedded_url[-15:]:  # check url mark
        return None
    m = regex_extract_base64_from_embedded_url.search(embedded_url)
    b64 = get_group('b64', m)

    # 'https://cdn.domain.com/a.php_zm24_.cT1zb21ldGhpbmc=._zm24_.css'
    # real_request_url_no_query ---> 'https://cdn.domain.com/a.php'
    real_request_url_no_query = embedded_url[:m.span()[0]]

    query_string_byte = base64.urlsafe_b64decode(b64)
    is_gzipped = get_group('gzip', m)
    if is_gzipped:
        query_string_byte = zlib.decompress(query_string_byte)
    query_string = query_string_byte.decode(encoding='utf-8')

    result = urljoin(real_request_url_no_query, '?' + query_string)
    # dbgprint('extract:', embedded_url, 'to', result)
    return result


@lru_cache(maxsize=1024)
def embed_real_url_to_embedded_url(real_url_raw, url_mime, escape_slash=False):
    """
    将url的参数(?q=some&foo=bar)编码到url路径中, 并在url末添加一个文件扩展名
    在某些对url参数支持不好的CDN中, 可以减少错误
    `cdn_redirect_encode_query_str_into_url`设置依赖于本函数, 详细说明可以看配置文件中的对应部分
    解码由 extract_real_url_from_embedded_url() 函数进行, 对应的例子也请看这个函数
    :rtype: str
    """
    # dbgprint(real_url_raw, url_mime, escape_slash)
    if escape_slash:
        real_url = real_url_raw.replace(r'\/', '/')
    else:
        real_url = real_url_raw
    url_sp = urlsplit(real_url)
    if not url_sp.query:  # no query, needn't rewrite
        return real_url_raw

    byte_query = url_sp.query.encode()
    if len(byte_query) > 128:  # 当查询参数太长时, 进行gzip压缩
        gzip_label = 'z'  # 进行压缩后的参数, 会在标识区中添加一个z
        byte_query = zlib.compress(byte_query)
    else:
        gzip_label = ''

    b64_query = base64.urlsafe_b64encode(byte_query).decode()
    # dbgprint(url_mime)
    mixed_path = url_sp.path + '_' + _url_salt + gzip_label + '_.' \
                 + b64_query \
                 + '._' + _url_salt + '_.' + mime_to_use_cdn[url_mime]
    result = urlunsplit((url_sp.scheme, url_sp.netloc, mixed_path, '', ''))

    if escape_slash:
        result = s_esc(result)
        # dbgprint('embed:', real_url_raw, 'to:', result)
    return result


@lru_cache(maxsize=64)
def guess_colon_from_slash(slash):
    """根据 slash(/) 的格式, 猜测最有可能与之搭配的 colon(:) 格式"""
    if "%" not in slash:
        return ":"  # slash没有转义, 直接原文
    elif "%25" in slash:
        # %252F %252f
        if "F" in slash:
            return "%253A"
        else:
            return "%253a"
    else:
        # %2F %2f
        if "F" in slash:
            return "%3A"
        else:
            return "%3a"


def attributes(var, to_dict=False, max_len=1024):
    output = {} if to_dict else ""
    for name in dir(var):
        if name[0] != '_' and name[-2:] != '__':
            value = str(getattr(var, name))

            if max_len:
                length = len(value)
                if length > max_len:
                    value = value[:max_len] + "....(total:{})".format(length)

            if to_dict:
                output[name] = value
            else:
                output += strx(name, ":", value, "\n")
    return output


def inject_content(position, html, content):
    """
    将文本内容注入到html中
    详见 default_config.py 的 `Custom Content Injection` 部分
    :param position: 插入位置
    :type position: str
    :param html: 原始html
    :type html: str
    :param content: 等待插入的自定义文本内容
    :type content: str
    :return: 处理后的html
    :rtype: str
    """
    if position == "head_first":
        return inject_content_head_first(html, content)
    elif position == "head_last":
        return inject_content_head_last(html, content)
    else:  # coverage: exclude
        raise ValueError("Unknown Injection Position: {}".format(position))


def inject_content_head_first(html, content):
    """
    将文本内容插入到head中第一个现有<script>之前
    如果head中不存在<script>, 则加在</head>标签之前

    :type html: str
    :type content: str
    :rtype: str
    """
    head_end_pos = html.find("</head")  # 找到 </head> 标签结束的位置
    script_begin_pos = html.find("<script")  # 找到第一个 <script> 开始的地方

    if head_end_pos == -1:  # coverage: exclude
        # 如果没有 </head> 就不进行插入
        return html

    if script_begin_pos != -1 and script_begin_pos < head_end_pos:
        # 如果<head>中存在<script>标签, 则插入到第一个 <script> 标签之前
        return html[:script_begin_pos] + content + html[script_begin_pos:]

    else:
        # 如果<head>中 *不* 存在<script>标签, 则插入到 </head> 之前
        return html[:head_end_pos] + content + html[head_end_pos:]


def inject_content_head_last(html, content):
    """
    将文本内容插入到head的尾部

    :type html: str
    :type content: str
    :rtype: str
    """
    head_end_pos = html.find("</head")  # 找到 </head> 标签结束的位置

    if head_end_pos == -1:
        # 如果没有 </head> 就不进行插入
        return html

    return html[:head_end_pos] + content + html[head_end_pos:]