python source code of util

# coding: utf-8

import re
import zipfile
import tempfile
import subprocess
from os import path
from io import BytesIO
from shutil import get_terminal_size

import rarfile
from guessit import guessit

from getsub.constants import SUB_FORMATS, ARCHIVE_TYPES


class ProgressBar:
    def __init__(self, prefix_info, title="", total="", count_time=0):
        self.title = title
        self.total = total
        self.prefix_info = prefix_info

    def refresh(self, cur_len):
        terminal_width = get_terminal_size().columns  # 获取终端宽度
        info = "%s '%s'...  %.2f%%" % (
            self.prefix_info,
            self.title,
            cur_len / self.total * 100,
        )
        while len(info) > terminal_width - 20:
            self.title = self.title[0:-4] + "..."
            info = "%s '%s'...  %.2f%%" % (
                self.prefix_info,
                self.title,
                cur_len / self.total * 100,
            )
        end_str = "\r" if cur_len < self.total else "\n"
        print(info, end=end_str)


def num_to_cn(number):
    """
    转化 1-99 的数字至中文
    """
    assert number.isdigit() and 1 <= int(number) <= 99

    trans_map = {n: c for n, c in zip(("123456789"), ("一二三四五六七八九"))}

    if len(number) == 1:
        return trans_map[number]
    else:
        part1 = "十" if number[0] == "1" else trans_map[number[0]] + "十"
        part2 = trans_map[number[1]] if number[1] != "0" else ""
        return part1 + part2


def extract_name(name, en=False):
    """
    提取文字
    若 en 为 True，提取 name 中英文
    若 en 为 False，提取 name 中占比大的语言

    params:
        name: str, name to be processed
    return:
        new_name: str, extracted name
    """
    name, suffix = path.splitext(name)
    c_pattern = "[\u4e00-\u9fff]"
    e_pattern = "[a-zA-Z]"
    c_indices = [m.start(0) for m in re.finditer(c_pattern, name)]
    e_indices = [m.start(0) for m in re.finditer(e_pattern, name)]

    if en or len(c_indices) <= len(e_indices):
        target, discard = e_indices, c_indices
    else:
        target, discard = c_indices, e_indices

    if len(target) == 0:
        return ""

    first_target, last_target = target[0], target[-1]
    first_discard = discard[0] if discard else -1
    last_discard = discard[-1] if discard else -1
    if last_discard < first_target:
        new_name = name[first_target:]
    elif last_target < first_discard:
        new_name = name[:first_discard]
    else:
        # try to find maximum continous part
        result, start, end = [0, 1], -1, 0
        while end < len(name):
            while end not in e_indices and end < len(name):
                end += 1
            if end == len(name):
                break
            start = end
            while end not in c_indices and end < len(name):
                end += 1
            if end - start > result[1] - result[0]:
                result = [start, end]
            start = end
            end += 1
        new_name = name[result[0] : result[1]]
    new_name = new_name.strip() + suffix
    return new_name


def _print_and_choose(items):

    for i, item in enumerate(items):
        print("%3s) %s" % (i, item))

    choice = None
    while choice is None:
        try:
            print()
            choice = input(" choose: ")
            choice = int(choice)
            assert choice < len(items)
        except ValueError:
            print(" only numbers accepted")
            choice = None
        except AssertionError:
            print(" ", end="\r")
            print("choice %d not within the range" % choice)
            choice = None
    print()

    return choice


def choose_archive(sub_dict, sub_num=5, query=True):
    """
    传入候选字幕字典，返回选择的字幕包名称，字幕包下载地址

    params:
        sub_dict: dict, check downloader.py
        sub_num: int, maximum number of subtitles
        query: bool, return first sub if False
    return:
        exit: bool
        chosen_subs: str, subtitle name
    """

    exit = False

    if not query:
        chosen_sub = list(sub_dict.keys())[0]
        return exit, chosen_sub

    items = []
    items.append("Exit. Not downloading any subtitles.")
    for i, key in enumerate(sub_dict.keys()):
        if i == sub_num:
            break
        lang_info = ""
        lang_info += "【简】" if 4 & sub_dict[key]["lan"] else "      "
        lang_info += "【繁】" if 2 & sub_dict[key]["lan"] else "      "
        lang_info += "【英】" if 1 & sub_dict[key]["lan"] else "      "
        lang_info += "【双】" if 8 & sub_dict[key]["lan"] else "      "
        sub_info = "%s  %s" % (lang_info, key)
        items.append(sub_info)

    choice = _print_and_choose(items)
    if choice == 0:
        exit = True
        return exit, []

    return exit, list(sub_dict.keys())[choice - 1]


def choose_subtitle(subtitles):
    """
    传入字幕列表，返回选择字幕名

    params:
        subtitles: list of str
    return:
        subname: str
    """

    items = []
    for subtitle in subtitles:
        try:
            # zipfile: Historical ZIP filename encoding
            subtitle = subtitle.encode("cp437").decode("gbk")
        except Exception:
            pass
        items.append(subtitle)

    choice = _print_and_choose(items)

    return subtitles[choice]


def compute_subtitle_score(video_detail, subname, match_episode=True):
    """
    计算字幕分数

    params:
        video_detail: dict, result of guessit
        subname: str
        match_episode: bool, whether episode number
                       needed to match if video is a TV show
    return:
        score: int, return -1 if not match with videos
    """

    video_name = video_detail["title"].lower()
    season = str(video_detail.get("season"))
    episode = str(video_detail.get("episode"))
    year = str(video_detail.get("year"))
    vtype = str(video_detail.get("type"))

    subname = subname.lower()
    score = 0

    sub_name_info = guessit(subname)
    if sub_name_info.get("title"):
        sub_title = sub_name_info["title"].lower()
    else:
        sub_title = ""
    sub_season = str(sub_name_info.get("season"))
    sub_episode = str(sub_name_info.get("episode"))
    sub_year = str(sub_name_info.get("year"))

    if vtype == "movie":
        if year == sub_year:
            score += 1
        if video_name == sub_title:
            score += 1
        elif sub_title != "":
            return -1
    else:
        if video_name == sub_title:
            if season != sub_season:
                return -1  # title match, season not match
            elif episode != sub_episode and match_episode:
                return -1  # title match, episode not match
            else:
                score += 1  # title and episode match
        elif season == sub_season and episode == sub_episode:
            # title not match, episode match
            if sub_title != "":
                return -1
        else:
            return -1  # title and episode not match

    if "简体" in subname or "chs" in subname or ".gb." in subname:
        score += 2
    if "繁体" in subname or "cht" in subname or ".big5." in subname:
        pass
    if "chs.eng" in subname or "chs&eng" in subname:
        score += 2
    if "中英" in subname or "简英" in subname or "双语" in subname or "简体&英文" in subname:
        score += 4

    score += ("ass" in subname or "ssa" in subname) * 2
    score += ("srt" in subname) * 1

    return score


def guess_subtitle(sublist, video_detail):
    """
    传入字幕列表，视频信息，返回得分最高字幕名

    params:
        sublist: list of str
        video_detail: result of guessit
    return:
        success: bool
        subname: str
    """

    if not sublist:
        return False, None

    scores, subs = [], []
    for one_sub in sublist:
        _, ftype = path.splitext(one_sub)
        if ftype not in SUB_FORMATS:
            continue
        subs.append(one_sub)
        subname = path.split(one_sub)[-1]  # extract subtitle name
        try:
            # zipfile:/Lib/zipfile.py:1211 Historical ZIP filename encoding
            # try cp437 encoding
            subname = subname.encode("cp437").decode("gbk")
        except Exception:
            pass
        score = compute_subtitle_score(video_detail, subname)
        scores.append(score)

    max_score = max(scores)
    max_pos = scores.index(max_score)
    return max_score > 0, subs[max_pos]


def get_file_list(data, datatype):
    """
    传入一个压缩文件控制对象，读取对应压缩文件内文件列表

    params:
        data: binary data of an archive file
        datatype: str, file type
    return:
        sub_lists_dict: dict, {subname: file_handler}
    """

    sub_buff = BytesIO(data)

    if datatype == ".7z":
        try:
            sub_buff.seek(0)
            file_handler = P7ZIP(sub_buff)
        except Exception:
            datatype = ".zip"  # try with zipfile
    if datatype == ".zip":
        try:
            sub_buff.seek(0)
            file_handler = zipfile.ZipFile(sub_buff, mode="r")
        except Exception:
            datatype = ".rar"  # try with rarfile
    if datatype == ".rar":
        sub_buff.seek(0)
        file_handler = rarfile.RarFile(sub_buff, mode="r")

    sub_lists_dict = dict()

    for one_file in file_handler.namelist():

        if path.splitext(one_file)[-1] in SUB_FORMATS:
            sub_lists_dict[one_file] = file_handler
            continue

        if path.splitext(one_file)[-1] in ARCHIVE_TYPES:
            data = file_handler.read(one_file)
            datatype = path.splitext(one_file)[-1]
            sub_lists_dict.update(get_file_list(data, datatype))

    return sub_lists_dict


def run_command(cmd):
    process = subprocess.Popen(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
    )
    output, error = process.communicate()
    return output.decode(), error.decode(), process.returncode


class P7ZIP:
    def __init__(self, file):
        self.data = file.read()
        # test if it is a valid 7zip file
        self.namelist()

    def _parse_list_output(self, output):
        header_pattern = r"\s+Date\s+Time\s+Attr\s+Size\s+Compressed\s+Name\s+"
        body = re.split(header_pattern, output)[-1]
        file_names = []
        for line in body.split("\n")[1:]:
            if line.startswith("-----"):  # reach end
                break
            parts = re.split(r"\s", line.strip())
            file_name = parts[-1].strip()
            if path.basename(file_name) == file_name:  # root dir
                continue
            file_names.append(file_name)
        return file_names

    def namelist(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            file_path = path.join(tmp_dir, "archive.7z")
            with open(file_path, "wb") as f:
                f.write(self.data)
            cmd = "7z l " + file_path
            output, err, status = run_command(cmd)
            if status != 0:
                raise ValueError(err)
            file_names = self._parse_list_output(output)
        return file_names

    def read(self, name):
        with tempfile.TemporaryDirectory() as tmp_dir:
            file_path = path.join(tmp_dir, "archive.7z")
            with open(file_path, "wb") as f:
                f.write(self.data)
            cmd_lists = ["7z", "e", file_path, "-o" + tmp_dir, name]
            cmd = " ".join(cmd_lists)
            output, err, status = run_command(cmd)
            if status != 0:
                raise ValueError(err)
            sub_file_path = path.join(tmp_dir, path.basename(name))
            with open(sub_file_path, "rb") as f:
                sub_data = f.read()
        return sub_data