python source code of multiThreadDownload

# -*- coding: utf-8 -*-
# @Author: LC
# @Date:   2016-03-26 22:42:39
# @Last modified by:   LC
# @Last Modified time: 2016-04-11 15:22:01
# @Email: liangchaowu5@gmail.com

# 功能：多线程下载QQ输入法词库文件,使用时把主函数中的baseDir改成自己的下载目录即可，注意baseDir末尾不能有/

import threading
import Queue
import re
import urllib2
import os
import time

import downloadSingleFile
import getQQCategory

# 全局变量
queue = Queue.Queue()   # 存储待访问的url
visited = set()     # 存储已经访问的url
downloaded = set()  # 存储已经下载过的文件
logFile = ''      # 日志文件
downloadDir = ''  # 下载的路径
smallCateURL = ''      # 类别url
threadingLock = threading.Lock()  # 保护除Queue以外的共享区域的线程锁

pagePattern = re.compile('&page=(\d+)"')  # 找到其他page的正则匹配模式
filePattern = re.compile('<a href="/dict_detail\?dict_id=(\d+)">(.*?)</a>')  # 找到下载文件的正则匹配模式

# 构建自己的线程类
class downloadThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        print self.name+ 'is created!'

    def run(self):
        global visited, downloaded
        while True:
            if queue.empty(): # 防止一开始队列内容太少导致后创建的线程退出
                continue
            currentURL = queue.get()

            # 查看url是否被访问过
            threadingLock.acquire()
            try:
                if currentURL in visited:
                    queue.task_done()
                    continue
                else:
                    visited.add(currentURL)
            finally:
                threadingLock.release()

            # 解析当前页面
            try:
                response = urllib2.urlopen(currentURL)
            except urllib2.HTTPError, e:
                with open(logFile.decode('utf8'), 'a') as f:
                    f.write(str(e.code)+' while parsing url '+currentURL+'\n')
                continue
            except:
                with open(logFile.decode('utf8'), 'a') as f:
                    f.write('unexcepted error while parsing url '+currentURL+'\n')
                continue

            # 找到链接到其他页面的连接
            data = response.read()
            pageList = re.findall(pagePattern,data)
            for i in pageList:
                pageURL = smallCateURL+'&page='+i
                queue.put(pageURL)

            # 下载当前页面存在的文件
            fileList = re.findall(filePattern,data)
            for id, name in fileList:
                fileURL = 'http://dict.qq.pinyin.cn/download?dict_id='+id
                filePath = downloadDir.decode('utf8')+'/'+name.decode('gbk')+'.qpyd'

                # 检查文件是否被下载
                threadingLock.acquire()
                try:
                    if fileURL in downloaded:
                        continue
                    else:
                        downloaded.add(fileURL)
                finally:
                    threadingLock.release()

                print self.name+' is downloading '+fileURL,name.decode('gbk')+'.qpyd  ...........'
                downloadSingleFile.downloadSingleFile(fileURL, filePath, logFile)
                print filePath+' is downloaded!!'

            queue.task_done()  #告诉queue当前任务已完成，否则因为queue调用了join，会一直block下去


# 下载某一类的词库
def downloadSingleType(bigCate,smallCate,baseDir):
    """
    下载某一分类的词库，实际作用是修改全局变量让多线程可以获取到正确的下载路径和存储目录
    :param bigCate:   一级分类
    :param smallCate: 二级分类
    :param baseDir:   下载目录
    :return: None
    """
    global smallCateURL, downloadDir, queue, logFile
    smallCateURL = 'http://dict.qq.pinyin.cn/dict_list?sort1=%s&sort2=%s' %(urllib2.quote(bigCate), urllib2.quote(smallCate))  # url编码
    if baseDir[-1] == '/':
        print '路径 '+baseDir+' 末尾不能有/'
        return
    downloadDir = baseDir+'/'+bigCate+'/'+smallCate
    logFile = baseDir+'/download.log'
    if not os.path.exists(downloadDir.decode('utf8')):  # 目录不存在的时候创建目录
        os.makedirs(downloadDir.decode('utf8'))
    queue.put(smallCateURL)

if __name__ == '__main__':
    start = time.time()
    baseDir = 'G:/各大输入法词库/QQ/多线程下载'  # 下载的目录，最后不能带有/
    category = getQQCategory.getCategory()

    threadNum = 5    # 下载的线程数目
    for i in range(threadNum):
        th = downloadThread()
        th.setDaemon(True)
        th.start()

    for bigCate in category:
        for smallCate in category[bigCate]:
            downloadSingleType(bigCate.encode('utf8'), smallCate.encode('utf8'), baseDir)
            queue.join()

    print 'process time: %s seconds' % (time.time()-start)