python source code of crawler

#!/usr/bin/env python3
# coding=utf-8
import json
import logging
import re
import time
from json import decoder
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.options import Options

class Crawler(object):

    def __init__(self, proxy=None):
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_argument('--ignore-ssl-errors')
        prefs = {"profile.managed_default_content_settings.images": 2}
        chrome_options.add_experimental_option("prefs", prefs)
        if proxy:
            proxy_address = proxy['https']
            chrome_options.add_argument('--proxy-server=%s' % proxy_address)
            logging.info('Chrome using proxy: %s', proxy['https'])
        # 设置等待策略为不等待完全加载
        caps = DesiredCapabilities().CHROME
        caps["pageLoadStrategy"] = "none"
        self.chrome = webdriver.Chrome(chrome_options=chrome_options, desired_capabilities=caps)
        # jd sometimes load google pic takes much time
        self.chrome.set_page_load_timeout(30)
        # set timeout for script
        self.chrome.set_script_timeout(30)

    def close(self):
        self.chrome.quit()

    def get_jd_item(self, item_id):
        item_info_dict = {"name": None, "price": None, "plus_price": None, "subtitle": None}
        url = 'https://item.jd.com/' + item_id + '.html'
        try:
            self.chrome.get(url)
            logging.info('Crawl: {}'.format(url))
            # 共60秒
            retry = 30
            while retry:
                try:
                    element = self.chrome.find_element_by_xpath("//*[@class='p-price']/span[2]").text
                    if element:
                        logging.info("爬取价格数据")
                        logging.info('Found price element: {}'.format(element))
                        break
                    else:
                        logging.info("价格元素出现，价格未出现重试2秒")
                        time.sleep(2)
                        retry -= 1
                except NoSuchElementException:
                    logging.info("价格元素未出现")
                    time.sleep(2)
                    retry -= 1
        except TimeoutException as e:
            logging.warning('Crawl failure: {}'.format(e.msg))
            return item_info_dict

        # 提取商品名称
        try:
            name = self.chrome.find_element_by_xpath("//*[@class='sku-name']").text
            item_info_dict['name'] = name
        except AttributeError as e:
            logging.warning('Crawl name failure: {}'.format(e))
        except NoSuchElementException:
            try:
                # 加油卡(如200117841739）需要改为提取name
                name = self.chrome.find_element_by_xpath("//*[@class='name']").text
                item_info_dict['name'] = name
            except NoSuchElementException as e:
                logging.warning('Crawl name failure: {}'.format(e.msg))

        # 提取商品PLUS价格
        try:  # 海外购(32533649560)页面无p-price-plus元素，直接保留为None
            plus_price = self.chrome.find_element_by_xpath("//*[@class='p-price-plus']").text
            if plus_price:
                plus_price_xpath = re.findall(r'-?\d+\.?\d*e?-?\d*?', plus_price)
                item_info_dict['plus_price'] = plus_price_xpath[0]  # 提取浮点数
        except AttributeError as e:
            logging.warning('Crawl plus_price failure: {}'.format(e))
        except NoSuchElementException as e:
            logging.warning('Crawl plus_price failure: {}'.format(e.msg))

        # 提取商品副标题
        try:
            subtitle = self.chrome.find_element_by_xpath("//*[@id='p-ad']").text
            item_info_dict['subtitle'] = subtitle
        except AttributeError as e:
            logging.warning('Crawl subtitle failure: {}'.format(e))
        except NoSuchElementException:
            try:
                # 加油卡200117841739需要改为提取name-s
                subtitle = self.chrome.find_element_by_xpath("//*[@class='name-s']").text
                item_info_dict['subtitle'] = subtitle
            except NoSuchElementException as e:
                logging.warning('Crawl subtitle failure: {}'.format(e.msg))

        # 提取商品价格
        try:
            price = self.chrome.find_element_by_xpath("//*[@class='p-price']").text
            if price:
                price_xpath = re.findall(r'-?\d+\.?\d*e?-?\d*?', price)
                if price_xpath:  # 若能提取到值
                    item_info_dict['price'] = price_xpath[0]  # 提取浮点数
        except AttributeError as e:
            logging.warning('Crawl price failure: {}'.format(e.msg))
        except NoSuchElementException as e:
            logging.warning('Crawl price failure: {}'.format(e.msg))
            return item_info_dict

        logging.info('Crawl SUCCESS: {}'.format(item_info_dict))
        return item_info_dict

    def get_huihui_item(self, item_id):
        huihui_info_dict = {"max_price": None, "min_price": None}
        url = 'https://zhushou.huihui.cn/productSense?phu=https://item.jd.com/' + item_id + '.html'
        try:
            self.chrome.get(url)
            # 共30秒
            retry = 15
            while retry:
                try:
                    element = self.chrome.find_element_by_tag_name('body').text
                    if element:
                        logging.info('Found body element: {}'.format(element))
                        break
                    else:
                        logging.info("huihui body元素出现，内容未出现重试2秒")
                        time.sleep(2)
                        retry -= 1
                except NoSuchElementException:
                    time.sleep(2)
                    retry -= 1
                except StaleElementReferenceException:
                    time.sleep(2)
                    retry -= 1
            url_text = self.chrome.find_element_by_tag_name('body').text
            info = json.loads(url_text)
            huihui_info_dict = {"max_price": info['max'], "min_price": info['min']}
            logging.info(huihui_info_dict)
        except decoder.JSONDecodeError as e:
            logging.warning('Crawl failure: {}'.format(e.msg))
        except NoSuchElementException as e:
            logging.warning('Crawl failure: {}'.format(e.msg))
        except TimeoutException as e:
            logging.warning('Crawl failure: {}'.format(e.msg))
        return huihui_info_dict


if __name__ == '__main__':
    logging.basicConfig(format="%(asctime)s | %(levelname)s | %(filename)s %(lineno)s | %(message)s",
                        datefmt="%Y-%m-%d %H:%M:%S",
                        level=logging.INFO)
    start = time.time()

    c = Crawler()
    # c = Crawler({'http': '125.105.32.168:7305', 'https': '171.211.32.79:2456'})

    # logging.debug(c.get_jd_item('10761604532'))
    logging.debug(c.get_jd_item('6287165'))
    logging.debug(c.get_huihui_item('30445355110'))

    c.close()

    end = time.time()
    print("执行时间:", end - start)