python source code of html

from lxml import etree

from AmazonReviewSpider.baidu_trans import BaiduTrans


class HtmlParser(object):
    def __init__(self, base_url):
        self.amazon_base_url = base_url
        self.trans = BaiduTrans()

    # 从主页获取See all 73 positive reviews的url链接
    def parse_main_page_reviews_url(self, content):
        content = str(content)
        html = etree.HTML(content)
        subject = html.xpath('//a[@id="dp-summary-see-all-reviews" and @class="a-link-emphasis"]')
        a_href = subject[0].get('href')

        return self.amazon_base_url + a_href

    def get_next_reviews_url(self, content):
        content = str(content)
        html = etree.HTML(content)
        subject = html.xpath('//li[@class="a-last"]/a')
        # 表示已经没有下一页
        if len(subject) == 0:
            return ""

        a_href = subject[0].get('href')
        return self.amazon_base_url + a_href

    def get_reviews_info(self, content):
        content = str(content)
        content = content.replace("<br>", "")
        content = content.replace("<br />", "")
        html = etree.HTML(content)

        star_list = html.xpath('//a/i[@data-hook="review-star-rating"]/span[@class="a-icon-alt"]/text()')
        title_list = html.xpath('//div[@class="a-row"]/a[@data-hook="review-title"]/text()')

        review_body_list = html.xpath('//div[@class="a-row review-data"]/span['
                                      '@data-hook="review-body"]/text()')

        all_review_list = []
        for index in range(len(star_list)):
            star_num = star_list[index][:1]
            if int(star_num) < 4:
                continue
            all_review_list.append(
                {"star": star_num, "title": title_list[index], "body": review_body_list[index],
                 'trans': self.trans.transEn2Zh(review_body_list[index])})

        return all_review_list


if __name__ == '__main__':
    '''
    downloader = Downloader()
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36',
    }

    url = "https://www.amazon.co.uk/Transmission-Lullabies-Temperature-Monitoring-Discoball%C2%AE/product-reviews/B01HXPQUUI/ref=cm_cr_getr_d_paging_btm_12?ie=UTF8&pageNumber=12&reviewerType=all_reviews"
    # url_final = "https://www.amazon.co.uk/Transmission-Lullabies-Temperature-Monitoring-Discoball%C2%AE/product-reviews/B01HXPQUUI/ref=cm_cr_getr_d_paging_btm_12?ie=UTF8&reviewerType=all_reviews&pageNumber=12";
    content2 = downloader.download(url, retry_count=2, headers=headers).decode('utf8')
    HtmlParser().get_reviews_info(content2)
    '''

    star = '1.0 of 5'
    print(star[:1])