# -*- coding:utf-8 -*- """ Verion: 1.0 Author: Helixcs Time: 2019/12/15 Reference : https://github.com/SeleniumHQ/selenium/wiki/ChromeDriver#requirements https://www.srcmake.com/home/selenium-python-chromedriver-ubuntu wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | sudo apt-key add - echo 'deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main' | sudo tee /etc/apt/sources.list.d/google-chrome.list sudo apt-get update sudo apt-get install google-chrome-stable """ import time import base64 import re import collections from zipfile import ZipFile import numpy as np import os import sys import shutil from typing import Optional, Union from urllib3.util.url import parse_url from urllib3 import PoolManager from urllib3.contrib.socks import SOCKSProxyManager from selenium import webdriver from selenium.webdriver import ChromeOptions from PIL import Image from slugify import slugify WINDOWS = os.name == 'nt' LINUX = sys.platform.startswith('linux') DARWIN = sys.platform.startswith('darwin') def is_blank(value: Optional[Union[int, str, dict, list, bytes, tuple, object]]) -> bool: if value is None: return True if isinstance(value, str): return True if value is None or value.strip('') == '' else False if isinstance(value, dict): return True if len(value) < 1 else False if isinstance(value, list): return True if len(value) < 1 else False if isinstance(value, bytes): return True if value == b'' else False # (None,None) ==> False if isinstance(value, tuple): if len(value) < 1: return True for i in value: if i is not None: return False return True if isinstance(value, set): return True if len(value) < 1 else False return value is None def is_not_blank(value: Optional[Union[int, str, dict, list, tuple,]]) -> bool: return not is_blank(value=value) DEFAULT_GCO_SIZE = 12000 DEFAULT_GCO_OUTPUT_PATH = 'output' DEFAULT_GCO_PARTIAL_PATH = 'partial' DEFAULT_GCO_INIT_DELAY = 5 class GoogleArtsCrawlerOption(object): def __init__(self, url: str = None, chrome_options: ChromeOptions = None, webdriver_execute_path: str = None, size: int = DEFAULT_GCO_SIZE, init_delay_time: int = DEFAULT_GCO_INIT_DELAY, blob_loading_delay_time: int = 2, output_path: str = DEFAULT_GCO_OUTPUT_PATH, output_filename: str = None, partial_tmp_path: str = DEFAULT_GCO_PARTIAL_PATH, need_download_webdrive: bool = False, need_clear_cache: bool = True, is_debug: bool = False): """ GoogleArtsCrawlerOption Usage: ``` GoogleArtsCrawlerOption() .set_url("https://artsandculture.google.com/asset/madame-moitessier/hQFUe-elM1npbw") .set_chrome_options(chrome_option) .set_need_download_webdrive(True) # .set_webdriver_execute_path("webdriver/chromedriver") .set_partial_tmp_path("custom_partial_dir") .set_output_path("custom_output_dir") .set_output_filename("custom.jpg") .set_need_clear_cache(True) .prepare_options() ``` :param url: google arts url. :param chrome_options: chrome options , visit `https://chromedriver.chromium.org/capabilities` for detail. :param webdriver_execute_path: webdrive executed path , if you do not set , it will auto download. :param size: webdrive simulated device size , default 120000. :param init_delay_time: webdrive request url and set `init_delay_time` after render. :param blob_loading_delay_time: webdrive get image blob delay time if get filed at first time. :param output_path: custom output dir , default `output` :param output_filename: custom output filename, default arts name. :param need_download_webdrive need download webdrive, default False , it will auto download webdrive if set True. :param partial_tmp_path: custom partial tmp path , it will be deleted after finish, default `blob`. :param need_clear_cache: auto clear webdriver download tmp and partial images after finished. :param is_debug: """ self._url = url self._chrome_options = chrome_options self._webdriver_execute_path = webdriver_execute_path self._size = size self._init_delay_time = init_delay_time self._blob_loading_delay_time = blob_loading_delay_time self._output_path = output_path self._output_filename = output_filename self._partial_tmp_path = partial_tmp_path self._is_debug: bool = is_debug self._need_download_webdrive = need_download_webdrive self._need_clear_cache = need_clear_cache pass def prepare_options(self): if is_blank(self._url): raise Exception("GoogleArtsCrawlerOption , url is blank!") uprs = parse_url(url=self._url) if not uprs.host == 'artsandculture.google.com': raise Exception("GoogleArtsCrawlerOption, url netloc is not `artsandculture.google.com`") self._url = "https://{0}{1}".format(uprs.host, uprs.path) # download webdriver if self._webdriver_execute_path is None and self._need_download_webdrive: default_webdrive_path = "webdriver" if os.path.isdir(default_webdrive_path): default_webdrive_files = os.listdir(default_webdrive_path) if len(default_webdrive_files) > 0: default_webdrive_execute_file = os.path.join(default_webdrive_path, default_webdrive_files[0]) if os.path.isfile(default_webdrive_execute_file): print("==> webdriver has exist at {0}".format(default_webdrive_execute_file)) self._webdriver_execute_path = default_webdrive_execute_file if self._webdriver_execute_path is None: if WINDOWS: os_name = "Windows" webdriver_download_url = "http://chromedriver.storage.googleapis.com/78.0.3904.70/chromedriver_win32.zip" elif DARWIN: os_name = "Mac OS" webdriver_download_url = "http://chromedriver.storage.googleapis.com/78.0.3904.70/chromedriver_mac64.zip" elif LINUX: os_name = "Linux" webdriver_download_url = "http://chromedriver.storage.googleapis.com/78.0.3904.70/chromedriver_linux64.zip" else: raise Exception("GoogleArtsCrawlerOptions, unknown platform !") print("==> current operation system : {0}".format(os_name)) print("==> prepare download webdriver : {0}".format(webdriver_download_url)) default_download_tmp = "tmp" webdriver_zip_filename = webdriver_download_url.split("/")[-1] webdriver_local_zip_filepath = os.path.join(default_download_tmp, webdriver_zip_filename) # not exist if not os.path.isfile(webdriver_local_zip_filepath): # http = SOCKSProxyManager('socks5://localhost:1086/') http = PoolManager() response = http.request('GET', webdriver_download_url, preload_content=False) if not os.path.isdir(default_download_tmp): os.mkdir(default_download_tmp) with open(webdriver_local_zip_filepath, mode="wb") as fd: while True: data = response.read(1024) if not data: break fd.write(data) response.release_conn() print("==> webdriver zip file download finished , location at : {0}".format( os.path.abspath(webdriver_local_zip_filepath))) else: print("==> webdriver zip file has existed at {0}".format(webdriver_local_zip_filepath)) with ZipFile(webdriver_local_zip_filepath, 'r') as zipfile: zipfile.extractall(path=default_webdrive_path) if self._need_clear_cache: shutil.rmtree(default_download_tmp) self._webdriver_execute_path = os.path.join(default_webdrive_path, os.listdir(default_webdrive_path)[0]) if is_blank(self._webdriver_execute_path): raise Exception("GoogleArtsCrawlerOption , webdriver_execute_path is blank!") if not os.path.isfile(self._webdriver_execute_path): raise Exception("GoogleArtsCrawlerOption , webdriver_execute_path is not exist, this is file!") if LINUX or DARWIN: os.chmod(self._webdriver_execute_path, 0o777) # self._chrome_options.binary_location = os.path.abspath(self._webdriver_execute_path) mobile_emulation = { "deviceMetrics": {"width": self._size, "height": self._size, "pixelRatio": 1.0}, "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 " "(KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"} self._chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) self._chrome_options.add_argument('--no-sandbox') self._chrome_options.add_argument('--disable-dev-shm-usage') self._chrome_options.add_argument('--disable-gpu') self._chrome_options.add_argument("--disable-dev-shm-usage") self._chrome_options.add_argument("start-maximized") self._chrome_options.add_argument("disable-infobars") self._chrome_options.add_argument("--disable-extensions") if not self._is_debug: self._chrome_options.add_argument("--headless") self._output_path = DEFAULT_GCO_OUTPUT_PATH if self._output_path is None else self._output_path self._size = DEFAULT_GCO_SIZE if self._size is None or self._size < 1 else self._size self._init_delay_time = DEFAULT_GCO_INIT_DELAY if self._init_delay_time is None or self._init_delay_time < 1 else self._init_delay_time if not os.path.isdir(self._output_path): os.makedirs(self._output_path) if not os.path.isdir(self._partial_tmp_path): os.makedirs(self._partial_tmp_path) if self._is_debug: print("GoogleArtsCrawlerOptions:") print("==> url:{0}".format(self._url)) print("==> webdriver_execute_path:{0}".format(os.path.abspath(self._webdriver_execute_path))) print("==> output :{0}".format(os.path.abspath(self._output_path))) return self @property def url(self) -> str: return self._url def set_url(self, url): self._url = url return self @property def chrome_options(self) -> ChromeOptions: return self._chrome_options def set_chrome_options(self, chrome_options: ChromeOptions): self._chrome_options = chrome_options return self @property def webdriver_execute_path(self) -> str: return self._webdriver_execute_path def set_webdriver_execute_path(self, webdriver_execute_path: str): self._webdriver_execute_path = webdriver_execute_path return self @property def size(self) -> int: return self._size def set_size(self, size: int): self._size = size return self @property def init_delay(self) -> int: return self._init_delay_time def set_init_delay(self, init_delay_time: int): self._init_delay_time = init_delay_time return self @property def blob_loading_delay_time(self) -> int: return self._blob_loading_delay_time def set_blob_loading_delay_time(self, blob_loading_delay_time: int): self._blob_loading_delay_time = blob_loading_delay_time return self @property def output_path(self) -> str: return self._output_path def set_output_path(self, output_path: str): self._output_path = output_path return self @property def output_filename(self) -> str: return self._output_filename def set_output_filename(self, output_filename): self._output_filename = output_filename return self @property def partial_tmp_path(self): return self._partial_tmp_path def set_partial_tmp_path(self, partial_tmp_path: str): self._partial_tmp_path = partial_tmp_path return self @property def need_download_webdrive(self): return self._need_download_webdrive def set_need_download_webdrive(self, need_download_webdrive: bool): self._need_download_webdrive = need_download_webdrive return self @property def need_clear_cache(self): return self._need_clear_cache def set_need_clear_cache(self, need_clear_cache: bool): self._need_clear_cache = need_clear_cache return self @property def is_debug(self): return self._is_debug def set_debug(self, is_debug: bool): self._is_debug = is_debug return self class GoogleArtsCrawlerProcess(object): def __init__(self, gaco: GoogleArtsCrawlerOption): """ GoogleArtsCrawlerProcess Usage: ``` GoogleArtsCrawlerProcess(gaco=GoogleArtsCrawlerOption()).process() ``` :param gaco: GoogleArtsCrawlerOption """ self._gaco = gaco print(os.path.abspath(self._gaco.webdriver_execute_path)) self._browser = webdriver.Chrome(options=self._gaco.chrome_options, executable_path=self._gaco.webdriver_execute_path) self._local_partial_tmp = None @property def gaco(self): return self._gaco def process(self): self._generate_image() if self._gaco.need_clear_cache: self._cleanup() pass # get blob content from blob:https://xxxxx def _get_blob_content(self, uri): """ Saves blob to base64. """ result = self._browser.execute_async_script(""" var uri = arguments[0]; var callback = arguments[1]; var toBase64 = function(buffer){for(var r,n=new Uint8Array(buffer),t=n.length,a=new Uint8Array(4*Math.ceil(t/3)),i=new Uint8Array(64),o=0,c=0;64>c;++c)i[c]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".charCodeAt(c);for(c=0;t-t%3>c;c+=3,o+=4)r=n[c]<<16|n[c+1]<<8|n[c+2],a[o]=i[r>>18],a[o+1]=i[r>>12&63],a[o+2]=i[r>>6&63],a[o+3]=i[63&r];return t%3===1?(r=n[t-1],a[o]=i[r>>2],a[o+1]=i[r<<4&63],a[o+2]=61,a[o+3]=61):t%3===2&&(r=(n[t-2]<<8)+n[t-1],a[o]=i[r>>10],a[o+1]=i[r>>4&63],a[o+2]=i[r<<2&63],a[o+3]=61),new TextDecoder("ascii").decode(a)}; var xhr = new XMLHttpRequest(); xhr.responseType = 'arraybuffer'; xhr.onload = function(){ callback(toBase64(xhr.response)) }; xhr.onerror = function(){ callback(xhr.status) }; xhr.open('GET', uri); xhr.send(); """, uri) if type(result) == int: raise Exception("Request failed with status %s" % result) return base64.b64decode(result) def _pil_grid(self, images: list, max_horiz: int = np.iinfo(int).max) -> Image: """ Generates one image out of many blobs. """ n_images = len(images) n_horiz = min(n_images, max_horiz) h_sizes, v_sizes = [0] * n_horiz, [0] * (n_images // n_horiz) for i, im in enumerate(images): h, v = i % n_horiz, i // n_horiz h_sizes[h] = max(h_sizes[h], im.size[0]) v_sizes[v] = max(v_sizes[v], im.size[1]) h_sizes, v_sizes = np.cumsum([0] + h_sizes), np.cumsum([0] + v_sizes) im_grid = Image.new('RGB', (h_sizes[-1], v_sizes[-1]), color='white') for i, im in enumerate(images): im_grid.paste(im, (h_sizes[i % n_horiz], v_sizes[i // n_horiz])) return im_grid def _cleanup(self): if self._local_partial_tmp is not None: shutil.rmtree(self._local_partial_tmp) # 生成切片图,再组合成一张完整图片 def _generate_image(self): try: print("==> staring request:{0}".format(self._gaco.url)) self._browser.get(self._gaco.url) if self._gaco.init_delay is not None and self._gaco.init_delay > 0: time.sleep(self._gaco.init_delay) blobs = self._browser.find_elements_by_tag_name('img') print("==> get total partial images:{0}".format(len(blobs) - 2)) title = slugify(self._browser.title) columns = [] rows = [] pil_images = [] i = 0 # 重建切片文件夹 local_tmp_path = os.path.join(self._gaco.partial_tmp_path, title) self._local_partial_tmp = local_tmp_path if os.path.exists(local_tmp_path): shutil.rmtree(local_tmp_path) os.makedirs(local_tmp_path) for blob in blobs: if i > 2: # Get number of rows and columns style = blob.get_attribute('style') style_end_index = style.find(');') # -4 removes "z" translation style = style[:style_end_index - 4] style = style.replace('transform: translate3d(', '') positions = list(map(int, re.findall(r'\d+', style))) if len(positions) < 2: # The positions are not available for this image - skip continue columns.append(positions[0]) rows.append(positions[1]) # Save blob to file partial_image_src = blob.get_attribute('src') while partial_image_src is None: if self._gaco.blob_loading_delay_time and self._gaco.blob_loading_delay_time > 0: time.sleep(self._gaco.blob_loading_delay_time) partial_image_src = blob.get_attribute('src') partial_image_content = self._get_blob_content(partial_image_src) local_partial_filename = os.path.join(local_tmp_path, "{0}.jpg".format(i)) print("===> got blob content:{0} to {1}".format(blob.get_attribute('src'), local_partial_filename)) with open(local_partial_filename, 'wb') as fd: fd.write(partial_image_content) fd.flush() # Create PIL objects list pil_images.append(Image.open(local_partial_filename)) i += 1 finally: self._browser.close() print("==> partial images has downloaded, total:{0}".format(len(blobs) - 2)) columns = len(collections.Counter(columns).keys()) rows = len(collections.Counter(rows).keys()) inverted_pil_images = [] # by default images are crawled in vertical direction # we re-arrange list to create horizontally sorted list for j in range(0, rows): for i in range(0, columns): inverted_pil_images.append(pil_images[(i * rows) + j]) grid = self._pil_grid(inverted_pil_images, columns) local_full_output_path = os.path.join(self._gaco.output_path, "{title}.jpg".format(title=title) if self._gaco.output_filename is None else self._gaco.output_filename) grid.save(local_full_output_path) print("==> Image location: {0}".format(local_full_output_path)) inverted_pil_images = None pil_images = None from . import GoogleArtsCrawlerProcess, GoogleArtsCrawlerOption