""" pdf2image is a light wrapper for the poppler-utils tools that can convert your PDFs into Pillow images. """ import os import platform import tempfile import types import shutil import pathlib from subprocess import Popen, PIPE from PIL import Image from .generators import uuid_generator, counter_generator, ThreadSafeGenerator from .parsers import ( parse_buffer_to_pgm, parse_buffer_to_ppm, parse_buffer_to_jpeg, parse_buffer_to_png, ) from .exceptions import ( PopplerNotInstalledError, PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError, ) TRANSPARENT_FILE_TYPES = ["png", "tiff"] PDFINFO_CONVERT_TO_INT = ["Pages"] def convert_from_path( pdf_path, dpi=200, output_folder=None, first_page=None, last_page=None, fmt="ppm", jpegopt=None, thread_count=1, userpw=None, use_cropbox=False, strict=False, transparent=False, single_file=False, output_file=uuid_generator(), poppler_path=None, grayscale=False, size=None, paths_only=False, use_pdftocairo=False, ): """ Description: Convert PDF to Image will throw whenever one of the condition is reached Parameters: pdf_path -> Path to the PDF that you want to convert dpi -> Image quality in DPI (default 200) output_folder -> Write the resulting images to a folder (instead of directly in memory) first_page -> First page to process last_page -> Last page to process before stopping fmt -> Output image format jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format) thread_count -> How many threads we are allowed to spawn for processing userpw -> PDF's password use_cropbox -> Use cropbox instead of mediabox strict -> When a Syntax Error is thrown, it will be raised as an Exception transparent -> Output with a transparent background instead of a white one. single_file -> Uses the -singlefile option from pdftoppm/pdftocairo output_file -> What is the output filename or generator poppler_path -> Path to look for poppler binaries grayscale -> Output grayscale image(s) size -> Size of the resulting image(s), uses the Pillow (width, height) standard paths_only -> Don't load image(s), return paths instead (requires output_folder) use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance """ if use_pdftocairo and fmt == "ppm": fmt = "png" # We make sure that if passed arguments are Path objects, they're converted to strings if isinstance(pdf_path, pathlib.PurePath): pdf_path = pdf_path.as_posix() if isinstance(output_folder, pathlib.PurePath): output_folder = output_folder.as_posix() if isinstance(poppler_path, pathlib.PurePath): poppler_path = poppler_path.as_posix() page_count = pdfinfo_from_path(pdf_path, userpw, poppler_path=poppler_path)["Pages"] # We start by getting the output format, the buffer processing function and if we need pdftocairo parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format( fmt, grayscale ) # We use pdftocairo is the format requires it OR we need a transparent output use_pdfcairo = ( use_pdftocairo or use_pdfcairo_format or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES) ) poppler_version = _get_poppler_version( "pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path ) if poppler_version <= 57: jpegopt = None # If output_file isn't a generator, it will be turned into one if not isinstance(output_file, types.GeneratorType) and not isinstance( output_file, ThreadSafeGenerator ): if single_file: output_file = iter([output_file]) else: output_file = counter_generator(output_file) if thread_count < 1: thread_count = 1 if first_page is None: first_page = 1 if last_page is None or last_page > page_count: last_page = page_count if first_page > last_page: return [] auto_temp_dir = False if output_folder is None and use_pdfcairo: auto_temp_dir = True output_folder = tempfile.mkdtemp() # Recalculate page count based on first and last page page_count = last_page - first_page + 1 if thread_count > page_count: thread_count = page_count reminder = page_count % thread_count current_page = first_page processes = [] for _ in range(thread_count): thread_output_file = next(output_file) # Get the number of pages the thread will be processing thread_page_count = page_count // thread_count + int(reminder > 0) # Build the command accordingly args = _build_command( ["-r", str(dpi), pdf_path], output_folder, current_page, current_page + thread_page_count - 1, parsed_fmt, jpegopt, thread_output_file, userpw, use_cropbox, transparent, single_file, grayscale, size, ) if use_pdfcairo: args = [_get_command_path("pdftocairo", poppler_path)] + args else: args = [_get_command_path("pdftoppm", poppler_path)] + args # Update page values current_page = current_page + thread_page_count reminder -= int(reminder > 0) # Add poppler path to LD_LIBRARY_PATH env = os.environ.copy() if poppler_path is not None: env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") # Spawn the process and save its uuid processes.append( (thread_output_file, Popen(args, env=env, stdout=PIPE, stderr=PIPE)) ) images = [] for uid, proc in processes: data, err = proc.communicate() if b"Syntax Error" in err and strict: raise PDFSyntaxError(err.decode("utf8", "ignore")) if output_folder is not None: images += _load_from_output_folder( output_folder, uid, final_extension, paths_only, in_memory=auto_temp_dir ) else: images += parse_buffer_func(data) if auto_temp_dir: shutil.rmtree(output_folder) return images def convert_from_bytes( pdf_file, dpi=200, output_folder=None, first_page=None, last_page=None, fmt="ppm", jpegopt=None, thread_count=1, userpw=None, use_cropbox=False, strict=False, transparent=False, single_file=False, output_file=uuid_generator(), poppler_path=None, grayscale=False, size=None, paths_only=False, use_pdftocairo=False, ): """ Description: Convert PDF to Image will throw whenever one of the condition is reached Parameters: pdf_file -> Bytes representing the PDF file dpi -> Image quality in DPI output_folder -> Write the resulting images to a folder (instead of directly in memory) first_page -> First page to process last_page -> Last page to process before stopping fmt -> Output image format jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format) thread_count -> How many threads we are allowed to spawn for processing userpw -> PDF's password use_cropbox -> Use cropbox instead of mediabox strict -> When a Syntax Error is thrown, it will be raised as an Exception transparent -> Output with a transparent background instead of a white one. single_file -> Uses the -singlefile option from pdftoppm/pdftocairo output_file -> What is the output filename or generator poppler_path -> Path to look for poppler binaries grayscale -> Output grayscale image(s) size -> Size of the resulting image(s), uses the Pillow (width, height) standard paths_only -> Don't load image(s), return paths instead (requires output_folder) use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance """ fh, temp_filename = tempfile.mkstemp() try: with open(temp_filename, "wb") as f: f.write(pdf_file) f.flush() return convert_from_path( f.name, dpi=dpi, output_folder=output_folder, first_page=first_page, last_page=last_page, fmt=fmt, jpegopt=jpegopt, thread_count=thread_count, userpw=userpw, use_cropbox=use_cropbox, strict=strict, transparent=transparent, single_file=single_file, output_file=output_file, poppler_path=poppler_path, grayscale=grayscale, size=size, paths_only=paths_only, use_pdftocairo=use_pdftocairo, ) finally: os.close(fh) os.remove(temp_filename) def _build_command( args, output_folder, first_page, last_page, fmt, jpegopt, output_file, userpw, use_cropbox, transparent, single_file, grayscale, size, ): if use_cropbox: args.append("-cropbox") if transparent and fmt in TRANSPARENT_FILE_TYPES: args.append("-transp") if first_page is not None: args.extend(["-f", str(first_page)]) if last_page is not None: args.extend(["-l", str(last_page)]) if fmt not in ["pgm", "ppm"]: args.append("-" + fmt) if fmt in ["jpeg", "jpg"] and jpegopt: args.extend(["-jpegopt", _parse_jpegopt(jpegopt)]) if single_file: args.append("-singlefile") if output_folder is not None: args.append(os.path.join(output_folder, output_file)) if userpw is not None: args.extend(["-upw", userpw]) if grayscale: args.append("-gray") if size is None: pass elif isinstance(size, tuple) and len(size) == 2: if size[0] is not None: args.extend(["-scale-to-x", str(int(size[0]))]) else: args.extend(["-scale-to-x", str(-1)]) if size[1] is not None: args.extend(["-scale-to-y", str(int(size[1]))]) else: args.extend(["-scale-to-y", str(-1)]) elif isinstance(size, tuple) and len(size) == 1: args.extend(["-scale-to", str(int(size[0]))]) elif isinstance(size, int) or isinstance(size, float): args.extend(["-scale-to", str(int(size))]) else: raise ValueError("Size {} is not a tuple or an integer") return args def _parse_format(fmt, grayscale=False): fmt = fmt.lower() if fmt[0] == ".": fmt = fmt[1:] if fmt in ("jpeg", "jpg"): return "jpeg", "jpg", parse_buffer_to_jpeg, False if fmt == "png": return "png", "png", parse_buffer_to_png, False if fmt in ("tif", "tiff"): return "tiff", "tif", None, True if fmt == "ppm" and grayscale: return "pgm", "pgm", parse_buffer_to_pgm, False # Unable to parse the format so we'll use the default return "ppm", "ppm", parse_buffer_to_ppm, False def _parse_jpegopt(jpegopt): parts = [] for k, v in jpegopt.items(): if v is True: v = "y" if v is False: v = "n" parts.append("{}={}".format(k, v)) return ",".join(parts) def _get_command_path(command, poppler_path=None): if platform.system() == "Windows": command = command + ".exe" if poppler_path is not None: command = os.path.join(poppler_path, command) return command def _get_poppler_version(command, poppler_path=None): command = [_get_command_path(command, poppler_path), "-v"] env = os.environ.copy() if poppler_path is not None: env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() try: # TODO: Make this more robust return int( err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".")[1] ) except: # Lowest version that includes pdftocairo (2011) return 17 def pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None): try: command = [_get_command_path("pdfinfo", poppler_path), pdf_path] if userpw is not None: command.extend(["-upw", userpw]) # Add poppler path to LD_LIBRARY_PATH env = os.environ.copy() if poppler_path is not None: env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() d = {} for field in out.decode("utf8", "ignore").split("\n"): sf = field.split(":") key, value = sf[0], ":".join(sf[1:]) if key != "": d[key] = ( int(value.strip()) if key in PDFINFO_CONVERT_TO_INT else value.strip() ) if "Pages" not in d: raise ValueError return d except OSError: raise PDFInfoNotInstalledError( "Unable to get page count. Is poppler installed and in PATH?" ) except ValueError: raise PDFPageCountError( "Unable to get page count.\n%s" % err.decode("utf8", "ignore") ) def pdfinfo_from_bytes(pdf_file): fh, temp_filename = tempfile.mkstemp() try: with open(temp_filename, "wb") as f: f.write(pdf_file) f.flush() return pdfinfo_from_path(temp_filename) finally: os.close(fh) os.remove(temp_filename) def _load_from_output_folder( output_folder, output_file, ext, paths_only, in_memory=False ): images = [] for f in sorted(os.listdir(output_folder)): if f.startswith(output_file) and f.split(".")[-1] == ext: if paths_only: images.append(os.path.join(output_folder, f)) else: images.append(Image.open(os.path.join(output_folder, f))) if in_memory: images[-1].load() return images