#!/usr/bin/env python """ A utility to fetch urls from the resume """ import logging # for fetching URLs from pdf from pdfminer.pdfpage import PDFPage logging.basicConfig(level=logging.DEBUG) """ Utility Function to fetch URLs from pdf. Params: file_name type: string returns: list of URLs """ def fetch_pdf_urls(file_name): try: links = [] file_pointer = open(file_name,'rb') # Setting up pdf document pdf_pages = PDFPage.get_pages(file_pointer) # fetches URLs for page in pdf_pages: if 'Annots' in page.attrs.keys(): link_object_list = page.attrs['Annots'] # Due to implementation of pdfminer the link_object_list can either # be the list directly or a PDF Object reference if type(link_object_list) is not list: link_object_list = link_object_list.resolve() for link_object in link_object_list: if type(link_object) is not dict: link_object = link_object.resolve() if link_object['A']['URI']: links.append(link_object['A']['URI']) file_pointer.close() return links except Exception, exception_instance: logging.error('Error while fetching URLs : '+str(exception_instance)) return ''