#!/usr/bin/env python # -*- coding: utf-8 -*- #Libraries import xlsxwriter import json #Analyze metadata pdf import PyPDF2 from PyPDF2 import PdfFileReader #Analyze metadata docx import docx import time import wget import os from modules.createdir import * #Global var's metadata_files=[] meta_author_array = [] meta_creator_array = [] meta_producer_array = [] #Count's count_pdf = 0 count_word =0 count_others = 0 ####### FUNCTION AnalyzeMetadata pdf ###### def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) ####### FUNCTION AnalyzeMetadata doc ###### def Analyze_Metadata_doc(fileName): #Open file docxFile = docx.Document(file(fileName,'rb')) #Get the structure docxInfo= docxFile.core_properties #Print the metadata which it wants to display attribute = ["author", "category", "comments", "content_status", "created", "identifier", "keywords", "language", "last_modified_by", "last_printed", "modified", "revision", "subject", "title", "version"] #run the list in a for loop to print the value of each metadata print ' - Document: ' + str(fileName) for meta in attribute: metadata = getattr(docxInfo,meta) value = metadata([meta]) if metadata: if meta =="/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta == "/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta =="/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Separate the values unicode and time date if isinstance(metadata, unicode): print " \n\t" + str(meta)+": " + str(metadata) elif isinstance(metadata, datetime.datetime): print " \n\t" + str(meta)+": " + str(metadata) ####### FUNCTION CATEGORY FILE TO EXTRACT METADATA ###### def Analyze_Metadata(filename): global count_pdf global count_word global count_word try: #Verify the ext to know the type of the file to diference of the analysis ext=filename.lower().rsplit(".",1)[-1] if ext =="pdf" or ext == "PDF": count_pdf += 1 #call the function analyze metadata pdf Analyze_Metadata_pdf(filename) elif ((ext =="doc") or (ext=="docx")): count_word += 1 Analyze_Metadata_doc(filename) else: count_word += 1 print "\nIt can't obtain the metadata. Skip the next!\n" except Exception as e: print e ####### FUNCTION DOWNLOADFILES ###### def Display_Export_Metadata(data,output,target): try: print "-----------------------------------------------" print "METADATA RESULTS BY CATEGORY" print "\n################################################\n" total_indexed =int(count_pdf + count_word + count_others) print "Documents indexed found: "+ str(total_indexed) print "\n PDF files: " + str(count_pdf) print "\n DOC/x files: " + str(count_word) print "\n Others files: " + str(count_others) print "\nUsers - Documents Author" for user in data[0]: print " " + str(user).encode('utf8') print "\n##################################################\n" print "Producer" #print "Producer"+ str(data[1]) for producer in data[1]: print "\t " + str(producer).encode('utf8') print "\n################################################\n" #print "Creator"+ str(data[2]) print "Creator" for creator in data[2]: print " " + str(creator).encode('utf8') print "\n################################################\n" print "-----------------------------------------------" # Start from the first cell. Rows and columns are zero indexed. row = 0 col = 0 #json if (output == "js"): print "Exporting the results in a metadata-json" with open("metadata.json", 'w') as f: json.dump(data, f) #excel if (output =="xl"): print "Exporting the results in an excel" # Create a workbook and add a worksheet. workbook = xlsxwriter.Workbook('metadata.xlsx') worksheet = workbook.add_worksheet() worksheet.write(row, col, "Users") worksheet.write(row, col+1, "Producer") worksheet.write(row, col+2, "Creator") row+=1 # Iterate over the data and write it out row by row. for users in meta_author_array: col = 0 worksheet.write(row, col, users) row += 1 #update row row=1 for producer in meta_producer_array: col = 1 worksheet.write(row, col, producer) row += 1 #update row row=1 for creator in meta_creator_array: col = 2 worksheet.write(row, col, creator) row += 1 #close the excel workbook.close() os.system ('mv metadata.xlsx '+str(target) +'/metadata.xlsx') except Exception as e: print str(e) ####### FUNCTION DOWNLOADFILES ###### def Downloadfiles(urls_metadata,output,target): path = None try: filename = None print "\nDo you like downloading these files to analyze metadata(Y/N)?" #Convert to lower the input resp = raw_input().lower() if (resp == 'n'): print "Exiting" exit(1) if ((resp != 'y') and (resp != 'n')): print "The option is not valided. Please, try again it" if (resp =='y'): path = str(target) + '/temp' for url in urls_metadata: filename = wget.download(url,str(path)) Analyze_Metadata(filename) time.sleep(3) #Delete temp folder os.system('rm -r ' + str(path)) Display_Export_Metadata(metadata_files,output,target) except Exception as e: print str(e)