#!/usr/bin/env python ################################################################################################### # # Copyright (c) 2015, Armin Buescher (armin.buescher@googlemail.com) # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ################################################################################################### # # File: iocp.py # Description: IOC Parser is a tool to extract indicators of compromise from security reports # in PDF format. # Usage: iocp.py [-h] [-p INI] [-f FORMAT] PDF # Author: Armin Buescher (@armbues) # Contributors: Angelo Dell'Aera (@angelodellaera) # Thanks to: Jose Ramon Palanco # Koen Van Impe (@cudeso) # ################################################################################################### import os import sys import fnmatch import glob import re try: import configparser as ConfigParser except ImportError: import ConfigParser try: from StringIO import StringIO except ImportError: from io import StringIO # Import optional third-party libraries IMPORTS = [] try: from PyPDF2 import PdfFileReader IMPORTS.append('pypdf2') except ImportError: pass try: from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.layout import LAParams IMPORTS.append('pdfminer') except ImportError: pass try: from bs4 import BeautifulSoup IMPORTS.append('beautifulsoup') except ImportError: pass try: import requests IMPORTS.append('requests') except ImportError: pass # Import project source files import iocp from iocp import Output class Parser(object): patterns = {} defang = {} def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', output_handler=None): basedir = iocp.get_basedir() if patterns_ini is None: patterns_ini = os.path.join(basedir, 'data/patterns.ini') self.load_patterns(patterns_ini) wldir = os.path.join(basedir, 'data/whitelists') self.whitelist = self.load_whitelists(wldir) self.dedup = dedup if output_handler: self.handler = output_handler else: self.handler = Output.getHandler(output_format) self.ext_filter = "*." + input_format parser_format = "parse_" + input_format try: self.parser_func = getattr(self, parser_format) except AttributeError: e = 'Selected parser format is not supported: %s' % (input_format) raise NotImplementedError(e) self.library = library if input_format == 'pdf': if library not in IMPORTS: e = 'Selected PDF parser library not found: %s' % (library) raise ImportError(e) elif input_format == 'html': if 'beautifulsoup' not in IMPORTS: e = 'HTML parser library not found: BeautifulSoup' raise ImportError(e) def load_patterns(self, fpath): config = ConfigParser.ConfigParser() with open(fpath) as f: config.readfp(f) for ind_type in config.sections(): try: ind_pattern = config.get(ind_type, 'pattern') except: continue if ind_pattern: ind_regex = re.compile(ind_pattern) self.patterns[ind_type] = ind_regex try: ind_defang = config.get(ind_type, 'defang') except: continue if ind_defang: self.defang[ind_type] = True def load_whitelists(self, fpath): whitelist = {} searchdir = os.path.join(fpath, "whitelist_*.ini") fpaths = glob.glob(searchdir) for fpath in fpaths: t = os.path.splitext(os.path.split(fpath)[1])[0].split('_',1)[1] patterns = [line.strip() for line in open(fpath)] whitelist[t] = [re.compile(p) for p in patterns] return whitelist def is_whitelisted(self, ind_match, ind_type): try: for w in self.whitelist[ind_type]: if w.findall(ind_match): return True except KeyError as e: pass return False def parse_page(self, fpath, data, page_num): for ind_type, ind_regex in self.patterns.items(): matches = ind_regex.findall(data) for ind_match in matches: if isinstance(ind_match, tuple): ind_match = ind_match[0] if self.is_whitelisted(ind_match, ind_type): continue if ind_type in self.defang: ind_match = re.sub(r'\[\.\]', '.', ind_match) if self.dedup: if (ind_type, ind_match) in self.dedup_store: continue self.dedup_store.add((ind_type, ind_match)) self.handler.print_match(fpath, page_num, ind_type, ind_match) def parse_pdf_pypdf2(self, f, fpath): try: pdf = PdfFileReader(f, strict = False) if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in pdf.pages: page_num += 1 data = page.extractText() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise def parse_pdf(self, f, fpath): parser_format = "parse_pdf_" + self.library try: self.parser_func = getattr(self, parser_format) except AttributeError: e = 'Selected PDF parser library is not supported: %s' % (self.library) raise NotImplementedError(e) self.parser_func(f, fpath) def parse_txt(self, f, fpath): try: if self.dedup: self.dedup_store = set() data = f.read() self.handler.print_header(fpath) self.parse_page(fpath, data, 1) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise def parse_html(self, f, fpath): try: if self.dedup: self.dedup_store = set() data = f.read() soup = BeautifulSoup(data) html = soup.findAll(text=True) text = u'' for elem in html: if elem.parent.name in ['style', 'script', '[document]', 'head', 'title']: continue elif re.match('<!--.*-->', unicode(elem)): continue else: text += unicode(elem) self.handler.print_header(fpath) self.parse_page(fpath, text, 1) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise def parse(self, path): try: if path.startswith('http://') or path.startswith('https://'): if 'requests' not in IMPORTS: e = 'HTTP library not found: requests' raise ImportError(e) headers = { 'User-Agent': 'Mozilla/5.0 Gecko Firefox' } r = requests.get(path, headers=headers) r.raise_for_status() f = StringIO(r.content) self.parser_func(f, path) return elif os.path.isfile(path): with open(path, 'rb') as f: self.parser_func(f, path) return elif os.path.isdir(path): for walk_root, walk_dirs, walk_files in os.walk(path): for walk_file in fnmatch.filter(walk_files, self.ext_filter): fpath = os.path.join(walk_root, walk_file) with open(fpath, 'rb') as f: self.parser_func(f, fpath) return e = 'File path is not a file, directory or URL: %s' % (path) raise IOError(e) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(path, e)