""" A standalone script to download and parse edgar 10k MDA section """ import argparse import csv import concurrent.futures import itertools import os import time import re import unicodedata from collections import namedtuple from functools import wraps from glob import glob import requests from bs4 import BeautifulSoup SEC_GOV_URL = 'https://www.sec.gov/Archives' FORM_INDEX_URL = os.path.join( SEC_GOV_URL, 'edgar', 'full-index', '{}', 'QTR{}', 'form.idx') # Used to combine form 10k index files. Adds URL column for lookup INDEX_HEADERS = ["Form Type", "Company Name", "CIK", "Date Filed", "File Name", "Url"] def create_parser(): """Argument Parser""" parser = argparse.ArgumentParser() parser.add_argument('-s', '--start_year', type=int, required=True, help="year to start") parser.add_argument('-e', '--end_year', type=int, required=True, help="year to end") parser.add_argument('-q', '--quarters', type=int, nargs="+", default=[1, 2, 3, 4], help="quarters to download for start to end years") parser.add_argument('-d', '--data_dir', type=str, default="./data", help="path to save data") parser.add_argument('--overwrite', action="store_true", help="If True, overwrites downloads and processed files.") parser.add_argument('--debug', action="store_true", help="Debug mode") return parser def main(): # Parse arguments parser = create_parser() args = parser.parse_args() # Download indices index_dir = os.path.join(args.data_dir, "index") download_indices(args.start_year, args.end_year, args.quarters, index_dir, args.overwrite) # Combine indices to csv combine_indices_to_csv(index_dir) # Download forms form_dir = os.path.join(args.data_dir, "form10k") download_forms(index_dir, form_dir, args.overwrite, args.debug) # Normalize forms parsed_form_dir = os.path.join(args.data_dir, "form10k.parsed") parse_html_multiprocess(form_dir, parsed_form_dir, args.overwrite) # Parse MDA mda_dir = os.path.join(args.data_dir, "mda") parse_mda_multiprocess(parsed_form_dir, mda_dir, args.overwrite) def download_file(url: str, download_path: str, overwrite: bool = False): """ Downloads file to disk Args: url (str) download_path (str) Returns: True if success else False """ if not overwrite and os.path.exists(download_path): print("{} already exists. Skipping download...".format(download_path)) return True try: print("Requesting {}".format(url)) res = requests.get(url) write_content(res.text, download_path) print("Write to {}".format(download_path)) return True except Exception as e: print(e) return False def write_content(content, output_path): """ Writes content to file Args: content (str) output_path (str): path to output file """ with open(output_path, "w", encoding="utf-8") as fout: fout.write(content) def timeit(f): @wraps(f) def wrapper(*args, **kw): start_time = time.time() result = f(*args, **kw) end_time = time.time() print("{} took {:.2f} seconds." .format(f.__name__, end_time-start_time)) return result return wrapper @timeit def download_indices(start_year: int, end_year: int, quarters: list, index_dir: str, overwrite: bool): """ Downloads edgar 10k form indices with multiprocess Args: start_year (int): starting year end_year (int): ending year """ # Create output directory os.makedirs(index_dir, exist_ok=True) # Prepare arguments years = range(start_year, end_year+1) urls = [FORM_INDEX_URL.format(year, qtr) for year, qtr in itertools.product(years, quarters)] download_paths = [os.path.join(index_dir, "year{}.qtr{}.idx".format(year, qtr)) for year, qtr in itertools.product(years, quarters)] # Download indices for url, download_path in zip(urls, download_paths): download_file(url, download_path, overwrite) def parse_line_to_record(line, fields_begin): """ Example: 10-K 1347 Capital Corp 1606163 2016-03-21 edgar/data/1606163/0001144204-16-089184.txt Returns: ["10-K", "1347 Capital Corp","160613", "2016-03-21", "edgar/data/1606163/0001144204-16-089184.txt"] """ record = [] fields_indices = fields_begin + [len(line)] for begin, end in zip(fields_indices[:-1], fields_indices[1:]): field = line[begin:end].rstrip() field = field.strip('\"') record.append(field) return record @timeit def combine_indices_to_csv(index_dir): """ Combines index files in index_dir csv file for lookup Args: index_dir (str) """ # Reads all rows into memory rows = [] for index_path in sorted(glob(os.path.join(index_dir, "*.idx"))): with open(index_path, 'r') as fin: arrived = False fields_begin = None for line in fin.readlines(): if line.startswith("Form Type"): fields_begin = [line.find("Form Type"), line.find("Company Name"), line.find('CIK'), line.find('Date Filed'), line.find("File Name")] print(fields_begin) elif line.startswith("10-K "): assert fields_begin is not None arrived = True row = parse_line_to_record(line, fields_begin) filename = row[-1] url = os.path.join( SEC_GOV_URL, filename).replace("\\", "/") row = row + [url] rows.append(row) elif arrived: break # Write to output file csv_file = os.path.join(index_dir, "combined.csv") with open(csv_file, "w") as fout: writer = csv.writer(fout, delimiter=",", quotechar='\"', quoting=csv.QUOTE_ALL) writer.writerow(INDEX_HEADERS) writer.writerows(rows) @timeit def download_forms(index_dir: str, form_dir: str, overwrite: bool = False, debug: bool = False): """ Reads indices and download forms Args: index_dir (str) form_dir (str) """ # Create output directory os.makedirs(form_dir, exist_ok=True) # Prepare arguments combined_csv = os.path.join(index_dir, "combined.csv") print("Combining index files to {}".format(combined_csv)) urls = read_url_from_combined_csv(combined_csv) download_paths = [] for url in urls: download_name = "_".join(url.split('/')[-2:]) download_path = os.path.join(form_dir, download_name) download_paths.append(download_path) # Debug if debug: print("Debug: download only 10 forms") download_paths = download_paths[:10] # Download forms nforms = len(download_paths) for idx, (url, download_path) in enumerate(zip(urls, download_paths), 1): print("Download form {}/{}".format(idx, nforms)) download_file(url, download_path, overwrite) def read_url_from_combined_csv(csv_path): """ Reads url from csv file Args: csv_path (str): path to index file Returns urls: urls in combined csv """ urls = [] with open(csv_path, 'r') as fin: reader = csv.reader(fin, delimiter=",", quotechar='\"', quoting=csv.QUOTE_ALL) # Skip header next(reader) for row in reader: url = row[-1] urls.append(url) return urls def parse_html_multiprocess(form_dir, parsed_form_dir, overwrite=False): """ parse html with multiprocess Args: form_dir (str) Returns: parsed_form_dir (str) """ # Create directory os.makedirs(parsed_form_dir, exist_ok=True) # Prepare argument form_paths = sorted(glob(os.path.join(form_dir, "*.txt"))) parsed_form_paths = [] for form_path in form_paths: form_name = os.path.basename(form_path) parsed_form_path = os.path.join(parsed_form_dir, form_name) parsed_form_paths.append(parsed_form_path) # Multiprocess with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor: for form_path, parsed_form_path in zip(form_paths, parsed_form_paths): executor.submit(parse_html, form_path, parsed_form_path, overwrite) def parse_html(input_file, output_file, overwrite=False): """ Parses text from html with BeautifulSoup Args: input_file (str) output_file (str) """ if not overwrite and os.path.exists(output_file): print("{} already exists. Skipping parse html...".format(output_file)) return print("Parsing html {}".format(input_file)) with open(input_file, 'r') as fin: content = fin.read() # Parse html with BeautifulSoup soup = BeautifulSoup(content, "html.parser") text = soup.get_text("\n") write_content(text, output_file) # Log message print("Write to {}".format(output_file)) def normalize_text(text): """Normalize Text """ text = unicodedata.normalize("NFKD", text) # Normalize text = '\n'.join(text.splitlines()) # Unicode break lines # Convert to upper text = text.upper() # Convert to upper # Take care of breaklines & whitespaces combinations due to beautifulsoup parsing text = re.sub(r'[ ]+\n', '\n', text) text = re.sub(r'\n[ ]+', '\n', text) text = re.sub(r'\n+', '\n', text) # To find MDA section, reformat item headers text = text.replace('\n.\n', '.\n') # Move Period to beginning text = text.replace('\nI\nTEM', '\nITEM') text = text.replace('\nITEM\n', '\nITEM ') text = text.replace('\nITEM ', '\nITEM ') text = text.replace(':\n', '.\n') # Math symbols for clearer looks text = text.replace('$\n', '$') text = text.replace('\n%', '%') # Reformat text = text.replace('\n', '\n\n') # Reformat by additional breakline return text def parse_mda_multiprocess(form_dir: str, mda_dir: str, overwrite: bool = False): """ Parse MDA section from forms with multiprocess Args: form_dir (str) mda_dir (str) """ # Create output directory os.makedirs(mda_dir, exist_ok=True) # Prepare arguments form_paths = sorted(glob(os.path.join(form_dir, "*"))) mda_paths = [] for form_path in form_paths: form_name = os.path.basename(form_path) root, _ = os.path.splitext(form_name) mda_path = os.path.join(mda_dir, '{}.mda'.format(root)) mda_paths.append(mda_path) # Multiprocess with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor: for form_path, mda_path in zip(form_paths, mda_paths): executor.submit(parse_mda, form_path, mda_path, overwrite) def parse_mda(form_path, mda_path, overwrite=False): """ Reads form and parses mda Args: form_path (str) mda_path (str) """ if not overwrite and os.path.exists(mda_path): print("{} already exists. Skipping parse mda...".format(mda_path)) return # Read print("Parse MDA {}".format(form_path)) with open(form_path, "r") as fin: text = fin.read() # Normalize text here text = normalize_text(text) # Parse MDA mda, end = find_mda_from_text(text) # Parse second time if first parse results in index if mda and len(mda.encode('utf-8')) < 1000: mda, _ = find_mda_from_text(text, start=end) if mda: print("Write MDA to {}".format(mda_path)) write_content(mda, mda_path) else: print("Parse MDA failed {}".format(form_path)) def find_mda_from_text(text, start=0): """Find MDA section from normalized text Args: text (str)s """ debug = False mda = "" end = 0 # Define start & end signal for parsing item7_begins = [ '\nITEM 7.', '\nITEM 7 –', '\nITEM 7:', '\nITEM 7 ', '\nITEM 7\n' ] item7_ends = ['\nITEM 7A'] if start != 0: item7_ends.append('\nITEM 7') # Case: ITEM 7A does not exist item8_begins = ['\nITEM 8'] """ Parsing code section """ text = text[start:] # Get begin for item7 in item7_begins: begin = text.find(item7) if debug: print(item7, begin) if begin != -1: break if begin != -1: # Begin found for item7A in item7_ends: end = text.find(item7A, begin + 1) if debug: print(item7A, end) if end != -1: break if end == -1: # ITEM 7A does not exist for item8 in item8_begins: end = text.find(item8, begin + 1) if debug: print(item8, end) if end != -1: break # Get MDA if end > begin: mda = text[begin:end].strip() else: end = 0 return mda, end if __name__ == "__main__": main()