import xlrd from xlrd.xldate import xldate_as_datetime from collections import defaultdict from pprint import pprint # noqa from datetime import datetime from normality import slugify from followthemoney import model from ftmstore.memorious import EntityEmitter from opensanctions.util import normalize_country URL = 'http://dfat.gov.au/international-relations/security/sanctions/Pages/sanctions.aspx' # noqa def clean_reference(ref): if isinstance(ref, (int, float)): return int(ref) number = ref while len(number): try: return int(number) except Exception: number = number[:-1] raise ValueError() def parse_reference(emitter, reference, rows): entity = emitter.make('LegalEntity') entity.make_id(reference) entity.add('sourceUrl', URL) sanction = emitter.make('Sanction') sanction.make_id(entity.id) sanction.add('authority', 'Australian Department of Foreign Affairs and Trade Consolidated Sanctions') # noqa sanction.add('entity', entity) for row in rows: if row.pop('type') == 'Individual': entity.schema = model.get('Person') name = row.pop('name_of_individual_or_entity', None) if row.pop('name_type') == 'aka': entity.add('alias', name) else: entity.add('name', name) entity.add('address', row.pop('address')) entity.add('notes', row.pop('additional_information')) sanction.add('program', row.pop('committees')) nationality = normalize_country(row.pop('citizenship')) entity.add('nationality', nationality, quiet=True) entity.add('birthDate', row.pop('date_of_birth'), quiet=True) entity.add('birthPlace', row.pop('place_of_birth'), quiet=True) entity.add('status', row.pop('listing_information'), quiet=True) control_date = int(row.pop('control_date')) base_date = datetime(1900, 1, 1).toordinal() dt = datetime.fromordinal(base_date + control_date - 2) sanction.add('modifiedAt', dt.date()) entity.add('modifiedAt', dt.date()) emitter.emit(entity) emitter.emit(sanction) def parse(context, data): emitter = EntityEmitter(context) references = defaultdict(list) with context.http.rehash(data) as res: xls = xlrd.open_workbook(res.file_path) ws = xls.sheet_by_index(0) headers = [slugify(h, sep='_') for h in ws.row_values(0)] for r in range(1, ws.nrows): row = ws.row(r) row = dict(zip(headers, row)) for header, cell in row.items(): if cell.ctype == 2: row[header] = str(int(cell.value)) elif cell.ctype == 3: date = xldate_as_datetime(cell.value, xls.datemode) row[header] = date.isoformat() elif cell.ctype == 0: row[header] = None row[header] = cell.value reference = clean_reference(row.get('reference')) references[reference].append(row) for ref, rows in references.items(): parse_reference(emitter, ref, rows) emitter.finalize()