python source code of parser

# coding: utf-8
import os
import six
import struct
import csv
import unicodedata
from itertools import count
from normality import normalize

from cronos.constants import KOD, PK_SENTINEL, RECORD_SEP, ENC, NULL


class CronosException(Exception):
    """General parsing errors."""


def vword(data):
    # A vodka word is a russian data unit, encompassing three bytes on good
    # days, with a flag in the fourth.
    word, = struct.unpack_from('<I', data)
    num = word & 0x00ffffff
    flags = (word & 0xff000000) >> 24
    return num, flags


def decode_text(text):
    # All strings should be encoded as CP1251 (Cyrillic)
    try:
        characters = []
        for character in text.decode(ENC):
            category = unicodedata.category(character)[0]
            if category in ['C']:
                character = ' '
            characters.append(character)
        return u''.join(characters)
    except:
        return None


def encode_cell(value):
    if value is None:
        return None
    if not isinstance(value, six.string_types):
        value = unicode(value)
    return value.encode('utf-8')


def align_sections(data):
    # We don't know how to decode all of the CroStru file, so we're guessing
    # the offsets for particular sections which we can decipher. This is
    # done by applying a sliding window, and looking for a key phrase (i.e.
    # the russian string for the primary key column).
    bytes_ = [ord(b) for b in data]
    sections = []
    # guess the offset for each section by using a sentinel
    for offset in range(256):
        buf = []
        for i, byte in enumerate(bytes_):
            # this is from the web (CRO.H)
            # buf[i] = kod[buf[i]] - (unsigned char) i - (unsigned char) offset
            better_byte = (KOD[byte] - i - offset) % 256
            buf.append(better_byte)

        text = ''.join([chr(b) for b in buf])
        if PK_SENTINEL in text:
            sections.append({
                'text': text,
                'buf': buf,
                'offset': offset,
                'index': text.find(PK_SENTINEL)
            })
            # with open('%s.bin' % offset, 'wb') as fh:
            #     fh.write(text)
    sections = sorted(sections, key=lambda s: s['index'])
    return sections


def parse_columns(text, base, count):
    # Parse the columns from the table definition. Columns start with
    # a short record length indicator, followed by type and sequence
    # information (each a short), and the name (prefixed by the length).
    columns = []
    for i in range(count):
        if len(text[base:]) < 8:
            break
        col_len, = struct.unpack_from('H', text, base)
        base = base + 2
        if len(text[base:]) < col_len:
            break
        col_data = text[base - 1:base - 1 + col_len]
        type_, col_id = struct.unpack_from('>HH', col_data, 0)
        text_len, = struct.unpack_from('>I', col_data, 4)
        col_name = decode_text(col_data[8:8 + text_len])
        if col_name is None:
            continue
        columns.append({
            'id': col_id,
            'name': col_name,
            'type': type_
        })
        base = base + col_len
    return columns


def parse_table(text, next_byte):
    # Once we've guessed a table definition location, we can start
    # parsing the name; followed by the two-letter table abbreviation
    # and the count of columns.
    next_len = ord(text[next_byte])
    next_byte = next_byte + 1
    if len(text) < next_byte + next_len + 10:
        return
    if ord(text[next_byte + next_len]) != 2:
        return
    # Get the table name.
    table_name = decode_text(text[next_byte:next_byte + next_len])
    if table_name is None:
        return
    next_byte = next_byte + next_len + 1
    # Get the table abbreviation.
    table_abbr = decode_text(text[next_byte:next_byte + 2])
    if table_abbr is None:
        return
    next_byte = next_byte + 2
    if ord(text[next_byte]) != 1:
        # raise CronosException('Table ID not ended by 0x01!')
        return
    next_byte = next_byte + 4
    # Get the number of columns for the table.
    col_count, = struct.unpack_from('I', text, next_byte)
    return {
        'name': table_name,
        'abbr': table_abbr,
        'columns': parse_columns(text, next_byte + 4, col_count),
        'column_count': col_count
    }


def parse_table_section(section, table_id):
    # Try and locate the beginning of a table definition using
    # some quasi-magical heuristics (i.e. the pattern of the
    # table definition).
    #
    # TABLE_ID + NULL + NULL + NULL + NAME_LEN + NAME
    # + 0x02 + ABBR1 + ABBR2 + 0x01 + NUM_TABLES
    text = section['text']
    sig = chr(table_id) + NULL + NULL + NULL
    offset = 0
    while True:
        index = text.find(sig, offset)
        if index == -1:
            break
        offset = index + 1
        next_byte = index + len(sig)
        table = parse_table(text, next_byte)
        if table is not None:
            table['id'] = table_id
            yield table


def parse_metadata(section):
    # Extract some nice-to-have metadata, such as the internal name
    # of the database and its ID.
    text = section['text']
    out = {}
    for field in ['BankId', 'BankName']:
        sentinel = field.encode(ENC)
        sentinel = chr(len(sentinel)) + sentinel
        index = text.find(sentinel)
        if index == -1:
            raise CronosException('Missing %s in structure!' % field)
        offset = index + len(sentinel)
        length, _ = vword(text[offset:])
        offset = offset + 4
        out[field] = decode_text(text[offset:offset + length])
    return out


def parse_structure(file_name):
    # The structure file holds metadata, such as table and column
    # definitions.
    with open(file_name, 'rb') as fh:
        data = fh.read()
    if not data.startswith('CroFile'):
        raise CronosException('Not a CroStru.dat file.')
    sections = align_sections(data)
    if not len(sections):
        raise CronosException('Could not recover CroStru.dat sections.')

    meta = parse_metadata(sections[0])

    tables = []
    for table_section in sections:
        for i in range(0, 256):
            for table in parse_table_section(table_section, i):
                tables.append(table)

    return meta, tables


def parse_record(meta, dat_fh):
    # Each data record is stored as a linked list of data fragments. The
    # metadata record holds the first and second offset, while all further
    # chunks are prefixed with the next offset.
    offset, length, next_offset, next_length = struct.unpack('<IHIH', meta)
    dat_fh.seek(offset)
    if length == 0:
        if next_length == 0 or next_length == 0xffff:
            return
    data = dat_fh.read(length)
    while next_length != 0 and next_length != 0xffff:
        dat_fh.seek(next_offset)
        next_data = dat_fh.read(min(252, next_length))
        if len(next_data) < 4:
            break
        next_offset, = struct.unpack_from('<I', next_data)
        data += next_data[4:]
        if next_length > 252:
            next_length -= 252
        else:
            next_length = 0
    return data


def parse_data(data_tad, data_dat, table_id, columns):
    # This function uses the offsets present in the TAD file to extract
    # all records for the given ``table_id`` from the DAT file.
    tad_fh = open(data_tad, 'rb')
    dat_fh = open(data_dat, 'rb')

    # Check the file signature.
    sig = dat_fh.read(7)
    if sig != 'CroFile':
        raise CronosException('Not a CroBank.dat file.')

    # One day, we'll find out what this header means.
    tad_fh.seek(8)
    for i in count(1):
        meta = tad_fh.read(12)
        if len(meta) != 12:
            break
        record = parse_record(meta, dat_fh)
        if record is None or len(record) < 2:
            continue
        if table_id != ord(record[0]):
            continue
        # First byte is the table ID
        record = record[1:]
        record = record.split(RECORD_SEP)
        # TODO: figure out how to detect password-encrypted columns.
        record = [decode_text(c) for c in record]
        if len(record) != len(columns):
            record = [i] + record
        yield record

    tad_fh.close()
    dat_fh.close()


def make_csv_file_name(meta, table, out_folder):
    bank_name = normalize(meta['BankName'], lowercase=False)
    if bank_name is None:
        bank_name = 'Untitled Database'
    table_abbr = normalize(table['abbr'], lowercase=False)
    table_name = normalize(table['name'], lowercase=False)
    file_name = '%s - %s - %s.csv' % (bank_name, table_abbr, table_name)
    return os.path.join(out_folder, file_name)


def get_file(db_folder, file_name):
    """Glob for the poor."""
    if not os.path.isdir(db_folder):
        return
    file_name = file_name.lower().strip()
    for cand_name in os.listdir(db_folder):
        if cand_name.lower().strip() == file_name:
            return os.path.join(db_folder, cand_name)


def parse(db_folder, out_folder):
    """
    Parse a cronos database.

    Convert the database located in ``db_folder`` into CSV files in the
    directory ``out_folder``.
    """
    # The database structure, containing table and column definitions as
    # well as other data.
    stru_dat = get_file(db_folder, 'CroStru.dat')
    # Index file for the database, which contains offsets for each record.
    data_tad = get_file(db_folder, 'CroBank.tad')
    # Actual data records, can only be decoded using CroBank.tad.
    data_dat = get_file(db_folder, 'CroBank.dat')
    if None in [stru_dat, data_tad, data_dat]:
        raise CronosException("Not all database files are present.")

    meta, tables = parse_structure(stru_dat)

    for table in tables:
        # TODO: do we want to export the "FL" table?
        if table['abbr'] == 'FL' and table['name'] == 'Files':
            continue
        fh = open(make_csv_file_name(meta, table, out_folder), 'w')
        columns = table.get('columns')
        writer = csv.writer(fh)
        writer.writerow([encode_cell(c['name']) for c in columns])
        for row in parse_data(data_tad, data_dat, table.get('id'), columns):
            writer.writerow([encode_cell(c) for c in row])
        fh.close()