python source code of addressset

# addressset.py -- btcrecover AddressSet library
# Copyright (C) 2017 Christopher Gurnee
#
# This file is part of btcrecover.
#
# btcrecover is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version
# 2 of the License, or (at your option) any later version.
#
# btcrecover is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see http://www.gnu.org/licenses/

# If you find this program helpful, please consider a small
# donation to the developer at the following Bitcoin address:
#
#           3Au8ZodNHPei7MQiSVAWb7NB2yqsb48GW4
#
#                      Thank You!

# (all optional futures for 2.7 except unicode_literals)
from __future__ import print_function, absolute_import, division

__version__ =  "0.1.3"

import struct, base64, io, mmap, ast, itertools, sys, gc, glob
from os import path


def bytes_to_int(bytes_rep):
    """convert a string of bytes (in big-endian order) to an integer

    :param bytes_rep: the raw bytes
    :type bytes_rep: str
    :return: the unsigned integer
    :rtype: int or long
    """
    bytes_len = len(bytes_rep)
    if bytes_len <= 4:
        return struct.unpack(">I", (4-bytes_len)*b"\0" + bytes_rep)[0]
    return long(base64.b16encode(bytes_rep), 16)


class AddressSet(object):
    """
    A set-like collection optimized for testing membership of Bitcoin addresses
    from their raw hash160 format with support for serializing to/from files
    """
    VERSION    = 1
    MAGIC      = b"seedrecover address database\r\n"  # file magic
    HEADER_LEN = 65536
    assert HEADER_LEN % mmap.ALLOCATIONGRANULARITY == 0

    def __init__(self, table_len, bytes_per_addr = 8, max_load = 0.75):
        """
        :param table_len: hash table size in count of addresses; must be a power of 2
        :type table_len: int
        :param bytes_per_addr: number of bytes of each address to store in the hash table
        :type bytes_per_addr: int
        :param max_load: max permissible load factor before an exception is raised
        :type max_load: float
        """
        if table_len < 1 or 1 << (table_len.bit_length()-1) != table_len:
            raise ValueError("table_len must be a positive power of 2")
        if not 1 <= bytes_per_addr <= 19:
            raise ValueError("bytes_per_addr must be between 1 and 19 inclusive")
        if not 0.0 < max_load < 1.0:
            raise ValueError("max_load must be between 0.0 and 1.0 exclusive")
        self._table_bytes    = table_len * bytes_per_addr         # len of hash table in bytes
        self._bytes_per_addr = bytes_per_addr                     # number of bytes per address to store
        self._null_addr      = b"\0" * bytes_per_addr             # all 0s is an empty hash table slot
        self._len            = 0                                  # count of addresses in the set
        self._max_len        = int(table_len * max_load)          # beyond this violates the load factor
        self._hash_bytes     = (table_len.bit_length() + 6) // 8  # number of bytes required for the mask
        self._hash_mask      = table_len - 1                      # mask used for the hash function
        self._data           = bytearray(self._table_bytes)       # the table itself
        self._dbfile         = None                               # file object, its .name is req'd for pickling
        self._mmap_access    = None                               # also required for pickling
        self.last_filenum    = None                               # will be serialized if set by the user
        if self._bytes_per_addr + self._hash_bytes > 20:
            raise ValueError("not enough bytes for both hashing and storage; "
                             "reduce either the bytes_per_addr or table_len")

    def __getstate__(self):
        # mmaps can't be pickled, so save only what's needed to recreate the object from scratch later
        if isinstance(self._data, mmap.mmap):
            return {"dbfilename": self._dbfile.name, "mmap_access": self._mmap_access}
        else:
            return self.__dict__

    def __setstate__(self, state):
        # If the object contained an mmap, recreate it from scratch
        if "dbfilename" in state:
            new = self.fromfile(open(state["dbfilename"], "r+b" if state["mmap_access"]==mmap.ACCESS_WRITE else "rb"),
                                mmap_access=state["mmap_access"], preload=False)
            self.__dict__ = new.__dict__.copy()
            new._dbfile = new._data = None  # ensure new's __del__() doesn't close() anything
        else:
            self.__dict__ = state

    def  __len__(self):
        return self._len

    def __contains__(self, address):
        return self._find(address) is True

    def add(self, address):
        """Adds the address to the set

        :param address: the address in hash160 (length 20) format to add
        :type address: bytes or str
        """
        pos = self._find(address)
        if pos is not True:
            bytes_to_add = address[ -(self._bytes_per_addr+self._hash_bytes) : -self._hash_bytes]
            if bytes_to_add.endswith(self._null_addr):
                return  # ignore these invalid addresses
            if self._len >= self._max_len:
                raise ValueError("addition to AddressSet exceeds load factor")
            self._data[pos : pos+self._bytes_per_addr] = bytes_to_add
            self._len += 1

    # Hash table with open addressing and linear probing:
    # The hash function is simply some of the address's least significant bits (since
    # most addresses are random hashes, this and linear probing should be sufficient).
    # To further save space, only the least significant _bytes_per_addr (typ. 8) bytes--
    # excluding those bytes already used for the "hash" above--are stored in the table,
    # causing different addresses to appear to be the same and false positives, however
    # (with high probability) only for invalid addresses (those w/o private keys).
    def _find(self, addr_to_find):
        pos = self._bytes_per_addr * (bytes_to_int(addr_to_find[ -self._hash_bytes :]) & self._hash_mask)
        while True:
            cur_addr = self._data[pos : pos+self._bytes_per_addr]
            if cur_addr == self._null_addr:
                return pos  # the position this element could be inserted at
            if len(addr_to_find) > self._bytes_per_addr:
                addr_to_find = addr_to_find[ -(self._bytes_per_addr+self._hash_bytes) : -self._hash_bytes]
            if cur_addr == addr_to_find:
                return True
            pos += self._bytes_per_addr  # linear probing
            if pos >= self._table_bytes:
                pos = 0

    def __iter__(self):
        """Iterates over the set returning the bytes_per_addr stored for each address
        """
        pos = 0
        while pos < self._table_bytes:
            cur_addr = self._data[pos : pos+self._bytes_per_addr]
            if cur_addr != self._null_addr:
                yield cur_addr
            pos += self._bytes_per_addr

    def __reversed__(self):
        pos = self._table_bytes - self._bytes_per_addr
        while pos >= 0:
            cur_addr = self._data[pos : pos+self._bytes_per_addr]
            if cur_addr != self._null_addr:
                yield cur_addr
            pos -= self._bytes_per_addr

    @staticmethod
    def _remove_nonheader_attribs(attrs):
        del attrs["_data"], attrs["_dbfile"], attrs["_mmap_access"]

    def _header(self):
        # Construct a 64K header with the file magic, this object's attributes, plus the version
        header_dict = self.__dict__.copy()
        self._remove_nonheader_attribs(header_dict)
        header_dict["version"] = self.VERSION
        header = repr(header_dict) + b"\r\n"
        assert ast.literal_eval(header) == header_dict
        header = self.MAGIC + header
        header_len = len(header)
        assert header_len < self.HEADER_LEN
        return header + b"\0" * (self.HEADER_LEN - header_len)  # appends at least one nul

    def tofile(self, dbfile):
        """Save the address set to a file

        :param dbfile: an open file object where the set is saved (overwriting it)
        :type dbfile: io.FileIO or file
        """
        if dbfile.tell() % mmap.ALLOCATIONGRANULARITY != 0:
            print("AddressSet: warning: if header position in file isn't a multiple of {}, it probably can't be loaded with fromfile()"
                  .format(mmap.ALLOCATIONGRANULARITY), file=sys.stderr)
        if "b" not in dbfile.mode:
            raise ValueError("must open file in binary mode")
        # Windows Python 2 file objects can't handle writes >= 4GiB. Objects returned
        # by io.open() work around this issue, see https://bugs.python.org/issue9611
        if not isinstance(dbfile, io.BufferedIOBase) and self._table_bytes >= 1 << 32:
            raise ValueError("must open file with io.open if size >= 4GiB")
        dbfile.truncate(dbfile.tell() + self.HEADER_LEN + self._table_bytes)
        dbfile.write(self._header())
        dbfile.write(self._data)

    @classmethod
    def fromfile(cls, dbfile, mmap_access = mmap.ACCESS_READ, preload = True):
        """Load the address set from a file

        :param dbfile: an open file object from which the set is loaded;
                       it will be closed by AddressSet when no longer needed
        :type dbfile: io.FileIO or file
        :param mmap_access: mmap.ACCESS_READ, .ACCESS_WRITE, or .ACCESS_COPY
        :type mmap_access: int
        :param preload: True to preload the entire address set, False to load on demand
        :type preload: bool
        """
        if "b" not in dbfile.mode:
            raise ValueError("must open file in binary mode")
        header_pos = dbfile.tell()
        if header_pos % mmap.ALLOCATIONGRANULARITY != 0:
            raise ValueError("header position in file must be a multiple of {}".format(mmap.ALLOCATIONGRANULARITY))
        #
        # Read in the header safely (ast.literal_eval() is safe for untrusted data)
        header = dbfile.read(cls.HEADER_LEN)
        if not header.startswith(cls.MAGIC):
            raise ValueError("unrecognized file format (invalid magic)")
        magic_len  = len(cls.MAGIC)
        config_end = header.find(b"\0", magic_len, cls.HEADER_LEN)
        assert config_end > 0
        config = ast.literal_eval(header[magic_len:config_end])
        if config["version"] != cls.VERSION:
            raise ValueError("can't load address database version {} (only supports {})"
                             .format(config["version"], cls.VERSION))
        #
        # Create an AddressSet object and replace its attributes
        self = cls(1)  # (size is irrelevant since it's getting replaced)
        cls._remove_nonheader_attribs(self.__dict__)
        for attr in self.__dict__.keys():  # only load expected attributes from untrusted data
            self.__dict__[attr] = config[attr]
        self._mmap_access = mmap_access
        #
        # The hash table is memory-mapped directly from the file instead of being loaded
        self._data = mmap.mmap(dbfile.fileno(), self._table_bytes, access=mmap_access,
                                offset= header_pos + cls.HEADER_LEN)
        if mmap_access == mmap.ACCESS_WRITE:
            dbfile.seek(header_pos)  # prepare for writing an updated header in close()
        else:
            dbfile.close()
        self._dbfile = dbfile
        #
        # Most of the time it makes sense to load the file serially instead of letting
        # the OS load each page as it's touched in random order, especially with HDDs;
        # reading a byte from each page is sufficient (CPython doesn't optimize this away)
        if preload:
            for i in xrange(self._table_bytes // mmap.PAGESIZE):
                self._data[i * mmap.PAGESIZE]
        #
        return self

    def close(self, flush = True):
        if self._dbfile:                 # if present, self._data is an mmap
            if not self._dbfile.closed:  # if not closed, the mmap was opened in write/update mode
                self._dbfile.write(self._header())  # update the header
                self._dbfile.close()
                if flush:
                    self._data.flush()
            self._data.close()
            self._dbfile = None
        elif isinstance(self._data, bytearray) and self._data:
            self._data = bytearray()
        if flush:
            gc.collect()

    def __del__(self):
        if hasattr(self, "_dbfile"):
            self.close(flush=False)


# Decodes a Bitcoin-style variable precision integer and
# returns a tuple containing its value and incremented offset
def varint(data, offset):
    b = ord(data[offset])
    if b <= 252:
        return b, offset + 1
    if b == 253:
        return struct.unpack_from("<H", data, offset + 1)[0], offset + 3
    if b == 254:
        return struct.unpack_from("<I", data, offset + 1)[0], offset + 5
    if b == 255:
        return struct.unpack_from("<Q", data, offset + 1)[0], offset + 9
    assert False


def create_address_db(dbfilename, blockdir, update = False, progress_bar = True):
    """Creates an AddressSet database and saves it to a file

    :param dbfilename: the file name where the database is saved (overwriting it)
    :type dbfilename: str
    :param blockdir: the data directory where the Bitcoin block files reside
    :type blockdir: str
    :param update: if True, the existing database file is updated from new txs
    :type update: bool
    :param progress_bar: True to enable the progress bar
    :type progress_bar: bool
    """

    for filename in glob.iglob(path.join(blockdir, "blk*.dat")):
        if path.isfile(filename): break
    else:
        raise ValueError("no block files exist in blocks directory '{}'".format(blockdir))

    if update:
        print("Loading address database ...")
        address_set   = AddressSet.fromfile(open(dbfilename, "r+b"), mmap_access=mmap.ACCESS_WRITE)
        first_filenum = address_set.last_filenum
        print()
    else:
        first_filenum = 0

    filename = "blk{:05}.dat".format(first_filenum)
    if not path.isfile(path.join(blockdir, filename)):
        raise ValueError("first block file '{}' doesn't exist in blocks directory '{}'".format(filename, blockdir))

    if not update:
        # Open the file early to make sure we can, but don't overwrite it yet
        # (see AddressSet.tofile() for why io.open() instead of open() is used)
        try:
            dbfile = io.open(dbfilename, "r+b")
        except IOError:
            dbfile = io.open(dbfilename, "wb")
        # With the default bytes_per_addr and max_load, this allocates
        # about 4 GiB which is room for a little over 400 million addresses
        address_set = AddressSet(1 << 29)

    if progress_bar:
        try:
            import progressbar
        except ImportError:
            progress_bar = False

    if progress_bar:
        print("Parsing block files ...")
        for filenum in itertools.count(first_filenum):
            filename = path.join(blockdir, "blk{:05}.dat".format(filenum))
            if not path.isfile(filename):
                break
        progress_label = progressbar.FormatLabel(" {:11,} addrs. %(elapsed)s, ".format(len(address_set)))
        progress_bar = progressbar.ProgressBar(maxval=filenum-first_filenum, widgets=[
            progressbar.SimpleProgress(), " ",
            progressbar.Bar(left="[", fill="-", right="]"),
            progress_label,
            progressbar.ETA()
        ])
        progress_bar.start()
    else:
        print("Block file   Address count")
        print("------------ -------------")
        # e.g. blk00943.dat   255,212,706

    for filenum in itertools.count(first_filenum):
        filename = path.join(blockdir, "blk{:05}.dat".format(filenum))
        if not path.isfile(filename):
            break
        address_set.last_filenum = filenum

        with open(filename, "rb") as blockfile:
            if not progress_bar:
                print(path.basename(filename), end=" ")

            header = blockfile.read(8)  # read in the magic and remaining (after these 8 bytes) block length
            while len(header) == 8 and header[4:] != b"\0\0\0\0":
                assert header[:4] == b"\xf9\xbe\xb4\xd9"                        # magic

                block = blockfile.read(struct.unpack_from("<I", header, 4)[0])  # read in the rest of the block
                tx_count, offset = varint(block, 80)                            # skips 80 bytes of header
                for tx_num in xrange(tx_count):
                    offset += 4                                                 # skips 4-byte tx version
                    is_bip144 = block[offset] == b"\0"                          # bip-144 marker
                    if is_bip144:
                        offset += 2                                             # skips 1-byte marker & 1-byte flag
                    txin_count, offset = varint(block, offset)
                    for txin_num in xrange(txin_count):
                        sigscript_len, offset = varint(block, offset + 36)      # skips 32-byte tx id & 4-byte tx index
                        offset += sigscript_len + 4                             # skips sequence number & sigscript
                    txout_count, offset = varint(block, offset)
                    for txout_num in xrange(txout_count):
                        pkscript_len, offset = varint(block, offset + 8)        # skips 8-byte satoshi count

                        # If this is a P2PKH script (OP_DUP OP_HASH160 PUSH(20) <20 address bytes> OP_EQUALVERIFY OP_CHECKSIG)
                        if pkscript_len == 25 and block[offset:offset+3] == b"\x76\xa9\x14" and block[offset+23:offset+25] == b"\x88\xac":
                            # Add the discovered address to the address set
                            address_set.add(block[offset+3:offset+23])

                        offset += pkscript_len                                  # advances past the pubkey script
                    if is_bip144:
                        for txin_num in xrange(txin_count):
                            stackitem_count, offset = varint(block, offset)
                            for stackitem_num in xrange(stackitem_count):
                                stackitem_len, offset = varint(block, offset)
                                offset += stackitem_len                         # skips this stack item
                    offset += 4                                                 # skips the 4-byte locktime
                header = blockfile.read(8)  # read in the next magic and remaining block length

        if progress_bar:
            progress_label.format = " {:11,} addrs. %(elapsed)s, ".format(len(address_set))  # updates address count
            nextval = progress_bar.currval + 1
            if nextval > progress_bar.maxval:  # can happen if the bitcoin client is left running
                progress_bar.maxval = nextval
            progress_bar.update(nextval)
        else:
            print("{:13,}".format(len(address_set)))

    if progress_bar:
        progress_bar.widgets.pop()  # remove the ETA
        progress_bar.finish()
    if update:
        print("\nSaving changes to address database ...")
        address_set.close()
    else:
        print("\nSaving address database ...")
        dbfile.truncate(0)
        address_set.tofile(dbfile)
        dbfile.close()

    print("\nDone.")