python source code of sync

"""Hive sync manager."""

import logging
import glob
from time import perf_counter as perf
import os
import ujson as json

from funcy.seqs import drop
from toolz import partition_all

from hive.db.db_state import DbState

from hive.utils.timer import Timer
from hive.steem.block.stream import MicroForkException

from hive.indexer.blocks import Blocks
from hive.indexer.accounts import Accounts
from hive.indexer.cached_post import CachedPost
from hive.indexer.feed_cache import FeedCache
from hive.indexer.follow import Follow
from hive.indexer.community import Community
from hive.server.common.mutes import Mutes

#from hive.indexer.jobs import audit_cache_missing, audit_cache_deleted

log = logging.getLogger(__name__)

class Sync:
    """Manages the sync/index process.

    Responsible for initial sync, fast sync, and listen (block-follow).
    """

    def __init__(self, conf):
        self._conf = conf
        self._db = conf.db()
        self._steem = conf.steem()

    def run(self):
        """Initialize state; setup/recovery checks; sync and runloop."""

        # ensure db schema up to date, check app status
        DbState.initialize()

        # prefetch id->name and id->rank memory maps
        Accounts.load_ids()
        Accounts.fetch_ranks()

        # load irredeemables
        mutes = Mutes(self._conf.get('muted_accounts_url'))
        Mutes.set_shared_instance(mutes)

        # community stats
        Community.recalc_pending_payouts()

        if DbState.is_initial_sync():
            # resume initial sync
            self.initial()
            DbState.finish_initial_sync()

        else:
            # recover from fork
            Blocks.verify_head(self._steem)

            # perform cleanup if process did not exit cleanly
            CachedPost.recover_missing_posts(self._steem)

        #audit_cache_missing(self._db, self._steem)
        #audit_cache_deleted(self._db)

        self._update_chain_state()

        if self._conf.get('test_max_block'):
            # debug mode: partial sync
            return self.from_steemd()
        if self._conf.get('test_disable_sync'):
            # debug mode: no sync, just stream
            return self.listen()

        while True:
            # sync up to irreversible block
            self.from_steemd()

            # take care of payout backlog
            CachedPost.dirty_paidouts(Blocks.head_date())
            CachedPost.flush(self._steem, trx=True)

            try:
                # listen for new blocks
                self.listen()
            except MicroForkException as e:
                # attempt to recover by restarting stream
                log.error("microfork: %s", repr(e))

    def initial(self):
        """Initial sync routine."""
        assert DbState.is_initial_sync(), "already synced"

        log.info("[INIT] *** Initial fast sync ***")
        self.from_checkpoints()
        self.from_steemd(is_initial_sync=True)

        log.info("[INIT] *** Initial cache build ***")
        CachedPost.recover_missing_posts(self._steem)
        FeedCache.rebuild()
        Follow.force_recount()

    def from_checkpoints(self, chunk_size=1000):
        """Initial sync strategy: read from blocks on disk.

        This methods scans for files matching ./checkpoints/*.json.lst
        and uses them for hive's initial sync. Each line must contain
        exactly one block in JSON format.
        """
        # pylint: disable=no-self-use
        last_block = Blocks.head_num()

        tuplize = lambda path: [int(path.split('/')[-1].split('.')[0]), path]
        basedir = os.path.dirname(os.path.realpath(__file__ + "/../.."))
        files = glob.glob(basedir + "/checkpoints/*.json.lst")
        tuples = sorted(map(tuplize, files), key=lambda f: f[0])

        last_read = 0
        for (num, path) in tuples:
            if last_block < num:
                log.info("[SYNC] Load %s. Last block: %d", path, last_block)
                with open(path) as f:
                    # each line in file represents one block
                    # we can skip the blocks we already have
                    skip_lines = last_block - last_read
                    remaining = drop(skip_lines, f)
                    for lines in partition_all(chunk_size, remaining):
                        Blocks.process_multi(map(json.loads, lines), True)
                last_block = num
            last_read = num

    def from_steemd(self, is_initial_sync=False, chunk_size=1000):
        """Fast sync strategy: read/process blocks in batches."""
        steemd = self._steem
        lbound = Blocks.head_num() + 1
        ubound = self._conf.get('test_max_block') or steemd.last_irreversible()

        count = ubound - lbound
        if count < 1:
            return

        log.info("[SYNC] start block %d, +%d to sync", lbound, count)
        timer = Timer(count, entity='block', laps=['rps', 'wps'])
        while lbound < ubound:
            timer.batch_start()

            # fetch blocks
            to = min(lbound + chunk_size, ubound)
            blocks = steemd.get_blocks_range(lbound, to)
            lbound = to
            timer.batch_lap()

            # process blocks
            Blocks.process_multi(blocks, is_initial_sync)
            timer.batch_finish(len(blocks))

            _prefix = ("[SYNC] Got block %d @ %s" % (
                to - 1, blocks[-1]['timestamp']))
            log.info(timer.batch_status(_prefix))

        if not is_initial_sync:
            # This flush is low importance; accounts are swept regularly.
            Accounts.flush(steemd, trx=True)

            # If this flush fails, all that could potentially be lost here is
            # edits and pre-payout votes. If the post has not been paid out yet,
            # then the worst case is it will be synced upon payout. If the post
            # is already paid out, worst case is to lose an edit.
            CachedPost.flush(steemd, trx=True)

    def listen(self):
        """Live (block following) mode."""
        trail_blocks = self._conf.get('trail_blocks')
        assert trail_blocks >= 0
        assert trail_blocks <= 100

        # debug: no max gap if disable_sync in effect
        max_gap = None if self._conf.get('test_disable_sync') else 100

        steemd = self._steem
        hive_head = Blocks.head_num()

        for block in steemd.stream_blocks(hive_head + 1, trail_blocks, max_gap):
            start_time = perf()

            self._db.query("START TRANSACTION")
            num = Blocks.process(block)
            follows = Follow.flush(trx=False)
            accts = Accounts.flush(steemd, trx=False, spread=8)
            CachedPost.dirty_paidouts(block['timestamp'])
            cnt = CachedPost.flush(steemd, trx=False)
            self._db.query("COMMIT")

            ms = (perf() - start_time) * 1000
            log.info("[LIVE] Got block %d at %s --% 4d txs,% 3d posts,% 3d edits,"
                     "% 3d payouts,% 3d votes,% 3d counts,% 3d accts,% 3d follows"
                     " --% 5dms%s", num, block['timestamp'], len(block['transactions']),
                     cnt['insert'], cnt['update'], cnt['payout'], cnt['upvote'],
                     cnt['recount'], accts, follows, ms, ' SLOW' if ms > 1000 else '')

            if num % 1200 == 0: #1hr
                log.warning("head block %d @ %s", num, block['timestamp'])
                log.info("[LIVE] hourly stats")
                Accounts.fetch_ranks()
                #Community.recalc_pending_payouts()
            if num % 200 == 0: #10min
                Community.recalc_pending_payouts()
            if num % 100 == 0: #5min
                log.info("[LIVE] 5-min stats")
                Accounts.dirty_oldest(500)
            if num % 20 == 0: #1min
                self._update_chain_state()

    # refetch dynamic_global_properties, feed price, etc
    def _update_chain_state(self):
        """Update basic state props (head block, feed price) in db."""
        state = self._steem.gdgp_extended()
        self._db.query("""UPDATE hive_state SET block_num = :block_num,
                       steem_per_mvest = :spm, usd_per_steem = :ups,
                       sbd_per_steem = :sps, dgpo = :dgpo""",
                       block_num=state['dgpo']['head_block_number'],
                       spm=state['steem_per_mvest'],
                       ups=state['usd_per_steem'],
                       sps=state['sbd_per_steem'],
                       dgpo=json.dumps(state['dgpo']))
        return state['dgpo']['head_block_number']