python source code of tasks

#  Polkascan PRE Harvester
#
#  Copyright 2018-2020 openAware BV (NL).
#  This file is part of Polkascan.
#
#  Polkascan is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  Polkascan is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with Polkascan. If not, see <http://www.gnu.org/licenses/>.
#
#  tasks.py

import os
from time import sleep

import celery
from celery.result import AsyncResult

from app import settings
from scalecodec.base import ScaleDecoder, ScaleBytes

from sqlalchemy import create_engine, text, distinct
from sqlalchemy.exc import SQLAlchemyError, IntegrityError
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.sql import func

from app.models.data import Extrinsic, Block, BlockTotal, Account, AccountInfoSnapshot, SearchIndex
from app.models.harvester import Status
from app.processors.converters import PolkascanHarvesterService, HarvesterCouldNotAddBlock, BlockAlreadyAdded, \
    BlockIntegrityError
from scalecodec.exceptions import RemainingScaleBytesNotEmptyException
from substrateinterface import SubstrateInterface, xxh128

from app.settings import DB_CONNECTION, DEBUG, SUBSTRATE_RPC_URL, TYPE_REGISTRY, FINALIZATION_ONLY, TYPE_REGISTRY_FILE

CELERY_BROKER = os.environ.get('CELERY_BROKER')
CELERY_BACKEND = os.environ.get('CELERY_BACKEND')

app = celery.Celery('tasks', broker=CELERY_BROKER, backend=CELERY_BACKEND)

app.conf.beat_schedule = {
    'check-head-10-seconds': {
        'task': 'app.tasks.start_harvester',
        'schedule': 10.0,
        'args': ()
    },
}

app.conf.timezone = 'UTC'


class BaseTask(celery.Task):

    def __init__(self):
        self.metadata_store = {}

    def __call__(self, *args, **kwargs):
        self.engine = create_engine(DB_CONNECTION, echo=DEBUG, isolation_level="READ_UNCOMMITTED")
        session_factory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False)
        self.session = scoped_session(session_factory)

        return super().__call__(*args, **kwargs)

    def after_return(self, status, retval, task_id, args, kwargs, einfo):
        if hasattr(self, 'session'):
            self.session.remove()
        if hasattr(self, 'engine'):
            self.engine.engine.dispose()


@app.task(base=BaseTask, bind=True)
def accumulate_block_recursive(self, block_hash, end_block_hash=None):

    harvester = PolkascanHarvesterService(
        db_session=self.session,
        type_registry=TYPE_REGISTRY,
        type_registry_file=TYPE_REGISTRY_FILE
    )

    harvester.metadata_store = self.metadata_store
    harvester.substrate.metadata_cache = self.metadata_store

    # If metadata store isn't initialized yet, perform some tests
    if not harvester.metadata_store:
        print('Init: create entrypoints')
        # Check if blocks exists
        max_block_id = self.session.query(func.max(Block.id)).one()[0]

        if not max_block_id:
            # Speed up accumulating by creating several entry points
            substrate = SubstrateInterface(SUBSTRATE_RPC_URL)
            block_nr = substrate.get_block_number(block_hash)
            if block_nr > 100:
                for entry_point in range(0, block_nr, block_nr // 4)[1:-1]:
                    entry_point_hash = substrate.get_block_hash(entry_point)
                    accumulate_block_recursive.delay(entry_point_hash)

    block = None
    max_sequenced_block_id = False

    add_count = 0

    try:

        for nr in range(0, 10):
            if not block or block.id > 0:
                # Process block
                block = harvester.add_block(block_hash)

                print('+ Added {} '.format(block_hash))

                add_count += 1

                self.session.commit()

                # Break loop if targeted end block hash is reached
                if block_hash == end_block_hash or block.id == 0:
                    break

                # Continue with parent block hash
                block_hash = block.parent_hash

        # Update persistent metadata store in Celery task
        self.metadata_store = harvester.metadata_store

        if block_hash != end_block_hash and block and block.id > 0:
            accumulate_block_recursive.delay(block.parent_hash, end_block_hash)

    except BlockAlreadyAdded as e:
        print('. Skipped {} '.format(block_hash))
    except IntegrityError as e:
        print('. Skipped duplicate {} '.format(block_hash))
    except Exception as exc:
        print('! ERROR adding {}'.format(block_hash))
        raise HarvesterCouldNotAddBlock(block_hash) from exc

    return {
        'result': '{} blocks added'.format(add_count),
        'lastAddedBlockHash': block_hash,
        'sequencerStartedFrom': max_sequenced_block_id
    }


@app.task(base=BaseTask, bind=True)
def start_sequencer(self):
    sequencer_task = Status.get_status(self.session, 'SEQUENCER_TASK_ID')

    if sequencer_task.value:
        task_result = AsyncResult(sequencer_task.value)
        if not task_result or task_result.ready():
            sequencer_task.value = None
            sequencer_task.save(self.session)

    if sequencer_task.value is None:
        sequencer_task.value = self.request.id
        sequencer_task.save(self.session)

        harvester = PolkascanHarvesterService(
            db_session=self.session,
            type_registry=TYPE_REGISTRY,
            type_registry_file=TYPE_REGISTRY_FILE
        )
        try:
            result = harvester.start_sequencer()
        except BlockIntegrityError as e:
            result = {'result': str(e)}

        sequencer_task.value = None
        sequencer_task.save(self.session)

        self.session.commit()

        # Check if analytics data need to be generated
        #start_generate_analytics.delay()

        return result
    else:
        return {'result': 'Sequencer already running'}


@app.task(base=BaseTask, bind=True)
def rebuilding_search_index(self, search_index_id=None, truncate=False):
    if truncate:
        # Clear search index table
        self.session.execute('delete from analytics_search_index where index_type_id={}'.format(search_index_id))
        self.session.commit()

    harvester = PolkascanHarvesterService(
        db_session=self.session,
        type_registry=TYPE_REGISTRY,
        type_registry_file=TYPE_REGISTRY_FILE
    )
    harvester.rebuild_search_index()

    return {'result': 'index rebuilt'}


@app.task(base=BaseTask, bind=True)
def start_harvester(self, check_gaps=False):

    substrate = SubstrateInterface(SUBSTRATE_RPC_URL)

    block_sets = []

    if check_gaps:
        # Check for gaps between already harvested blocks and try to fill them first
        remaining_sets_result = Block.get_missing_block_ids(self.session)

        for block_set in remaining_sets_result:

            # Get start and end block hash
            end_block_hash = substrate.get_block_hash(int(block_set['block_from']))
            start_block_hash = substrate.get_block_hash(int(block_set['block_to']))

            # Start processing task
            accumulate_block_recursive.delay(start_block_hash, end_block_hash)

            block_sets.append({
                'start_block_hash': start_block_hash,
                'end_block_hash': end_block_hash
            })

    # Start sequencer
    sequencer_task = start_sequencer.delay()

    # Continue from current (finalised) head
    if FINALIZATION_ONLY == 1:
        start_block_hash = substrate.get_chain_finalised_head()
    else:
        start_block_hash = substrate.get_chain_head()

    end_block_hash = None

    accumulate_block_recursive.delay(start_block_hash, end_block_hash)

    block_sets.append({
        'start_block_hash': start_block_hash,
        'end_block_hash': end_block_hash
    })

    return {
        'result': 'Harvester job started',
        'block_sets': block_sets,
        'sequencer_task_id': sequencer_task.task_id
    }


@app.task(base=BaseTask, bind=True)
def start_generate_analytics(self):
    self.session.execute('CALL generate_analytics_data()')
    self.session.commit()
    return {'status': 'OK'}


@app.task(base=BaseTask, bind=True)
def rebuild_search_index(self):
    harvester = PolkascanHarvesterService(
        db_session=self.session,
        type_registry=TYPE_REGISTRY,
        type_registry_file=TYPE_REGISTRY_FILE
    )
    harvester.rebuild_search_index()

    return {'result': 'search index rebuilt'}


@app.task(base=BaseTask, bind=True)
def rebuild_account_info_snapshot(self):
    harvester = PolkascanHarvesterService(
        db_session=self.session,
        type_registry=TYPE_REGISTRY,
        type_registry_file=TYPE_REGISTRY_FILE
    )

    last_full_snapshot_block_nr = 0

    self.session.execute('truncate table {}'.format(AccountInfoSnapshot.__tablename__))

    for account_id, block_id in self.session.query(SearchIndex.account_id, SearchIndex.block_id).filter(
            SearchIndex.block_id >= settings.BALANCE_SYSTEM_ACCOUNT_MIN_BLOCK
    ).order_by('block_id').group_by(SearchIndex.account_id, SearchIndex.block_id).yield_per(1000):

        if block_id > last_full_snapshot_block_nr + settings.BALANCE_FULL_SNAPSHOT_INTERVAL:

            last_full_snapshot_block_nr = block_id - block_id % settings.BALANCE_FULL_SNAPSHOT_INTERVAL
            harvester.create_full_balance_snaphot(last_full_snapshot_block_nr)
            self.session.commit()
        else:
            harvester.create_balance_snapshot(block_id, account_id)
            self.session.commit()

    # set balances according to most recent snapshot
    account_info = self.session.execute("""
            select
               a.account_id, 
               a.balance_total,
               a.balance_free,
               a.balance_reserved,
               a.nonce
        from
             data_account_info_snapshot as a
        inner join (
            select 
                account_id, max(block_id) as max_block_id 
            from data_account_info_snapshot 
            group by account_id
        ) as b
        on a.account_id = b.account_id and a.block_id = b.max_block_id
        """)

    for account_id, balance_total, balance_free, balance_reserved, nonce in account_info:
        Account.query(self.session).filter_by(id=account_id).update(
            {
                Account.balance_total: balance_total,
                Account.balance_free: balance_free,
                Account.balance_reserved: balance_reserved,
                Account.nonce: nonce,
            }, synchronize_session='fetch'
        )
    self.session.commit()


    return {'result': 'account info snapshots rebuilt'}


@app.task(base=BaseTask, bind=True)
def balance_snapshot(self, account_id=None, block_start=1, block_end=None, block_ids=None):
    if account_id:
        accounts = [account_id]
    else:
        accounts = [account.id for account in Account.query(self.session)]

    harvester = PolkascanHarvesterService(
        db_session=self.session,
        type_registry=TYPE_REGISTRY,
        type_registry_file=TYPE_REGISTRY_FILE
    )

    if block_ids:
        block_range = block_ids
    else:

        if block_end is None:
            # Set block end to chaintip
            substrate = SubstrateInterface(SUBSTRATE_RPC_URL)
            block_end = substrate.get_block_number(substrate.get_chain_finalised_head())

        block_range = range(block_start, block_end + 1)

    for block_id in block_range:
        for account in accounts:
            harvester.create_balance_snapshot(block_id, account)
            self.session.commit()

    return {
        'message': 'Snapshop created',
        'account_id': account_id,
        'block_start': block_start,
        'block_end': block_end,
        'block_ids': block_ids
    }


@app.task(base=BaseTask, bind=True)
def update_balances_in_block(self, block_id):
    harvester = PolkascanHarvesterService(
        db_session=self.session,
        type_registry=TYPE_REGISTRY,
        type_registry_file=TYPE_REGISTRY_FILE
    )

    harvester.create_full_balance_snaphot(block_id)
    self.session.commit()

    harvester.update_account_balances()
    self.session.commit()

    return 'Snapshot created for block {}'.format(block_id)