import datetime
import logging
import sys
import gzip
import argparse
import os
import json
from datetime import datetime, timedelta
from time import time, sleep
from SolrClient import SolrClient, IndexQ

class Reindexer():
    Initiates the re-indexer.

    :param source: An instance of SolrClient.
    :param dest: An instance of SolrClient or an instance of IndexQ.
    :param string source_coll: Source collection name.
    :param string dest_coll: Destination collection name; only required if destination is SolrClient.
    :param int rows: Number of items to get in each query; default is 1000, however you will probably want to increase it.
    :param string date_field: String name of a Solr date field to use in sort and resume.
    :param bool devel: Whenever to turn on super verbouse logging for development. Standard DEBUG should suffice for most developemnt.
    :param bool per_shard: Will add distrib=false to each query to get the data. Use this only if you will be running multiple instances of this to get the rest of the shards.
    :param list ignore_fields: What fields to exclude from Solr queries. This is important since if you pull them out, you won't be able to index the documents in.
    By default, it will try to determine and exclude copy fields as well as _version_. Pass in your own list to override or set it to False to prevent it from doing anything.
    def __init__(self,

        self.log = logging.getLogger('reindexer')

        self._source = source
        self._source_coll = source_coll
        self._dest = dest
        self._dest_coll = dest_coll
        self._rows = rows
        self._date_field = date_field
        self._per_shard = per_shard
        self._items_processed = 0
        self._devel = devel
        self._ignore_fields = ignore_fields

        #Determine what source and destination should be
        if type(source) is SolrClient and source_coll:
            self._getter = self._from_solr
             #Maybe break this out later for the sake of testing
            if type(self._ignore_fields) is list and len(self._ignore_fields) == 1:

        elif type(source) is str and os.path.isdir(source):
            self._getter = self._from_json
            raise ValueError("Incorrect Source Specified. Pass either a directory with json files or source SolrClient \
                            instance with the name of the collection.")

        if type(self._dest) is SolrClient and self._dest_coll:
            self._putter = self._to_solr
        elif type(dest) is IndexQ:
            self._putter = self._to_IndexQ
           raise ValueError("Incorrect Destination Specified. Pass either a directory with json files or destination SolrClient \
                            instance with the name of the collection.")"Reindexer created succesfully. ")

    def _get_copy_fields(self):
        if self._devel:
            self.log.debug("Getting additional copy fields to exclude")
        fields =  [field['dest'] for field in self._source.schema.get_schema_copyfields(self._source_coll)]"Field exclusions are: {}".format(", ".join(fields)))
        return fields

    def reindex(self, fq= [], **kwargs):
        Starts Reindexing Process. All parameter arguments will be passed down to the getter function.
        :param string fq: FilterQuery to pass to source Solr to retrieve items. This can be used to limit the results.
        for items in self._getter(fq=fq, **kwargs):
        if type(self._dest) is SolrClient and self._dest_coll:
  "Finished Indexing, sending a commit")
            self._dest.commit(self._dest_coll, openSearcher=True)

    def _from_solr(self, fq=[], report_frequency = 25):
        Method for retrieving batch data from Solr.
        cursor = '*'
        stime =
        query_count = 0
        while True:
            #Get data with starting cursorMark
            query = self._get_query(cursor)
            #Add FQ to the query. This is used by resume to filter on date fields and when specifying document subset.
            #Not included in _get_query for more flexibiilty.

            if fq:
                if 'fq' in query:
                    [query['fq'].append(x) for x in fq]
                    query['fq'] = fq

            results = self._source.query(self._source_coll, query)
            query_count += 1
            if query_count % report_frequency == 0:
      "Processed {} Items in {} Seconds. Apprximately {} items/minute".format(
                            self._items_processed, int((,
                            str(int(self._items_processed / ((

            if results.get_results_count():
                #If we got items back, get the new cursor and yield the docs
                self._items_processed += results.get_results_count()
                cursor = results.get_cursor()
                #Remove ignore fields
                docs = self._trim_fields(
                yield docs
                if results.get_results_count() < self._rows:
                    #Less results than asked, probably done
                #No Results, probably done :)
                self.log.debug("Got zero Results with cursor: {}".format(cursor))

    def _trim_fields(self, docs):
        Removes ignore fields from the data that we got from Solr.
        for doc in docs:
            for field in self._ignore_fields:
                if field in doc:
        return docs

    def _get_query(self, cursor):
        Query tempalte for source Solr, sorts by id by default.
        query = {'q':'*:*',
                'sort':'id desc',
        if self._date_field:
            query['sort'] = "{} asc, id desc".format(self._date_field)
        if self._per_shard:
            query['distrib'] = 'false'
        return query

    def _to_IndexQ(self, data):
        Sends data to IndexQ instance.

    def _to_solr(self, data):
        Sends data to a Solr instance.
        return self._dest.index_json(self._dest_coll, json.dumps(data,sort_keys=True))

    def _get_date_range_query(self, start_date, end_date, timespan= 'DAY', date_field= None):
        Gets counts of items per specified date range.
        :param collection: Solr Collection to use.
        :param timespan: Solr Date Math compliant value for faceting ex HOUR, MONTH, DAY
        if date_field is None:
            date_field = self._date_field
        query ={'q':'*:*',
                'facet.range': date_field,
                '': '+1{}'.format(timespan),
                'facet.range.end': '{}'.format(end_date),
                'facet.range.start': '{}'.format(start_date),
                'facet.range.include': 'all'
        if self._per_shard:
            query['distrib'] = 'false'
        return query

    def _get_edge_date(self, date_field, sort):
        This method is used to get start and end dates for the collection.
        return self._source.query(self._source_coll, {
                'sort':'{} {}'.format(date_field, sort)}).docs[0][date_field]

    def _get_date_facet_counts(self, timespan, date_field, start_date=None, end_date=None):
        Returns Range Facet counts based on
        if 'DAY' not in timespan:
            raise ValueError("At this time, only DAY date range increment is supported. Aborting..... ")

        #Need to do this a bit better later. Don't like the string and date concatenations.
        if not start_date:
            start_date = self._get_edge_date(date_field, 'asc')
            start_date = datetime.strptime(start_date,'%Y-%m-%dT%H:%M:%S.%fZ').date().isoformat()+'T00:00:00.000Z'
            start_date = start_date+'T00:00:00.000Z'

        if not end_date:
            end_date = self._get_edge_date(date_field, 'desc')
            end_date = datetime.strptime(end_date,'%Y-%m-%dT%H:%M:%S.%fZ').date()
            end_date += timedelta(days=1)
            end_date = end_date.isoformat()+'T00:00:00.000Z'
            end_date = end_date+'T00:00:00.000Z'
"Processing Items from {} to {}".format(start_date, end_date))

        #Get facet counts for source and destination collections
        source_facet = self._source.query(self._source_coll,
            self._get_date_range_query(timespan=timespan, start_date=start_date, end_date=end_date)
        dest_facet = self._dest.query(
            self._dest_coll, self._get_date_range_query(
                    timespan=timespan, start_date=start_date, end_date=end_date
        return source_facet, dest_facet

    def resume(self, start_date=None, end_date=None, timespan='DAY', check= False):
        This method may help if the original run was interrupted for some reason. It will only work under the following conditions
        * You have a date field that you can facet on
        * Indexing was stopped for the duration of the copy

        The way this tries to resume re-indexing is by running a date range facet on the source and destination collections. It then compares
        the counts in both collections for each timespan specified. If the counts are different, it will re-index items for each range where
        the counts are off. You can also pass in a start_date to only get items after a certain time period. Note that each date range will be indexed in
        it's entirety, even if there is only one item missing.

        Keep in mind this only checks the counts and not actual data. So make the indexes weren't modified between the reindexing execution and
        running the resume operation.

        :param start_date: Date to start indexing from. If not specified there will be no restrictions and all data will be processed. Note that
        this value will be passed to Solr directly and not modified.
        :param end_date: The date to index items up to. Solr Date Math compliant value for faceting; currenlty only DAY is supported.
        :param timespan: Solr Date Math compliant value for faceting; currenlty only DAY is supported.
        :param check: If set to True it will only log differences between the two collections without actually modifying the destination.

        if type(self._source) is not SolrClient or type(self._dest) is not SolrClient:
            raise ValueError("To resume, both source and destination need to be Solr.")

        source_facet, dest_facet = self._get_date_facet_counts(timespan, self._date_field, start_date=start_date, end_date=end_date)

        for dt_range in sorted(source_facet):
            if dt_range in dest_facet:
      "Date Range: {} Source: {} Destination:{} Difference:{}".format(
                        dt_range, source_facet[dt_range], dest_facet[dt_range], (source_facet[dt_range]-dest_facet[dt_range])))
                if check:
                if source_facet[dt_range] > dest_facet[dt_range]:
                    #Kicks off reindexing with an additional FQ
                    self.reindex(fq=['{}:[{} TO {}]'.format(self._date_field, dt_range, dt_range+'+1{}'.format(timespan))])
          "Complete Date Range {}".format(dt_range))
                self.log.error("Something went wrong; destinationSource: {}".format(source_facet))
                self.log.error("Destination: {}".format(dest_facet))
                raise ValueError("Date Ranges don't match up")
        self._dest.commit(self._dest_coll, openSearcher=True)