# coding:utf-8
Created on 2015年9月24日

@author: likaiguo
from __future__ import division, unicode_literals

from _collections import defaultdict
import collections
import datetime
import hashlib
import logging
import re
import sys
import time

from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz

from utils.timer import Timer

from .simcache import SIMCACHE
from .simhash_model import SimHashCache, SimhashInvertedIndex

def add_dup_simhash_caches(simhashcache, dup_obj_ids):
    if not dup_obj_ids:
    old_dup_obj_ids = set(dup_obj_ids)
    start_time = time.time()
    for i, dup_obj_id in enumerate(dup_obj_ids, 1):
        with Timer(msg='fuzzy-like:%d %s' % (i, dup_obj_id)):
            logging.info('--' * 100)
                dup_simhash = SimHashCache.objects.get(obj_id=dup_obj_id)
            except Exception, e:
                print e
            sim_ratio = fuzz.partial_ratio(s1=simhashcache.text, s2=dup_simhash.text)
            logging.info('--' * 20)
            logging.info("%d %s %s" % (sim_ratio, simhashcache.obj_id, dup_simhash.obj_id))

            if dup_simhash not in old_dup_obj_ids:
                if sim_ratio > 50:
                if sim_ratio <= 50:
    if dup_obj_ids or dup_obj_ids:
        simhashcache.dup_count = len(old_dup_obj_ids)
        simhashcache.dup_obj_ids = list(old_dup_obj_ids)
    print (time.time() - start_time) * 100

class Simhash(object):

    def __init__(self, value, f=64, reg=r'[\w\u4e00-\u9fcc]+', hashfunc=None):
        `f` is the dimensions of fingerprints

        `reg` is meaningful only when `value` is basestring and describes
        what is considered to be a letter inside parsed string. Regexp
        object can also be specified (some attempt to handle any letters
        is to specify reg=re.compile(r'\w', re.UNICODE))

        `hashfunc` accepts a utf-8 encoded string and returns a unsigned
        integer in at least `f` bits.

        self.f = f
        self.reg = reg
        self.value = None

        if hashfunc is None:
            def _hashfunc(x):
                # 一些缓存,这个值可以继续扩大
                if x in SIMCACHE:
                    return SIMCACHE[x]
                return int(hashlib.md5(x).hexdigest(), 16)

            self.hashfunc = _hashfunc
            self.hashfunc = hashfunc

        if isinstance(value, Simhash):
            self.value = value.value
        elif isinstance(value, basestring):
        elif isinstance(value, collections.Iterable):
        elif isinstance(value, long):
            self.value = value
            raise Exception('Bad parameter with type {}'.format(type(value)))

    def _slide(self, content, width=4):
        return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))]

    def _tokenize(self, content):
        content = content.lower()
        content = ''.join(re.findall(self.reg, content))
        ans = self._slide(content)
        return ans

    def build_by_text(self, content):
        features = self._tokenize(content)
        return self.build_by_features(features)

    def build_by_features(self, features):
        hashs = [self.hashfunc(w.encode('utf-8')) for w in features]
        v = [0] * self.f
        masks = [1 << i for i in range(self.f)]
        for h in hashs:
            for i in range(self.f):
                v[i] += 1 if h & masks[i] else -1
        ans = 0
        for i in range(self.f):
            if v[i] >= 0:
                ans |= masks[i]
        self.value = ans

    def distance(self, another):
        计算海明距离,海明距离在二进制中表现为 xor,数出1的个数
        assert self.f == another.f
        x = (self.value ^ another.value) & ((1 << self.f) - 1)
        ans = 0
        while x:
            ans += 1
            x &= x - 1
        return ans

class SimhashIndex(object):

    def __init__(self, objs, f=64, k=2):
        `objs` is a list of (obj_id, simhash)
        obj_id is a string, simhash is an instance of Simhash
        `f` is the same with the one for Simhash
        `k` is the tolerance
        self.k = k
        self.f = f
        count = len(objs)
        logging.info('Initializing %s data.', count)
        # 最关键的点,全放在内存中的
        self.bucket = {}

        for i, q in enumerate(objs):
            if i % 10000 == 0 or i == count - 1:
                logging.info('%s/%s', i + 1, count)


    def get_near_dups(self, simhash):
        `simhash` is an instance of Simhash
        return a list of obj_id, which is in type of str
        assert simhash.f == self.f

        ans = set()

        for key in self.get_keys(simhash):
            dups = self.bucket.get(key, set())
            logging.debug('key:%s', key)
            if len(dups) > 200:
                logging.warning('Big bucket found. key:%s, len:%s', key, len(dups))

            for dup in dups:
                sim2, obj_id = dup.split(',', 1)
                sim2 = Simhash(long(sim2, 16), self.f)

                d = simhash.distance(sim2)
                if d <= self.k:
        return list(ans)

    def add(self, obj_id, simhash):
        `obj_id` is a string
        `simhash` is an instance of Simhash
        assert simhash.f == self.f

        for key in self.get_keys(simhash):
            v = '%x,%s' % (simhash.value, obj_id)

            self.bucket.setdefault(key, set())

    def delete(self, obj_id, simhash):
        `obj_id` is a string
        `simhash` is an instance of Simhash
        assert simhash.f == self.f

        for key in self.get_keys(simhash):
            v = '%x,%s' % (simhash.value, obj_id)

            if v in self.bucket.get(key, set()):

    def offsets(self):
        You may optimize this method according to <http://www.wwwconference.org/www2007/papers/paper215.pdf>
        return [self.f // (self.k + 1) * i for i in range(self.k + 1)]

    def get_keys(self, simhash):
        @summary: 将hash值分块,构建倒排索引的键
        for i, offset in enumerate(self.offsets):
            m = (i == len(self.offsets) - 1 and 2 ** (self.f - offset) - 1 or 2 ** (self.offsets[i + 1] - offset) - 1)
            c = simhash.value >> offset & m
            yield '%x:%x' % (c, i)

    def bucket_size(self):
        return len(self.bucket)

class SimhashIndexWithMongo(object):

    def __init__(self, objs=(), f=64, k=2, hash_type='resume'):
        `objs` is a list of (obj_id, origin_text)
         obj_id is a string, simhash is an instance of Simhash
        `f` is the same with the one for Simhash
        `k` is the tolerance 默认选择2的原因。 按照Charikar在论文中阐述的,64位simhash,
         `hash_type` is the hash type  of the text

        2.倒排索引的容器, 存储hash值进行离散后的  索引
        self.k = k
        self.f = f
        self.hash_type = hash_type
        count = len(objs)
        logging.info('Initializing %s data.', count)

        for i, q in enumerate(objs):
            if i % 10000 == 0 or i == count - 1:
                logging.info('%s/%s', i + 1, count)

    def insert(self, obj_id=None, value=None):
        @summary: 插入一个hash值
        data can  be text,{obj_id,text},  {obj_id,simhash}
        assert value != None
        if isinstance(value, (str, unicode)):
            simhash = Simhash(value=value, f=self.f)
        elif isinstance(value, Simhash):
            simhash = value
            raise 'value not text or simhash'
        assert simhash.f == self.f
        # 缓存原始文本信息
        if obj_id and simhash:
            with Timer(msg='add_simhash_cache'):
                # 存储或者更新缓存
                simhashcaches = SimHashCache.objects.filter(obj_id=obj_id,
                if simhashcaches:
                    simhashcache = simhashcaches[0]
                    simhashcache = SimHashCache(obj_id=obj_id,
                if isinstance(value, (str, unicode)):
                    simhashcache.text = value
                simhashcache.update_time = datetime.datetime.now()
                simhashcache.hash_value = "%x" % simhash.value
            with Timer(msg='add_invert_index'):
                # 存储倒排索引
                v = '%x,%s' % (simhash.value, obj_id)  # 转换成16进制,压缩,查询时候转回来,可以节省空间
                for key in self.get_keys(simhash):
                    with Timer(msg='add_invert_index-update_index-insert'):
                            invert_index = SimhashInvertedIndex(key=key, hash_type=self.hash_type,
                        except Exception, e:
                            print '%s,%s,%s' % (e, key, v)

            return simhashcache

    def find(self, value, k=2, exclude_obj_ids=set(), exclude_obj_id_contain=None):
        查找相似的text的 id,逻辑比较复杂
        2.将每个key查询倒排索引,得到对应可能相似的 related_simhash
        3.求origin_simhash与 related_simhash之间的编辑距离 d

        4.统计每个related_simhash和对应 编辑距离 d

        6.将related_simhash按照 d从小到大排序
        assert value != None

        if isinstance(value, (str, unicode)):
            simhash = Simhash(value=value, f=self.f)
        elif isinstance(value, Simhash):
            simhash = value
            raise 'value not text or simhash'
        assert simhash.f == self.f
        sim_hash_dict = defaultdict(list)
        ans = set()
        for key in self.get_keys(simhash):
            with Timer(msg='==query: %s' % key):
                simhash_invertindex = SimhashInvertedIndex.objects.filter(key=key)
                if simhash_invertindex:
                    simhash_caches_index = [sim_index.simhash_value_obj_id
                                        for sim_index in simhash_invertindex]
    #                 logging.warning('SimhashInvertedIndex not exists key %s: %s' % (key, e))
            with Timer(msg='find d < k %d' % (k)):
                if len(simhash_caches_index) > 200:
                    logging.warning('Big bucket found. key:%s, len:%s', key, len(simhash_caches_index))
                for simhash_cache in simhash_caches_index:
                        sim2, obj_id = simhash_cache.split(',', 1)
                        if obj_id in exclude_obj_ids or \
                        (exclude_obj_id_contain and exclude_obj_id_contain in simhash_cache):

                        sim2 = Simhash(long(sim2, 16), self.f)
                        d = simhash.distance(sim2)
    #                     print '**' * 50
    #                     print "d:%d obj_id:%s key:%s " % (d, obj_id, key)
                        if d < k:
                    except Exception, e:
                        logging.warning('not exists %s' % (e))
        return list(ans)

    def query_simhash_cache(obj_id):
        @summary: 通过obj_id,查询相似的simhash对象
        simhash_caches = SimHashCache.objects.filter(obj_id__contains=obj_id)

        return simhash_caches

    def find_similiar(obj_id):

        simhash_caches = SimHashCache.objects.filter(obj_id__contains=obj_id)
        return simhash_caches

    def delete(self, obj_id, simhash):
        `obj_id` is a string
        `simhash` is an instance of Simhash
        assert simhash.f == self.f
            simhashcache = SimHashCache.objects.get(obj_id=obj_id, hash_type=self.hash_type)
        except Exception, e:
            logging.warning('not exists %s' % (e))

        for key in self.get_keys(simhash):
                simhash_invertindex = SimhashInvertedIndex.objects.get(key=key)
                if simhashcache in simhash_invertindex.simhash_caches_index:
            except Exception, e:
                logging.warning('not exists %s' % (e))

    def add(self, obj_id, simhash):
        `obj_id` is a string
        `simhash` is an instance of Simhash
        return self.insert(obj_id=obj_id, value=simhash)

    def add_and_find_dup(self, obj_id, value, k=16):
        添加一个键值对文档,并且找到最相似的文档并且写入 simhashcache中,
        目的: 为了在建立的过程中尽量找到相关连的simhash.

        simhash = BeautifulSoup(value, "lxml").get_text('\n')
        simhashcache = self.add(obj_id=obj_id, simhash=simhash)
        with Timer(msg='find'):
            dup_obj_ids = self.find(value=simhash, k=k, exclude_obj_id_contain=obj_id.split('_')[0])
        if dup_obj_ids:
            with Timer(msg='add_dup_simhash_caches'):
                add_dup_simhash_caches(simhashcache, dup_obj_ids)

        return simhashcache

    def get_near_dups(self, simhash):
        `simhash` is an instance of Simhash
        return a list of obj_id, which is in type of str
        return self.find(simhash, self.k)

    def offsets(self):
        You may optimize this method according to <http://www.wwwconference.org/www2007/papers/paper215.pdf>
        return [self.f // (self.k + 1) * i for i in range(self.k + 1)]

    def get_keys(self, simhash):
        @summary: 将hash值分块,构建倒排索引的键
        for i, offset in enumerate(self.offsets):
            m = (i == len(self.offsets) - 1 and 2 ** (self.f - offset) - 1 or 2 ** (self.offsets[i + 1] - offset) - 1)
            c = simhash.value >> offset & m
            yield '%x:%x' % (c, i)

    def bucket_size(self):
        return SimhashInvertedIndex.objects.count()