Python whoosh.index.create_in() Examples

The following are 11 code examples of whoosh.index.create_in(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module whoosh.index , or try the search function .
Example #1
Source File: engine.py    From txtorg with MIT License 6 votes vote down vote up
def _init_index(self):

        if not os.path.exists(self.corpus.path):
            os.mkdir(self.corpus.path)

        analyzer = self.corpus.analyzer
        self.analyzer = self.corpus.analyzer
        
        if exists_in(self.corpus.path):
            ix = open_dir(self.corpus.path)
        else:
            # may need to remove this?  how can we have a schema if we don't know the...uh...schema?
            schema = Schema(title=TEXT(stored=True,analyzer=analyzer), content=TEXT(analyzer=analyzer),
                            path=ID(stored=True))
            ix = create_in(self.corpus.path,schema)
            writer = ix.writer()            
            writer.commit()

        self.index = ix
        self.searcher = ix.searcher();
        #self.reader = IndexReader.open(self.lucene_index, True)
        self.reader = ix.reader();
        #self.analyzer = self.corpus.analyzer 
Example #2
Source File: indexfiles.py    From txtorg with MIT License 6 votes vote down vote up
def __init__(self, root, storeDir, analyzer, args_dir = None):
        self.args_dir = args_dir
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        schema = Schema(name=TEXT(stored=True),
                    path=ID(stored=True),
                    txtorg_id=ID(stored=True),
                    contents=TEXT(stored=False,vector=True,analyzer=analyzer()))
        ix = create_in(storeDir, schema)
        writer = ix.writer()
        
        print 'document dir is', root
        self.indexDocs(root, writer)

        print 'optimizing index',
        writer.commit(optimize=True)
        print 'done'
        self.index = ix
        self.writer = writer
        self.reader = ix.reader() 
Example #3
Source File: whoosh_write.py    From Penny-Dreadful-Tools with GNU General Public License v3.0 5 votes vote down vote up
def rewrite_index(self, cards: List[Card]) -> None:
        print('Rewriting index in {d}'.format(d=WhooshConstants.index_dir))
        ensure_dir_exists(WhooshConstants.index_dir)
        ix = create_in(WhooshConstants.index_dir, self.schema)
        update_index(ix, cards)

    # pylint: disable=no-self-use 
Example #4
Source File: search.py    From markdown-search with GNU General Public License v2.0 5 votes vote down vote up
def open_index(self, index_folder, create_new=False):
        self.index_folder = index_folder
        if create_new:
            if os.path.exists(index_folder):
                shutil.rmtree(index_folder)
                print "deleted index folder: " + index_folder

        if not os.path.exists(index_folder):
            os.mkdir(index_folder)

        exists = index.exists_in(index_folder)
        stemming_analyzer = StemmingAnalyzer()

        schema = Schema(
            path=ID(stored=True, unique=True)
            , filename=TEXT(stored=True, field_boost=100.0)
            , tags=KEYWORD(stored=True, scorable=True, field_boost=80.0)
            , headlines=KEYWORD(stored=True, scorable=True, field_boost=60.0)
            , doubleemphasiswords=KEYWORD(stored=True, scorable=True, field_boost=40.0)
            , emphasiswords=KEYWORD(stored=True, scorable=True, field_boost=20.0)
            , content=TEXT(stored=True, analyzer=stemming_analyzer)
            , time=STORED
        )
        if not exists:
            self.ix = index.create_in(index_folder, schema)
        else:
            self.ix = index.open_dir(index_folder) 
Example #5
Source File: whoosh_backend.py    From flask-msearch with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def init(self):
        ix_path = os.path.join(self.path, self.name)
        if whoosh_index.exists_in(ix_path):
            return whoosh_index.open_dir(ix_path)
        if not os.path.exists(ix_path):
            os.makedirs(ix_path)
        return whoosh_index.create_in(ix_path, self.schema) 
Example #6
Source File: search.py    From databrewer with MIT License 5 votes vote down vote up
def __init__(self, index_dir, schema=DEFAULT_SCHEMA, force_create=False):
        self.schema = schema
        if exists_in(index_dir) and not force_create:
            self.index = open_dir(index_dir, schema=schema)
        else:
            self.index = create_in(index_dir, schema=schema) 
Example #7
Source File: whooshsearch.py    From pySINDy with MIT License 5 votes vote down vote up
def __init__(self, db_path):
        ensuredir(db_path)
        if index.exists_in(db_path):
            self.index = index.open_dir(db_path)
        else:
            self.index = index.create_in(db_path, schema=self.schema)
        self.qparser = QueryParser('text', self.schema) 
Example #8
Source File: models.py    From realms-wiki with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, index_path, language):
        from whoosh import index as whoosh_index
        from whoosh.fields import Schema, TEXT, ID
        from whoosh import qparser
        from whoosh.highlight import UppercaseFormatter
        from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer
        from whoosh.lang import has_stemmer, has_stopwords
        import os

        if not has_stemmer(language) or not has_stopwords(language):
            # TODO Display a warning?
            analyzer = SimpleAnalyzer()
        else:
            analyzer = LanguageAnalyzer(language)

        self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer))
        self.formatter = UppercaseFormatter()

        self.index_path = index_path

        if not os.path.exists(index_path):
            try:
                os.mkdir(index_path)
            except OSError as e:
                sys.exit("Error creating Whoosh index: %s" % e)

        if whoosh_index.exists_in(index_path):
            try:
                self.search_index = whoosh_index.open_dir(index_path)
            except whoosh_index.IndexError as e:
                sys.exit("Error opening whoosh index: {0}".format(e))
        else:
            self.search_index = whoosh_index.create_in(index_path, self.schema)

        self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema)
        self.query_parser.add_plugin(qparser.FuzzyTermPlugin()) 
Example #9
Source File: models.py    From realms-wiki with GNU General Public License v2.0 5 votes vote down vote up
def delete_index(self, index):
        from whoosh import index as whoosh_index
        self.search_index.close()
        self.search_index = whoosh_index.create_in(self.index_path, schema=self.schema) 
Example #10
Source File: index_whoosh.py    From BREDS with GNU Lesser General Public License v3.0 5 votes vote down vote up
def create_index():
    regex_tokenize = re.compile('\w+(?:-\w+)+|<[A-Z]+>[^<]+</[A-Z]+>|\w+', re.U)
    tokenizer = RegexTokenizer(regex_tokenize)
    schema = Schema(sentence=TEXT(stored=True, analyzer=tokenizer))
    if not os.path.exists("index_full"):
        os.mkdir("index_full")
        idx = create_in("index_full", schema)
    else:
        idx = open_dir("index_full")
    return idx 
Example #11
Source File: get_template_based_result.py    From DualRL with MIT License 4 votes vote down vote up
def cal_sim(train_data_path, test_data_path, dst_result_path=None, save_n_best_search=1):
    schema = Schema(context=TEXT(stored=True), response=STORED, post=TEXT(stored=True))
    index_i = re.findall('\d', train_data_path)[0]

    index_path = "../tmp/ix_index/" + index_i
    if not os.path.exists(index_path):
        os.makedirs(index_path)

    ix = create_in(index_path, schema)
    writer = ix.writer()

    def get_cpr(line):
        lines = line.lower().strip().split('\t')
        context = ''
        post = lines[0]
        response = lines[1]
        return context.strip().decode('utf-8'), response.decode('utf-8'), post.decode('utf-8')

    def load_train_data(file_name, writer):
        f = open(file_name)
        for line in f:
            context, response, post = get_cpr(line)
            if context != '':
                writer.add_document(context=context, response=response, post=post)
            else:
                writer.add_document(response=response, post=post)
        writer.commit()

    def get_query(line, ix):
        lines = line.strip().split('\t')
        post = lines[0].decode('utf-8')
        q2 = QueryParser("post", ix.schema).parse(post)
        terms = list(q2.all_terms())
        query = Or([Term(*x) for x in terms])
        return query

    load_train_data(train_data_path, writer)

    f = open(test_data_path, 'r')
    fw_search = open(dst_result_path, 'w')
    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        c = searcher.collector(limit=10)
        tlc = TimeLimitCollector(c, timelimit=10.0)
        for line in f:
            try:
                query = get_query(line, ix)
                searcher.search_with_collector(query, tlc)
                results = tlc.results()
                for i in range(min(len(results), save_n_best_search)):
                    fw_search.write(
                        line.strip() + '\t' + str(results[i]["post"]) + '\t' + str(results[i]["response"]) + '\n')
            except Exception as e:
                print('TimeLimit, ignore it!')
                print(line)
    fw_search.close()