Python whoosh.index.create_in() Examples

The following are 11 code examples of whoosh.index.create_in(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module whoosh.index , or try the search function

Example #1

Source File: engine.py From txtorg with MIT License

6 votes

def _init_index(self):

        if not os.path.exists(self.corpus.path):
            os.mkdir(self.corpus.path)

        analyzer = self.corpus.analyzer
        self.analyzer = self.corpus.analyzer
        
        if exists_in(self.corpus.path):
            ix = open_dir(self.corpus.path)
        else:
            # may need to remove this?  how can we have a schema if we don't know the...uh...schema?
            schema = Schema(title=TEXT(stored=True,analyzer=analyzer), content=TEXT(analyzer=analyzer),
                            path=ID(stored=True))
            ix = create_in(self.corpus.path,schema)
            writer = ix.writer()            
            writer.commit()

        self.index = ix
        self.searcher = ix.searcher();
        #self.reader = IndexReader.open(self.lucene_index, True)
        self.reader = ix.reader();
        #self.analyzer = self.corpus.analyzer

Example #2

Source File: indexfiles.py From txtorg with MIT License

6 votes

def __init__(self, root, storeDir, analyzer, args_dir = None):
        self.args_dir = args_dir
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        schema = Schema(name=TEXT(stored=True),
                    path=ID(stored=True),
                    txtorg_id=ID(stored=True),
                    contents=TEXT(stored=False,vector=True,analyzer=analyzer()))
        ix = create_in(storeDir, schema)
        writer = ix.writer()
        
        print 'document dir is', root
        self.indexDocs(root, writer)

        print 'optimizing index',
        writer.commit(optimize=True)
        print 'done'
        self.index = ix
        self.writer = writer
        self.reader = ix.reader()

Example #3

Source File: whoosh_write.py From Penny-Dreadful-Tools with GNU General Public License v3.0

5 votes

def rewrite_index(self, cards: List[Card]) -> None:
        print('Rewriting index in {d}'.format(d=WhooshConstants.index_dir))
        ensure_dir_exists(WhooshConstants.index_dir)
        ix = create_in(WhooshConstants.index_dir, self.schema)
        update_index(ix, cards)

    # pylint: disable=no-self-use

Example #4

Source File: search.py From markdown-search with GNU General Public License v2.0

5 votes

def open_index(self, index_folder, create_new=False):
        self.index_folder = index_folder
        if create_new:
            if os.path.exists(index_folder):
                shutil.rmtree(index_folder)
                print "deleted index folder: " + index_folder

        if not os.path.exists(index_folder):
            os.mkdir(index_folder)

        exists = index.exists_in(index_folder)
        stemming_analyzer = StemmingAnalyzer()

        schema = Schema(
            path=ID(stored=True, unique=True)
            , filename=TEXT(stored=True, field_boost=100.0)
            , tags=KEYWORD(stored=True, scorable=True, field_boost=80.0)
            , headlines=KEYWORD(stored=True, scorable=True, field_boost=60.0)
            , doubleemphasiswords=KEYWORD(stored=True, scorable=True, field_boost=40.0)
            , emphasiswords=KEYWORD(stored=True, scorable=True, field_boost=20.0)
            , content=TEXT(stored=True, analyzer=stemming_analyzer)
            , time=STORED
        )
        if not exists:
            self.ix = index.create_in(index_folder, schema)
        else:
            self.ix = index.open_dir(index_folder)

Example #5

Source File: whoosh_backend.py From flask-msearch with BSD 3-Clause "New" or "Revised" License

5 votes

def init(self):
        ix_path = os.path.join(self.path, self.name)
        if whoosh_index.exists_in(ix_path):
            return whoosh_index.open_dir(ix_path)
        if not os.path.exists(ix_path):
            os.makedirs(ix_path)
        return whoosh_index.create_in(ix_path, self.schema)

Example #6

Source File: search.py From databrewer with MIT License

5 votes

def __init__(self, index_dir, schema=DEFAULT_SCHEMA, force_create=False):
        self.schema = schema
        if exists_in(index_dir) and not force_create:
            self.index = open_dir(index_dir, schema=schema)
        else:
            self.index = create_in(index_dir, schema=schema)

Example #7

Source File: whooshsearch.py From pySINDy with MIT License

5 votes

def __init__(self, db_path):
        ensuredir(db_path)
        if index.exists_in(db_path):
            self.index = index.open_dir(db_path)
        else:
            self.index = index.create_in(db_path, schema=self.schema)
        self.qparser = QueryParser('text', self.schema)

Example #8

Source File: models.py From realms-wiki with GNU General Public License v2.0

5 votes

def __init__(self, index_path, language):
        from whoosh import index as whoosh_index
        from whoosh.fields import Schema, TEXT, ID
        from whoosh import qparser
        from whoosh.highlight import UppercaseFormatter
        from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer
        from whoosh.lang import has_stemmer, has_stopwords
        import os

        if not has_stemmer(language) or not has_stopwords(language):
            # TODO Display a warning?
            analyzer = SimpleAnalyzer()
        else:
            analyzer = LanguageAnalyzer(language)

        self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer))
        self.formatter = UppercaseFormatter()

        self.index_path = index_path

        if not os.path.exists(index_path):
            try:
                os.mkdir(index_path)
            except OSError as e:
                sys.exit("Error creating Whoosh index: %s" % e)

        if whoosh_index.exists_in(index_path):
            try:
                self.search_index = whoosh_index.open_dir(index_path)
            except whoosh_index.IndexError as e:
                sys.exit("Error opening whoosh index: {0}".format(e))
        else:
            self.search_index = whoosh_index.create_in(index_path, self.schema)

        self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema)
        self.query_parser.add_plugin(qparser.FuzzyTermPlugin())

Example #9

Source File: models.py From realms-wiki with GNU General Public License v2.0

5 votes

def delete_index(self, index):
        from whoosh import index as whoosh_index
        self.search_index.close()
        self.search_index = whoosh_index.create_in(self.index_path, schema=self.schema)

Example #10

Source File: index_whoosh.py From BREDS with GNU Lesser General Public License v3.0

5 votes

def create_index():
    regex_tokenize = re.compile('\w+(?:-\w+)+|<[A-Z]+>[^<]+</[A-Z]+>|\w+', re.U)
    tokenizer = RegexTokenizer(regex_tokenize)
    schema = Schema(sentence=TEXT(stored=True, analyzer=tokenizer))
    if not os.path.exists("index_full"):
        os.mkdir("index_full")
        idx = create_in("index_full", schema)
    else:
        idx = open_dir("index_full")
    return idx

Example #11

Source File: get_template_based_result.py From DualRL with MIT License

4 votes

def cal_sim(train_data_path, test_data_path, dst_result_path=None, save_n_best_search=1):
    schema = Schema(context=TEXT(stored=True), response=STORED, post=TEXT(stored=True))
    index_i = re.findall('\d', train_data_path)[0]

    index_path = "../tmp/ix_index/" + index_i
    if not os.path.exists(index_path):
        os.makedirs(index_path)

    ix = create_in(index_path, schema)
    writer = ix.writer()

    def get_cpr(line):
        lines = line.lower().strip().split('\t')
        context = ''
        post = lines[0]
        response = lines[1]
        return context.strip().decode('utf-8'), response.decode('utf-8'), post.decode('utf-8')

    def load_train_data(file_name, writer):
        f = open(file_name)
        for line in f:
            context, response, post = get_cpr(line)
            if context != '':
                writer.add_document(context=context, response=response, post=post)
            else:
                writer.add_document(response=response, post=post)
        writer.commit()

    def get_query(line, ix):
        lines = line.strip().split('\t')
        post = lines[0].decode('utf-8')
        q2 = QueryParser("post", ix.schema).parse(post)
        terms = list(q2.all_terms())
        query = Or([Term(*x) for x in terms])
        return query

    load_train_data(train_data_path, writer)

    f = open(test_data_path, 'r')
    fw_search = open(dst_result_path, 'w')
    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        c = searcher.collector(limit=10)
        tlc = TimeLimitCollector(c, timelimit=10.0)
        for line in f:
            try:
                query = get_query(line, ix)
                searcher.search_with_collector(query, tlc)
                results = tlc.results()
                for i in range(min(len(results), save_n_best_search)):
                    fw_search.write(
                        line.strip() + '\t' + str(results[i]["post"]) + '\t' + str(results[i]["response"]) + '\n')
            except Exception as e:
                print('TimeLimit, ignore it!')
                print(line)
    fw_search.close()