Python elasticsearch.helpers.streaming_bulk() Examples

The following are 10 code examples of elasticsearch.helpers.streaming_bulk(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module elasticsearch.helpers , or try the search function .
Example #1
Source File: commands.py    From udata with GNU Affero General Public License v3.0 4 votes vote down vote up
def index_model(index_name, adapter, timeout=None):
    ''' Indel all objects given a model'''
    model = adapter.model
    log.info('Indexing %s objects', model.__name__)
    qs = model.objects
    if hasattr(model.objects, 'visible'):
        qs = qs.visible()
    if adapter.exclude_fields:
        qs = qs.exclude(*adapter.exclude_fields)

    docs = iter_qs(qs, adapter)
    docs = iter_for_index(docs, index_name)

    for ok, info in streaming_bulk(es.client, docs, raise_on_error=False,
                                   request_timeout=timeout):
        if not ok:
            log.error('Unable to index %s "%s": %s', model.__name__,
                      info['index']['_id'], info['index']['error']) 
Example #2
Source File: views.py    From texta with GNU General Public License v3.0 4 votes vote down vote up
def insert_multiple_documents(self, list_of_documents):
        actions = [{"_source": document, "_index": self.index, "_type": self.doc_type} for document in list_of_documents]

        for success, response in elastic_parallelbulk(client=self.es, actions=actions):
            self.logger.info("BulkAPI response: {0}".format(str(response)))
            if not success:
                self.logger.error(str(response))
                raise ValueError(str(response)) 
Example #3
Source File: index_data.py    From es-django-example with Apache License 2.0 4 votes vote down vote up
def verbose_run(self, model, report_every=100):
        name = model._meta.verbose_name
        print('Indexing %s: ' % name, end='')
        start = time.time()
        cnt = 0
        for _  in streaming_bulk(
                self.es,
                (m.to_search().to_dict(True) for m in model.objects.all().iterator()),
                index=settings.ES_INDEX,
                doc_type=name.lower(),
            ):
            cnt += 1
            if cnt % report_every:
                print('.', end='')
                sys.stdout.flush()
        print('DONE\nIndexing %d %s in %.2f seconds'% (
            cnt, name, time.time() - start
        )) 
Example #4
Source File: indexer.py    From georef-ar-api with MIT License 4 votes vote down vote up
def _insert_documents(self, es, index, docs, count, verbose=False):
        """Inserta documentos dentro de un índice.

        Args:
            es (Elasticsearch): Cliente Elasticsearch.
            index (str): Nombre de índice.
            docs (Iterator[dict]): Iterator de documentos a insertar.
            count (int): Cantidad de documentos a insertar.
            verbose (bool): Mostrar más información en pantalla.

        """
        operations = self._bulk_update_generator(docs, index)
        creations, errors = 0, 0

        logger.info('Insertando documentos...')

        iterator = helpers.streaming_bulk(es, operations, raise_on_error=False,
                                          request_timeout=ES_TIMEOUT)

        if verbose:
            iterator = tqdm.tqdm(iterator, total=count, file=sys.stderr)

        for ok, response in iterator:
            if ok and response['create']['result'] == 'created':
                creations += 1
            else:
                errors += 1
                identifier = response['create']['_id']
                error = response['create']['error']

                logger.warning(
                    'Error al procesar el documento ID {}:'.format(identifier))
                logger.warning(json.dumps(error, indent=4, ensure_ascii=False))
                logger.warning('')

        logger.info('Resumen:')
        logger.info(' + Documentos procesados: {}'.format(count))
        logger.info(' + Documentos creados: {}'.format(creations))
        logger.info(' + Errores: {}'.format(errors))
        logger.info('') 
Example #5
Source File: _elasticsearch_helpers.py    From mindmeld with Apache License 2.0 4 votes vote down vote up
def version_compatible_streaming_bulk(
    es_client, docs, index, chunk_size, raise_on_error, doc_type
):

    if is_es_version_7(es_client):
        return streaming_bulk(
            es_client,
            docs,
            index=index,
            chunk_size=chunk_size,
            raise_on_error=raise_on_error,
        )
    else:
        return streaming_bulk(
            es_client,
            docs,
            index=index,
            doc_type=doc_type,
            chunk_size=chunk_size,
            raise_on_error=raise_on_error,
        ) 
Example #6
Source File: es_load.py    From openslack-crawler with Apache License 2.0 4 votes vote down vote up
def load_repo(client, path=None, index='git'):
    """
    Parse a git repository with all it's commits and load it into elasticsearch
    using `client`. If the index doesn't exist it will be created.
    """
    path = dirname(dirname(abspath(__file__))) if path is None else path
    repo_name = basename(path)
    repo = git.Repo(path)

    create_git_index(client, index)

    # create the parent document in case it doesn't exist
    client.create(
        index=index,
        doc_type='repos',
        id=repo_name,
        body={},
        ignore=409 # 409 - conflict - would be returned if the document is already there
    )

    # we let the streaming bulk continuously process the commits as they come
    # in - since the `parse_commits` function is a generator this will avoid
    # loading all the commits into memory
    for ok, result in streaming_bulk(
            client,
            parse_commits(repo.refs.master.commit, repo_name),
            index=index,
            doc_type='commits',
            chunk_size=50 # keep the batch sizes small for appearances only
        ):
        action, result = result.popitem()
        doc_id = '/%s/commits/%s' % (index, result['_id'])
        # process the information from ES whether the document has been
        # successfully indexed
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print(doc_id)


# we manually create es repo document and update elasticsearch-py to include metadata 
Example #7
Source File: catalog_meta_indexer.py    From series-tiempo-ar-api with MIT License 4 votes vote down vote up
def index(self) -> bool:
        if not self.get_available_fields().count():
            self.task.info(self.task, "No hay series para indexar en este catálogo")
            return False

        index_ok = False
        for success, info in streaming_bulk(self.elastic, self.generate_actions()):
            if not success:
                self.task.info(self.task, 'Error indexando: {}'.format(info))
            else:
                index_ok = True

        return index_ok 
Example #8
Source File: elastic2_doc_manager.py    From elastic2-doc-manager with Apache License 2.0 3 votes vote down vote up
def handle_command(self, doc, namespace, timestamp):
        # Flush buffer before handle command
        self.commit()
        db = namespace.split(".", 1)[0]
        if doc.get("dropDatabase"):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get("renameCollection"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping."
            )

        if doc.get("create"):
            db, coll = self.command_helper.map_collection(db, doc["create"])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(), doc_type=coll, body={"_source": {"enabled": True}}
                )

        if doc.get("drop"):
            db, coll = self.command_helper.map_collection(db, doc["drop"])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn(
                    "Deleting all documents of type %s on index %s."
                    "The mapping definition will persist and must be"
                    "removed manually." % (coll, db)
                )
                responses = streaming_bulk(
                    self.elastic,
                    (
                        dict(result, _op_type="delete")
                        for result in scan(
                            self.elastic, index=db.lower(), doc_type=coll
                        )
                    ),
                )
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp
                        ) 
Example #9
Source File: elastic2_doc_manager.py    From elastic2-doc-manager with Apache License 2.0 3 votes vote down vote up
def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""

        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = str(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc),
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {"ns": namespace, "_ts": timestamp},
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search"
                )

        try:
            kw = {}
            if self.chunk_size > 0:
                kw["chunk_size"] = self.chunk_size

            responses = streaming_bulk(
                client=self.elastic, actions=docs_to_upsert(), **kw
            )

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp
                    )
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass 
Example #10
Source File: views.py    From autocompeter with Mozilla Public License 2.0 3 votes vote down vote up
def bulk(request, domain):
    assert domain

    try:
        documents = json.loads(request.body.decode('utf-8'))['documents']
    except KeyError:
        return http.JsonResponse({'error': "Missing 'documents'"}, status=400)

    def iterator():
        for document in documents:
            url = document.get('url', '').strip()
            if not url:
                continue
            title = document.get('title', '').strip()
            if not title:
                continue
            yield TitleDoc(
                meta={'id': make_id(domain.name, url)},
                **{
                    'domain': domain.name,
                    'url': url,
                    'title': title,
                    'group': document.get('group', '').strip(),
                    'popularity': float(document.get('popularity', 0.0)),
                }
            ).to_dict(True)

    count = failures = 0

    t0 = time.time()
    for success, doc in streaming_bulk(
        connections.get_connection(),
        iterator(),
        index=settings.ES_INDEX,
        doc_type='title_doc',
    ):
        if not success:
            print("NOT SUCCESS!", doc)
            failures += 1
        count += 1
    t1 = time.time()

    return http.JsonResponse({
        'message': 'OK',
        'count': count,
        'failures': failures,
        'took': t1 - t0,
    }, status=201)