Python more_itertools.chunked() Examples

The following are 23 code examples of more_itertools.chunked(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module more_itertools , or try the search function .
Example #1
Source File: filter.py    From pygreynoise with MIT License 6 votes vote down vote up
def filter(self, text, noise_only):
        """Filter lines that contain IP addresses from a given text.

        :param text: Text input
        :type text: file-like | str
        :param noise_only:
            If set, return only lines that contain IP addresses classified as noise,
            otherwise, return lines that contain IP addresses not classified as noise.
        :type noise_only: bool
        :return: Iterator that yields lines in chunks
        :rtype: iterable

        """
        if isinstance(text, str):
            text = text.splitlines(True)
        chunks = more_itertools.chunked(text, self.FILTER_TEXT_CHUNK_SIZE)
        for chunk in chunks:
            yield self._filter_chunk(chunk, noise_only) 
Example #2
Source File: utils.py    From vidreid_cosegmentation with Apache License 2.0 6 votes vote down vote up
def get_spatial_features(model, imgs, test_num_tracks):
    """to handle higher seq length videos due to OOM error
    specifically used during test
    
    Arguments:
        model -- model under test
        imgs -- imgs to get features for
    
    Returns:
        features 
    """

    # handle chunked data
    all_features, all_spatial_features = [], []

    for test_imgs in mit.chunked(imgs, test_num_tracks):
        current_test_imgs = torch.stack(test_imgs)
        num_current_test_imgs = current_test_imgs.shape[0]
        features, spatial_feats = model(current_test_imgs)
        features = features.view(num_current_test_imgs, -1)

        all_spatial_features.append(spatial_feats)
        all_features.append(features)

    return torch.cat(all_features), torch.cat(all_spatial_features) 
Example #3
Source File: utils.py    From vidreid_cosegmentation with Apache License 2.0 6 votes vote down vote up
def get_features(model, imgs, test_num_tracks):
    """to handle higher seq length videos due to OOM error
    specifically used during test
    
    Arguments:
        model -- model under test
        imgs -- imgs to get features for
    
    Returns:
        features 
    """

    # handle chunked data
    all_features = []

    for test_imgs in mit.chunked(imgs, test_num_tracks):
        current_test_imgs = torch.stack(test_imgs)
        num_current_test_imgs = current_test_imgs.shape[0]
        # print(current_test_imgs.shape)
        features = model(current_test_imgs)
        features = features.view(num_current_test_imgs, -1)
        all_features.append(features)

    return torch.cat(all_features) 
Example #4
Source File: displacements.py    From langchangetrack with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def calculate_words_displacement(self, column_names, n_jobs = 1):
        """ Calculate word displacements for each word in the Pandas data frame. """

        words = self.get_word_list()
        # Create chunks of the words to be processed.
        chunk_sz = np.ceil(len(words)/float(n_jobs))
        chunks = list(more_itertools.chunked(words, chunk_sz))

        # Calculate the displacements
        chunksL = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, process_word_source, self) for chunk in chunks)
        chunksH = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, process_word_dest, self) for chunk in chunks)
        L = more_itertools.flatten(chunksL)
        H = more_itertools.flatten(chunksH)
        flattendL = [x for sublist in L for x in sublist]
        flattendH = [x for sublist in H for x in sublist]

        # Store the results in a nice pandas data frame
        dfo, dfn = self.create_data_frames(flattendL, flattendH, column_names)
        return flattendL, flattendH, dfo, dfn 
Example #5
Source File: dataset.py    From machine-learning-for-programming-samples with MIT License 6 votes vote down vote up
def get_minibatch_iterator(
    token_seqs: np.ndarray,
    batch_size: int,
    is_training: bool,
    drop_remainder: bool = True,
) -> Iterator[np.ndarray]:
    indices = np.arange(token_seqs.shape[0])
    if is_training:
        np.random.shuffle(indices)

    for minibatch_indices in chunked(indices, batch_size):
        if len(minibatch_indices) < batch_size and drop_remainder:
            break  # Drop last, smaller batch

        minibatch_seqs = token_seqs[minibatch_indices]
        yield minibatch_seqs 
Example #6
Source File: test_more.py    From python-netsurv with MIT License 5 votes vote down vote up
def test_even(self):
        """Test when ``n`` divides evenly into the length of the iterable."""
        self.assertEqual(
            list(mi.chunked('ABCDEF', 3)), [['A', 'B', 'C'], ['D', 'E', 'F']]
        ) 
Example #7
Source File: graph_sequence.py    From article-0 with MIT License 5 votes vote down vote up
def __init__(self, args, batch_size=32, test=False):
		self.batch_size = batch_size
		
		self.query = """
			MATCH p=
				(person:PERSON) 
					-[:WROTE]-> 
				(review:REVIEW {dataset_name:{dataset_name}, test:{test}}) 
					-[:OF]-> 
				(product:PRODUCT)
			RETURN person.style_preference + product.style as x, review.score as y
		"""

		self.query_params = {
			"dataset_name": "article_0",
			"test": test
		}

		with open('./settings.json') as f:
			self.settings = json.load(f)[args.database]

		driver = GraphDatabase.driver(
			self.settings["neo4j_url"], 
			auth=(self.settings["neo4j_user"], self.settings["neo4j_password"]))

		with driver.session() as session:
			data = session.run(self.query, **self.query_params).data()
			data = [ (np.array(i["x"]), i["y"]) for i in data]
			
			# Split the data up into "batches"
			data = more_itertools.chunked(data, self.batch_size)

			# Format our batches in the way Keras expects them:
			# An array of tuples (x_batch, y_batch)

			# An x_batch is a numpy array of shape (batch_size, 12), 
			# containing the concatenated style and style_preference vectors. 

			# A y_batch is a numpy array of shape (batch_size,1) containing the review scores.

			self.data = [ (np.array([j[0] for j in i]), np.array([j[1] for j in i])) for i in data] 
Example #8
Source File: biodati_client.py    From pybel with MIT License 5 votes vote down vote up
def post_graph_chunked(
        self,
        graph: BELGraph,
        chunksize: int,
        *,
        use_tqdm: bool = True,
        collections: Optional[Iterable[str]] = None,
        overwrite: bool = False,
        validate: bool = True,
        email: Union[bool, str] = False
    ) -> requests.Response:
        """Post the graph to BioDati in chunks, when the graph is too big for a normal upload.

        :param graph: A BEL graph
        :param chunksize: The size of the chunks of nanopubs to upload
        :param use_tqdm: Should tqdm be used when iterating?
        :param collections: Tags to add to the nanopubs for lookup on BioDati
        :param overwrite: Set the BioDati upload "overwrite" setting
        :param validate: Set the BioDati upload "validate" setting
        :param email: Who should get emailed with results about the upload? If true, emails to user
         used for login. If string, emails to that user. If false, no email.
        :return: Last response from upload
        """
        metadata_extras = dict()
        if collections is not None:
            metadata_extras.update(collections=list(collections))
        iterable = _iter_graphdati(graph, use_tqdm=use_tqdm, metadata_extras=metadata_extras)
        res = None
        for chunk in chunked(iterable, chunksize):
            res = self.post_graph_json(
                chunk,
                overwrite=overwrite,
                validate=validate,
                email=email,
            )
        return res 
Example #9
Source File: test_more.py    From pipenv with MIT License 5 votes vote down vote up
def test_even(self):
        """Test when ``n`` divides evenly into the length of the iterable."""
        self.assertEqual(
            list(mi.chunked('ABCDEF', 3)), [['A', 'B', 'C'], ['D', 'E', 'F']]
        ) 
Example #10
Source File: test_more.py    From pipenv with MIT License 5 votes vote down vote up
def test_odd(self):
        """Test when ``n`` does not divide evenly into the length of the
        iterable.

        """
        self.assertEqual(
            list(mi.chunked('ABCDE', 3)), [['A', 'B', 'C'], ['D', 'E']]
        ) 
Example #11
Source File: abuse_ch.py    From n6 with GNU Affero General Public License v3.0 5 votes vote down vote up
def publish_iteratively(self):
        for chunk in more_itertools.chunked(self._selected_data, 20000):
            rk, body, prop_kwargs = self.get_output_components(selected_data="\n".join(chunk))
            self.publish_output(rk, body, prop_kwargs)
            yield
        yield self.FLUSH_OUT
        self.save_state(self._state) 
Example #12
Source File: varmisuse_data_splitter.py    From tf-gnn-samples with MIT License 5 votes vote down vote up
def _write_data(out_dir: RichPath, window_idx: int, chunk_size: int, data_window: List[Any]):
    np.random.shuffle(data_window)
    for chunk_idx, data_chunk in enumerate(chunked(data_window, chunk_size)):
        out_file = out_dir.join('chunk_%i-%i.jsonl.gz' % (window_idx, chunk_idx))
        out_file.save_as_compressed_file(data_chunk) 
Example #13
Source File: score_processor.py    From revscoring with MIT License 5 votes vote down vote up
def score(self, rev_ids, caches=None, cache=None):
        if isinstance(rev_ids, int):
            rev_ids = [rev_ids]

        batches = batch_rev_caches(chunked(rev_ids, self.batch_size), caches,
                                   cache)

        for batch_scores in self.scores_ex.map(self._score_batch, batches):
            for score in batch_scores:
                yield score 
Example #14
Source File: test_more.py    From python-netsurv with MIT License 5 votes vote down vote up
def test_odd(self):
        """Test when ``n`` does not divide evenly into the length of the
        iterable.

        """
        self.assertEqual(
            list(mi.chunked('ABCDE', 3)), [['A', 'B', 'C'], ['D', 'E']]
        ) 
Example #15
Source File: test_more.py    From python-netsurv with MIT License 5 votes vote down vote up
def test_even(self):
        """Test when ``n`` divides evenly into the length of the iterable."""
        self.assertEqual(
            list(mi.chunked('ABCDEF', 3)), [['A', 'B', 'C'], ['D', 'E', 'F']]
        ) 
Example #16
Source File: test_more.py    From python-netsurv with MIT License 5 votes vote down vote up
def test_odd(self):
        """Test when ``n`` does not divide evenly into the length of the
        iterable.

        """
        self.assertEqual(
            list(mi.chunked('ABCDE', 3)), [['A', 'B', 'C'], ['D', 'E']]
        ) 
Example #17
Source File: pydataapi.py    From py-data-api with MIT License 5 votes vote down vote up
def batch_execute(
        self,
        query: Union[Query, Insert, Update, Delete, Select, str],
        parameter_sets: Optional[List[Dict[str, Any]]],
        transaction_id: Optional[str] = None,
        database: Optional[str] = None,
    ) -> UpdateResults:

        if self.transaction_id:
            start_transaction: bool = False
        else:
            self.begin(database=database)
            start_transaction = True
        try:
            results_sets = list(
                flatten(
                    self.client.batch_execute_statement(
                        **Options(
                            resourceArn=self.resource_arn,
                            secretArn=self.secret_arn,
                            database=database or self.database,
                            transactionId=transaction_id or self.transaction_id,
                            parameterSets=chunked_parameter_sets,  # type: ignore
                            sql=query,
                        ).build()
                    )["updateResults"]
                    for chunked_parameter_sets in chunked(
                        parameter_sets or [], MAX_RECORDS
                    )
                )
            )
        except:
            if start_transaction:
                self.rollback()
            raise
        if start_transaction:
            self.commit()
        return UpdateResults(results_sets) 
Example #18
Source File: dump.py    From promnesia with MIT License 5 votes vote down vote up
def dump_histories(all_histories: List[Tuple[str, List[DbVisit]]]) -> None:
    logger = get_logger()
    output_dir = Path(config.get().output_dir)
    db_path = output_dir / 'promnesia.sqlite'

    def iter_visits():
        for e, h in all_histories:
            # TODO sort them somehow for determinism?
            # TODO what do we do with errors?
            # TODO maybe conform them to schema and dump too?
            # TODO or, dump to a separate table?
            yield from h

    tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite'
    engine = create_engine(f'sqlite:///{tpath}')
    binder = NTBinder.make(DbVisit)
    meta = MetaData(engine)
    table = Table('visits', meta, *binder.columns)
    meta.create_all()

    with engine.begin() as conn:
        for chunk in chunked(iter_visits(), n=_CHUNK_BY):
            bound = [binder.to_row(x) for x in chunk]
            # pylint: disable=no-value-for-parameter
            conn.execute(table.insert().values(bound))

    shutil.move(str(tpath), str(db_path))

    logger.info('saved database to %s', db_path)
    # TODO log error count 
Example #19
Source File: test_more.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def test_even(self):
        """Test when ``n`` divides evenly into the length of the iterable."""
        self.assertEqual(
            list(mi.chunked('ABCDEF', 3)), [['A', 'B', 'C'], ['D', 'E', 'F']]
        ) 
Example #20
Source File: test_more.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def test_odd(self):
        """Test when ``n`` does not divide evenly into the length of the
        iterable.

        """
        self.assertEqual(
            list(mi.chunked('ABCDE', 3)), [['A', 'B', 'C'], ['D', 'E']]
        ) 
Example #21
Source File: analyzer.py    From pygreynoise with MIT License 4 votes vote down vote up
def analyze(self, text):
        """Aggregate stats related to IP addresses from a given text.

        :param text: Text input
        :type text: file-like | str
        :return: Aggregated stats for all the IP addresses found.
        :rtype: dict

        """
        if isinstance(text, str):
            text = text.splitlines(True)
        chunks = more_itertools.chunked(text, self.ANALYZE_TEXT_CHUNK_SIZE)
        text_stats = {
            "query": [],
            "count": 0,
            "stats": {},
        }
        text_ip_addresses = set()
        chunks_stats = [
            self._analyze_chunk(chunk, text_ip_addresses) for chunk in chunks
        ]
        functools.reduce(self._aggregate_stats, chunks_stats, text_stats)

        # This maps section dictionaries to list of dictionaries
        # (undoing mapping done previously to keep track of count values)
        for section_key, section_value in text_stats["stats"].items():
            section_element_key = self.SECTION_KEY_TO_ELEMENT_KEY[section_key]
            text_stats["stats"][section_key] = sorted(
                [
                    {section_element_key: element_key, "count": element_count}
                    for element_key, element_count in section_value.items()
                ],
                key=lambda element: (-element["count"], element[section_element_key]),
            )

        if text_ip_addresses:
            noise_ip_addresses = {
                result["ip"]
                for result in self.api.quick(text_ip_addresses)
                if result["noise"]
            }
        else:
            noise_ip_addresses = set()

        ip_count = len(text_ip_addresses)
        noise_ip_count = len(noise_ip_addresses)
        not_noise_ip_count = ip_count - noise_ip_count
        if ip_count > 0:
            noise_ip_ratio = float(noise_ip_count) / ip_count
        else:
            noise_ip_ratio = 0

        text_stats["summary"] = {
            "ip_count": ip_count,
            "noise_ip_count": noise_ip_count,
            "not_noise_ip_count": not_noise_ip_count,
            "noise_ip_ratio": noise_ip_ratio,
        }

        return text_stats 
Example #22
Source File: dump_timeseries.py    From langchangetrack with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def main(args):
    # get the arguments
    method = args.method
    win_size = args.win_size
    step = args.step
    metric_name = args.metric_name
    n_jobs = args.workers

    # Load the data.
    L, H, olddf, newdf = pickle.load(open(args.filename))
    words = pd.Series(olddf.word.values.ravel()).unique()
    oldrows = []
    newrows = []
    sourcexrange = np.arange(args.mint, args.maxt, step)
    destxrange = np.arange(args.mint, args.maxt, step)
    if method == 'win':
        sourcexrange = sourcexrange[win_size:]
        destxrange = destxrange[:-win_size]

    if args.interpolate:
        sourcexinter = np.arange(sourcexrange[0], sourcexrange[-1] + 1, 1)
        destxinter = np.arange(destxrange[0], destxrange[-1] + 1, 1)
    else:
        sourcexinter = sourcexrange
        destxinter = destxrange

    # Construct the series
    assert(len(sourcexinter) == len(destxinter))
    chunk_sz = np.ceil(len(words)/float(n_jobs))
    words_chunks = more_itertools.chunked(words, chunk_sz)
    timeseries_chunks = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, create_word_time_series, olddf, newdf,
                                                                               sourcexinter, destxinter,
                                                                               metric_name=metric_name,
                                                                               interpolate=args.interpolate) for chunk in words_chunks)

    timeseries = list(more_itertools.flatten(timeseries_chunks))

    # Dump the data frame
    for orow, newrow in timeseries:
        if orow and newrow:
            oldrows.append(orow)
            newrows.append(newrow)

    oldtimeseries = pd.DataFrame()
    newtimeseries = pd.DataFrame()
    header = ['word']
    header.extend(sourcexinter)
    newheader = ['word']
    newheader.extend(destxinter)
    oldtimeseries = oldtimeseries.from_records(oldrows, columns=header)
    oldtimeseries = oldtimeseries.fillna(method='backfill', axis=1)
    newtimeseries = newtimeseries.from_records(newrows, columns=newheader)
    newtimeseries = newtimeseries.fillna(method='backfill', axis=1)
    oldtimeseries.to_csv(args.sourcetimef, encoding='utf-8')
    newtimeseries.to_csv(args.endtimef, encoding='utf-8') 
Example #23
Source File: image_viewer_widget.py    From CvStudio with MIT License 4 votes vote down vote up
def load_image(self):
        @work_exception
        def do_work():
            return dask.compute(*[
                self.load_image_label(),
                self.load_image_annotations()
            ]), None

        @gui_exception
        def done_work(args):
            result, error = args
            if result:
                label, annotations = result
                if label:
                    self._class_label.setVisible(True)
                    self._class_label.setText(label)
                else:
                    self._class_label.setVisible(False)
                    self._class_label.setText("")

                if annotations:
                    img_bbox: QRectF = self.image_viewer.pixmap.sceneBoundingRect()
                    offset = QPointF(img_bbox.width() / 2, img_bbox.height() / 2)
                    for entry in annotations:
                        try:
                            vo: AnnotaVO = entry
                            points = map(float, vo.points.split(","))
                            points = list(more_itertools.chunked(points, 2))
                            if vo.kind == "box" or vo.kind == "ellipse":
                                x = points[0][0] - offset.x()
                                y = points[0][1] - offset.y()
                                w = math.fabs(points[0][0] - points[1][0])
                                h = math.fabs(points[0][1] - points[1][1])
                                roi: QRectF = QRectF(x, y, w, h)
                                if vo.kind == "box":
                                    item = EditableBox(roi)
                                else:
                                    item = EditableEllipse()
                                item.tag = self.tag.dataset
                                item.setRect(roi)
                                item.label = vo.label
                                self.image_viewer.scene().addItem(item)
                            elif vo.kind == "polygon":
                                item = EditablePolygon()
                                item.label = vo.label
                                item.tag = self.tag.dataset
                                self.image_viewer.scene().addItem(item)
                                for p in points:
                                    item.addPoint(QPoint(p[0] - offset.x(), p[1] - offset.y()))
                        except Exception as ex:
                            GUIUtilities.show_error_message("Error loading the annotations: {}".format(ex), "Error")

        self.image_viewer.remove_annotations()
        worker = Worker(do_work)
        worker.signals.result.connect(done_work)
        self._thread_pool.start(worker)