"""This module holds classes that can be used as data soures. Note that it is easy to create other data sources: A data source must be iterable and provide dicts that map from attribute names to attribute values. """ # Copyright (c) 2009-2020, Aalborg University (pygrametl@cs.aau.dk) # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # - Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from csv import DictReader import sys if sys.platform.startswith('java'): # Jython specific code from pygrametl.jythonmultiprocessing import Queue, Process else: from multiprocessing import Queue, Process try: from Queue import Empty # Python 2 except ImportError: from queue import Empty # Python 3 __author__ = "Christian Thomsen" __maintainer__ = "Christian Thomsen" __version__ = '2.6' __all__ = ['CSVSource', 'TypedCSVSource', 'SQLSource', 'PandasSource', 'JoiningSource', 'HashJoiningSource', 'MergeJoiningSource', 'BackgroundSource', 'ProcessSource', 'MappingSource', 'TransformingSource', 'UnionSource', 'CrossTabbingSource', 'FilteringSource', 'DynamicForEachSource', 'RoundRobinSource'] CSVSource = DictReader class TypedCSVSource(DictReader): """A class for iterating a CSV file and type cast the values.""" def __init__(self, f, casts, fieldnames=None, restkey=None, restval=None, dialect='excel', *args, **kwds): """Arguments: - f: An iterable object such as as file. Passed on to csv.DictReader - casts: A dict mapping from attribute names to functions to apply to these names, e.g., {'id':int, 'salary':float} - fieldnames: Passed on to csv.DictReader - restkey: Passed on to csv.DictReader - restval: Passed on to csv.DictReader - dialect: Passed on to csv.DictReader - *args: Passed on to csv.DictReader - **kwds: Passed on to csv.DictReader """ DictReader.__init__(self, f, fieldnames=fieldnames, restkey=restkey, restval=restval, dialect=dialect, *args, **kwds) if not type(casts) == dict: raise TypeError("The casts argument must be a dict") for v in casts.values(): if not callable(v): raise TypeError("The values in casts must be callable") self._casts = casts def __next__(self): # For Python 3 row = DictReader.__next__(self) for (att, func) in self._casts.items(): row[att] = func(row[att]) return row def next(self): # For Python 2 row = DictReader.next(self) for (att, func) in self._casts.items(): row[att] = func(row[att]) return row class SQLSource(object): """A class for iterating the result set of a single SQL query.""" def __init__(self, connection, query, names=(), initsql=None, cursorarg=None, parameters=None): """Arguments: - connection: the PEP 249 connection to use. NOT a ConnectionWrapper! - query: the query that generates the result - names: names of attributes in the result. If not set, the names from the database are used. Default: () - initsql: SQL that is executed before the query. The result of this initsql is not returned. Default: None. - cursorarg: if not None, this argument is used as an argument when the connection's cursor method is called. Default: None. - parameters: if not None, this sequence or mapping of parameters will be sent when the query is executed. """ self.connection = connection if cursorarg is not None: self.cursor = connection.cursor(cursorarg) else: self.cursor = connection.cursor() if initsql: self.cursor.execute(initsql) self.query = query self.names = names self.executed = False self.parameters = parameters def __iter__(self): try: if not self.executed: if self.parameters: self.cursor.execute(self.query, self.parameters) else: self.cursor.execute(self.query) names = None if self.names or self.cursor.description: names = self.names or \ [t[0] for t in self.cursor.description] while True: data = self.cursor.fetchmany(500) if not data: break if not names: # We do this to support cursor objects that only have # a meaningful .description after data has been fetched. # This is, for example, the case when using a named # psycopg2 cursor. names = [t[0] for t in self.cursor.description] if len(names) != len(data[0]): raise ValueError( "Incorrect number of names provided. " + "%d given, %d needed." % (len(names), len(data[0]))) for row in data: yield dict(zip(names, row)) finally: try: self.cursor.close() except Exception: pass class PandasSource(object): """A source for iterating a Pandas DataFrame and cast each row to a dict.""" def __init__(self, dataFrame): """Arguments: - dataFrame: A Pandas DataFrame """ self._dataFrame = dataFrame def __iter__(self): for (_, series) in self._dataFrame.iterrows(): row = series.to_dict() yield row class ProcessSource(object): """A class for iterating another source in a separate process""" def __init__(self, source, batchsize=500, queuesize=20): """Arguments: - source: the source to iterate - batchsize: the number of rows passed from the worker process each time it passes on a batch of rows. Must be positive. Default: 500 - queuesize: the maximum number of batches that can wait in a queue between the processes. 0 means unlimited. Default: 20 """ if not isinstance(batchsize, int) or batchsize < 1: raise ValueError('batchsize must be a positive integer') self.__source = source self.__batchsize = batchsize self.__queue = Queue(queuesize) p = Process(target=self.__worker) p.name = "Process for ProcessSource" p.start() def __worker(self): batch = [] try: for row in self.__source: batch.append(row) if len(batch) == self.__batchsize: self.__queue.put(batch) batch = [] # We're done. Send the batch if it has any data and a signal if batch: self.__queue.put(batch) self.__queue.put('STOP') except Exception: # Jython 2.5.X does not support the as syntax required by Python 3 e = sys.exc_info()[1] if batch: self.__queue.put(batch) self.__queue.put('EXCEPTION') self.__queue.put(e) def __iter__(self): while True: data = self.__queue.get() if data == 'STOP': break elif data == 'EXCEPTION': exc = self.__queue.get() raise exc # else we got a list of rows from the other process for row in data: yield row BackgroundSource = ProcessSource # for compatability # The old thread-based BackgroundSource has been removed and # replaced by ProcessSource class HashJoiningSource(object): """A class for equi-joining two data sources.""" def __init__(self, src1, key1, src2, key2): """Arguments: - src1: the first source. This source is iterated row by row. - key1: the attribute of the first source to use in the join - src2: the second source. The rows of this source are all loaded into memory. - key2: the attriubte of the second source to use in the join. """ self.__hash = {} self.__src1 = src1 self.__key1 = key1 self.__src2 = src2 self.__key2 = key2 def __buildhash(self): for row in self.__src2: keyval = row[self.__key2] l = self.__hash.get(keyval, []) l.append(row) self.__hash[keyval] = l self.__ready = True def __iter__(self): self.__buildhash() for row in self.__src1: matches = self.__hash.get(row[self.__key1], []) for match in matches: newrow = row.copy() newrow.update(match) yield newrow JoiningSource = HashJoiningSource # for compatability class MergeJoiningSource(object): """A class for merge-joining two sorted data sources""" def __init__(self, src1, key1, src2, key2): """Arguments: - src1: a data source - key1: the attribute to use from src1 - src2: a data source - key2: the attribute to use from src2 """ self.__src1 = src1 self.__key1 = key1 self.__src2 = src2 self.__key2 = key2 self.__next = None def __iter__(self): iter1 = self.__src1.__iter__() iter2 = self.__src2.__iter__() row1 = next(iter1) keyval1 = row1[self.__key1] rows2 = self.__getnextrows(iter2) keyval2 = rows2[0][self.__key2] try: while True: # At one point there will be a StopIteration if keyval1 == keyval2: # Output rows for part in rows2: resrow = row1.copy() resrow.update(part) yield resrow row1 = next(iter1) keyval1 = row1[self.__key1] elif keyval1 < keyval2: row1 = next(iter1) keyval1 = row1[self.__key1] else: # k1 > k2 rows2 = self.__getnextrows(iter2) keyval2 = rows2[0][self.__key2] except StopIteration: return # Needed in Python 3.7+ due to PEP 479 def __getnextrows(self, iterval): res = [] keyval = None if self.__next is not None: res.append(self.__next) keyval = self.__next[self.__key2] self.__next = None while True: try: row = next(iterval) except StopIteration: if res: return res else: raise if keyval is None: keyval = row[self.__key2] # for the first row in this round if row[self.__key2] == keyval: res.append(row) else: self.__next = row return res class MappingSource(object): """A class for iterating a source and applying a function to each column.""" def __init__(self, source, callables): """Arguments: - source: A data source - callables: A dict mapping from attribute names to functions to apply to these names, e.g. type casting {'id':int, 'salary':float} """ if not type(callables) == dict: raise TypeError("The callables argument must be a dict") for v in callables.values(): if not callable(v): raise TypeError("The values in callables must be callable") self._source = source self._callables = callables def __iter__(self): for row in self._source: for (att, func) in self._callables.items(): row[att] = func(row[att]) yield row class TransformingSource(object): """A source that applies functions to the rows from another source""" def __init__(self, source, *transformations): """Arguments: - source: a data source - *transformations: the transformations to apply. Must be callables of the form func(row) where row is a dict. Will be applied in the given order. """ self.__source = source self.__transformations = transformations def __iter__(self): for row in self.__source: for func in self.__transformations: func(row) yield row class CrossTabbingSource(object): """A source that produces a crosstab from another source""" def __init__(self, source, rowvaluesatt, colvaluesatt, values, aggregator=None, nonevalue=0, sortrows=False): """Arguments: - source: the data source to pull data from - rowvaluesatt: the name of the attribute that holds the values that appear as rows in the result - colvaluesatt: the name of the attribute that holds the values that appear as columns in the result - values: the name of the attribute that holds the values to aggregate - aggregator: the aggregator to use (see pygrametl.aggregators). If not given, pygrametl.aggregators.Sum is used to sum the values - nonevalue: the value to return when there is no data to aggregate. Default: 0 - sortrows: A boolean deciding if the rows should be sorted. Default: False """ self.__source = source self.__rowvaluesatt = rowvaluesatt self.__colvaluesatt = colvaluesatt self.__values = values if aggregator is None: from pygrametl.aggregators import Sum self.__aggregator = Sum() else: self.__aggregator = aggregator self.__nonevalue = nonevalue self.__sortrows = sortrows self.__allcolumns = set() self.__allrows = set() def __iter__(self): for data in self.__source: # first we iterate over all source data ... row = data[self.__rowvaluesatt] col = data[self.__colvaluesatt] self.__allrows.add(row) self.__allcolumns.add(col) self.__aggregator.process((row, col), data[self.__values]) # ... and then we build result rows for row in (self.__sortrows and sorted(self.__allrows) or self.__allrows): res = {self.__rowvaluesatt: row} for col in self.__allcolumns: res[col] = \ self.__aggregator.finish((row, col), self.__nonevalue) yield res class FilteringSource(object): """A source that applies a filter to another source""" def __init__(self, source, filter=bool): """Arguments: - source: the source to filter - filter: a callable f(row). If the result is a True value, the row is passed on. If not, the row is discarded. Default: bool, i.e., Python's standard boolean conversion which removes empty rows. """ self.__source = source self.__filter = filter def __iter__(self): for row in self.__source: if self.__filter(row): yield row class UnionSource(object): """A source to union other sources (possibly with different types of rows). All rows are read from the 1st source before rows are read from the 2nd source and so on (to interleave the rows, use a RoundRobinSource) """ def __init__(self, *sources): """Arguments: - *sources: The sources to union in the order they should be used. """ self.__sources = sources def __iter__(self): for src in self.__sources: for row in src: yield row class RoundRobinSource(object): """A source that reads sets of rows from sources in round robin-fashion""" def __init__(self, sources, batchsize=500): """Arguments: - sources: a sequence of data sources - batchsize: the amount of rows to read from a data source before going to the next data source. Must be positive (to empty a source before going to the next, use UnionSource) """ self.__sources = [iter(src) for src in sources] self.__sources.reverse() # we iterate it from the back in __iter__ if not batchsize > 0: raise ValueError("batchsize must be positive") self.__batchsize = batchsize def __iter__(self): while self.__sources: # iterate from back for i in range(len(self.__sources) - 1, -1, -1): cursrc = self.__sources[i] # now return up to __batchsize from cursrc try: for _ in range(self.__batchsize): yield next(cursrc) except StopIteration: # we're done with this source and can delete it since # we iterate the list as we do del self.__sources[i] return class DynamicForEachSource(object): """A source that for each given argument creates a new source that will be iterated by this source. For example, useful for directories where a CSVSource should be created for each file. The user must provide a function that when called with a single argument, returns a new source to iterate. A DynamicForEachSource instance can be given to several ProcessSource instances. """ def __init__(self, seq, callee): """Arguments: - seq: a sequence with the elements for each of which a unique source must be created. the elements are given (one by one) to callee. - callee: a function f(e) that must accept elements as those in the seq argument. the function should return a source which then will be iterated by this source. the function is called once for every element in seq. """ self.__queue = Queue() # a multiprocessing.Queue if not callable(callee): raise TypeError('callee must be callable') self.__callee = callee for e in seq: # put them in a safe queue such that this object can be used from # different fork'ed processes self.__queue.put(e) def __iter__(self): while True: try: arg = self.__queue.get(False) src = self.__callee(arg) for row in src: yield row except Empty: return