"""Main data-model classes for the Humanitarian Exchange Language (HXL). This module defines the basic classes for working with HXL data. Other modules have classes derived from these (e.g. in [hxl.filters](filters.html) or [hxl.io](io.html)). The core class is [Dataset](#hxl.model.Dataset), which defines the operations available on a HXL dataset, including convenience methods for chaining filters. Typical usage: source = hxl.data("https://example.org/data.csv") # returns a hxl.model.Dataset object result = source.with_lines("#country+name=Kenya").sort() # a filtered/sorted view of the data This code is released into the Public Domain and comes with NO WARRANTY. """ import abc, copy, csv, dateutil, hashlib, json, logging, operator, re, six import hxl logger = logging.getLogger(__name__) class TagPattern(object): """Pattern for matching a HXL hashtag and attributes - the pattern "#*" matches any hashtag/attribute combination - the pattern "#*+foo" matches any hashtag with the foo attribute - the pattern "#tag" matches #tag with any attributes - the pattern "#tag+foo" matches #tag with foo among its attributes - the pattern "#tag-foo" matches #tag with foo *not* among its attributes - the pattern "#tag+foo-bar" matches #tag with foo but not bar - the pattern "#tag+foo+bar!" matches #tag with exactly the attributes foo and bar, but *no others* The normal way to create a tag pattern is using the [parse()](#hxl.model.TagPattern.parse) method rather than the constructor: pattern = hxl.model.TagPattern.parse("#affected+f-children") Args: tag: the basic hashtag (without attributes) include_attributes: a list of attributes that must be present exclude_attributes: a list of attributes that must not be present is_absolute: if True, no attributes are allowed except those in _include_attributes_ """ PATTERN = r'^\s*#?({token}|\*)((?:\s*[+-]{token})*)\s*(!)?\s*$'.format(token=hxl.datatypes.TOKEN_PATTERN) """Constant: regular expression to match a HXL tag pattern. """ def __init__(self, tag, include_attributes=[], exclude_attributes=[], is_absolute=False): self.tag = tag self.include_attributes = set(include_attributes) """Set of all attributes that must be present""" self.exclude_attributes = set(exclude_attributes) """Set of all attributes that must not be present""" self.is_absolute = is_absolute """True if this pattern is absolute (no extra attributes allowed)""" def is_wildcard(self): return self.tag == '#*' def match(self, column): """Check whether a Column matches this pattern. @param column: the column to check @returns: True if the column is a match """ if column.tag and (self.is_wildcard() or self.tag == column.tag): # all include_attributes must be present if self.include_attributes: for attribute in self.include_attributes: if attribute not in column.attributes: return False # all exclude_attributes must be absent if self.exclude_attributes: for attribute in self.exclude_attributes: if attribute in column.attributes: return False # if absolute, then only specified attributes may be present if self.is_absolute: for attribute in column.attributes: if attribute not in self.include_attributes: return False return True else: return False def get_matching_columns(self, columns): """Return a list of columns that match the pattern. @param columns: a list of L{hxl.model.Column} objects @returns: a list (possibly empty) """ result = [] for column in columns: if self.match(column): result.append(column) return result def find_column_index(self, columns): """Get the index of the first matching column. @param columns: a list of columns to check @returns: the 0-based index of the first matching column, or None for no match """ for i in range(len(columns)): if self.match(columns[i]): return i return None def find_column(self, columns): """Check whether there is a match in a list of columns.""" for column in columns: if self.match(column): return column return None def __repr__(self): s = self.tag if self.include_attributes: for attribute in self.include_attributes: s += '+' + attribute if self.exclude_attributes: for attribute in self.exclude_attributes: s += '-' + attribute return s __str__ = __repr__ @staticmethod def parse(s): """Parse a single tag-pattern string. pattern = TagPattern.parse("#affected+f-children") The [parse_list()](#hxl.model.TagPattern.parse_list) method will call this method to parse multiple patterns at once. Args: s: the tag-pattern string to parse Returns: A TagPattern object """ if not s: # edge case: null value raise hxl.HXLException('Attempt to parse empty tag pattern') elif isinstance(s, TagPattern): # edge case: already parsed return s result = re.match(TagPattern.PATTERN, s) if result: tag = '#' + result.group(1).lower() include_attributes = set() exclude_attributes = set() attribute_specs = re.split(r'\s*([+-])', result.group(2)) for i in range(1, len(attribute_specs), 2): if attribute_specs[i] == '+': include_attributes.add(attribute_specs[i + 1].lower()) else: exclude_attributes.add(attribute_specs[i + 1].lower()) if result.group(3) == '!': is_absolute = True if exclude_attributes: raise ValueError('Exclusions not allowed in absolute patterns') else: is_absolute = False return TagPattern( tag, include_attributes=include_attributes, exclude_attributes=exclude_attributes, is_absolute=is_absolute ) else: raise hxl.HXLException('Malformed tag: ' + s) @staticmethod def parse_list(specs): """Parse a list of tag-pattern strings. If _specs_ is a list of already-parsed TagPattern objects, do nothing. If it's a list of strings, apply [parse()](#hxl.model.TagPattern.parse) to each one. If it's a single string with multiple patterns separated by commas, split the string, then parse the patterns. patterns = TagPattern.parse_list("#affected+f,#inneed+f") # or patterns = TagPattern.parse_list("#affected+f", "#inneed+f") Args: specs: the raw input (a list of strings, or a single string with commas separating the patterns) Returns: A list of TagPattern objects. """ if not specs: return [] if isinstance(specs, six.string_types): specs = specs.split(',') return [TagPattern.parse(spec) for spec in specs] @staticmethod def match_list(column, patterns): """Test if a column matches any of the patterns in a list. This is convenient to use together with [parse_list()](hxl.model.TagPattern.parse_list): patterns = TagPattern.parse_list(["#affected+f", "#inneed+f"]) if TagPattern.match_list(column, patterns): print("The column matched one of the patterns") Args: column: the column to test patterns: a list of zero or more patterns. Returns: True if there is a match """ for pattern in patterns: if pattern.match(column): return True return False class Dataset(object): """Abstract base class for a HXL data source. Any source of parsed HXL data inherits from this class: that includes Dataset, HXLReader, and the various filters in the hxl.old_filters package. The contract of a Dataset is that it will provide a columns property and a next() method to read through the rows. The child class must implement the columns() method as a property and the __iter__() method to make itself iterable. """ __metaclass__ = abc.ABCMeta def __init__(self): """Constructor.""" super().__init__() @abc.abstractmethod def __iter__(self): """Get the iterator over the rows. @returns: an iterator that returns L{hxl.model.Row} objects """ raise RuntimeException("child class must implement __iter__() method") @property def is_cached(self): """Test whether the source data is cached (replayable). By default, this is False, but some subclasses may override. @returns: C{True} if the input is cached (replayable); C{False} otherwise. """ return False @property @abc.abstractmethod def columns(self): """Get the column definitions for the dataset. @returns: a list of Column objects. """ raise RuntimeException("child class must implement columns property method") @property def columns_hash(self): """Generate a hash across all of the columns in the dataset. This function helps detect whether two HXL documents are of the same type, even if they contain different data (e.g. the HXL API output for the same humanitarian dataset in two different months or two different countries). It takes into account text headers, hashtags, the order of attributes, and the order of columns. Whitespace is normalised, and null values are treated as empty strings. The MD5 hash digest is generated from a UTF-8 encoded version of each header. @returns: a 32-character hex-formatted MD5 hash string """ md5 = hashlib.md5() for column in self.columns: md5.update(hxl.datatypes.normalise_space(column.header).encode('utf-8')) for column in self.columns: md5.update(hxl.datatypes.normalise_space(column.display_tag).encode('utf-8')) return md5.hexdigest() @property def data_hash(self): """Generate a hash for the entire dataset. This function allows checking if two HXL datasets are functionally identical. It takes into account text headers, hashtags, the order of attributes, and the order of columns. Whitespace is normalised, and null values are treated as empty strings. The MD5 hash digest is generated from a UTF-8 encoded version of each header and data cell. @returns: a 32-character hex-formatted MD5 hash string """ md5 = hashlib.md5() # text header row for column in self.columns: md5.update(hxl.datatypes.normalise_space(column.header).encode('utf-8')) # hashtag row for column in self.columns: md5.update(hxl.datatypes.normalise_space(column.display_tag).encode('utf-8')) # data rows for row in self: for value in row: md5.update(hxl.datatypes.normalise_space(value).encode('utf-8')) return md5.hexdigest() @property def headers(self): """Return a list of header strings (for a spreadsheet row). """ return [column.header if column else '' for column in self.columns] @property def tags(self): """Get all hashtags (without attributes) as a list @returns: a list of base hashtags for the dataset columns """ return [column.tag if column else '' for column in self.columns] @property def display_tags(self): """Return a list of display tags. @returns: a list of strings containing the hashtag and attributes for each column """ return [column.display_tag if column else '' for column in self.columns] @property def has_headers(self): """Report whether any non-empty header strings exist. @returns: C{True} if there is at least one column with a non-empty header string """ for column in self.columns: if column.header: return True return False @property def values(self): """Get all values for the dataset at once, in an array of arrays. This method can be highly inefficient for large datasets. @returns: an array of arrays of scalar values """ return [row.values for row in self] def get_value_set(self, tag_pattern=None, normalise=False): """Return the set of all values in a dataset (optionally matching a tag pattern for a single column) Warning: this method can be highly inefficient for large datasets. @param tag_pattern: (optional) return values only for columns matching this tag pattern. @param normalise: (optional) normalise the strings with hxl.datatypes.normalise (default: False) @returns: a Python set of values """ value_set = set([]) if tag_pattern: tag_pattern = TagPattern.parse(tag_pattern) for row in self: if tag_pattern: new_values = row.get_all(tag_pattern) else: new_values = row.values if normalise: new_values = [hxl.datatypes.normalise(s) for s in new_values] else: new_values = [hxl.datatypes.normalise_space(s) for s in new_values] value_set.update(new_values) return value_set def get_column_indices(self, tag_patterns, columns): """Get a list of indices that match the tag patterns provided @param tag_patterns: a list of tag patterns or a string version of the list @param columns: a list of columns @returns: a (possibly-empty) list of 0-based indices """ patterns = TagPattern.parse_list(tag_patterns) indices = [] for i, column in enumerate(columns): for pattern in patterns: if pattern.match(column): indices.push(i) return indices # # Aggregates # def _get_minmax(self, pattern, op): """Calculate the extreme min/max value for a tag pattern Will iterate through the dataset, and use values from multiple matching columns. Uses numbers, dates, or strings for comparison, based on the first non-empty value found. @param pattern: the L{hxl.model.TagPattern} to match @param op: operator_lt or operator_gt @returns: the extreme value according to operator supplied, or None if no values found """ pattern = TagPattern.parse(pattern) result_raw = None # what's actually in the dataset result_normalised = None # normalised version for comparison # Look at every row for row in self: # Look at every matching value in every row for i, value in enumerate(row.get_all(pattern)): # ignore empty values if hxl.datatypes.is_empty(value): continue # make a normalised value for comparison normalised = hxl.datatypes.normalise(value, row.columns[i]) # first non-empty value is always a match if result_normalised is None: result_raw = value result_normalised = normalised else: # try comparing the normalised types first, then strings on failure try: if op(normalised, result_normalised): result_raw = value result_normalised = normalised except TypeError: if op(str(normalised), str(result_normalised)): result_raw = value result_normalised = normalised return result_raw def min(self, pattern): """Calculate the minimum value for a tag pattern Will iterate through the dataset, and use values from multiple matching columns. Uses numbers, dates, or strings for comparison, based on the first non-empty value found. @param pattern: the L{hxl.model.TagPattern} to match @returns: the minimum value according to the '<' operator, or None if no values found """ return self._get_minmax(pattern, operator.lt) def max(self, pattern): """Calculate the maximum value for a tag pattern Will iterate through the dataset, and use values from multiple matching columns. @param pattern: the L{hxl.model.TagPattern} to match @returns: the minimum value according to the '<' operator, or None if no values found """ return self._get_minmax(pattern, operator.gt) # # Utility # def validate(self, schema=None, callback=None): """ Validate the current dataset. @param schema (optional) the pre-compiled schema, schema filename, URL, file object, etc. Defaults to a built-in schema. @param callback (optional) a function to call with each error or warning. Defaults to collecting errors in an array and returning them. """ return hxl.schema(schema, callback).validate(self) def recipe(self, recipe): """Parse a recipe (JSON or a list of dicts) and create the appropriate filters. @param recipe: a list of dicts, a single dict, or a JSON literal string. @return: the new end filter. """ import hxl.filters return hxl.filters.from_recipe(self, recipe) # # Filters # def append(self, append_sources, add_columns=True, queries=[]): """Append additional datasets. @param append_sources: a list of sources to append @param add_columns: if True (default), include any extra columns in the append sources @param queries: a list of row queries to select rows for inclusion from the append sources. @returns: a new HXL source for chaining """ import hxl.filters return hxl.filters.AppendFilter(self, append_sources, add_columns=add_columns, queries=queries) def append_external_list(self, source_list_url, add_columns=True, queries=[]): """Append additional datasets from an external list @param source_list_url: URL of a HXL dataset containing a list of sources to append. @param add_columns: if True (default), include any extra columns in the append sources. @param queries: a list of row queries to select rows for inclusion from the append sources. @returns: a new HXL source for chaining """ import hxl.filters logger.debug("Loading append list from %s...", source_list_url) append_sources = hxl.filters.AppendFilter.parse_external_source_list(source_list_url) logger.debug("Done loading") return hxl.filters.AppendFilter(self, append_sources, add_columns=add_columns, queries=queries) def cache(self): """Add a caching filter to the dataset.""" import hxl.filters return hxl.filters.CacheFilter(self) def dedup(self, patterns=[], queries=[]): """Deduplicate a dataset.""" import hxl.filters return hxl.filters.DeduplicationFilter(self, patterns=patterns, queries=queries) def with_columns(self, whitelist): """Select matching columns.""" import hxl.filters return hxl.filters.ColumnFilter(self, include_tags=whitelist) def without_columns(self, blacklist=None, skip_untagged=False): """Select non-matching columns.""" import hxl.filters return hxl.filters.ColumnFilter(self, exclude_tags=blacklist, skip_untagged=skip_untagged) def with_rows(self, queries, mask=[]): """Select matching rows. @param queries: a predicate or list of predicates for rows to include @param mask: a predicate or list of predicates for rows to test (default: [] to test all) @return: a filtered version of the source """ import hxl.filters return hxl.filters.RowFilter(self, queries=queries, reverse=False, mask=mask) def without_rows(self, queries, mask=[]): """Select non-matching rows. @param queries: a predicate or list of predicates for rows to ignore @param mask: a predicate or list of predicates for rows to test (default: [] to test all) @return: a filtered version of the source """ import hxl.filters return hxl.filters.RowFilter(self, queries=queries, reverse=True, mask=mask) def sort(self, keys=None, reverse=False): """Sort the dataset (caching).""" import hxl.filters return hxl.filters.SortFilter(self, tags=keys, reverse=reverse) def count(self, patterns=[], aggregators=None, queries=[]): """Count values in the dataset (caching).""" import hxl.filters return hxl.filters.CountFilter( self, patterns=patterns, aggregators=aggregators, queries=queries ) def row_counter(self, queries=[]): """Count the number of rows while streaming.""" import hxl.filters return hxl.filters.RowCountFilter(self, queries=queries) def replace_data(self, original, replacement, pattern=None, use_regex=False, queries=[]): """Replace values in a HXL dataset.""" import hxl.filters replacement = hxl.filters.ReplaceDataFilter.Replacement(original, replacement, pattern, use_regex) return hxl.filters.ReplaceDataFilter(self, [replacement], queries=queries) def replace_data_map(self, map_source, queries=[]): """Replace values in a HXL dataset.""" import hxl.filters replacements = hxl.filters.ReplaceDataFilter.Replacement.parse_map(hxl.data(map_source)) return hxl.filters.ReplaceDataFilter(self, replacements, queries=queries) def add_columns(self, specs, before=False): """Add fixed-value columns to a HXL dataset.""" import hxl.filters return hxl.filters.AddColumnsFilter(self, specs=specs, before=before) def rename_columns(self, specs): """Changes headers and tags on a column.""" import hxl.filters return hxl.filters.RenameFilter(self, specs) def clean_data( self, whitespace=[], upper=[], lower=[], date=[], date_format=None, number=[], number_format=None, latlon=[], purge=False, queries=[] ): """Clean data fields.""" import hxl.filters return hxl.filters.CleanDataFilter( self, whitespace=whitespace, upper=upper, lower=lower, date=date, date_format=date_format, number=number, number_format=number_format, latlon=latlon, purge=purge, queries=queries ) def merge_data(self, merge_source, keys, tags, replace=False, overwrite=False, queries=[]): """Merges values from a second dataset.""" import hxl.filters return hxl.filters.MergeDataFilter(self, merge_source, keys, tags, replace, overwrite, queries=queries) def explode(self, header_attribute='header', value_attribute='value'): """Explodes a wide dataset into a long datasets. @param header_attribute: the attribute to add to the hashtag of the column with the former header (default 'header') @param value_attribute: the attribute to add to the hashtag of the column with the former value (default 'value') @return: filtered dataset. @see hxl.filters.ExplodeFilter """ import hxl.filters return hxl.filters.ExplodeFilter(self, header_attribute, value_attribute) def implode(self, label_pattern, value_pattern): """Implodes a long dataset into a wide dataset @param label_pattern: the tag pattern to match the label column @param value_pattern: the tag pattern to match the @return: filtered dataset. @see hxl.filters.ImplodeFilter """ import hxl.filters return hxl.filters.ImplodeFilter(self, label_pattern=label_pattern, value_pattern=value_pattern) def jsonpath(self, path, patterns=[], queries=[], use_json=True): """Parse the value as a JSON expression and extract data from it. See http://goessner.net/articles/JsonPath/ @param path: a JSONPath expression for extracting data @param patterns: a tag pattern or list of patterns for the columns to use (default to all) @param queries: a predicate or list of predicates for the rows to consider. @param use_json: if True, serialise multiple results as JSON lists. @returns: filtered dataset @see: hxl.filters.JSONPathFilter """ import hxl.filters return hxl.filters.JSONPathFilter(self, path, patterns=patterns, queries=queries, use_json=use_json) def fill_data(self, patterns=[], queries=[]): """Fills empty cells in a column using the last non-empty value. @param patterns: a tag pattern or list of patterns for the columns to fill (default to all) @param queries: a predicate or list of predicates for rows to fill (leave any blank that don't match). @return filtered dataset @see hxl.filters.FillFilter """ import hxl.filters return hxl.filters.FillDataFilter(self, patterns=patterns, queries=queries) # # Generators # def gen_raw(self, show_headers=True, show_tags=True): """Generate an array representation of a HXL dataset, one at a time.""" if show_headers: yield self.headers if show_tags: yield self.display_tags for row in self: yield row.values def gen_csv(self, show_headers=True, show_tags=True): """Generate a CSV representation of a HXL dataset, one row at a time.""" class TextOut: """Simple string output source to capture CSV""" def __init__(self): self.data = '' def write(self, s): self.data += s def get(self): data = self.data self.data = '' return data output = TextOut() writer = csv.writer(output) for raw in self.gen_raw(show_headers, show_tags): writer.writerow(raw) yield output.get() def gen_json(self, show_headers=True, show_tags=True, use_objects=False): """Generate a JSON representation of a HXL dataset, one row at a time.""" is_first = True yield "[\n" if use_objects: for row in self: if is_first: is_first = False yield json.dumps(row.dictionary, sort_keys=True, indent=2) else: yield ",\n" + json.dumps(row.dictionary, sort_keys=True, indent=2) else: for raw in self.gen_raw(show_headers, show_tags): if is_first: is_first = False yield json.dumps(raw) else: yield ",\n" + json.dumps(raw) yield "\n]\n" class Column(object): """ The definition of a logical column in the HXL data. """ # Regular expression to match a HXL tag PATTERN = r'^\s*(#{token})((?:\s*\+{token})*)\s*$'.format(token=hxl.datatypes.TOKEN_PATTERN) # To tighten debugging (may reconsider later -- not really a question of memory efficiency here) __slots__ = ['tag', 'attributes', 'attribute_list', 'header', 'column_number'] def __init__(self, tag=None, attributes=(), header=None, column_number=None): """ Initialise a column definition. @param tag: the HXL hashtag for the column (default: None) @param attributes: (optional) a sequence of attributes (default: ()) @param header: (optional) the original plaintext header for the column (default: None) @param column_number: (optional) the zero-based column number """ if tag: tag = tag.lower() self.tag = tag self.header = header self.column_number = column_number self.attributes = set([a.lower() for a in attributes]) self.attribute_list = [a.lower() for a in attributes] # to preserve order @property def display_tag(self): """Default display version of a HXL hashtag. Attributes are not sorted. """ return self.get_display_tag(sort_attributes=False) def get_display_tag(self, sort_attributes=False): """ Generate a display version of the column hashtag @param sort_attributes: if True, sort attributes; otherwise, preserve the original order @return the reassembled HXL hashtag string, including language code """ if self.tag: s = self.tag for attribute in sorted(self.attribute_list) if sort_attributes else self.attribute_list: s += '+' + attribute return s else: return '' def has_attribute(self, attribute): """Check if an attribute is present.""" return (attribute in self.attribute_list) def add_attribute(self, attribute): """Add an attribute to the column.""" if attribute not in self.attributes: self.attributes.add(attribute) self.attribute_list.append(attribute) return self def remove_attribute(self, attribute): """Remove an attribute from the column.""" if attribute in self.attributes: self.attributes.remove(attribute) self.attribute_list.remove(attribute) return self def __hash__(self): """Make columns usable in a dictionary. Only the hashtag and attributes are used. """ hash_value = hash(self.tag) for attribute in self.attributes: hash_value += hash(attribute) return hash_value def __eq__(self, other): """Test for comparison with another object. For equality, only the hashtag and attributes have to be the same.""" try: return (self.tag == other.tag and self.attributes == other.attributes) except: return False def __repr__(self): return self.display_tag __str__ = __repr__ @staticmethod def parse(raw_string, header=None, use_exception=False, column_number=None): """ Attempt to parse a full hashtag specification. @param raw_string: the string representation of the tagspec @param header: the text header to include @param use_exception: if True, throw an exception for a malformed tagspec @returns: None if the string is empty, False if it's malformed (and use_exception is False), or a Column object otherwise """ # Already parsed? if isinstance(raw_string, Column): return raw_string # Empty string? if hxl.datatypes.is_empty(raw_string): return None # Pattern for a single tag result = re.match(Column.PATTERN, raw_string) if result: tag = result.group(1) attribute_string = result.group(2) if attribute_string: attributes = re.split(r'\s*\+', attribute_string.strip().strip('+')) else: attributes = [] return Column(tag=tag, attributes=attributes, header=header, column_number=column_number) else: if use_exception: raise hxl.HXLException("Malformed tag expression: " + raw_string) else: logger.debug("Not a HXL hashtag spec: %s", raw_string) return False @staticmethod def parse_spec(raw_string, default_header=None, use_exception=False, column_number=None): """Attempt to parse a single-string header/hashtag spec""" # Already parsed? if isinstance(raw_string, Column): return raw_string matches = re.match(r'^(.*)(#.*)$', raw_string) if matches: header = matches.group(1) if matches.group(1) else default_header return Column.parse(matches.group(2), header=header, column_number=column_number) else: return Column.parse('#' + raw_string, header=default_header, column_number=column_number) class Row(object): """ An iterable row of values in a HXL dataset. """ # Predefine the slots for efficiency (may reconsider later) __slots__ = ['columns', 'values', 'row_number', 'source_row_number'] def __init__(self, columns, values=[], row_number=None, source_row_number=None): """ Set up a new row. @param columns: The column definitions (array of Column objects). @param values: (optional) The string values for the row (default: []) @param row_number: (optional) The zero-based logical row number in the input dataset, if available (default: None) @param source_row_number: (optional) The zero-based source row number in the input dataset, if available (default: None) """ self.columns = columns self.values = copy.copy(values) self.row_number = row_number self.source_row_number = source_row_number def append(self, value): """ Append a value to the row. @param value The new value to append. @return The new value """ self.values.append(value) return value def get(self, tag, index=None, default=None, parsed=False): """ Get a single value for a tag in a row. If no index is provided ("None"), return the first non-empty value. @param tag: A TagPattern or a string value for a tag. @param index: The zero-based index if there are multiple values for the tag (default: None) @param default: The default value if not found (default: None). Never parsed, even if parsed=True @param parsed: If true, use attributes as hints to try to parse the value (e.g. number, list, date) @return The value found, or the default value provided. If parsed=True, the return value will be a list (default: False) """ # FIXME - move externally, use for get_all as well, and support numbers and dates def parse(column, value): if parsed: if column.has_attribute('list'): return re.split("\s*,\s*", value) else: return [value] return value if type(tag) is TagPattern: pattern = tag else: pattern = TagPattern.parse(tag) for i, column in enumerate(self.columns): if i >= len(self.values): break if pattern.match(column): if index is None: # None (the default) is a special case: it means look # for the first truthy value if self.values[i]: return parse(column, self.values[i]) else: # Otherwise, look for a specific index if index == 0: return parse(column, self.values[i]) else: index = index - 1 return default def get_all(self, tag, default=None): """ Get all values for a specific tag in a row @param tag A TagPattern or a string value for a tag. @return An array of values for the HXL hashtag. """ if type(tag) is TagPattern: pattern = tag else: pattern = TagPattern.parse(tag) result = [] for i, column in enumerate(self.columns): if i >= len(self.values): break if pattern.match(column): value = self.values[i] if default is not None and not value: value = default result.append(value) return result def key(self, patterns=None, indices=None): """Generate a unique key tuple for the row, based on a list of tag patterns @param patterns: a list of L{TagPattern} objects, or a parseable string @returns: the key as a tuple (might be empty) """ key = [] # if the user doesn't provide indices, get indices from the pattern if not indices and patterns: indices = get_column_indices(patterns, self.columns) if indices: # if we have indices, use them to build the key for i in indices: if i < len(self.values): key.append(hxl.datatypes.normalise(self.values[i], self.columns[i])) else: # if there are still no indices, use the whole row for the key for i, value in enumerate(self.values): key.append(hxl.datatypes.normalise(value, self.columns[i])) return tuple(key) # make it into a tuple so that it's hashable @property def dictionary(self): """Return the row as a Python dict. The keys will be HXL hashtags and attributes, normalised per HXL 1.1. If two or more columns have the same hashtags and attributes, only the first will be included. @return: The row as a Python dictionary. """ data = {} for i, col in enumerate(self.columns): key = col.get_display_tag(sort_attributes=True) if key and (not key in data) and (i < len(self.values)): data[key] = self.values[i] return data def __getitem__(self, index): """ Array-access method to make this class iterable. @param index The zero-based index of a value to look up. @return The value if it exists. @exception IndexError if the index is out of range. """ return self.values[index] def __str__(self): """ Create a string representation of a row for debugging. """ s = '<Row'; for column_number, value in enumerate(self.values): s += "\n " + str(self.columns[column_number]) + "=" + str(value) s += "\n>" return s class RowQuery(object): """Query to execute against a row of HXL data.""" def __init__(self, pattern, op, value, is_aggregate=False): """Constructor @param pattern: the L{TagPattern} to match in the row @param op: the operator function to use for comparison @param value: the value to compare against @param is_aggregate: if True, the value is a special placeholder like "min" or "max" that needs to be calculated """ self.pattern = TagPattern.parse(pattern) self.op = op self.value = value # if the value is a formula, extract it self.formula = None result = re.match(r'^{{(.+)}}$', hxl.datatypes.normalise_space(value)) if result: self.formula = result.group(1) self.is_aggregate=is_aggregate self.needs_aggregate = False """Need to calculate an aggregate value""" if is_aggregate: self.needs_aggregate = True # calculate later self.date_value = None self.number_value = None self._saved_indices = None def calc_aggregate(self, dataset): """Calculate the aggregate value that we need for the row query Substitute the special values "min" and "max" with aggregates. @param dataset: the HXL dataset to use (must be cached) """ if not self.needs_aggregate: logger.warning("no aggregate calculation needed") return # no need to calculate if not dataset.is_cached: raise HXLException("need a cached dataset for calculating an aggregate value") if self.value == 'min': self.value = dataset.min(self.pattern) self.op = operator.eq elif self.value == 'max': self.value = dataset.max(self.pattern) self.op = operator.eq elif self.value == 'not min': self.value = dataset.min(self.pattern) self.op = operator.ne elif self.value == 'not max': self.value = dataset.max(self.pattern) self.op = operator.ne else: raise HXLException("Unrecognised aggregate: {}".format(value)) self.needs_aggregate = False def match_row(self, row): """Check if a key-value pair appears in a HXL row""" # fail if we need an aggregate and haven't calculated it if self.needs_aggregate and not self.aggregate_is_calculated: raise HXLException("must call calc_aggregate before matching an 'is min' or 'is max' condition") # initialise is this is the first time matching for the row query if self._saved_indices is None or self.formula: # if it's a row formula, evaluate first if self.formula: value = hxl.formulas.eval.eval(row, self.formula) else: value = self.value if self.pattern.tag == '#date': try: self.date_value = hxl.datatypes.normalise_date(value) except ValueError: self.date_value = None try: self.number_value = hxl.datatypes.normalise_number(value) except ValueError: self.number_value = None self.string_value = hxl.datatypes.normalise_string(value) # try all the matching column values indices = self._get_saved_indices(row.columns) for i in indices: if i < len(row.values) and self.match_value(row.values[i], self.op): return True return False def match_value(self, value, op): """Try matching as dates, then as numbers, then as simple strings""" if self.date_value is not None: try: return op(hxl.datatypes.normalise_date(value), self.date_value) except ValueError: pass if self.number_value is not None: try: return op(hxl.datatypes.normalise_number(value), self.number_value) except: pass return self.op(hxl.datatypes.normalise_string(value), self.string_value) def _get_saved_indices(self, columns): """Cache the column tests, so that we run them only once.""" # FIXME - assuming that the columns never change self._saved_indices = [] for i in range(len(columns)): if self.pattern.match(columns[i]): self._saved_indices.append(i) return self._saved_indices @staticmethod def parse(query): """Parse a filter expression""" if isinstance(query, RowQuery): # already parsed return query parts = re.split(r'([<>]=?|!?=|!?~|\bis\b)', hxl.datatypes.normalise_string(query), maxsplit=1) pattern = TagPattern.parse(parts[0]) op_name = hxl.datatypes.normalise_string(parts[1]) op = RowQuery.OPERATOR_MAP.get(op_name) value = hxl.datatypes.normalise_string(parts[2]) is_aggregate = False # special handling for aggregates (FIXME) if op_name == 'is' and value in ('min', 'max', 'not min', 'not max'): is_aggregate = True return RowQuery(pattern, op, value, is_aggregate) @staticmethod def parse_list(queries): """Parse a single query spec or a list of specs.""" if queries: if not hasattr(queries, '__len__') or isinstance(queries, six.string_types): # make a list if needed queries = [queries] return [hxl.model.RowQuery.parse(query) for query in queries] else: return [] @staticmethod def match_list(row, queries=None, reverse=False): """See if any query in a list matches a row.""" if not queries: # no queries = pass return True else: # otherwise, must match at least one for query in queries: if query.match_row(row): return not reverse return reverse @staticmethod def operator_re(s, pattern): """Regular-expression comparison operator.""" return re.search(pattern, s) @staticmethod def operator_nre(s, pattern): """Regular-expression negative comparison operator.""" return not re.search(pattern, s) @staticmethod def operator_is(s, condition): """Advanced tests Note: this won't be called for aggregate values like "is min" or "is not max"; for these, the aggregate will already be calculated, and a simple comparison operator substituted by L{calc_aggregate}. """ if condition == 'empty': return hxl.datatypes.is_empty(s) elif condition == 'not empty': return not hxl.datatypes.is_empty(s) elif condition == 'number': return hxl.datatypes.is_number(s) elif condition == 'not number': return not hxl.datatypes.is_number(s) elif condition == 'date': return (hxl.datatypes.is_date(s)) elif condition == 'not date': return (hxl.datatypes.is_date(s) is False) else: raise hxl.HXLException('Unknown is condition: {}'.format(condition)) # Constant map of comparison operators OPERATOR_MAP = { '=': operator.eq, '!=': operator.ne, '<': operator.lt, '<=': operator.le, '>': operator.gt, '>=': operator.ge, } # Static functions def get_column_indices(tag_patterns, columns): """Get a list of column indices that match the tag patterns provided @param tag_patterns: a list of tag patterns or a string version of the list @param columns: a list of columns @returns: a (possibly-empty) list of 0-based indices """ tag_patterns = TagPattern.parse_list(tag_patterns) columns = [Column.parse(column) for column in columns] indices = [] for i, column in enumerate(columns): for pattern in tag_patterns: if pattern.match(column): indices.append(i) return indices # Extra static initialisation RowQuery.OPERATOR_MAP['~'] = RowQuery.operator_re RowQuery.OPERATOR_MAP['!~'] = RowQuery.operator_nre RowQuery.OPERATOR_MAP['is'] = RowQuery.operator_is # end