#!/usr/bin/python2 # # Copyright 2018 Google LLC # # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file or at # https://developers.google.com/open-source/licenses/bsd """Implementation of the DSPL data model in Python. Note that not all DSPL features are currently supported (e.g., topics, concept properties, etc.). """ from __future__ import print_function __author__ = 'Benjamin Yolken <yolken@google.com>' import csv import os import xml.dom.minidom import xml.etree.ElementTree _VALUE_LANGUAGE = 'en' class DSPLModelError(Exception): """Base class for exceptions in the dspl_model module.""" pass def _ValueOrPlaceHolder(value_string, description): """Embeds a string inside an XML <value>...</value> element. If the string is empty or None, an alternate string is used instead. Args: value_string: String to embed description: String to be used if the value string is empty or None. Returns: An ElementTree Element object. """ value_element = xml.etree.ElementTree.Element('value') value_element.set('xml:lang', _VALUE_LANGUAGE) if value_string: value_element.text = value_string else: value_element.text = '** INSERT %s **' % description return value_element class DataSet(object): """Top-level representation of a DSPL dataset.""" def __init__(self, namespace='', name='', description='', url='', provider_name='', provider_url='', imports=(), topics=(), concepts=(), slices=(), tables=(), verbose=True): """Create a new DataSet object. Args: namespace: Namespace for the dataset name: Name of the dataset description: Dataset description url: Dataset URL provider_name: Name of dataset provider provider_url: Provider URL imports: Sequence of Import objects topics: Sequence of Topic objects concepts: Sequence of Concept objects slices: Sequence of Slice objects tables: Sequence of Table objects verbose: Print out status messages to stdout """ self.namespace = namespace self.name = name self.description = description self.url = url self.provider_name = provider_name self.provider_url = provider_url self.imports = list(imports) self.topics = list(topics) self.concepts = list(concepts) self.slices = list(slices) self.tables = list(tables) self.verbose = verbose def AddImport(self, import_obj): """Add an import to this dataset.""" self.imports.append(import_obj) def GetImport(self, namespace_id): """Get the import matching the argument namespace id.""" for import_obj in self.imports: if import_obj.namespace_id == namespace_id: return import_obj return None def AddTopic(self, topic_obj): """Add a top-level topic to this dataset.""" self.topics.append(topic_obj) def _TopicSearchHelper(self, topic_list, topic_id): """Recursively search a list for the topic with the argument id.""" for topic_obj in topic_list: if topic_obj.topic_id == topic_id: return topic_obj elif topic_obj.children: children_result = self._TopicSearchHelper(topic_obj.children, topic_id) if children_result: return children_result return None def GetTopic(self, topic_id): """Get the topic matching the argument topic id.""" return self._TopicSearchHelper(self.topics, topic_id) def AddConcept(self, concept): """Add a concept to this dataset.""" self.concepts.append(concept) def GetConcept(self, concept_id): """Find the concept matching the argument ID.""" for concept in self.concepts: if concept.concept_id == concept_id: return concept return None def AddSlice(self, data_slice): """Add a slice to this dataset.""" self.slices.append(data_slice) def GetSlice(self, slice_id): """Find slice matching the argument ID.""" for data_slice in self.slices: if data_slice.slice_id == slice_id: return data_slice return None def AddTable(self, table): """Add a table to this dataset.""" self.tables.append(table) def GetTable(self, table_id): """Find the table matching the argument ID.""" for table in self.tables: if table.table_id == table_id: return table return None def Materialize(self, output_path): """Write the dataset XML and CSV files to the argument output path.""" output_file_name = os.path.join(output_path, 'dataset.xml') if self.verbose: print('Writing file: %s' % output_file_name) # Write XML file xml_file = open(output_file_name, 'w') xml_file.write(str(self)) xml_file.close() # Write CSV files for table in self.tables: table.MaterializeData(output_path) def ToXMLElement(self): """Convert object to its ElementTree XML representation. Recursively calls the ToXMLElement method for all of its concept, slice, and table children. TODO(yolken): Cache results for better performance. Returns: An ElementTree Element. """ root_element = xml.etree.ElementTree.Element('dspl') if self.namespace: root_element.set('targetNamespace', self.namespace) # Add namespace and imports root_element.set('xmlns', 'http://schemas.google.com/dspl/2010') for import_obj in self.imports: root_element.set('xmlns:%s' % import_obj.namespace_id, import_obj.namespace_url) root_element.append(import_obj.ToXMLElement()) # Basic dataset information dataset_info = xml.etree.ElementTree.Element('info') dataset_name = xml.etree.ElementTree.Element('name') dataset_name.append(_ValueOrPlaceHolder(self.name, 'DATASET NAME')) dataset_info.append(dataset_name) dataset_description = xml.etree.ElementTree.Element('description') dataset_description.append( _ValueOrPlaceHolder(self.description, 'DATASET DESCRIPTION')) dataset_info.append(dataset_description) dataset_url = xml.etree.ElementTree.Element('url') dataset_url.append( _ValueOrPlaceHolder(self.url, 'DATASET URL')) dataset_info.append(dataset_url) root_element.append(dataset_info) # Provider information provider_info = xml.etree.ElementTree.Element('provider') provider_name = xml.etree.ElementTree.Element('name') provider_name.append( _ValueOrPlaceHolder(self.provider_name, 'PROVIDER NAME')) provider_info.append(provider_name) provider_url = xml.etree.ElementTree.Element('url') provider_url.append( _ValueOrPlaceHolder(self.provider_url, 'PROVIDER URL')) provider_info.append(provider_url) root_element.append(provider_info) # Add topic info if self.topics: topic_elements = xml.etree.ElementTree.Element('topics') for topic in self.topics: topic_elements.append(topic.ToXMLElement()) root_element.append(topic_elements) # Add concept info concept_elements = xml.etree.ElementTree.Element('concepts') for concept in self.concepts: if not concept.concept_reference: concept_elements.append(concept.ToXMLElement()) root_element.append(concept_elements) # Add slices slice_elements = xml.etree.ElementTree.Element('slices') for data_slice in self.slices: slice_elements.append(data_slice.ToXMLElement(self)) root_element.append(slice_elements) # Add table info table_elements = xml.etree.ElementTree.Element('tables') for table in self.tables: table_elements.append(table.ToXMLElement()) root_element.append(table_elements) return root_element def __str__(self): """Make a 'pretty' version of the dataset XML, with two-space indents. TODO(yolken): Cache results for better performance. Returns: A string of the dataset XML """ result = xml.dom.minidom.parseString( xml.etree.ElementTree.tostring( self.ToXMLElement(), encoding='utf-8')).toprettyxml(indent=' ') return result class Import(object): """Representation of a DSPL dataset import.""" def __init__(self, namespace_id='', namespace_url=''): """Create a new Import object. Args: namespace_id: Identifier for the dataset namespace_url: URL for the imported dataset """ self.namespace_id = namespace_id self.namespace_url = namespace_url def ToXMLElement(self): """Convert object to its ElementTree XML representation. Returns: An ElementTree Element. """ import_element = xml.etree.ElementTree.Element('import') import_element.set('namespace', self.namespace_url) return import_element class Topic(object): """Representation of a DSPL topic.""" def __init__(self, topic_id='', topic_name='', children=()): """Create a new Topic object. Args: topic_id: Identifier for this topic topic_name: Name of this topic children: Sequence of topics that are the children of this one """ self.topic_id = topic_id self.topic_name = topic_name self.children = children def ToXMLElement(self): """Convert object to its ElementTree XML representation. Returns: An ElementTree Element. """ topic_element = xml.etree.ElementTree.Element('topic') topic_element.set('id', self.topic_id) topic_info = xml.etree.ElementTree.Element('info') topic_name = xml.etree.ElementTree.Element('name') topic_name.append( _ValueOrPlaceHolder( self.topic_name, 'NAME for topic: %s' % self.topic_id)) topic_info.append(topic_name) topic_element.append(topic_info) for child_topic in self.children: topic_element.append(child_topic.ToXMLElement()) return topic_element class Concept(object): """Representation of a DSPL concept.""" def __init__(self, concept_id='', concept_name='', concept_description='', data_type='', table_ref='', concept_reference='', concept_extension_reference='', topic_references=(), attributes=(), properties=()): """Create a new Concept object. Args: concept_id: ID string for the concept concept_name: Name of the concept concept_description: Description of the concept data_type: One of {'boolean', 'date', 'float', 'integer', 'string'} table_ref: ID string for the concept's table concept_reference: ID string for the (external) concept that this object represents; including a value here means that the metadata will not be materialized to XML concept_extension_reference: ID string for the concept this one extends topic_references: List of string topic IDs for this concept attributes: A list of Attribute instances associated with this concept properties: A list of Property instances associated with this concept """ self.concept_id = concept_id self.concept_name = concept_name self.concept_description = concept_description self.data_type = data_type self.table_ref = table_ref self.concept_reference = concept_reference self.concept_extension_reference = concept_extension_reference self.topic_references = list(topic_references) self.attributes = list(attributes) self.properties = list(properties) def ToXMLElement(self): """Convert object to its ElementTree XML representation. Returns: An ElementTree Element. """ concept_element = xml.etree.ElementTree.Element('concept') concept_element.set('id', self.concept_id) if self.concept_extension_reference: concept_element.set('extends', self.concept_extension_reference) concept_info = xml.etree.ElementTree.Element('info') concept_name = xml.etree.ElementTree.Element('name') concept_name.append( _ValueOrPlaceHolder( self.concept_name, 'NAME for concept: %s' % self.concept_id)) concept_info.append(concept_name) concept_description = xml.etree.ElementTree.Element('description') concept_description.append( _ValueOrPlaceHolder( self.concept_description, 'DESCRIPTION for concept: %s' % self.concept_id)) concept_info.append(concept_description) concept_element.append(concept_info) for topic_reference in self.topic_references: topic_element = xml.etree.ElementTree.Element('topic') topic_element.set('ref', topic_reference) concept_element.append(topic_element) concept_type = xml.etree.ElementTree.Element('type') concept_type.set('ref', self.data_type) concept_element.append(concept_type) for concept_attribute in self.attributes: concept_element.append(concept_attribute.toXMLElement()) for concept_property in self.properties: concept_element.append(concept_property.toXMLElement()) if self.table_ref: concept_table = xml.etree.ElementTree.Element('table') concept_table.set('ref', self.table_ref) concept_element.append(concept_table) return concept_element class Attribute(object): """Representation of a simple DSPL concept attribute. For now, this representation is limited to attributes with just a concept reference and value. """ def __init__(self, concept_ref='', value=''): """Create a new Attribute instance. Args: concept_ref: String reference to concept value: String value for this attribute """ self.concept_ref = concept_ref self.value = value def toXMLElement(self): """Convert object to its ElementTree XML representation. Returns: An ElementTree Element. """ attribute_element = xml.etree.ElementTree.Element('attribute') attribute_element.set('concept', self.concept_ref) if self.value: value_element = xml.etree.ElementTree.Element('value') value_element.text = self.value attribute_element.append(value_element) return attribute_element class Property(object): """Representation of a simple DSPL concept property. For now, this representation is limited to properties with just a concept reference and (optional) isParent attribute. """ def __init__(self, concept_ref='', is_parent=False): """Create a new Property instance. Args: concept_ref: String reference to concept is_parent: Boolean representing whether the previous is this concept's parent """ self.concept_ref = concept_ref self.is_parent = is_parent def toXMLElement(self): """Convert object to its ElementTree XML representation. Returns: An ElementTree Element. """ property_element = xml.etree.ElementTree.Element('property') property_element.set('concept', self.concept_ref) if self.is_parent: property_element.set('isParent', 'true') return property_element class Slice(object): """Representation of a DSPL slice.""" def __init__(self, slice_id='', dimension_refs=(), metric_refs=(), dimension_map=(), metric_map=(), table_ref=''): """Create a new Slice object. Args: slice_id: ID string for this slice dimension_refs: Sequence of concept ids (immutable after initialization) metric_refs: Sequence of concept ids (immutable after initialization) dimension_map: Map of dimension IDs to column IDs (if not the same) metric_map: Map of metric IDs to column IDs (if not the same) table_ref: String ID of this slice's table """ self.slice_id = slice_id self.dimension_refs = tuple(dimension_refs) self.metric_refs = tuple(metric_refs) self.dimension_map = dict(dimension_map) self.metric_map = dict(metric_map) self.table_ref = table_ref def ToXMLElement(self, dataset): """Convert object to its ElementTree XML representation. Args: dataset: DataSet object that this slice belongs to. Returns: An ElementTree Element. """ slice_element = xml.etree.ElementTree.Element('slice') slice_element.set('id', self.slice_id) dimension_mapping_elements = [] metric_mapping_elements = [] for dimension_ref in self.dimension_refs: dimension = dataset.GetConcept(dimension_ref) new_dimension = xml.etree.ElementTree.Element('dimension') new_dimension.set('concept', dimension.concept_id) slice_element.append(new_dimension) # Handle dimension->column mappings if dimension.concept_id in self.dimension_map: dimension_mapping_element = ( xml.etree.ElementTree.Element('mapDimension')) dimension_mapping_element.set('concept', dimension.concept_id) dimension_mapping_element.set('toColumn', self.dimension_map[dimension.concept_id]) dimension_mapping_elements.append(dimension_mapping_element) for metric_ref in self.metric_refs: metric = dataset.GetConcept(metric_ref) new_metric = xml.etree.ElementTree.Element('metric') new_metric.set('concept', metric.concept_id) slice_element.append(new_metric) # Handle metric->column metrics if metric.concept_id in self.metric_map: metric_mapping_element = ( xml.etree.ElementTree.Element('mapMetric')) metric_mapping_element.set('concept', metric.concept_id) metric_mapping_element.set('toColumn', self.metric_map[metric.concept_id]) metric_mapping_elements.append(metric_mapping_element) if self.table_ref: slice_table = xml.etree.ElementTree.Element('table') slice_table.set('ref', self.table_ref) for mapping_element in ( dimension_mapping_elements + metric_mapping_elements): slice_table.append(mapping_element) slice_element.append(slice_table) return slice_element class TableColumn(object): """A column in a DSPL table.""" def __init__(self, column_id='', data_type='', data_format='', constant_value=''): """Create a new TableColumn object. Args: column_id: String ID for the column data_type: One of {'boolean', 'date', 'float', 'integer', 'string'} data_format: Formatting string for this column constant_value: A constant value for this column """ self.column_id = column_id self.data_type = data_type self.data_format = data_format self.constant_value = constant_value def ToXMLElement(self): """Convert object to its ElementTree XML representation. Returns: An ElementTree Element. """ column_element = xml.etree.ElementTree.Element('column') column_element.set('id', self.column_id) column_element.set('type', self.data_type) if self.data_format: column_element.set('format', self.data_format) if self.constant_value: column_value_element = xml.etree.ElementTree.Element('value') column_value_element.text = self.constant_value column_element.append(column_value_element) return column_element class Table(object): """Representation of a DSPL table.""" def __init__(self, table_id='', columns=(), file_name='', table_data=(), verbose=True): """Create a new Table object. Args: table_id: String ID for the table columns: Sequence of TableColumn objects file_name: Name of the file associated with this table table_data: Sequence of sequences, one for each row in the table verbose: Print out status messages to stdout """ self.table_id = table_id self.columns = list(columns) self.file_name = file_name self.table_data = list(table_data) self.verbose = verbose def MaterializeData(self, output_path): """Write table data to CSV, using argument path.""" output_file_name = os.path.join(output_path, self.file_name) if self.verbose: print('Writing file: %s' % output_file_name) csv_output_file = open(output_file_name, 'wb') csv_writer = csv.writer(csv_output_file) for row in self.table_data: csv_writer.writerow(row) csv_output_file.close() def ToXMLElement(self): """Convert object to its ElementTree XML representation. Returns: An ElementTree Element. """ table_element = xml.etree.ElementTree.Element('table') table_element.set('id', self.table_id) for column in self.columns: table_element.append(column.ToXMLElement()) table_data = xml.etree.ElementTree.Element('data') table_data_file = xml.etree.ElementTree.Element('file') table_data_file.set('encoding', 'utf-8') table_data_file.set('format', 'csv') table_data_file.text = self.file_name table_data.append(table_data_file) table_element.append(table_data) return table_element