# -*- encoding: utf8 -*- """Olapy's main module, this module manipulate MDX Queries and executes them. Execution needs two main objects: - table_loaded: which contains all tables needed to construct a cube - star_schema: which is the cube Those two objects are constructed in three ways: - manually with a config file, see :mod:`cube_loader_custom` - automatically from csv files, if they respect olapy's `start schema model <http://datawarehouse4u.info/Data-warehouse-schema-architecture-star-schema.html>`_, see :mod:`cube_loader` - automatically from database, also if they respect the start schema model, see :mod:`cube_loader_db` """ from __future__ import absolute_import, division, print_function, \ unicode_literals import itertools import os from collections import OrderedDict from os.path import expanduser from typing import Any, Dict, List, Optional, Text import attr import numpy as np import pandas as pd from olapy.core.mdx.parser.parse import Parser # Needed because SQLAlchemy doesn't work under pyiodide # FIXME: find another way try: from ..tools.connection import get_dialect, get_dialect_name from .cube_loader_db import CubeLoaderDB from .cube_loader_custom import CubeLoaderCustom except ImportError: pass @attr.s class MdxEngine(object): """The main class for executing a query. :param cube: It must be under ~/olapy-data/cubes/{cube_name}. example: if cube = 'sales' then full path -> *home_directory/olapy-data/cubes/sales* :param cubes_folder: which is under olapy-data, and contains all csv cubes by default * ~/olapy-data/cubes/... :param olapy_data_location: olapy-data path :param cube_config: cube-config.yml parsing file result (dict for creating customized cube) :param sql_engine: sql_alchemy engine if you don't want to use any database config file :param source_type: source data input, Default csv :param parser: mdx query parser :param facts: facts table name, Default **Facts** """ cube = attr.ib(default=None) facts = attr.ib(default="Facts") source_type = attr.ib(default="csv") parser = attr.ib(default=attr.Factory(Parser)) csv_files_cubes = attr.ib(default=attr.Factory(list)) db_cubes = attr.ib(default=attr.Factory(list)) sqla_engine = attr.ib(default=None) olapy_data_location = attr.ib( default=os.path.join( os.environ.get("OLAPY_PATH", expanduser("~")), "olapy-data" ) ) cube_config = attr.ib(default=None) tables_loaded = attr.ib(default=None) star_schema_dataframe = attr.ib(default=None) measures = attr.ib(default=None) selected_measures = attr.ib(default=None) cubes_folder = attr.ib(default="cubes") # @olapy_data_location.default # def get_default_cubes_directory(self): # # with OLAPY_PATH env var, we can inject olap_web's flask instance_path olapy-data, # home_directory = os.environ.get("OLAPY_PATH", expanduser("~")) # if "olapy-data" not in home_directory: # home_directory = os.path.join(home_directory, "olapy-data") # # return home_directory def _get_db_cubes_names(self): """Get databases cubes names.""" # get databases names first , and them instantiate MdxEngine with this database, thus \ # it will try to construct the star schema either automatically or manually dialect = get_dialect(self.sqla_engine) if not self.sqla_engine or str(self.sqla_engine) != str(dialect.engine): self.sqla_engine = dialect.engine return dialect.get_all_databases() @staticmethod def _get_csv_cubes_names(cubes_location): """Get csv folder names. :param cubes_location: OlaPy cubes path, which is under olapy-data, by default ~/olapy-data/cubes/... """ return [ file for file in os.listdir(cubes_location) if os.path.isdir(os.path.join(cubes_location, file)) ] def get_cubes_names(self): """List all cubes (by default from csv folder only). You can explicitly specify csv folder and databases with *self.source_type = ('csv','db')* :return: list of all cubes """ if "db" in self.source_type: self.db_cubes = self._get_db_cubes_names() if "csv" in self.source_type: cubes_folder_path = os.path.join( self.olapy_data_location, self.cubes_folder ) self.csv_files_cubes = self._get_csv_cubes_names(cubes_folder_path) return self.db_cubes + self.csv_files_cubes def load_cube( self, cube_name, fact_table_name="Facts", sep=";", measures=None, **kwargs ): """After instantiating MdxEngine(), load_cube construct the cube and load all tables. :param cube_name: cube name :param fact_table_name: facts table name, Default **Facts** :param sep: separator used in csv files :param measures: if you want to explicitly specify measures """ self.cube = cube_name self.facts = fact_table_name # load cubes names self.get_cubes_names() # necessary, it fills csv_files_cubes and db_cubes # load tables self.tables_loaded = self.load_tables(sep=sep) if measures: self.measures = measures else: self.measures = self.get_measures() if self.measures: # default measure is the first one self.selected_measures = [self.measures[0]] # construct star_schema if self.tables_loaded: self.star_schema_dataframe = self.get_star_schema_dataframe(sep=sep) def load_tables(self, sep): # type: (Text) -> Dict[Text, pd.DataFrame] """ Load all tables as dict of { Table_name : DataFrame } for the current cube instance. :param sep: csv files separator. :return: dict with table names as keys and DataFrames as values. """ cubes_folder_path = self.get_cube_path() cube_loader = None # type: Any if ( self.cube_config and self.cube_config["facts"] and self.cube == self.cube_config["name"] ): cube_loader = CubeLoaderCustom( cube_config=self.cube_config, cube_path=cubes_folder_path, sqla_engine=self.sqla_engine, sep=sep, ) elif self.cube in self.db_cubes: dialect_name = get_dialect_name(str(self.sqla_engine)) if "postgres" in dialect_name: self.facts = self.facts.lower() cube_loader = CubeLoaderDB(self.sqla_engine) # if not tables: # raise Exception( # 'unable to load tables, check that the database is not empty', # ) # elif self.cube in self.csv_files_cubes: else: # force reimport CubeLoader every instance call (MdxEngine or SparkMdxEngine) from . import CubeLoader cube_loader = CubeLoader(cubes_folder_path, sep) return cube_loader.load_tables() def get_measures(self): """:return: all numerical columns in Facts table.""" # from postgres and oracle databases , all tables names are lowercase # col.lower()[-2:] != 'id' to ignore any id column if self.facts in list(self.tables_loaded.keys()): not_id_columns = [ column for column in self.tables_loaded[self.facts].columns if "id" not in column ] cleaned_facts = self.clean_data( self.tables_loaded[self.facts], not_id_columns ) return [ col for col in cleaned_facts.select_dtypes(include=[np.number]).columns if col.lower()[-2:] != "id" ] @staticmethod def clean_data(star_schema_df, measures): """measure like this: 1 349 is not numeric so we try to transform it to 1349. :param star_schema_df: start schema dataframe :param measures: list of measures columns names :return: cleaned columns """ if measures: for measure in measures: if star_schema_df[measure].dtype == object: star_schema_df[measure] = star_schema_df[measure].str.replace( " ", "" ) try: star_schema_df[measure] = star_schema_df[measure].astype( "float" ) except ValueError: star_schema_df = star_schema_df.drop(measure, 1) return star_schema_df def get_star_schema_dataframe(self, sep, with_id_columns=False): """Merge all DataFrames as star schema. :param sep: csv files separator. :param with_id_columns: start schema dataFrame contains id columns or not :return: star schema DataFrame """ # force reimport CubeLoader every instance call (MdxEngine or SparkMdxEngine) from ..executor import CubeLoader if ( self.cube_config and self.cube_config["facts"] and self.cube == self.cube_config["name"] ): self.facts = self.cube_config["facts"]["table_name"] # measures in config-file only if self.cube_config["facts"]["measures"]: self.measures = self.cube_config["facts"]["measures"] cube_path = self.get_cube_path() cube_loader = CubeLoaderCustom( cube_config=self.cube_config, cube_path=cube_path, sqla_engine=self.sqla_engine, sep=sep, ) elif self.cube in self.db_cubes: cube_loader = CubeLoaderDB(self.sqla_engine) # elif self.cube in self.csv_files_cubes: else: cube_path = self.get_cube_path() cube_loader = CubeLoader(cube_path) fusion = cube_loader.construct_star_schema(self.facts) star_schema_df = self.clean_data(fusion, self.measures) if with_id_columns: return star_schema_df return star_schema_df[ [col for col in fusion.columns if col.lower()[-3:] != "_id"] ] def get_all_tables_names(self, ignore_fact=False): """Get list of tables names. :param ignore_fact: return all table name with or without facts table name :return: all tables names """ if ignore_fact: return [tab for tab in self.tables_loaded if self.facts not in tab] return list(self.tables_loaded) def get_cube_path(self): """Get path to the cube ( ~/olapy-data/cubes ). :return: path to the cube """ if self.source_type.upper() == "CSV": return os.path.join(self.olapy_data_location, self.cubes_folder, self.cube) return None @staticmethod def change_measures(tuples_on_mdx): """Set measures to which exists in the query. :param tuples_on_mdx: List of tuples example :: [ '[Measures].[Amount]' , '[Geography].[Geography].[Continent]' ] :return: measures column's names *(Amount for the example)* """ list_measures = [] for tple in tuples_on_mdx: if tple[0].upper() == "MEASURES" and tple[-1] not in list_measures: list_measures.append(tple[-1]) return list_measures @staticmethod def _df_column_values_exist(tupl, df): """for [Geography].[Geography].[Continent].[America].[Los Angeles] check if America exist in Country column and check if Los Angeles exist in City column ... :return: """ for idx, column_value in enumerate(tupl[2:]): # for numeric column values, pandas convert them to unicode, so try to convert to int # ( exple Time dimension, with Year column value 2010 -> 2010 unicode) try: column_value = int(column_value) except ValueError: pass if column_value not in df[df.columns[idx]].unique(): return False return True def get_tables_and_columns(self, tuple_as_list): """Get used dimensions and columns in the MDX Query. Useful for DataFrame -> xmla response transformation. :param tuple_as_list: list of tuples example:: [ '[Measures].[Amount]', '[Product].[Product].[Crazy Development]', '[Geography].[Geography].[Continent]' ] :return: dimension and columns dict example:: { Geography : ['Continent','Country'], Product : ['Company'] Facts : ['Amount','Count'] } """ axes = {} for axis, tuples in tuple_as_list.items(): measures = [] tables_columns = OrderedDict() # if we have measures in columns or rows axes like : # SELECT {[Measures].[Amount],[Measures].[Count]} ON COLUMNS # we have to add measures directly to tables_columns for tupl in tuples: if tupl[0].upper() == "MEASURES": measures.append(tupl[-1]) tables_columns.update({self.facts: measures}) else: df = self.tables_loaded[tupl[0]] if len(tables_columns.get(tupl[0], [])) < len( df.columns[: len(tupl[2:])] ): tables_columns.update({tupl[0]: df.columns[: len(tupl[2:])]}) axes.update({axis: tables_columns}) return axes def _get_df_subset_from_tuple(self, df, tupl): if tupl[-1] in df.columns: return df[(df[tupl[-1]].notnull())] else: column_from_value = self.tables_loaded[tupl[0]].columns[len(tupl[3:])] return df[(df[column_from_value] == tupl[-1])] def _get_column_name_from_value(self, df, column_value): for column in df.columns: if column_value in df[column].unique(): return column def execute_one_tuple(self, tuple_as_list, dataframe_in, columns_to_keep): """Filter a DataFrame (Dataframe_in) with one tuple. Example :: tuple = ['Geography','Geography','Continent','Europe','France','olapy'] Dataframe_in in : +-------------+----------+---------+---------+---------+ | Continent | Country | Company | Article | Amount | +=============+==========+=========+=========+=========+ | America | US | MS | SSAS | 35150 | +-------------+----------+---------+---------+---------+ | Europe | France | AB | olapy | 41239 | +-------------+----------+---------+---------+---------+ | ..... | ..... | ...... | ..... | ..... | +-------------+----------+---------+---------+---------+ out : +-------------+----------+---------+---------+---------+ | Continent | Country | Company | Article | Amount | +=============+==========+=========+=========+=========+ | Europe | France | AB | olapy | 41239 | +-------------+----------+---------+---------+---------+ :param tuple_as_list: tuple as list :param dataframe_in: DataFrame in with you want to execute tuple :param columns_to_keep: (useful for executing many tuples, for instance execute_mdx) other columns to keep in the execution except the current tuple :return: Filtered DataFrame """ df = dataframe_in # tuple_as_list like ['Geography','Geography','Continent'] # return df with Continent column non empty # if len(tuple_as_list) == 3: # df = self._get_df_subset_from_tuple(df, tuple_as_list) # tuple_as_list like['Geography', 'Geography', 'Continent' , 'America','US'] # execute : df[(df['Continent'] == 'America')] and # df[(df['Country'] == 'US')] # elif len(tuple_as_list) > 3: for tup_att in tuple_as_list[2:]: # df[(df['Year'] == 2010)] # 2010 must be as int, otherwise , pandas generate exception if tup_att.isdigit(): tup_att = int(tup_att) if tup_att in df.columns: df = df[(df[tup_att].notnull())] else: # todo check ex time column_from_value = self._get_column_name_from_value( self.tables_loaded[tuple_as_list[0]], tup_att ) df = df[(df[column_from_value] == tup_att)] cols = list(itertools.chain.from_iterable(columns_to_keep)) return df[cols + self.selected_measures] @staticmethod def add_missed_column(dataframe1, dataframe2): """if you want to concat two Dataframes with different columns like : +-------------+---------+ | Continent | Amount | +=============+=========+ | America | 35150 | +-------------+---------+ | Europe | 41239 | +-------------+---------+ and : +-------------+---------------+---------+ | Continent | Country_code | Amount | +=============+===============+=========+ | America | 1111 | 35150 | +-------------+---------------+---------+ result : +-------------+--------------+---------+ | Continent | Country_code | Amount | +=============+==============+=========+ | America | 1111.0 |35150 | +-------------+--------------+---------+ | Europe | NaN |41239 | +-------------+--------------+---------+ Country_code is converted to float, so the solution is to add a column to the fist DataFrame filled with -1, thus +-------------+---------------+---------+ | Continent | Country_code | Amount | +=============+===============+=========+ | America | -1 | 35150 | +-------------+---------------+---------+ | Europe | -1 | 41239 | +-------------+---------------+---------+ and : +-------------+---------------+---------+ | Continent | Country_code | Amount | +=============+===============+=========+ | America | 1111 | 35150 | +-------------+---------------+---------+ result : +-------------+--------------+---------+ | Continent | Country_code | Amount | +=============+==============+=========+ | America | 1111 |35150 | +-------------+--------------+---------+ | Europe | -1 |41239 | +-------------+--------------+---------+ :param dataframe1: first DataFrame :param dataframe2: second DataFrame :return: Two DataFrames with same columns """ df_with_less_columns = dataframe1 df_with_more_columns = dataframe2 if len(list(dataframe1.columns)) != len(list(dataframe2.columns)): if len(list(dataframe1.columns)) > len(list(dataframe2.columns)): df_with_more_columns = dataframe1 df_with_less_columns = dataframe2 missed_columns = [ col for col in list(df_with_more_columns.columns) if col not in list(df_with_less_columns.columns) ] for missed_column in missed_columns: df_with_less_columns[missed_column] = -1 return [df_with_less_columns, df_with_more_columns] def update_columns_to_keep(self, tuple_as_list, columns_to_keep): """If we have multiple dimensions, with many columns like:: columns_to_keep : Geo -> Continent,Country Prod -> Company Time -> Year,Month,Day we have to use only dimension's columns of current dimension that exist in tuple_as_list and keep other dimensions columns. So if tuple_as_list = ['Geography','Geography','Continent'] then columns_to_keep will be:: columns_to_keep : Geo -> Continent Prod -> Company Time -> Year,Month,Day we need columns_to_keep for grouping our columns in the DataFrame :param tuple_as_list: example:: ['Geography','Geography','Continent'] :param columns_to_keep: example:: { 'Geography': ['Continent','Country'], 'Time': ['Year','Month','Day'] } :return: updated columns_to_keep """ df = self.tables_loaded[tuple_as_list[0]] if self.parser.hierarchized_tuples() or self._df_column_values_exist( tuple_as_list, df ): start_columns_used = 2 else: start_columns_used = 3 # todo change to simplest solution if len(tuple_as_list) == 3 and tuple_as_list[-1] in df.columns: columns_to_keep.update({tuple_as_list[0]: [tuple_as_list[-1]]}) else: # if len(columns_to_keep.get(tuple_as_list[0], [])) < len( # df.columns[: len(tuple_as_list[start_columns_used:])]): columns_to_keep.update( { tuple_as_list[0]: df.columns[ : len(tuple_as_list[start_columns_used:]) ] } ) @staticmethod def _uniquefy_tuples(tuples): """Remove redundant tuples. :param tuples: list of string of tuples. :return: list of string of unique tuples. """ unique_tuples = [] for tupl in tuples: if tupl not in unique_tuples and tupl[0].upper() != "MEASURES": unique_tuples.append(tupl) return unique_tuples def tuples_to_dataframes(self, tuples_on_mdx_query, columns_to_keep): """Construct DataFrame of many groups mdx query. many groups mdx query is something like: example with 3 groups:: SELECT{ ([A].[A].[A]) ([B].[B].[B]) ([C].[C].[C]) } FROM [D] :param tuples_on_mdx_query: list of string of tuples. :param columns_to_keep: (useful for executing many tuples, for instance execute_mdx). :return: Pandas DataFrame. """ # get only used columns and dimensions for all query star_df = self.star_schema_dataframe df_to_fusion = [] table_name = tuples_on_mdx_query[0][0] # in every tuple for tupl in tuples_on_mdx_query: # if we have measures in columns or rows axes like : # SELECT {[Measures].[Amount],[Measures].[Count], [Customers].[Geography].[All Regions]} ON COLUMNS # we use only used columns for dimension in that tuple and keep # other dimension's columns self.update_columns_to_keep(tupl, columns_to_keep) # a tuple with new dimension if tupl[0] != table_name: # if we change dimension , we have to work on the # exection's result on previous DataFrames df = df_to_fusion[0] for next_df in df_to_fusion[1:]: df = pd.concat(self.add_missed_column(df, next_df)) table_name = tupl[0] df_to_fusion = [] star_df = df df_to_fusion.append( self.execute_one_tuple(tupl, star_df, columns_to_keep.values()) ) return df_to_fusion def fusion_dataframes(self, df_to_fusion): # type: (List[pd.DataFrame]) -> pd.DataFrame """Concat chunks of DataFrames. :param df_to_fusion: List of Pandas DataFrame. :return: a Pandas DataFrame. """ df = df_to_fusion[0] for next_df in df_to_fusion[1:]: df = pd.concat(self.add_missed_column(df, next_df), sort=False) return df def check_nested_select(self): # type: () -> bool """Check if the MDX Query is Hierarchized and contains many tuples groups.""" return ( not self.parser.hierarchized_tuples() and len(self.parser.get_nested_select()) >= 2 ) def nested_tuples_to_dataframes(self, columns_to_keep): """Construct DataFrame of many groups. :param columns_to_keep: :func:`columns_to_keep` (useful for executing many tuples, for instance execute_mdx). :return: a list of Pandas DataFrame. """ dfs = [] grouped_tuples = self.parser.get_nested_select() for tuple_groupe in grouped_tuples: transformed_tuple_groups = [] for tuple in self.parser.split_group(tuple_groupe): tuple = tuple.split("].[") tuple[0] = tuple[0].replace("[", "") tuple[-1] = tuple[-1].replace("]", "") if tuple[0].upper() != "MEASURES": transformed_tuple_groups.append(tuple) dfs.append( self.tuples_to_dataframes(transformed_tuple_groups, columns_to_keep)[0] ) return dfs def clean_mdx_query(self, mdx_query): try: clean_query = ( mdx_query.decode("utf-8").strip().replace("\n", "").replace("\t", "") ) except AttributeError: clean_query = mdx_query.strip().replace("\n", "").replace("\t", "") # todo property in parser self.parser.mdx_query = clean_query return clean_query def execute_mdx(self, mdx_query): """Execute an MDX Query. Usage :: executor = MdxEngine() executor.load_cube('sales') query = "SELECT FROM [sales] WHERE ([Measures].[Amount])" executor.execute_mdx(query) :param mdx_query: Mdx Query :return: dict with DataFrame execution result and (dimension and columns used as dict) example:: { 'result': DataFrame result 'columns_desc': dict of dimension and columns used } """ query = self.clean_mdx_query(mdx_query) # use measures that exists on where or insides axes query_axes = self.parser.decorticate_query(query) if self.change_measures(query_axes["all"]): self.selected_measures = self.change_measures(query_axes["all"]) tables_n_columns = self.get_tables_and_columns(query_axes) columns_to_keep = OrderedDict( (table, columns) for table, columns in tables_n_columns["all"].items() if table != self.facts ) tuples_on_mdx_query = self._uniquefy_tuples(query_axes["all"]) # tuples_on_mdx_query.sort(key=lambda x: x[0]) # sort by tuple length # tuples_on_mdx_query.sort(key=len) # # sort by tuple dimensions, we dont want something like [X].[Y].[X] But instead [X].[X].[Y] # tuples_on_mdx_query.sort(key=lambda x: x[0]) tuples_on_mdx_query.sort(key=lambda tupl: (tupl[0], len(tupl))) # if we have tuples in axes # to avoid prob with query like this: # SELECT FROM [Sales] WHERE ([Measures].[Amount]) if tuples_on_mdx_query: if self.check_nested_select(): df_to_fusion = self.nested_tuples_to_dataframes(columns_to_keep) else: df_to_fusion = self.tuples_to_dataframes( tuples_on_mdx_query, columns_to_keep ) df = self.fusion_dataframes(df_to_fusion) cols = list(itertools.chain.from_iterable(columns_to_keep.values())) sort = self.parser.hierarchized_tuples() # margins=True for columns total !!!!! result = df.groupby(cols, sort=sort).sum()[self.selected_measures] else: result = ( self.star_schema_dataframe[self.selected_measures].sum().to_frame().T ) return {"result": result, "columns_desc": tables_n_columns}