Python pandas.Dataframe() Examples

The following are 40 code examples for showing how to use pandas.Dataframe(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may want to check out the right sidebar which shows the related API usage.

You may also want to check out all available functions/classes of the module pandas , or try the search function .

Example 1
Project: interpret-text   Author: interpretml   File: glove_preprocessor.py    License: MIT License 6 votes vote down vote up
def preprocess(self, data) -> pd.DataFrame:
        """ Convert a list of text into a dataframe containing padded token ids,
        masks distinguishing word tokens from pads, and word token counts for
        each text in the list.
        :param data: list of strings (e.g. sentences)
        :type data: list
        :return: tokens (pd.Dataframe): a dataframe containing
            lists of word token ids, pad/word masks, and token counts
            for each string in the list
        :rtype: pandas dataframe
        """
        token_lists = []
        masks = []
        counts = []
        for sentence in data:
            token_list, mask = self.generate_tokens(sentence)
            token_lists.append(token_list)
            masks.append(mask)
            counts.append(np.sum(mask))
        tokens = pd.DataFrame(
            {"tokens": token_lists, "mask": masks, "counts": counts}
        )
        return tokens 
Example 2
Project: code-for-the-world   Author: jennirinker   File: io.py    License: MIT License 6 votes vote down vote up
def read_selig(path):
    """Read a Selig-style airfoil file

    Parameters
    -----------
    path : str
        Path to the Selig-stle .dat file.

    Returns
    -------
    air_df : pd.Dataframe
        Pandas Dataframe containing x- and y-coordinates of airfoil data.
    """
    air_df = pd.read_csv(path, delim_whitespace=True,
                         header=0)
    air_df.columns = ['x', 'y']
    return air_df 
Example 3
Project: prophet   Author: facebook   File: forecaster.py    License: MIT License 6 votes vote down vote up
def add_group_component(self, components, name, group):
        """Adds a component with given name that contains all of the components
        in group.

        Parameters
        ----------
        components: Dataframe with components.
        name: Name of new group component.
        group: List of components that form the group.

        Returns
        -------
        Dataframe with components.
        """
        new_comp = components[components['component'].isin(set(group))].copy()
        group_cols = new_comp['col'].unique()
        if len(group_cols) > 0:
            new_comp = pd.DataFrame({'col': group_cols, 'component': name})
            components = components.append(new_comp)
        return components 
Example 4
Project: prophet   Author: facebook   File: forecaster.py    License: MIT License 6 votes vote down vote up
def predictive_samples(self, df):
        """Sample from the posterior predictive distribution.

        Parameters
        ----------
        df: Dataframe with dates for predictions (column ds), and capacity
            (column cap) if logistic growth.

        Returns
        -------
        Dictionary with keys "trend" and "yhat" containing
        posterior predictive samples for that component.
        """
        df = self.setup_dataframe(df.copy())
        sim_values = self.sample_posterior_predictive(df)
        return sim_values 
Example 5
Project: prophet   Author: facebook   File: forecaster.py    License: MIT License 6 votes vote down vote up
def predict_uncertainty(self, df):
        """Prediction intervals for yhat and trend.

        Parameters
        ----------
        df: Prediction dataframe.

        Returns
        -------
        Dataframe with uncertainty intervals.
        """
        sim_values = self.sample_posterior_predictive(df)

        lower_p = 100 * (1.0 - self.interval_width) / 2
        upper_p = 100 * (1.0 + self.interval_width) / 2

        series = {}
        for key in ['yhat', 'trend']:
            series['{}_lower'.format(key)] = self.percentile(
                sim_values[key], lower_p, axis=1)
            series['{}_upper'.format(key)] = self.percentile(
                sim_values[key], upper_p, axis=1)

        return pd.DataFrame(series) 
Example 6
Project: nistats   Author: nilearn   File: glm_reporter.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _dataframe_to_html(df, precision, **kwargs):
    """ Makes HTML table from provided dataframe.
    Removes HTML5 non-compliant attributes (ex: `border`).

    Parameters
    ----------
    df: pandas.Dataframe
        Dataframe to be converted into HTML table.

    precision: int
        The display precision for float values in the table.

    **kwargs: keyworded arguments
        Supplies keyworded arguments for func: pandas.Dataframe.to_html()

    Returns
    -------
    html_table: String
        Code for HTML table.
    """
    with pd.option_context('display.precision', precision):
        html_table = df.to_html(**kwargs)
    html_table = html_table.replace('border="1" ', '')
    return html_table 
Example 7
Project: QUANTAXIS   Author: QUANTAXIS   File: QADataStruct.py    License: MIT License 6 votes vote down vote up
def __init__(self, DataFrame):
        """Stock Transaction

        Arguments:
            DataFrame {pd.Dataframe} -- [input is one/multi day transaction]
        """

        self.type = 'stock_transaction'

        self.data = DataFrame
        if 'amount' not in DataFrame.columns:
            if 'vol' in DataFrame.columns:
                self.data['amount'] = self.data.vol * self.data.price * 100
            elif 'volume' in DataFrame.columns:
                self.data['amount'] = self.data.volume * self.data.price * 100
        if '_id' in DataFrame.columns:
            self.data = self.data.drop(["_id"], axis=1)
        self.mongo_coll = DATABASE.stock_transaction 
Example 8
Project: QUANTAXIS   Author: QUANTAXIS   File: QADataStruct.py    License: MIT License 6 votes vote down vote up
def __init__(self, DataFrame):
        """Index Transaction

        Arguments:
            DataFrame {pd.Dataframe} -- [input is one/multi day transaction]
        """

        self.type = 'index_transaction'

        self.data = DataFrame
        if 'amount' not in DataFrame.columns:
            if 'vol' in DataFrame.columns:
                self.data['amount'] = self.data.vol * self.data.price * 100
            elif 'volume' in DataFrame.columns:
                self.data['amount'] = self.data.volume * self.data.price * 100
        if '_id' in DataFrame.columns:
            self.data = self.data.drop(["_id"], axis=1)
        self.mongo_coll = DATABASE.index_transaction 
Example 9
Project: pyiron   Author: pyiron   File: generic.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_jobs(self, recursive=True, columns=None):
        """
        Internal function to return the jobs as dictionary rather than a pandas.Dataframe

        Args:
            recursive (bool): search subprojects [True/False]
            columns (list): by default only the columns ['id', 'project'] are selected, but the user can select a subset
                            of ['id', 'status', 'chemicalformula', 'job', 'subjob', 'project', 'projectpath',
                            'timestart', 'timestop', 'totalcputime', 'computer', 'hamilton', 'hamversion', 'parentid',
                            'masterid']

        Returns:
            dict: columns are used as keys and point to a list of the corresponding values
        """
        if not isinstance(self.db, FileTable):
            return get_jobs(
                database=self.db,
                sql_query=self.sql_query,
                user=self.user,
                project_path=self.project_path,
                recursive=recursive,
                columns=columns,
            )
        else:
            return self.db.get_jobs(project=self.project_path, recursive=recursive, columns=columns) 
Example 10
Project: pyiron   Author: pyiron   File: generic.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def queue_table(self, project_only=True, recursive=True, full_table=False):
        """
        Display the queuing system table as pandas.Dataframe

        Args:
            project_only (bool): Query only for jobs within the current project - True by default
            recursive (bool): Include jobs from sub projects
            full_table (bool): Whether to show the entire pandas table

        Returns:
            pandas.DataFrame: Output from the queuing system - optimized for the Sun grid engine
        """
        return queue_table(
            job_ids=self.get_job_ids(recursive=recursive), project_only=project_only,
            full_table=full_table
        ) 
Example 11
Project: pyiron   Author: pyiron   File: generic.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def queue_table_global(self, full_table=False):
        """
        Display the queuing system table as pandas.Dataframe

        Args:
            full_table (bool): Whether to show the entire pandas table

        Returns:
            pandas.DataFrame: Output from the queuing system - optimized for the Sun grid engine
        """
        df = queue_table(job_ids=[], project_only=False, full_table=full_table)
        if len(df) != 0 and self.db is not None:
            return pandas.DataFrame(
                [
                    self.db.get_item_by_id(
                        int(str(queue_ID).replace("pi_", "").replace(".sh", ""))
                    )
                    for queue_ID in df["jobname"]
                    if str(queue_ID).startswith("pi_")
                ]
            )
        else:
            return None 
Example 12
Project: pyiron   Author: pyiron   File: hdfio.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_from_table(self, path, name):
        """
        Get a specific value from a pandas.Dataframe

        Args:
            path (str): relative path to the data object
            name (str): parameter key

        Returns:
            dict, list, float, int: the value associated to the specific parameter key
        """
        df_table = self.get(path)
        keys = df_table["Parameter"]
        if name in keys:
            job_id = keys.index(name)
            return df_table["Value"][job_id]
        raise ValueError("Unknown name: {0}".format(name)) 
Example 13
Project: pyiron   Author: pyiron   File: parallel.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def output_to_pandas(self, sort_by=None, h5_path="output"):
        """
        Convert output of all child jobs to a pandas Dataframe object.

        Args:
            sort_by (str): sort the output using pandas.DataFrame.sort_values(by=sort_by)
            h5_path (str): select child output to include - default='output'

        Returns:
            pandas.Dataframe: output as dataframe
        """
        # TODO: The output to pandas function should no longer be required
        with self.project_hdf5.open(h5_path) as hdf:
            for key in hdf.list_nodes():
                self._output[key] = hdf[key]
        df = pandas.DataFrame(self._output)
        if sort_by is not None:
            df = df.sort_values(by=sort_by)
        return df

    # TODO: make it more general and move it then into genericJob 
Example 14
Project: CityEnergyAnalyst   Author: architecture-building-systems   File: schemas.py    License: MIT License 6 votes vote down vote up
def validate(self, df):
        """Check to make sure the Dataframe conforms to the schema"""
        expected_columns = set(self.schema["schema"]["columns"].keys())
        found_columns = set(df.columns.values)

        # handle some extra cases
        if "PIPE0" in expected_columns:
            found_columns = {c for c in found_columns if not c.startswith("PIPE")}
            found_columns.add("PIPE0")

        # handle some extra cases
        if "NODE0" in expected_columns:
            found_columns = {c for c in found_columns if not c.startswith("NODE")}
            found_columns.add("NODE0")

        if not found_columns == expected_columns:
            missing_columns = expected_columns - found_columns
            extra_columns = found_columns - expected_columns

            warnings.warn("Dataframe does not conform to schemas.yml specification for {lm}"
                          "(missing: {missing_columns}, extra: {extra_columns}".format(
                lm=self.lm, missing_columns=missing_columns, extra_columns=extra_columns)) 
Example 15
Project: autoimpute   Author: kearnz   File: logistic_regressor.py    License: MIT License 6 votes vote down vote up
def predict_proba(self, X):
        """Predict probabilities of class membership for logistic regression.

        The regression uses the pooled parameters from each of the imputed
        datasets to generate a set of single predictions. The pooled params
        come from multiply imputed datasets, but the predictions themselves
        follow the same rules as an logistic regression. Because this is
        logistic regression, the sigmoid function is applied to the result
        of the normal equation, giving us probabilities between 0 and 1 for
        each prediction. This method returns those probabilities.

        Args:
            X (pd.Dataframe): predictors to predict response

        Returns:
            np.array: prob of class membership for predicted observations.
        """

        # run validation first
        X = self._predict_strategy_validator(self, X)

        # get the alpha and betas, then create linear equation for predictions
        alpha = self.statistics_["coefs"].values[0]
        betas = self.statistics_["coefs"].values[1:]
        return self._sigmoid(alpha + np.dot(X, betas)) 
Example 16
Project: pysystemtrade   Author: robcarver17   File: pdutils.py    License: GNU General Public License v3.0 6 votes vote down vote up
def dataframe_pad(starting_df, column_list, padwith=0.0):
    """
    Takes a dataframe and adds extra columns if neccessary so we end up with columns named column_list

    :param starting_df: A pd.dataframe with named columns
    :param column_list: A list of column names
    :param padwith: The value to pad missing columns with
    :return: pd.Dataframe
    """

    def _pad_column(column_name, starting_df, padwith):
        if column_name in starting_df.columns:
            return starting_df[column_name]
        else:
            return pd.Series([0.0] * len(starting_df.index), starting_df.index)

    new_data = [
        _pad_column(column_name, starting_df, padwith)
        for column_name in column_list
    ]

    new_df = pd.concat(new_data, axis=1)
    new_df.columns = column_list

    return new_df 
Example 17
Project: OptimalPortfolio   Author: VivekPa   File: invariants.py    License: MIT License 6 votes vote down vote up
def stock_invariants(prices, no_assets):
    """
    Calculates stock price invariants, which are the compounded returns
    :param prices: stock prices data of the various tickers
    :type prices: pd Dataframe
    :param no_assets: number of assets in data
    :type no_assets: int
    :return: dataframe of stock invariants
    :rtype: pd.Dataframe
    """
    if not isinstance(prices, pd.DataFrame):
        warnings.warn("prices are not a pd Dataframe", RuntimeWarning)

    asset_ret = pd.DataFrame()
    for j in range(no_assets):
        returns = []
        for i in range(1, len(prices)):
            log_ret = np.log(prices.iloc[i, j] / prices.iloc[i-1, j])
            returns.append(log_ret)
        asset_ret = pd.concat([pd.DataFrame(returns), asset_ret], axis=1, ignore_index=True)
    return asset_ret 
Example 18
Project: OptimalPortfolio   Author: VivekPa   File: invariants.py    License: MIT License 6 votes vote down vote up
def forex_invariants(prices, no_assets):
    """
    Calculates forex price invariants, which are the compounded returns
    :param prices: stock prices data of the various tickers
    :type prices: pd Dataframe
    :param no_assets: number of assets in data
    :type no_assets: int
    :return: dataframe of stock invariants
    :rtype: pd.Dataframe
    """
    if not isinstance(prices, pd.DataFrame):
        warnings.warn("prices are not a pd Dataframe", RuntimeWarning)

    asset_ret = pd.DataFrame()
    for j in range(no_assets):
        returns = []
        for i in range(1, len(prices)):
            log_ret = np.log(prices.iloc[i, j] / prices.iloc[i-1, j])
            returns.append(log_ret)
        asset_ret = pd.concat([pd.DataFrame(returns), asset_ret], axis=1, ignore_index=True)
    return asset_ret 
Example 19
Project: OptimalPortfolio   Author: VivekPa   File: moment_est.py    License: MIT License 6 votes vote down vote up
def sample_coM3(invariants):
    """
    Calculates sample third order co-moment matrix
    Taps into the R package PerformanceAnalytics through rpy2

    :param invariants: sample data of market invariants
    :type invariants: pd.Dataframe
    :param frequency: time horizon of projection, default set ot 252 days
    :type frequency: int
    :return: sample skew dataframe
    """
    
    importr('PerformanceAnalytics')
    if not isinstance(invariants, pd.DataFrame):
        warnings.warn("invariants not a pd.Dataframe", RuntimeWarning)
        invariants = pd.DataFrame(invariants)
    p = invariants.shape[1]
    coskew_function = robjects.r('M3.MM')
    r_inv_vec = robjects.FloatVector(np.concatenate(invariants.values))
    r_invariants = robjects.r.matrix(r_inv_vec,nrow=p,ncol=p)
    r_M3 = coskew_function(r_invariants)
    
    return np.matrix(r_M3) 
Example 20
Project: OptimalPortfolio   Author: VivekPa   File: moment_est.py    License: MIT License 6 votes vote down vote up
def sample_coM4(invariants):
    """
    Calculates sample fourth order co-moment matrix
    Taps into the R package PerformanceAnalytics through rpy2

    :param invariants: sample data of market invariants
    :type invariants: pd.Dataframe
    :param frequency: time horizon of projection, default set ot 252 days
    :type frequency: int
    :return: sample skew dataframe
    """
    
    importr('PerformanceAnalytics')
    if not isinstance(invariants, pd.DataFrame):
        warnings.warn("invariants not a pd.Dataframe", RuntimeWarning)
        invariants = pd.DataFrame(invariants)
    p = invariants.shape[1]
    coskew_function = robjects.r('M4.MM')
    r_inv_vec = robjects.FloatVector(np.concatenate(invariants.values))
    r_invariants = robjects.r.matrix(r_inv_vec,nrow=p,ncol=p)
    r_M4 = coskew_function(r_invariants)
    
    return np.matrix(r_M4) 
Example 21
Project: OptimalPortfolio   Author: VivekPa   File: moment_est.py    License: MIT License 6 votes vote down vote up
def sample_moment(invariants, order, frequency=252):
    """
    Calculates nth moment of sample data.

    :param invariants: sample data of market invariants
    :type invariants: pd.Dataframe
    :param order: order of moment
    :type order: int
    :param frequency: time horizon of projection
    :type frequency: int
    :return: nth moment of sample invariants
    """
    if not isinstance(invariants, pd.DataFrame):
        warnings.warn("invariants not a pd.Dataframe", RuntimeWarning)
        invariants = pd.DataFrame(invariants)
    daily_moment = moment(invariants, moment=order)
    return daily_moment*frequency 
Example 22
Project: OptimalPortfolio   Author: VivekPa   File: moment_est.py    License: MIT License 6 votes vote down vote up
def exp_cov(invariants, span=180, frequency=252):
    """
    Calculates sample exponentially weighted covariance

    :param invariants: sample data of market invariants
    :type invariants: pd.Dataframe
    :param frequency: time horizon of projection
    :type frequency: int
    :param span: the span for exponential weights
    :return: sample exponentially weighted covariance dataframe
    """
    if not isinstance(invariants, pd.DataFrame):
        warnings.warn("invariants not a pd.Dataframe", RuntimeWarning)
        invariants = pd.DataFrame(invariants)
    assets = invariants.columns
    daily_cov = invariants.ewm(span=span).cov().iloc[-len(assets):, -len(assets):]
    return pd.DataFrame(daily_cov*frequency) 
Example 23
Project: OptimalPortfolio   Author: VivekPa   File: moment_est.py    License: MIT License 6 votes vote down vote up
def __init__(self, invariants, n, dist="normal"):
        """

        :param invariants: sample data of market invariants
        :type invariants: pd.Dataframe
        :param n: number of assets
        :type n: int
        :param dist: choice of distribution: "normal"
        :type dist: str
        """
        self.invariants = invariants
        self.dist = dist
        self.n = n
        self.mean = None
        self.cov = None
        self.skew = None
        self.kurt = None 
Example 24
Project: OptimalPortfolio   Author: VivekPa   File: moment_est.py    License: MIT License 6 votes vote down vote up
def __init__(self, invariants, n, frequency=252):
        """

        :param invariants: sample data of market invariants
        :type invariants: pd.Dataframe
        :param n: number of assets
        :type n: int
        :param frequency: time horizon of projection
        :type frequency: int
        """
        if not isinstance(invariants, pd.DataFrame):
            warnings.warn("invariants is not pd.Dataframe", RuntimeWarning)
        self.invariants = invariants
        self.S = self.invariants.cov()
        self.frequency = frequency
        self.n = n 
Example 25
Project: xalpha   Author: refraction-ray   File: indicator.py    License: MIT License 5 votes vote down vote up
def comparison(self, date=yesterdayobj()):
        """
        :returns: tuple of two pd.Dataframe, the first is for aim and the second if for the benchmark index
            all netvalues are normalized and set equal 1.00 on the self.start date
        """
        partp = self.price[self.price["date"] <= date]
        partm = self.bmprice[self.bmprice["date"] <= date]
        normp = partp.iloc[0].netvalue
        normm = partm.iloc[0].netvalue
        partp["netvalue"] = partp["netvalue"] / normp
        partm["netvalue"] = partm["netvalue"] / normm
        return (partp, partm) 
Example 26
Project: scattertext   Author: JasonKessler   File: TermDocMatrixWithoutCategories.py    License: Apache License 2.0 5 votes vote down vote up
def apply_ranker(self, term_ranker, use_non_text_features):
        '''
        Parameters
        ----------
        term_ranker : TermRanker

        Returns
        -------
        pd.Dataframe
        '''
        if use_non_text_features:
            return term_ranker(self).use_non_text_features().get_ranks()
        return term_ranker(self).get_ranks() 
Example 27
Project: scattertext   Author: JasonKessler   File: TermCategoryFrequencies.py    License: Apache License 2.0 5 votes vote down vote up
def apply_ranker(self, term_ranker):
		'''
		Parameters
		----------
		term_ranker : TermRanker
			We'll ignore this

		Returns
		-------
		pd.Dataframe
		'''
		return self.get_term_category_frequencies(None) 
Example 28
Project: prophet   Author: facebook   File: forecaster.py    License: MIT License 5 votes vote down vote up
def predict_seasonal_components(self, df):
        """Predict seasonality components, holidays, and added regressors.

        Parameters
        ----------
        df: Prediction dataframe.

        Returns
        -------
        Dataframe with seasonal components.
        """
        seasonal_features, _, component_cols, _ = (
            self.make_all_seasonality_features(df)
        )
        if self.uncertainty_samples:
            lower_p = 100 * (1.0 - self.interval_width) / 2
            upper_p = 100 * (1.0 + self.interval_width) / 2

        X = seasonal_features.values
        data = {}
        for component in component_cols.columns:
            beta_c = self.params['beta'] * component_cols[component].values

            comp = np.matmul(X, beta_c.transpose())
            if component in self.component_modes['additive']:
                comp *= self.y_scale
            data[component] = np.nanmean(comp, axis=1)
            if self.uncertainty_samples:
                data[component + '_lower'] = self.percentile(
                    comp, lower_p, axis=1,
                )
                data[component + '_upper'] = self.percentile(
                    comp, upper_p, axis=1,
                )
        return pd.DataFrame(data) 
Example 29
Project: prophet   Author: facebook   File: forecaster.py    License: MIT License 5 votes vote down vote up
def make_future_dataframe(self, periods, freq='D', include_history=True):
        """Simulate the trend using the extrapolated generative model.

        Parameters
        ----------
        periods: Int number of periods to forecast forward.
        freq: Any valid frequency for pd.date_range, such as 'D' or 'M'.
        include_history: Boolean to include the historical dates in the data
            frame for predictions.

        Returns
        -------
        pd.Dataframe that extends forward from the end of self.history for the
        requested number of periods.
        """
        if self.history_dates is None:
            raise Exception('Model has not been fit.')
        last_date = self.history_dates.max()
        dates = pd.date_range(
            start=last_date,
            periods=periods + 1,  # An extra in case we include start
            freq=freq)
        dates = dates[dates > last_date]  # Drop start if equals last_date
        dates = dates[:periods]  # Return correct number of periods

        if include_history:
            dates = np.concatenate((np.array(self.history_dates), dates))

        return pd.DataFrame({'ds': dates}) 
Example 30
Project: nistats   Author: nilearn   File: glm_reporter.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _dmtx_to_svg_url(design_matrices):
    """ Accepts a FirstLevelModel or SecondLevelModel object
    with fitted design matrices & generates SVG Image URL,
    which can be inserted into an HTML template.

    Parameters
    ----------
    design_matrices: List[pd.Dataframe]
        Design matrices computed in the model.

    Returns
    -------
    svg_url_design_matrices: String
        SVG Image URL for the plotted design matrices,
    """
    html_design_matrices = []
    dmtx_template_path = os.path.join(HTML_TEMPLATE_ROOT_PATH,
                                      'design_matrix_template.html'
                                      )
    with open(dmtx_template_path) as html_template_obj:
        dmtx_template_text = html_template_obj.read()

    for dmtx_count, design_matrix in enumerate(design_matrices, start=1):
        dmtx_text_ = string.Template(dmtx_template_text)
        dmtx_plot = plot_design_matrix(design_matrix)
        dmtx_title = 'Session {}'.format(dmtx_count)
        plt.title(dmtx_title, y=0.987)
        dmtx_plot = _resize_plot_inches(dmtx_plot, height_change=.3)
        url_design_matrix_svg = plot_to_svg(dmtx_plot)
        # prevents sphinx-gallery & jupyter from scraping & inserting plots
        plt.close()
        dmtx_text_ = dmtx_text_.safe_substitute(
            {'design_matrix': url_design_matrix_svg,
             'dmtx_title': dmtx_title,
             }
        )
        html_design_matrices.append(dmtx_text_)
    svg_url_design_matrices = ''.join(html_design_matrices)
    return svg_url_design_matrices 
Example 31
Project: retentioneering-tools   Author: retentioneering   File: utils.py    License: Mozilla Public License 2.0 5 votes vote down vote up
def get_adjacency(self, cols=None, edge_attributes='event_count', norm=True, **kwargs):
        """
        Creates edge graph in the matrix format. Basically this method is similar to ``BaseTrajectory.retention.get_edgelist()`` but in different format. Row indeces are ``event_col`` values, from which the transition occured, while the row names are ``event_col`` values, to which the transition occured. The values are weights of the edges defined with ``edge_col``, ``edge_attributes`` and ``norm`` parameters.

        Parameters
        -------
        index_col: str, optional
            Name of custom index column, for more information refer to ``init_config``. For instance, if in config you have defined ``index_col`` as ``user_id``, but want to use function over sessions. By default the column defined in ``init_config`` will be used as ``index_col``.
        event_col: str, optional
            Name of custom event column, for more information refer to ``init_config``. For instance, you may want to aggregate some events or rename and use it as new event column. By default the column defined in ``init_config`` will be used as ``event_col``.
        edge_col: str, optional
            Aggregation column for edge weighting. For instance, you may set it to the same value as in ``index_col`` and define ``edge_attributes='users_unique'`` to calculate unique users passed through edge. Default: ``None``
        edge_attributes: str, optional
            Edge weighting function and the name of field is defined with this parameter. It is set with two parts and a dash inbetween: ``[this_column_name]_[aggregation_function]``. The first part is the custom name of this field. The second part after `_` should be a valid ``pandas.groupby.agg()`` parameter, e.g. ``count``, ``sum``, ``nunique``, etc. Default: ``event_count``.
        cols: list, optional
            List of source and target columns, e.g. ``event_name`` and ``next_event``. ``next_event`` column is created automatically during ``BaseTrajectory.retention.prepare()`` method execution. Default: ``None`` wich corresponds to ``event_col`` value from ``retention_config`` and 'next_event'.
        norm: bool, optional
            Normalize values over aggregation used in the second part of ``edge_attributes``. For example, if you set ``edge_col='user_id'`` and ``edge_attributes='users_nunique'``, then if ``norm=True``, edge values will be weighted by the unique number of users and will represent the percentage of unique users passed through a given edge. Default: ``True``.

        Returns
        -------
        Dataframe with number of columns and rows equal to unique number of ``event_col`` values.

        Return type
        -------
        pd.DataFrame
        """
        self._init_cols(locals())
        agg = self.get_edgelist(cols=cols, edge_attributes=edge_attributes, norm=norm, **kwargs)
        G = nx.DiGraph()
        G.add_weighted_edges_from(agg.values)
        return nx.to_pandas_adjacency(G).round(2) 
Example 32
Project: retentioneering-tools   Author: retentioneering   File: utils.py    License: Mozilla Public License 2.0 5 votes vote down vote up
def filter_cluster(self, cluster_name, index_col=None):
        """
        Filters dataset against one or several clusters.

        Parameters
        --------
        cluster_name: int or list
            Cluster ID or list of cluster IDs for filtering.
        index_col: str, optional
            Name of custom index column, for more information refer to ``init_config``. For instance, if in config you have defined ``index_col`` as ``user_id``, but want to use function over sessions. If ``None``, the column defined in ``init_config`` will be used as ``index_col``. Default: ``None``

        Returns
        --------
        Filtered dataset

        Return type
        --------
        pd.Dataframe
        """
        self._init_cols(locals())
        ids = []
        if type(cluster_name) is list:
            for i in cluster_name:
                ids.extend(self.cluster_mapping[i])
        else:
            ids = self.cluster_mapping[cluster_name]
        return self._obj[self._obj[self._index_col()].isin(ids)].copy().reset_index(drop=True) 
Example 33
Project: retentioneering-tools   Author: retentioneering   File: utils.py    License: Mozilla Public License 2.0 5 votes vote down vote up
def select_bbox_from_tsne(self, bbox, plotting=True, **kwargs):
        """
        Selects data filtered by cordinates of TSNE plot.

        Parameters
        ---------
        bbox: list
            List of lists that contains angles of bbox.
                ```bbox = [
                    [0, 0], # [min x, max x]
                    [10, 10] # [min y, max y]
                ]```
        plotting: bool, optional
            If ``True``, then visualize graph of selected users.

        Returns
        --------
        Dataframe with filtered clickstream of users in bbox.

        Return type
        -------
        pd.DataFrame
        """
        self._init_cols(locals())
        if not hasattr(self, '_tsne'):
            raise ValueError('Please, use `learn_tsne` before selection of specific bbox')

        f = self._tsne.index.values[(self._tsne.iloc[:, 0] >= bbox[0][0])
                                    & (self._tsne.iloc[:, 0] <= bbox[0][1])
                                    & (self._tsne.iloc[:, 1] >= bbox[1][0])
                                    & (self._tsne.iloc[:, 1] <= bbox[1][1])]

        filtered = self._obj[self._obj[self._index_col()].isin(f)]
        if plotting:
            filtered.retention.plot_graph(**kwargs)
        return filtered.reset_index(drop=True) 
Example 34
Project: pytorch-widedeep   Author: jrzaurin   File: dense_utils.py    License: MIT License 5 votes vote down vote up
def label_encoder(
    df_inp: pd.DataFrame,
    cols: Optional[List[str]] = None,
    val_to_idx: Optional[Dict[str, Dict[str, int]]] = None,
):
    r"""
    Label-encode some features of a given dataset.

    Parameters:
    -----------
    df_inp: pd.Dataframe
        input dataframe
    cols: List, Optional
        columns to be label-encoded
    val_to_idx: Dict, Optional
        dictionary with the encodings

    Returns:
    --------
    df: pd.Dataframe
        df with Label-encoded features.
    val_to_idx: Dict
        Dictionary with the encoding information
    """

    df = df_inp.copy()
    if cols is None:
        cols = list(df.select_dtypes(include=["object"]).columns)

    if not val_to_idx:
        val_types = dict()
        for c in cols:  # type: ignore
            val_types[c] = df[c].unique()
        val_to_idx = dict()
        for k, v in val_types.items():
            val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    return df, val_to_idx 
Example 35
Project: pyiron   Author: pyiron   File: jobtable.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_jobs(database, sql_query, user, project_path, recursive=True, columns=None):
    """
    Internal function to return the jobs as dictionary rather than a pandas.Dataframe

    Args:
        database (DatabaseAccess): Database object
        sql_query (str): SQL query to enter a more specific request
        user (str): username of the user whoes user space should be searched
        project_path (str): root_path - this is in contrast to the project_path in GenericPath
        recursive (bool): search subprojects [True/False]
        columns (list): by default only the columns ['id', 'project'] are selected, but the user can select a subset
                        of ['id', 'status', 'chemicalformula', 'job', 'subjob', 'project', 'projectpath', 'timestart',
                        'timestop', 'totalcputime', 'computer', 'hamilton', 'hamversion', 'parentid', 'masterid']

    Returns:
        dict: columns are used as keys and point to a list of the corresponding values
    """
    if columns is None:
        columns = ["id", "project"]
    df = job_table(database, sql_query, user, project_path, recursive, columns=columns)
    if len(df) == 0:
        dictionary = {}
        for key in columns:
            dictionary[key] = list()
        return dictionary
        # return {key: list() for key in columns}
    dictionary = {}
    for key in df.keys():
        dictionary[key] = df[
            key
        ].tolist()  # ToDo: Check difference of tolist and to_list
    return dictionary
    # return {key: df[key].tolist() for key in df.keys()} 
Example 36
Project: pyiron   Author: pyiron   File: queuestatus.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def queue_table(job_ids=[], project_only=True, full_table=False):
    """
    Display the queuing system table as pandas.Dataframe

    Args:
        job_ids (list): check for a specific list of job IDs - empty list by default
        project_only (bool): Query only for jobs within the current project - True by default

    Returns:
        pandas.DataFrame: Output from the queuing system - optimized for the Sun grid engine
    """
    if project_only and not job_ids:
        return []
    if s.queue_adapter is not None:
        if full_table:
            pandas.set_option('display.max_rows', None)
            pandas.set_option('display.max_columns', None)
        df = s.queue_adapter.get_status_of_my_jobs()
        if not project_only:
            return df[
                [
                    True if QUEUE_SCRIPT_PREFIX in job_name else False
                    for job_name in list(df.jobname)
                ]
            ]
        else:
            job_name_lst = [QUEUE_SCRIPT_PREFIX + str(job_id) for job_id in job_ids]
            return df[
                [
                    True if job_name in job_name_lst else False
                    for job_name in list(df.jobname)
                ]
            ]
    else:
        return None 
Example 37
Project: pyiron   Author: pyiron   File: datamining.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __repr__(self):
        """
        Human readable string representation

        Returns:
            str: pandas Dataframe structure as string
        """
        return self._df.__repr__() 
Example 38
Project: pyiron   Author: pyiron   File: datamining.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_dataframe(self):
        """

        Returns:
            pandas.Dataframe
        """
        return self.pyiron_table._df 
Example 39
Project: CityEnergyAnalyst   Author: architecture-building-systems   File: schemas.py    License: MIT License 5 votes vote down vote up
def read(self, *args, **kwargs):
        """
        Open the file indicated by the locator method and return it as a Dataframe.
        args and kwargs are passed to the original (undecorated) locator method to figure out the location of the
        file.

        :param args:
        :param kwargs:
        :rtype: pd.DataFrame
        """
        raise AttributeError("{lm}: don't know how to read file_type {file_type}".format(
            lm=self.lm, file_type=self.schema["file_type"])) 
Example 40
Project: CityEnergyAnalyst   Author: architecture-building-systems   File: schemas.py    License: MIT License 5 votes vote down vote up
def new(self):
        raise AttributeError("{lm}: don't know how to create a new Dataframe for file_type {file_type}".format(
            lm=self.lm, file_type=self.schema["file_type"]))