Python pandas.DataFrame() Examples

The following are code examples for showing how to use pandas.DataFrame(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: navitia_client   Author: leonardbinet   File: parser.py    MIT License 6 votes vote down vote up
def write_all(self, directory):
        # Get results
        unnested = pd.DataFrame(self.unnested_items)  # df
        nested = self.nested_items  # dict
        # Write item csv
        unnested.to_csv(os.path.join(directory, self.item_name + ".csv"))
        # Write item json
        with open(os.path.join(directory, self.item_name + ".json"), 'w') as f:
            json.dump(nested, f, ensure_ascii=False)
        # Write links (of first page)
        with open(os.path.join(directory, "links.json"), 'w') as f:
            json.dump(self.links, f, ensure_ascii=False)
        # Write disruptions (if item different)
        if self.item_name != "disruptions":
            unnested_dis = pd.DataFrame(self.disruptions)  # df
            unnested_dis.to_csv(os.path.join(directory, "disruptions.csv"))
        # Write logs
        with open(os.path.join(directory, "parse_log.json"), 'w') as f:
            json.dump(self.log, f, ensure_ascii=False) 
Example 2
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 6 votes vote down vote up
def _plot_initial_windows(self, unsig_base_means, unsig_fcs,
                              sig_base_means, sig_fcs):
        # create plot folder if it does not exist
        plot_folder = "{}/plots".format(self._output_folder)
        if not exists(plot_folder):
            makedirs(plot_folder)
        # MA plot
        plt.plot(np.log10(unsig_base_means),
                 np.log2(unsig_fcs), ".",
                 markersize=2.0, alpha=0.3)
        plt.plot(np.log10(sig_base_means),
                 np.log2(sig_fcs), ".",
                 markersize=2.0, color="red", alpha=0.3)
        plt.axhline(y=np.median(np.log2(unsig_fcs.append(sig_fcs))))
        plt.axvline(x=np.median(np.log10(unsig_base_means.append(
                                         sig_base_means))))
        plt.title("Initial_windows_MA_plot")
        plt.xlabel("log10 base mean")
        plt.ylabel("log2 fold-change")
        plt.savefig("{}/Initial_windows_MA_plot.png".format(plot_folder),
                    dpi=600)
        plt.close()
        # HexBin plot
        df = pd.DataFrame({'log10 base mean': np.log10(unsig_base_means.append(
            sig_base_means)), 'log2 fold-change': np.log2(unsig_fcs.append(
                sig_fcs))})
        df.plot(kind='hexbin', x='log10 base mean',
                y='log2 fold-change', gridsize=50, bins='log')
        plt.axhline(y=np.median(np.log2(unsig_fcs.append(sig_fcs))))
        plt.axvline(x=np.median(np.log10(unsig_base_means.append(
                                         sig_base_means))))
        plt.title("Initial_windows_HexBin_plot")
        plt.savefig("{}/Initial_windows_HexBin_plot.pdf".format(plot_folder))
        plt.close() 
Example 3
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 6 votes vote down vote up
def _run_deseq2_peaks(self):
        peak_df = pd.DataFrame()
        for replicon in sorted(self._replicon_dict):
            if self._replicon_dict[replicon]["peak_df"].empty:
                continue
            peak_df = peak_df.append(self._replicon_dict[replicon]["peak_df"],
                                     ignore_index=True)
        count_df = peak_df.loc[:, self._exp_lib_list + self._ctr_lib_list]
        deseq2_runner = DESeq2Runner(count_df)
        result_df, self._size_factors = deseq2_runner.run_deseq2(
            self._exp_lib_list, self._ctr_lib_list, self._size_factors,
            self._pairwise_replicates)
        # normalize counts
        peak_df[self._lib_names_list] = peak_df[
            self._lib_names_list].div(self._size_factors, axis='columns')
        # append DESeq2 output
        peak_df = pd.concat([peak_df, result_df], axis=1)
        for replicon in sorted(self._replicon_dict):
            self._replicon_dict[replicon]["peak_df"] = peak_df[
                peak_df.replicon == replicon] 
Example 4
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 6 votes vote down vote up
def generate_peaks_from_blockbuster(self, min_cluster_expr_frac,
                                        min_block_overlap,
                                        min_max_block_expr_frac):
        for replicon in self._replicon_dict:
            self._replicon_dict[replicon]["peak_df"] = pd.DataFrame()
        cluster = {}
        for line in self._blockbuster_output.rstrip().split('\n'):
            if line.startswith('>'):
                if cluster:
                    self._call_cluster_peaks(cluster, min_cluster_expr_frac,
                                             min_block_overlap,
                                             min_max_block_expr_frac)
                    cluster = {}
                cluster["header"] = line
                cluster["blocks"] = []
            else:
                cluster["blocks"].append(line)
        if cluster:
            self._call_cluster_peaks(cluster, min_cluster_expr_frac,
                                     min_block_overlap,
                                     min_max_block_expr_frac) 
Example 5
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 6 votes vote down vote up
def calculate_peak_expression(self):
        self._generate_peak_counts()
        self._peak_df = pd.DataFrame()
        for replicon in sorted(self._replicon_dict):
            if self._replicon_dict[replicon]["peak_df"].empty:
                continue
            self._replicon_dict[replicon]["peak_df"]["replicon"] = replicon
            for lib_name, lib in self._lib_dict.items():
                self._replicon_dict[replicon][
                    "peak_df"][lib_name] = lib.replicon_dict[
                        replicon]["peak_counts"]
                del lib.replicon_dict[replicon]["peak_counts"]
            # add pseudocounts
            # self._replicon_dict[
            #    replicon]["peak_df"].loc[:, self._lib_names_list] += 1.0
            self._peak_df = self._peak_df.append(self._replicon_dict[replicon][
                "peak_df"], ignore_index=True) 
Example 6
Project: backtrader-cn   Author: pandalibin   File: utils.py    GNU General Public License v3.0 6 votes vote down vote up
def write_daily_alert(cls, symbol, stock_id, action):
        """
        write daily stock alert to MongoDB.
        :param symbol: Arctic symbol
        :param data: dict, like: {'stock': '000651', 'action': 'buy/sell'}
        :return: None
        """

        lib = get_or_create_library(conf.DAILY_STOCK_ALERT_LIBNAME)

        data = {
            'stock': stock_id,
            'action': action
        }
        df = pd.DataFrame([data], columns=data.keys())
        if symbol in lib.list_symbols():
            lib.append(symbol, df)
        else:
            lib.write(symbol, df) 
Example 7
Project: backtrader-cn   Author: pandalibin   File: test_datas_utils.py    GNU General Public License v3.0 6 votes vote down vote up
def _test_strip_unused_cols(self):
        data = pd.DataFrame({
            'name': ['tom', 'jack'],
            'age': [24, 56],
            'gender': ['male', 'male'],
            'address': ['cn', 'us']
        })
        data.index = pd.date_range(start='2017-01-01', periods=2)

        origin_cols = ['name', 'age', 'gender', 'address']
        unused_cols = ['address', 'gender']
        new_cols = ['name', 'age']

        self.assertEqual(list(data.columns).sort(), origin_cols.sort())

        bdu.Utils.strip_unused_cols(data, *unused_cols)

        self.assertEqual(list(data.columns).sort(), new_cols.sort()) 
Example 8
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 6 votes vote down vote up
def write_absorption_data_to_txt(filename,dataframe):
    """ Write absorption data Sij to txt file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    f = open(filename,'w')
    for i in dataframe.index:
        for j in dataframe.columns:
            f.write("{0} {1} {2}\n".format(i,j,dataframe[j][i]))
    f.close() 
Example 9
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 6 votes vote down vote up
def write_concentration_data_to_txt(filename,dataframe):
    """ Write concentration data Cij to txt file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    f = open(filename,'w')
    for i in dataframe.index:
        for j in dataframe.columns:
            f.write("{0} {1} {2}\n".format(i,j,dataframe[j][i]))
    f.close() 
Example 10
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 6 votes vote down vote up
def generate_absorbance_data(wl_span,parameters_dict):
    """
    helper function to generate absorption data based on 
    lorentzian parameters
    """
    components = parameters_dict.keys()
    n_components = len(components)
    n_lambdas = len(wl_span)
    array = np.zeros((n_lambdas,n_components))
    for i,l in enumerate(wl_span):
        j = 0
        for k,p in six.iteritems(parameters_dict):
            alphas = p['alphas']
            betas  = p['betas']
            gammas = p['gammas']
            array[i,j] = absorbance(l,alphas,betas,gammas)
            j+=1

    data_frame = pd.DataFrame(data=array,
                              columns = components,
                              index=wl_span)
    return data_frame 
Example 11
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 6 votes vote down vote up
def add_noise_to_signal(signal, size):
    """
    Adds a random normally distributed noise to a clean signal. Used mostly in Kipet
    To noise absorbances or concentration profiles obtained from simulations. All
    values that are negative after the noise is added are set to zero
    Args:
        signal (data): the Z or S matrix to have noise added to it
        size (scalar): sigma (or size of distribution)
    Returns:
        pandas dataframe
    """
    clean_sig = signal    
    noise = np.random.normal(0,size,clean_sig.shape)
    sig = clean_sig+noise    
    df= pd.DataFrame(data=sig)
    df[df<0]=0
    return df

#=============================================================================
#---------------------------PRE-PROCESSING TOOLS------------------------
#============================================================================= 
Example 12
Project: kipet   Author: salvadorgarciamunoz   File: TemplateBuilder.py    GNU General Public License v3.0 6 votes vote down vote up
def add_absorption_data(self, data):
        """Add absorption data

        Args:
            data (DataFrame): DataFrame with wavelengths as
                              indices and muxture components as columns.

        Returns:
            None

        """
        if isinstance(data, pd.DataFrame):
            self._absorption_data = data
        else:
            raise RuntimeError('Spectral data format not supported. Try pandas.DataFrame')

    # For inclusion of discrete jumps 
Example 13
Project: pybench   Author: pentschev   File: utils.py    Apache License 2.0 6 votes vote down vote up
def benchmark_json_to_pandas(path):
    """Convert the json "benchmarks" field of a pytest-benchmark json file into
    a pandas.DataFrame.

    Parameters
    ----------
    path: str
        path to json file

    Returns
    -------
    pandas.DataFrame
        A pandas DataFrame containing benchmarks extracted from a
        pytest-benchmark json file

    Example
    -------
    >>> benchmarks_df = benchmark_json_to_pandas(
    >>>     "/path/to/pytest_benchmark.json")
    """
    data = json.load(open(path))

    return pd.io.json.json_normalize(data=data["benchmarks"]) 
Example 14
Project: pybench   Author: pentschev   File: benchmark_ml.py    Apache License 2.0 6 votes vote down vote up
def load_mortgage(d):
    kwargs = {"nrows": d["shape"][0], "ncols": d["shape"][1], "cached": d["data"]}

    if "train_split" in d:
        kwargs["train_split"] = d["train_split"]
    if "label_col" in d:
        kwargs["label_col"] = d["label_col"]

    data = load_data(**kwargs)

    if d["module"] == "cuml":
        import cudf

        if isinstance(data, dict):
            for k, v in data.items():
                data[k] = cudf.DataFrame.from_pandas(v)

            data["y_train"] = cudf.Series(data["y_train"]["fea0"])
        else:
            data = cudf.DataFrame.from_pandas(data)

    return {"module": d["module"], "data": data} 
Example 15
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 6 votes vote down vote up
def average_true_range(df, n):
    """
    
    :param df: pandas.DataFrame
    :param n: 
    :return: pandas.DataFrame
    """
    i = 0
    TR_l = [0]
    while i < df.index[-1]:
        TR = max(df.loc[i + 1, 'High'], df.loc[i, 'Close']) - min(df.loc[i + 1, 'Low'], df.loc[i, 'Close'])
        TR_l.append(TR)
        i = i + 1
    TR_s = pd.Series(TR_l)
    ATR = pd.Series(TR_s.ewm(span=n, min_periods=n).mean(), name='ATR_' + str(n))
    df = df.join(ATR)
    return df 
Example 16
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 6 votes vote down vote up
def ppsr(df):
    """Calculate Pivot Points, Supports and Resistances for given data
    
    :param df: pandas.DataFrame
    :return: pandas.DataFrame
    """
    PP = pd.Series((df['High'] + df['Low'] + df['Close']) / 3)
    R1 = pd.Series(2 * PP - df['Low'])
    S1 = pd.Series(2 * PP - df['High'])
    R2 = pd.Series(PP + df['High'] - df['Low'])
    S2 = pd.Series(PP - df['High'] + df['Low'])
    R3 = pd.Series(df['High'] + 2 * (PP - df['Low']))
    S3 = pd.Series(df['Low'] - 2 * (df['High'] - PP))
    psr = {'PP': PP, 'R1': R1, 'S1': S1, 'R2': R2, 'S2': S2, 'R3': R3, 'S3': S3}
    PSR = pd.DataFrame(psr)
    df = df.join(PSR)
    return df 
Example 17
Project: navitia_client   Author: leonardbinet   File: parser.py    MIT License 5 votes vote down vote up
def get_unnested_items(self):
        df = pd.DataFrame(self.nested_items[self.item_name])
        flatten_dataframe(df, drop=True, max_depth=5)
        self.unnested_items = df.to_dict() 
Example 18
Project: navitia_client   Author: leonardbinet   File: parser.py    MIT License 5 votes vote down vote up
def count_nbr_collected_items(self):
        unnested = pd.DataFrame(self.unnested_items)  # df
        self.nbr_collected_items = len(unnested.index) 
Example 19
Project: navitia_client   Author: leonardbinet   File: parser.py    MIT License 5 votes vote down vote up
def parse_log(self):
        log = {}
        log["number_requests"] = len(self.results)
        log["number_parsed"] = len(self.parsed)
        log["keys"] = self.keys
        log["nbr_announced_items"] = self.nbr_expected_items
        log["nbr_collected_items"] = self.nbr_collected_items
        log["item_columns"] = list(pd.DataFrame(
            self.unnested_items).columns.values)
        self.log = log
        log["parsing_errors"] = self.parsing_errors 
Example 20
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def _convert_to_data_frame(self):
        self._window_df = pd.DataFrame()
        for replicon in sorted(self._replicon_dict):
            for strand in ["+", "-"]:
                # add window positions to data frame
                row_number = len(self._replicon_dict[replicon]["window_list"])
                df = pd.concat([
                    pd.Series([replicon] * row_number),
                    pd.Series([strand] * row_number),
                    pd.Series([window[0]+1 for window in
                               self._replicon_dict[
                                   replicon]["window_list"]]),
                    pd.Series([window[1] for window in
                               self._replicon_dict[
                        replicon]["window_list"]])], axis=1)
                df.columns = ["replicon", "strand", "w_start", "w_end"]
                # add library counts to data frame
                for lib_name, lib in self._lib_dict.items():
                    df[lib_name] = (pd.Series(lib.replicon_dict[
                        replicon]["window_counts"].loc[:, strand]))
                self._window_df = self._window_df.append(df,
                                                         ignore_index=True)
            del self._replicon_dict[replicon]["window_list"]
        # remove windows without expression in any library
        print("Removing empty windows from DataFrame with {} rows...".format(
            len(self._window_df.index)), flush=True)
        t_start = time()
        self._window_df = self._window_df.loc[
            (self._window_df.loc[:, self._lib_names_list].sum(axis=1) > 0), :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(self._window_df.index)), flush=True)
        if self._window_df.empty:
            print("**Dataframe empty**", flush=True)
            return
        if self._stat_test == "gtest":
            self._run_gtest_preprocessing()
        elif self._stat_test == "deseq":
            self._run_deseq_preprocessing() 
Example 21
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def _prefilter_windows_deseq(self, df):
        print("Removing windows where not all experiment libs show "
              "expression from DataFrame with {} rows...".format(len(df)),
              flush=True)
        t_start = time()
        for exp_lib in self._exp_lib_list:
            exp_lib_zero_count = 0.0
            df = df.loc[(df.loc[:, exp_lib] > exp_lib_zero_count), :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        if df.empty:
            return df
        initial_window_df = df.copy()
        # normalize counts on initial windows
        initial_window_df[self._lib_names_list] = initial_window_df[
            self._lib_names_list].div(self._size_factors, axis='columns')
        # minimum expression cutoff based on mean over experiment libraries
        print("Removing windows based on mad cutoff from DataFrame "
              "with {} rows...".format(len(df)), flush=True)
        t_start = time()
        median_abs_dev_from_zero = mad(initial_window_df.loc[
            :, self._exp_lib_list].mean(axis=1), center=0.0)
        min_expr = (self._mad_multiplier * median_abs_dev_from_zero)
        print("Minimal window expression based on mean over RIP/CLIP "
              "libraries: {} (MAD from zero: {})".format(
                  min_expr, median_abs_dev_from_zero), flush=True)
        df = df.loc[initial_window_df.loc[:, self._exp_lib_list].mean(
            axis=1) >= min_expr, :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        return df 
Example 22
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def _combine_windows(self, df):
        peak_list = []
        peak = {"peak_start": None, "peak_end": None}
        for index, window in df.iterrows():
            # significant window
            if window.loc["significant"]:
                # start new peak region if no peak region was started before
                if peak["peak_start"] is None:
                    peak["peak_start"] = window.loc["w_start"]
                    peak["peak_end"] = window.loc["w_end"]
                # start new peak region while still inside previous peak region
                #   *this is due to gaps in the windows caused by pre-filtering
                #   *add previous peak region to output
                # +1 -> combine adjacent peaks
                elif window.loc["w_start"] > peak["peak_end"] + 1:
                    peak_list.append(deepcopy(peak))
                    peak["peak_start"] = window.loc["w_start"]
                    peak["peak_end"] = window.loc["w_end"]
                # elongate peak if window overlaps
                else:
                    peak["peak_end"] = window.loc["w_end"]
            # non-significant window
            else:
                # jump to next window if outside of peak region
                # or current position upstream of peak end
                # +1 -> combine adjacent peaks
                if (peak["peak_start"] is None or window.loc[
                        "w_start"] <= peak["peak_end"] + 1):
                    continue
                # otherwise end peak region and append to output list
                peak_list.append(deepcopy(peak))
                peak["peak_start"] = None
                peak["peak_end"] = None
        # append peak if last window in data frame was significant
        if peak["peak_start"] is not None:
            peak_list.append(deepcopy(peak))
        peak_df = pd.DataFrame(peak_list, columns=["peak_start", "peak_end"])
        return peak_df 
Example 23
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 5 votes vote down vote up
def _call_cluster_peaks(self, cluster, min_cluster_expr_frac,
                            min_block_overlap, min_max_block_expr_frac):
        cluster_entries = cluster["header"].strip().split('\t')
        cluster_expr = float(cluster_entries[5])
        cluster_strand = cluster_entries[4]
        cluster_replicon = cluster_entries[1]
        peak_df = pd.DataFrame()

        if len(cluster["blocks"]) == 1:
            block_entries = cluster["blocks"][0].strip().split('\t')
            peak_start = int(block_entries[2]) + 1
            peak_end = int(block_entries[3])
            peak_df = peak_df.append(pd.Series([peak_start, peak_end], index=[
                "peak_start", "peak_end"]), ignore_index=True)
        else:
            blocks = [block.strip().split('\t') for block in cluster["blocks"]]
            block_df = pd.DataFrame(
                blocks, columns=["blockNb", "blockChrom", "blockStart",
                                 "blockEnd", "blockStrand", "blockExpression",
                                 "readCount"])
            block_df[["blockNb", "blockStart", "blockEnd", "blockExpression",
                      "readCount"]] = block_df[
                    ["blockNb", "blockStart", "blockEnd", "blockExpression",
                     "readCount"]].apply(pd.to_numeric)
            peak_df = self._split_cluster_peaks(block_df, cluster_expr,
                                                peak_df, min_cluster_expr_frac,
                                                min_block_overlap,
                                                min_max_block_expr_frac)
        if peak_df.empty:
            return
        peak_df = peak_df.astype(np.int64)
        peak_df["peak_strand"] = cluster_strand
        self._replicon_dict[cluster_replicon]["peak_df"] = self._replicon_dict[
            cluster_replicon]["peak_df"].append(peak_df, ignore_index=True) 
Example 24
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 5 votes vote down vote up
def _filter_peaks(self, df):
        # calculate mad for original data frame
        median_abs_dev_from_zero = mad(df.loc[:, self._exp_lib_list].mean(
            axis=1), center=0.0)
        # padj filter
        print("Removing peaks based on padj from DataFrame with {} rows...".
              format(len(df)), flush=True)
        t_start = time()
        df = df.query('padj < @self._padj_threshold')
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        if df.empty:
            return df
        # minimum expression cutoff based on mean over experiment libraries
        print("Removing peaks based on mad cutoff from DataFrame "
              "with {} rows...".format(len(df)), flush=True)
        t_start = time()
        min_expr = (self._mad_multiplier * median_abs_dev_from_zero)
        print("Minimal peak expression based on mean over RIP/CLIP "
              "libraries:" "{} (MAD from zero: {})".format(
                  min_expr, median_abs_dev_from_zero), flush=True)
        df = df.loc[df.loc[:, self._exp_lib_list].mean(axis=1) >= min_expr, :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        if df.empty:
            return df
        # minimum fold change
        print("Removing peaks based on minimum fold change "
              "from DataFrame with {} rows...".format(len(df)), flush=True)
        t_start = time()
        log2_fc_cutoff = np.log2(self._fc_cutoff)
        df = df.query('log2FoldChange >= @log2_fc_cutoff')
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        return df 
Example 25
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 5 votes vote down vote up
def _filter_peaks_without_replicates(self, df):
        # calculate mad for original data frame
        median_abs_dev_from_zero = mad(df.loc[:, self._exp_lib_list].mean(
            axis=1), center=0.0)
        # minimum expression cutoff based on mean over experiment libraries
        print("Removing peaks based on mad cutoff from DataFrame "
              "with {} rows...".format(len(df)), flush=True)
        t_start = time()
        min_expr = (self._mad_multiplier * median_abs_dev_from_zero)
        print("Minimal peak expression based on mean over RIP/CLIP "
              "libraries:" "{} (MAD from zero: {})".format(
                  min_expr, median_abs_dev_from_zero), flush=True)
        df = df.loc[df.loc[:, self._exp_lib_list].mean(axis=1) >= min_expr, :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        if df.empty:
            return df
        # minimum fold change
        print("Removing windows based on minimum fold change from DataFrame "
              "with {} rows...".format(len(df)), flush=True)
        t_start = time()
        df = df.query('fold_change >= @self._fc_cutoff')
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        return df 
Example 26
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 5 votes vote down vote up
def _plot_initial_peaks(self, unsig_base_means, unsig_fcs,
                            sig_base_means, sig_fcs):
        # create plot folder if it does not exist
        plot_folder = "{}/plots".format(self._output_folder)
        if not exists(plot_folder):
            makedirs(plot_folder)
        # MA plot
        plt.plot(np.log10(unsig_base_means),
                 np.log2(unsig_fcs), ".",
                 markersize=2.0, alpha=0.3)
        plt.plot(np.log10(sig_base_means),
                 np.log2(sig_fcs), ".",
                 markersize=2.0, color="red", alpha=0.3)
        plt.axhline(y=np.median(np.log2(unsig_fcs.append(sig_fcs))))
        plt.axvline(x=np.median(np.log10(unsig_base_means.append(
                                         sig_base_means))))
        plt.title("Initial_peaks_MA_plot")
        plt.xlabel("log10 base mean")
        plt.ylabel("log2 fold-change")
        plt.savefig("{}/Initial_peaks_MA_plot.png".format(plot_folder),
                    dpi=600)
        plt.close()
        # HexBin plot
        df = pd.DataFrame({'log10 base mean': np.log10(unsig_base_means.append(
            sig_base_means)), 'log2 fold-change': np.log2(unsig_fcs.append(
                sig_fcs))})
        df.plot(kind='hexbin', x='log10 base mean',
                y='log2 fold-change', gridsize=50, bins='log')
        plt.axhline(y=np.median(np.log2(unsig_fcs.append(sig_fcs))))
        plt.axvline(x=np.median(np.log10(unsig_base_means.append(
                                         sig_base_means))))
        plt.title("Initial_peaks_HexBin_plot")
        plt.savefig("{}/Initial_peaks_HexBin_plot.pdf".format(plot_folder))
        plt.close() 
Example 27
Project: PEAKachu   Author: tbischler   File: gtest.py    ISC License 5 votes vote down vote up
def __init__(self, ctr_rep_counts, tagged_rep_counts,
                 pairwise_replicates=False):
        self._rep_df = pd.DataFrame({"ctr_counts": ctr_rep_counts,
                                     "tagged_counts": tagged_rep_counts})
        self._pairwise_replicates = pairwise_replicates
        self._pooled_g_res = {}
        self._total_g_res = {}
        self._heterogenous_g_res = {}
        self._single_g_res = {} 
Example 28
Project: PEAKachu   Author: tbischler   File: library.py    ISC License 5 votes vote down vote up
def count_reads_for_windows(self):
        read_counter = ReadCounter(self.paired_end, self.max_insert_size,
                                   self.bam_file)
        for replicon in self.replicon_dict:
            self.replicon_dict[replicon]['window_counts'] = pd.DataFrame()
            for strand in ['+', '-']:
                window_counts = read_counter.count_reads_for_windows(
                            replicon,
                            strand,
                            self.replicon_dict[replicon]["window_list"])
                self.replicon_dict[replicon]['window_counts'][
                    strand] = window_counts
        read_counter.close_bam() 
Example 29
Project: PEAKachu   Author: tbischler   File: consensus_peak.py    ISC License 5 votes vote down vote up
def plot_consensus_peak(self):
        # create plot folder if it does not exist
        plot_folder = "{}/plots".format(self._project_folder)
        if not exists(plot_folder):
            makedirs(plot_folder)
        self._store_peaks()
        comb_cons_value_dict = self._get_peak_coverage()
        df = pd.DataFrame(comb_cons_value_dict, columns=sorted(
            comb_cons_value_dict))
        ax = df.plot(title="Consensus peak per library")
        ax.set_xlabel("Nucleotide position")
        ax.set_ylabel("Relative expression")
        plt.savefig("{}/plots/consensus_peaks.pdf".format(
            self._project_folder)) 
Example 30
Project: incubator-spot   Author: apache   File: dns_oa.py    Apache License 2.0 5 votes vote down vote up
def _ingest_summary(self):
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")
        
        ingest_summary_cols = ["date","total"]		
        result_rows = []        
        df_filtered =  pd.DataFrame()

        query_to_load = ("""
            SELECT frame_time, COUNT(*) as total FROM {0}.{1}
            WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL
            AND frame_time IS NOT NULL AND frame_len IS NOT NULL
            AND dns_qry_name IS NOT NULL AND ip_src IS NOT NULL
            AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL
            AND dns_qry_rcode IS NOT NULL ) GROUP BY frame_time;
        """).format(self._db,self._table_name, yr, mn, dy)

        results = impala.execute_query_as_list(query_to_load)
        df = pd.DataFrame(results)

        # Forms a new dataframe splitting the minutes from the time column
        df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy,\
            val['frame_time'].replace("  "," ").split(" ")[3].split(":")[0].zfill(2),\
            val['frame_time'].replace("  "," ").split(" ")[3].split(":")[1].zfill(2)),\
            int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df.iterrows()],columns = ingest_summary_cols)

        #Groups the data by minute
        sf = df_new.groupby(by=['date'])['total'].sum()
        df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})

        df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False)

        if len(df_final) > 0:
            query_to_insert=("""
                INSERT INTO {0}.dns_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4};
            """).format(self._db, yr, mn, dy, tuple(df_final))
            impala.execute_query(query_to_insert) 
Example 31
Project: rhodonite   Author: nestauk   File: tabular.py    MIT License 5 votes vote down vote up
def vertices_to_dataframe(g, vectors=False, keys=None):
    """vertices_to_dataframe
    Transforms a graph's vertices and their properties into a tabular format.

    Parameters
    ----------
        g : :obj:`Graph`
            A graph.
        keys : :obj:`iter` of :obj:`str`, optional
            A list of property map keys to convert in to columns. If None, all 
            property maps are converted. Default is None.
    Returns
    -------
        vertex_df : :obj:`DataFrame` A dataframe where each row represents a 
            vertex and the columns are properties of those edges. By default, 
            the dataframe will contain a column with the vertex id.
    """
    vertex_df = pd.DataFrame(list(g.vertices()), columns=['v'], dtype='int')
    filt = g.get_vertex_filter()
    if filt[0] is not None:
        filt = filt[0].a > 0
        filtered = True
    else:
        filtered=False
        filt = np.array([True] * g.num_vertices())
    if keys is None:
        keys = list(g.vp.keys())
    for k, vp in g.vp.items():
        if k in keys:
            vt = vp.value_type()
            if ('vector' not in vt) & ('string' not in vt) & ('object' not in vt):
                if ('int' in vt) | ('bool' in vt):
                    vertex_df[k] = vp.get_array()[filt]
                    vertex_df[k] = vertex_df[k].astype(int)
                elif 'double' in vt:
                    vertex_df[k] = vp.get_array()[filt]
                    vertex_df[k] = vertex_df[k].astype(float)
            elif ('vector' in vt) & ('string' not in vt) & (vectors == True):
                vertex_df[k] = [[i for i in vp[v]] for v, f in zip(g.vertices(), filt) if f]
    return vertex_df 
Example 32
Project: rhodonite   Author: nestauk   File: tabular.py    MIT License 5 votes vote down vote up
def edges_to_dataframe(g, keys=None, drop_keys=[], sort=None, vectors=False):
    """edges_to_dataframe
    Transforms a graph's edges and their properties into a tabular format.

    Parameters
    ----------
        g : :obj:`Graph` 
            A graph.
        keys : :obj:`iter` of :obj:`str`, optional
            A list of property map keys to convert in
            to columns. If None, all property maps are converted. Default is
            None.
    Returns
    -------
        edge_df : :obj:`DataFrame` 
            A dataframe where each row represents an edge and the columns are 
            properties of those edges. By default, the dataframe will contain 
            a column for the source vertices and another for the target 
            vertices.
    """
    edge_df = pd.DataFrame(list(g.edges()), columns=['s', 't'], dtype='int')
    edge_df = pd.DataFrame(g.get_edges(), columns=['s', 't', 'e_index'], dtype='int')
    indices = edge_df['e_index'].values
    if keys is None:
        keys = list(g.ep.keys())
    for k, ep in g.ep.items():
        if k in keys:
            vt = ep.value_type()
            if ('vector' not in vt) & ('string' not in vt) & ('object' not in vt):
                if ('int' in vt) | ('bool' in vt):
                    edge_df[k] = ep.get_array()[indices]
                    edge_df[k] = edge_df[k].astype(int)
                elif 'double' in vt:
                    edge_df[k] = ep.get_array()[indices]
                    edge_df[k] = edge_df[k].astype(float)
            elif ('vector' in vt) & ('string' not in vt) & (vectors == True):
                edge_df[k] = [[i for i in vp[v]] for v in g.vertices()]
    if sort:
        edge_df.sort_values(sort, inplace=True)
    return edge_df 
Example 33
Project: backtrader-cn   Author: pandalibin   File: utils.py    GNU General Public License v3.0 5 votes vote down vote up
def split_data(cls, data, percent=0.3):
        """
        Split the data into training data and test data.
        :param data(DataFrame): data to be split.
        :param percent(float): percent of data used as training data.
        :return: training data(DataFrame) and testing data(DataFrame)
        """

        rows = len(data)
        train_rows = math.floor(rows * percent)
        test_rows = rows - train_rows

        return data.iloc[:train_rows], data.iloc[-test_rows:] 
Example 34
Project: backtrader-cn   Author: pandalibin   File: utils.py    GNU General Public License v3.0 5 votes vote down vote up
def get_best_params(cls, al_results):
        """
        Get the best params, current algorithm is the largest total return rate.
        :param al_results(list): all the optional params and corresponding analysis data.
        :return: best params and corresponding analysis data(dict)
        """
        al_results_df = pd.DataFrame.from_dict(al_results)
        al_results_df = al_results_df.sort_values('total_return_rate', ascending=False)

        al_result_dict = al_results_df.iloc[0].to_dict()

        return al_result_dict 
Example 35
Project: backtrader-cn   Author: pandalibin   File: models.py    GNU General Public License v3.0 5 votes vote down vote up
def save_training_params(symbol, params):
    """
    save training params to library.
    :param symbol: str, arctic symbol
    :param params: dict, e.g.: {"ma_period_s": 1, "ma_period_l": 2, "stock_id": "600909"}
    :return: None
    """

    stock_id = params.ma_periods['stock_id']
    params_to_save = dict(params=params)
    df = pd.DataFrame([params_to_save], columns=params_to_save.keys(), index=[stock_id])

    # write to database
    # if library does not exist, create it
    lib = get_or_create_library(conf.STRATEGY_PARAMS_LIBNAME)

    if lib.has_symbol(symbol):
        logger.debug(
            f'symbol: {symbol} already exists, '
            f'change the params of stock {stock_id}, '
            f'then delete and write symbol: {symbol}.'
        )
        params_df = lib.read(symbol).data
        params_df.loc[stock_id, 'params'] = params
        lib.delete(symbol)
        lib.write(symbol, params_df)
    else:
        logger.debug(
            f'write the params of stock {stock_id} to symbol: {symbol}'
        )
        lib.write(symbol, df) 
Example 36
Project: backtrader-cn   Author: pandalibin   File: test_strategies_ma.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_get_params_list(self):
        training_data = pd.DataFrame(np.random.rand(100, 2))
        params_list = bsm.MATrendStrategy.get_params_list(training_data, '000651')

        self.assertEqual(len(params_list), 29) 
Example 37
Project: backtrader-cn   Author: pandalibin   File: test_datas_tushare.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_download_delta_data_initial_no_data(self, mock_get_hist_data):
        mock_hist_data = pd.DataFrame()

        mock_get_hist_data.return_value = mock_hist_data

        coll_name = '000651'
        ts_his_data = bdt.TsHisData(coll_name)

        lib = models.get_library(conf.CN_STOCK_LIBNAME)
        lib.delete(coll_name)

        ts_his_data.download_delta_data()

        self.assertFalse(lib.has_symbol(coll_name)) 
Example 38
Project: backtrader-cn   Author: pandalibin   File: test_datas_tushare.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_download_delta_data_initial(self, mock_get_hist_data):
        mock_hist_data = pd.DataFrame(data={
            'open': [10, 11],
            'high': [12, 13],
            'close': [14, 15],
            'low': [16, 17],
            'volume': [18, 19],
            'price_change': [20, 21],
            'p_change': [22, 23],
            'ma5': [24, 25],
            'ma10': [26, 27],
            'ma20': [28, 29],
            'v_ma5': [30, 31],
            'v_ma10': [32, 33],
            'v_ma20': [34, 35],
            'turnover': [36, 37]
        }, index=['2017-01-01', '2017-01-02'])

        mock_get_hist_data.return_value = mock_hist_data

        coll_name = '000651'
        ts_his_data = bdt.TsHisData(coll_name)

        lib = models.get_library(conf.CN_STOCK_LIBNAME)
        lib.delete(coll_name)

        ts_his_data.download_delta_data()

        hist_data_000651 = ts_his_data.get_data()

        self.assertEqual(len(hist_data_000651), 2) 
Example 39
Project: backtrader-cn   Author: pandalibin   File: test_datas_tushare.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_download_delta_data_no_data(self, mock_get_hist_data):
        coll_name = '000651'
        ts_his_data = bdt.TsHisData(coll_name)

        mock_delta_data = pd.DataFrame()
        mock_get_hist_data.return_value = mock_delta_data

        ts_his_data.download_delta_data()

        hist_data_000651 = ts_his_data.get_data()

        self.assertEqual(len(hist_data_000651), 2) 
Example 40
Project: backtrader-cn   Author: pandalibin   File: test_datas_tushare.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_download_delta_data(self, mock_get_hist_data):
        coll_name = '000651'
        ts_his_data = bdt.TsHisData(coll_name)

        yesterday = dt.datetime.now() - dt.timedelta(days=1)
        mock_delta_data = pd.DataFrame(data={
            'open': 38,
            'high': 39,
            'close': 40,
            'low': 41,
            'volume': 42,
            'price_change': 43,
            'p_change': 44,
            'ma5': 45,
            'ma10': 46,
            'ma20': 47,
            'v_ma5': 48,
            'v_ma10': 49,
            'v_ma20': 50,
            'turnover': 51
        }, index=[dt.datetime.strftime(yesterday, '%Y-%m-%d')])

        mock_get_hist_data.return_value = mock_delta_data

        ts_his_data.download_delta_data()

        hist_data_000651 = ts_his_data.get_data()

        self.assertEqual(len(hist_data_000651), 3)
        self.assertEqual(dt.datetime.strftime(hist_data_000651.index[-1], '%Y-%m-%d'),
                         dt.datetime.strftime(yesterday, '%Y-%m-%d'))
        lib = models.get_library(conf.CN_STOCK_LIBNAME)
        lib.delete(coll_name) 
Example 41
Project: osqf2015   Author: mvaz   File: model.py    MIT License 5 votes vote down vote up
def from_blaze(cls, filename, date_col='Date', value_col='Close'):
        df = bz.odo(filename, pd.DataFrame)[[date_col, value_col]] #[1000:1100]
        df = df.rename(columns = {value_col: 'Value'})
        ts = df.set_index(date_col)
        return cls(ts) 
Example 42
Project: osqf2015   Author: mvaz   File: model.py    MIT License 5 votes vote down vote up
def __init__(self):
        super(StockModel, self).__init__()
        file_name = "notebooks/db2.bcolz"
        self.df = bz.odo(file_name, pd.DataFrame)[['Date', 'Close']] #[1000:1100]
        self.devol()
        self.returns_df = None 
Example 43
Project: kipet   Author: salvadorgarciamunoz   File: validate_templatebuilder.py    GNU General Public License v3.0 5 votes vote down vote up
def test_add_spectral_data(self):

        builder = TemplateBuilder()
        d_frame = pd.DataFrame()
        self.assertFalse(builder.has_spectral_data())
        builder.add_spectral_data(d_frame)
        self.assertTrue(builder.has_spectral_data()) 
Example 44
Project: kipet   Author: salvadorgarciamunoz   File: validate_templatebuilder.py    GNU General Public License v3.0 5 votes vote down vote up
def test_add_absorption_data(self):
        builder = TemplateBuilder()
        s_frame = pd.DataFrame()
        self.assertFalse(builder.has_adsorption_data())
        builder.add_absorption_data(s_frame)
        self.assertTrue(builder.has_adsorption_data()) 
Example 45
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def write_spectral_data_to_csv(filename,dataframe):
    """ Write spectral data Dij to csv file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    dataframe.to_csv(filename) 
Example 46
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def write_absorption_data_to_csv(filename,dataframe):
    """ Write absorption data Sij to csv file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    dataframe.to_csv(filename) 
Example 47
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def write_concentration_data_to_csv(filename,dataframe):
    """ Write concentration data Cij to csv file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    dataframe.to_csv(filename) 
Example 48
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_concentration_data_from_txt(filename):
    """ Reads txt with concentration data
    
        Args:
            filename (str): name of input file
          
        Returns:
            DataFrame

    """

    f = open(filename,'r')
    data_dict = dict()
    set_index = set()
    set_columns = set()

    for line in f:
        if line not in ['','\n','\t','\t\n']:
            l=line.split()
            i = float(l[0])
            j = l[1]
            k = float(l[2])
            set_index.add(i)
            set_columns.add(j)
            data_dict[i,j] = k
    f.close()
    
    data_array = np.zeros((len(set_index),len(set_columns)))
    sorted_index = sorted(set_index)
    sorted_columns = set_columns

    for i,idx in enumerate(sorted_index):
        for j,jdx in enumerate(sorted_columns):
            data_array[i,j] = data_dict[idx,jdx]

    return pd.DataFrame(data=data_array,columns=sorted_columns,index=sorted_index) 
Example 49
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_spectral_data_from_csv(filename, instrument = False, negatives_to_zero = False):
    """ Reads csv with spectral data
    
        Args:
            filename (str): name of input file
            instrument (bool): if data is direct from instrument
            negatives_to_zero (bool): if data contains negatives and baseline shift is not
                                        done then this forces negative values to zero.

        Returns:
            DataFrame

    """

    data = pd.read_csv(filename,index_col=0)
    if instrument:
        #this means we probably have a date/timestamp on the columns
        data = pd.read_csv(filename,index_col=0, parse_dates = True)
        data = data.T
        for n in data.index:
            h,m,s = n.split(':')
            sec = (float(h)*60+float(m))*60+float(s)
            data.rename(index={n:sec}, inplace=True)
        data.index = [float(n) for n in data.index]
    else:
        data.columns = [float(n) for n in data.columns]

    #If we have negative values then this makes them equal to zero
    if negatives_to_zero:
        for t in (data.index):
            for l in data.columns:
                if data.loc[t,l] < 0:
                    data.loc[t,l] = 0.0

    return data 
Example 50
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_absorption_data_from_csv(filename):
    """ Reads csv with spectral data
    
        Args:
            filename (str): name of input file
          
        Returns:
            DataFrame

    """
    data = pd.read_csv(filename,index_col=0)
    return data 
Example 51
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_spectral_data_from_txt(filename):
    """ Reads txt with spectral data
    
        Args:
            filename (str): name of input file
          
        Returns:
            DataFrame

    """

    f = open(filename,'r')
    data_dict = dict()
    set_index = set()
    set_columns = set()

    for line in f:
        if line not in ['','\n','\t','\t\n']:
            l=line.split()
            i = float(l[0])
            j = float(l[1])
            k = float(l[2])
            set_index.add(i)
            set_columns.add(j)
            data_dict[i,j] = k
    f.close()
    
    data_array = np.zeros((len(set_index),len(set_columns)))
    sorted_index = sorted(set_index)
    sorted_columns = sorted(set_columns)

    for i,idx in enumerate(sorted_index):
        for j,jdx in enumerate(sorted_columns):
            data_array[i,j] = data_dict[idx,jdx]

    return pd.DataFrame(data=data_array,columns=sorted_columns,index=sorted_index) 
Example 52
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_absorption_data_from_txt(filename):
    """ Reads txt with absorption data
    
        Args:
            filename (str): name of input file
          
        Returns:
            DataFrame

    """

    f = open(filename,'r')
    data_dict = dict()
    set_index = set()
    set_columns = set()

    for line in f:
        if line not in ['','\n','\t','\t\n']:
            l=line.split()
            i = float(l[0])
            j = l[1]
            k = float(l[2])
            set_index.add(i)
            set_columns.add(j)
            data_dict[i,j] = k
    f.close()
    
    data_array = np.zeros((len(set_index),len(set_columns)))
    sorted_index = sorted(set_index)
    sorted_columns = set_columns

    for i,idx in enumerate(sorted_index):
        for j,jdx in enumerate(sorted_columns):
            data_array[i,j] = data_dict[idx,jdx]

    return pd.DataFrame(data=data_array,columns=sorted_columns,index=sorted_index) 
Example 53
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def plot_spectral_data(dataFrame,dimension='2D'):
    """ Plots spectral data
    
        Args:
            dataFrame (DataFrame): spectral data
          
        Returns:
            None

    """
    if dimension=='3D':
        lambdas = dataFrame.columns
        times = dataFrame.index
        D = np.array(dataFrame)
        L, T = np.meshgrid(lambdas, times)
        fig = plt.figure()
        #ax = fig.add_subplot(111, projection='3d')
        #ax.plot_wireframe(L, T, D, rstride=10, cstride=10)
        ax = fig.gca(projection='3d')
        ax.plot_surface(L, T, D, rstride=10, cstride=10, alpha=0.2)
        #cset = ax.contour(L, T, D, zdir='z',offset=-10)
        cset = ax.contour(L, T, D, zdir='x',offset=-20,cmap='coolwarm')
        cset = ax.contour(L, T, D, zdir='y',offset=times[-1]*1.1,cmap='coolwarm')
        
        ax.set_xlabel('Wavelength')
        ax.set_xlim(lambdas[0]-20, lambdas[-1])
        ax.set_ylabel('time')
        ax.set_ylim(0, times[-1]*1.1)
        ax.set_zlabel('Spectra')
        #ax.set_zlim(-10, )


    else:
        plt.figure()
        plt.plot(dataFrame)

#=============================================================================
#--------------------------- DIAGNOSTIC TOOLS ------------------------
#============================================================================= 
Example 54
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def decrease_wavelengths(original_dataset, A_set = 2, specific_subset = None):
    '''
    Takes in the original, full dataset and removes specific wavelengths, or only keeps every
    multipl of A_set. Returns a new, smaller dataset that should be easier to solve
    
    Args:
        original_dataset (DataFrame):   the data to be processed
        A_set (float, optional):  optional user-provided multiple of wavelengths to keep. i.e. if
                                    3, every third value is kept. Default is 2.
        specific_subset (list or dict, optional): If the user already knows which wavelengths they would like to
                                    remove, then a list containing these can be included.
        
    Returns:
        DataFrame with the smaller dataset
    
    '''
    if specific_subset != None:
        if not isinstance(specific_subset, (list, dict)):
            raise RuntimeError("subset must be of type list or dict!")
             
        if isinstance(specific_subset, dict):
            lists1 = sorted(specific_subset.items())
            x1, y1 = zip(*lists1)
            specific_subset = list(x1)
            
        new_D = pd.DataFrame(np.nan,index=original_dataset.index, columns = specific_subset)
        for t in original_dataset.index:
            for l in original_dataset.columns.values:
                if l in subset:
                    new_D.at[t,l] = self.model.D[t,l]           
    else:
        count=0
        for l in original_dataset.columns.values:
            remcount = count%A_set
            if remcount==0:
                original_dataset.drop(columns=[l],axis = 1)
            count+=1
        new_D = original_dataset[original_dataset.columns[::A_set]]     
    return new_D 
Example 55
Project: kipet   Author: salvadorgarciamunoz   File: TemplateBuilder.py    GNU General Public License v3.0 5 votes vote down vote up
def add_smoothparameter(self, *args, **kwds): #added for smoothingparameter (CS)
        """Add a kinetic parameter(s) to the model.

        Note:
            Plan to change this method add parameters as PYOMO variables

            This method tries to mimic a template implementation. Depending
            on the argument type it will behave differently

        Args:
            param1 (boolean): Mutable value. Creates a variable mutable

        Returns:
            None

        """
        # bounds = kwds.pop('bounds', None)
        # init = kwds.pop('init', None)
        mutable = kwds.pop('mutable', False)

        if len(args) == 2:
            first = args[0]
            second = args[1]
            if isinstance(first, six.string_types):
                self._smoothparameters[first] = second
                if mutable is not False and second is pd.DataFrame:
                    self._smoothparameters_mutable[first] = mutable #added for mutable parameters CS
            else:
                raise RuntimeError('Parameter argument not supported. Try pandas.Dataframe and mutable=True')
        else:
            raise RuntimeError('Parameter argument not supported. ry pandas.Dataframe and mutable=True') 
Example 56
Project: kipet   Author: salvadorgarciamunoz   File: TemplateBuilder.py    GNU General Public License v3.0 5 votes vote down vote up
def add_huplc_data(self, data): #added for the inclusion of h/uplc data CS
        """Add HPLC or UPLC data

                Args:
                    data (DataFrame): DataFrame with measurement times as
                                      indices and wavelengths as columns.

                Returns:
                    None
        """
        if isinstance(data, pd.DataFrame):
            dfDhat = pd.DataFrame(index=self._feed_times, columns=data.columns)
            for t in self._feed_times:
                if t not in data.index:  # for points that are the same in original meas times and feed times
                    dfDhat.loc[t] = [0.0 for n in range(len(data.columns))]
            dfallDhat = data.append(dfDhat)
            dfallDhat.sort_index(inplace=True)
            dfallDhat.index = dfallDhat.index.to_series().apply(
                lambda x: np.round(x, 6))  # time from data rounded to 6 digits
            ##############Filter out NaN###############
            count = 0
            for j in dfallDhat.index:
                if count >= 1 and count < len(dfallDhat.index):
                    if dfallDhat.index[count] == dfallDhat.index[count - 1]:
                        dfallDhat = dfallDhat.dropna()
                count += 1
            ###########################################
            self._huplc_data = dfallDhat
        else:
            raise RuntimeError('HUPLC data format not supported. Try pandas.DataFrame')
        Dhat = np.array(dfallDhat)
        for t in range(len(dfallDhat.index)):
            for l in range(len(dfallDhat.columns)):
                if Dhat[t, l] >= 0:
                    pass
                else:
                    self._is_Dhat_deriv = True
        if self._is_Dhat_deriv == True:
            print(
                "Warning! Since Dhat-matrix contains negative values Kipet is assuming a derivative of C has been inputted") 
Example 57
Project: kipet   Author: salvadorgarciamunoz   File: TemplateBuilder.py    GNU General Public License v3.0 5 votes vote down vote up
def add_smoothparam_data(self, data): #added for mutable smoothing parameter option CS
        """Add HPLC or UPLC data

                Args:
                    data (DataFrame): DataFrame with measurement times as
                                      indices and wavelengths as columns.

                Returns:
                    None
        """
        if isinstance(data, pd.DataFrame):
            dfPs = pd.DataFrame(index=self._feed_times, columns=data.columns)
            for t in self._feed_times:
                if t not in data.index:  # for points that are the same in original meas times and feed times
                    dfPs.loc[t] = [0.0 for n in range(len(data.columns))]
            dfallPs = data.append(dfPs)
            dfallPs.sort_index(inplace=True)
            dfallPs.index = dfallPs.index.to_series().apply(
                lambda x: np.round(x, 6))  # time from data rounded to 6 digits
            ##############Filter out NaN###############
            count = 0
            for j in dfallPs.index:
                if count >= 1 and count < len(dfallPs.index):
                    if dfallPs.index[count] == dfallPs.index[count - 1]:
                        dfallPs = dfallPs.dropna()
                count += 1
            ###########################################
            self._smoothparam_data = dfallPs
        else:
            raise RuntimeError('Smooth parameter data format not supported. Try pandas.DataFrame')
        Ps = np.array(dfallPs) 
Example 58
Project: prediction-constrained-topic-models   Author: dtak   File: select_best_runs_and_snapshots.py    MIT License 5 votes vote down vote up
def load_df_from_all_folders_matching_list_of_patterns(
        list_of_path_patterns=None,
        legend_name=None,
        y_ind=0,
        column_names=None,
        query_str=None,
        task_ids=None,
        **kwargs):
    pprint(">>> BEGIN load_df_from_all_folders_that_match_pattern")
    list_of_match_df = list()
    for path_pattern in list_of_path_patterns:
        cur_alg_df = load_df_from_all_folders_that_match_pattern(
            path_pattern,
            y_ind=y_ind,
            task_ids=task_ids,
            column_names=column_names)
        if query_str is not None:
            cur_alg_df = cur_alg_df.query(query_str).copy()

        # Append to list of all matching dataframes
        list_of_match_df.append(cur_alg_df)



    # Create all matching DataFrame
    all_matching_runs_df = pd.concat(list_of_match_df)
    pprint("<<< END   load_df_from_all_folders_that_match_pattern")
    return all_matching_runs_df


######################
## Funcs that select best df 
Example 59
Project: prediction-constrained-topic-models   Author: dtak   File: select_best_runs_and_snapshots.py    MIT License 5 votes vote down vote up
def load_df_from_all_folders_that_match_pattern(
        src_path_pattern='',
        task_ids='1',
        when_task_path_does_not_exist='continue',
        when_split_csv_does_not_exist='raise_error',
        y_ind=0,
        column_names=None,
        output_data_type='binary',
        engine=None,
        csv_pattern='snapshot_perf_metrics_%s.csv'):
    ''' Load results from many folders that match a pattern into data frame.

    Aggregates results from many pipelines.

    Returns
    -------
    df : pandas DataFrame
    '''
    src_path_list = [s for s in sorted(glob.glob(src_path_pattern))]
    if len(src_path_list) == 0:
        raise ValueError("ERROR: No snapshot csv files for provided pattern:%s" % src_path_pattern)
    mega_df = None
    df_list = list()
    column_names = load_default_column_name_dict(
        output_data_type=output_data_type)
    for src_path in src_path_list:
        df = load_df_from_training_results_folder(
            src_path=src_path, 
            task_ids=task_ids,
            when_task_path_does_not_exist=when_task_path_does_not_exist,
            when_split_csv_does_not_exist=when_split_csv_does_not_exist,
            column_names=column_names,
            engine=engine,
            csv_pattern=csv_pattern,
            y_ind=y_ind)
        df_list.append(df)
    mega_df = pd.concat(df_list)
    return mega_df 
Example 60
Project: pybench   Author: pentschev   File: utils.py    Apache License 2.0 5 votes vote down vote up
def filter_by_string_in_column(df, column, value):
    """Filter pandas DataFrame by value, where value is a subsequence of the
    of the string contained in a column.

    Parameters
    ----------
    df: pandas.DataFrame
        A pandas DataFrame containing data from pytest-benchmark.
    column: str
        Column name where to check for value.
    value: str
        String to be checked if is part of column's content.

    Returns
    -------
    pandas.DataFrame
        A pandas DataFrame containing only rows for which value is contained in
        column content.

    Example
    -------
    >>> numpy_df = filter_by_string_in_column(df, 'name', 'numpy')
    """
    return df.loc[df[column].str.contains(value)] 
Example 61
Project: pybench   Author: pentschev   File: utils.py    Apache License 2.0 5 votes vote down vote up
def split_params_list(df, params_name, columns=None):
    """Split a parameter list into multiple columns.

    Parameters
    ----------
    df: pandas.DataFrame
        A pandas DataFrame containing data from pytest-benchmark.
    params_name: str
        The name of the pytest parameter column to split.
    columns: list of str
        If specified, the resulting columns will have the names in this list,
        otherwise resulting columns will have the element number as a suffix to
        the value of ``params_name``, such as ``params_name.0``,
        ``params_name.1``, and so on.

    Returns
    -------
    pandas.DataFrame
        A pandas DataFrame containing additional columns for the parameters
        split.

    Examples
    --------
    >>> split_params_list(df, "params.shape")
    >>> split_params_list(df,
    >>>     "params.shape",
    >>>     ["params.shape.0", "params.shape.1"])
    >>> split_params_list(df, "params.shape", ["shape0", "shape1"])
    """
    lst = df[params_name].to_list()
    lst = [[l] if not isinstance(l, list) else l for l in lst]
    ncols = max([len(l) for l in lst])
    if columns is None:
        columns = [params_name + "." + str(i) for i in range(ncols)]
    split_param = pd.DataFrame(lst, columns=columns)
    return df.join(split_param, how="outer") 
Example 62
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 5 votes vote down vote up
def moving_average(df, n):
    """Calculate the moving average for the given data.
    
    :param df: pandas.DataFrame
    :param n: 
    :return: pandas.DataFrame
    """
    MA = pd.Series(df['Close'].rolling(n, min_periods=n).mean(), name='MA_' + str(n))
    df = df.join(MA)
    return df 
Example 63
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 5 votes vote down vote up
def exponential_moving_average(df, n):
    """
    
    :param df: pandas.DataFrame
    :param n: 
    :return: pandas.DataFrame
    """
    EMA = pd.Series(df['Close'].ewm(span=n, min_periods=n).mean(), name='EMA_' + str(n))
    df = df.join(EMA)
    return df 
Example 64
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 5 votes vote down vote up
def momentum(df, n):
    """
    
    :param df: pandas.DataFrame 
    :param n: 
    :return: pandas.DataFrame
    """
    M = pd.Series(df['Close'].diff(n), name='Momentum_' + str(n))
    df = df.join(M)
    return df 
Example 65
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 5 votes vote down vote up
def rate_of_change(df, n):
    """
    
    :param df: pandas.DataFrame
    :param n: 
    :return: pandas.DataFrame
    """
    M = df['Close'].diff(n - 1)
    N = df['Close'].shift(n - 1)
    ROC = pd.Series(M / N, name='ROC_' + str(n))
    df = df.join(ROC)
    return df 
Example 66
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 5 votes vote down vote up
def stochastic_oscillator_k(df):
    """Calculate stochastic oscillator %K for given data.
    
    :param df: pandas.DataFrame
    :return: pandas.DataFrame
    """
    SOk = pd.Series((df['Close'] - df['Low']) / (df['High'] - df['Low']), name='SO%k')
    df = df.join(SOk)
    return df 
Example 67
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 5 votes vote down vote up
def stochastic_oscillator_d(df, n):
    """Calculate stochastic oscillator %D for given data.
    :param df: pandas.DataFrame
    :param n: 
    :return: pandas.DataFrame
    """
    SOk = pd.Series((df['Close'] - df['Low']) / (df['High'] - df['Low']), name='SO%k')
    SOd = pd.Series(SOk.ewm(span=n, min_periods=n).mean(), name='SO%d_' + str(n))
    df = df.join(SOd)
    return df 
Example 68
Project: incubator-spot   Author: apache   File: flow_oa.py    Apache License 2.0 4 votes vote down vote up
def _ingest_summary(self):
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")
        
        ingest_summary_cols = ["date","total"]		
        result_rows = []        
        df_filtered =  pd.DataFrame()

        # get ingest summary.

        query_to_load=("""
                SELECT tryear, trmonth, trday, trhour, trminute, COUNT(*) as total
                FROM {0}.{1} WHERE y={2} AND m={3} AND d={4}
                AND unix_tstamp IS NOT NULL
                AND sip IS NOT NULL
                AND sport IS NOT NULL
                AND dip IS NOT NULL
                AND dport IS NOT NULL
                AND ibyt IS NOT NULL
                AND ipkt IS NOT NULL
                AND tryear={2}
                AND cast(treceived as timestamp) IS NOT NULL
                GROUP BY tryear, trmonth, trday, trhour, trminute;
        """).format(self._db,self._table_name, yr, mn, dy)
        
        results = impala.execute_query(query_to_load) 
 
        if results:
            df_results = as_pandas(results) 
            
            #Forms a new dataframe splitting the minutes from the time column
            df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(val['tryear'],val['trmonth'],val['trday'], val['trhour'], val['trminute']), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols)
            value_string = ''
            #Groups the data by minute 

            sf = df_new.groupby(by=['date'])['total'].sum()
            df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})
            
            df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) 
            if len(df_final) > 0:
                query_to_insert=("""
                    INSERT INTO {0}.flow_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4};
                """).format(self._db, yr, mn, dy, tuple(df_final))

                impala.execute_query(query_to_insert)
                
        else:
            self._logger.info("No data found for the ingest summary") 
Example 69
Project: incubator-spot   Author: apache   File: proxy_oa.py    Apache License 2.0 4 votes vote down vote up
def _ingest_summary(self): 
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")
        
        ingest_summary_cols = ["date","total"]		
        result_rows = []        
        df_filtered =  pd.DataFrame()

        # get ingest summary.

        query_to_load=("""
                SELECT p_date, p_time, COUNT(*) as total
                FROM {0}.{1} WHERE y='{2}' AND m='{3}' AND d='{4}'
                AND p_date IS NOT NULL AND p_time IS NOT NULL
                AND clientip IS NOT NULL AND p_time != ''
                AND host IS NOT NULL AND fulluri IS NOT NULL
                GROUP BY p_date, p_time;
        """).format(self._db,self._table_name, yr, mn, dy)
        
        results = impala.execute_query(query_to_load) 
 
        if results:
            df_results = as_pandas(results)
            #Forms a new dataframe splitting the minutes from the time column/
            df_new = pd.DataFrame([["{0} {1}:{2}".format(val['p_date'], val['p_time'].split(":")[0].zfill(2), val['p_time'].split(":")[1].zfill(2)), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols)
            value_string = ''
            #Groups the data by minute 
            sf = df_new.groupby(by=['date'])['total'].sum()
            df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})
            
            df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) 
            if len(df_final) > 0:
                query_to_insert=("""
                    INSERT INTO {0}.proxy_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4};
                """).format(self._db, yr, mn, dy, tuple(df_final))

                impala.execute_query(query_to_insert) 
                
        else:
            self._logger.info("No data found for the ingest summary") 
Example 70
Project: Wide-Residual-Nets-for-SETI   Author: sgrvinod   File: test.py    Apache License 2.0 4 votes vote down vote up
def test(test_loader, model):
    """
    Perform testing.
    """

    model.eval()  # eval mode

    all_probs = []
    all_uuids = []

    batch_time = AverageMeter()  # forward prop. time this batch

    start = time.time()

    softmax = torch.nn.Softmax()  # need this, since there is no longer a loss layer

    for i, (input, uuids) in enumerate(test_loader):

        softmax.zero_grad()

        # Store UUIDs associated with this batch, in the right order
        uuids = list(uuids.numpy().ravel())
        all_uuids.extend(uuids)

        input_var = torch.autograd.Variable(input, volatile=True).cuda()

        output = model(input_var)
        probs = softmax(output)
        all_probs.append(probs.data)

        batch_time.update(time.time() - start)
        start = time.time()

        if i % args.print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'.format(i, len(test_loader),
                                                                                    batch_time=batch_time))
    all_probs = torch.cat(all_probs).cpu()  # concatenate probs from all batches, move to CPU
    all_uuids = [uuid_index_mapping[i] for i in all_uuids]  # convert UUID indices to UUIDs

    # Create dataframe and store as CSV
    df1 = pd.DataFrame({'UUIDs': pd.Series(all_uuids)})
    df2 = pd.DataFrame(all_probs.numpy())
    df = pd.concat([df1, df2], axis=1)
    csv_path = './TESTRESULTS__' + args.checkpoint.split('/')[-1] + '__' + args.h5data.split('/')[-1] + '.csv'
    df.to_csv(csv_path, header=False, index=False)
    print("\nSaved results to {0}\n".format(csv_path)) 
Example 71
Project: Wide-Residual-Nets-for-SETI   Author: sgrvinod   File: test_cpu.py    Apache License 2.0 4 votes vote down vote up
def test(test_loader, model):
    """
    Perform testing.
    """

    print('Perform testing')

    model.eval()  # eval mode

    all_probs = []
    all_uuids = []

    batch_time = AverageMeter()  # forward prop. time this batch

    start = time.time()

    softmax = torch.nn.Softmax()  # need this, since there is no longer a loss layer

    for i, (input, uuids) in enumerate(test_loader):

        softmax.zero_grad()

        # Store UUIDs associated with this batch, in the right order
        uuids = list(uuids.numpy().ravel())
        all_uuids.extend(uuids)

        input_var = torch.autograd.Variable(input, volatile=True).cpu()

        output = model(input_var)
        probs = softmax(output)
        
        all_probs.append(probs.data)

        batch_time.update(time.time() - start)
        start = time.time()

        if i % args.print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'.format(i, len(test_loader),
                                                                                    batch_time=batch_time))
    all_probs = torch.cat(all_probs).cpu()  # concatenate probs from all batches, move to CPU
    all_uuids = [uuid_index_mapping[i] for i in all_uuids]  # convert UUID indices to UUIDs

    # Create dataframe and store as CSV
    df1 = pd.DataFrame({'UUIDs': pd.Series(all_uuids)})
    df2 = pd.DataFrame(all_probs.numpy())
    df = pd.concat([df1, df2], axis=1)
    csv_path = './TESTRESULTS__' + args.checkpoint.split('/')[-1] + '__' + args.h5data.split('/')[-1] + '.csv'
    df.to_csv(csv_path, header=False, index=False)
    print("\nSaved results to {0}\n".format(csv_path)) 
Example 72
Project: kipet   Author: salvadorgarciamunoz   File: MultipleExperimentsEstimator.py    GNU General Public License v3.0 4 votes vote down vote up
def __init__(self,datasets):
        super(MultipleExperimentsEstimator, self).__init__()
        self.block_models = dict()
        self._idx_to_variable = dict()
        
        if datasets != None:
            if isinstance(datasets, dict):
                for key, val in datasets.items():
                    if not isinstance(key, str):
                        raise RuntimeError('The key for the dictionary must be a str')
                    if not isinstance(val, pd.DataFrame):
                        raise RuntimeError('The value in the dictionary must be the experimental dataset as a pandas DataFrame')

        else:
            raise RuntimeError("datasets not given, add datasets as a dict to use this class")
            
        self.datasets = datasets
        self.experiments = list()
        for key,val in self.datasets.items():
            self.experiments.append(key)
        
        self._variance_solved = False
        #added for new initialization options (CS):
        self._sim_solved = False
        self.sim_results = dict()
        self.cloneopt_model = dict()
        # self.clonecloneopt_model = dict()
        
        self.variances= dict()
        self.variance_results = dict()
        self.start_time =dict()
        self.end_time = dict()
        self.builder = dict()
        self.opt_model = dict()
        
        self.initialization_model = dict()
        self._sublist_components = dict()

        self._n_meas_times = 0
        self._n_meas_lambdas = 0
        self._n_actual = 0
        self._n_params = 0
        
        self._spectra_given = True
        self._concentration_given = False
        
        self.global_params = None
        
        # set of flags to mark the how many times and wavelengths are in each dataset
        self.l_mark = dict()
        self.t_mark = dict()
        self.n_mark = dict()
        self.p_mark = dict() 
Example 73
Project: kipet   Author: salvadorgarciamunoz   File: ResultsObject.py    GNU General Public License v3.0 4 votes vote down vote up
def load_from_pyomo_model(self,instance,to_load=[]):

        model_variables = set()
        for block in instance.block_data_objects():
            block_map = block.component_map(Var)
            for name in six.iterkeys(block_map):
                model_variables.add(name)
                
        user_variables = set(to_load)

        if user_variables:
            variables_to_load = user_variables.intersection(model_variables)
        else:
            variables_to_load = model_variables

        diff = user_variables.difference(model_variables)
        if diff:
            print("WARNING: The following variables are not part of the model:")
            print(diff) 
        
        for block in instance.block_data_objects():
            block_map = block.component_map(Var)
            for name in variables_to_load:
                v = block_map[name]
                if v.dim()==0:
                    setattr(self,name,v.value)
                elif v.dim()==1:
                    setattr(self,name,pd.Series(v.get_values()))
                elif v.dim()==2:
                    d = v.get_values()
                    keys = d.keys()
                    if keys:
                        split_keys = v._implicit_subsets
                        # split_keys = zip(*keys)
                        # print(split_keys)
                        first_set = set(split_keys[0])
                        second_set = set(split_keys[1])
                        s_first_set = sorted(first_set)
                        s_second_set = sorted(second_set)
                        m = len(first_set)
                        n = len(second_set)

                        v_values = np.zeros((m,n))
                        for i,w in enumerate(s_first_set):
                            for j,k in enumerate(s_second_set):
                                v_values[i,j] = d[w,k]

                        data_frame = pd.DataFrame(data=v_values,
                                                  columns = s_second_set,
                                                  index=s_first_set)
                    else:
                        data_frame = pd.DataFrame(data=[],
                                                  columns = [],
                                                  index=[])
                    setattr(self,name,data_frame)        
                else:
                    raise RuntimeError('load_from_pyomo_model function not supported for models with variables with dimension>2') 
Example 74
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 4 votes vote down vote up
def basic_pca(dataFrame,n=None,with_plots=False):
    """ Runs basic component analysis based on SVD
    
        Args:
            dataFrame (DataFrame): spectral data
            
            n (int): number of largest singular-values
            to plot
            
            with_plots (boolean): argument for files with plots due to testing

        Returns:
            None

    """
            
    times = np.array(dataFrame.index)
    lambdas = np.array(dataFrame.columns)
    D = np.array(dataFrame)
    #print("D shape: ", D.shape)
    U, s, V = np.linalg.svd(D, full_matrices=True)
    #print("U shape: ", U.shape)
    #print("s shape: ", s.shape)
    #print("V shape: ", V.shape)
    #print("sigma/singular values", s)
    if n == None:
        print("WARNING: since no number of components is specified, all components are printed")
        print("It is advised to select the number of components for n")
        n_shape = s.shape
        n = n_shape[0]
        
    u_shape = U.shape
    #print("u_shape[0]",u_shape[0])
    n_l_vector = n if u_shape[0]>=n else u_shape[0]
    n_singular = n if len(s)>=n else len(s)
    idxs = range(n_singular)
    vals = [s[i] for i in idxs]
    v_shape = V.shape
    n_r_vector = n if v_shape[0]>=n else v_shape[0]
    
    if with_plots:
        for i in range(n_l_vector):
            plt.plot(times,U[:,i])
        plt.xlabel("time")
        plt.ylabel("Components U[:,i]")
        plt.show()
        
        plt.semilogy(idxs,vals,'o')
        plt.xlabel("i")
        plt.ylabel("singular values")
        plt.show()
        
        for i in range(n_r_vector):
            plt.plot(lambdas,V[i,:])
        plt.xlabel("wavelength")
        plt.ylabel("Components V[i,:]")
        plt.show() 
Example 75
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 4 votes vote down vote up
def perform_data_analysis(dataFrame, pseudo_equiv_matrix, rank_data):  
    """ Runs the analysis by Chen, et al, 2018, based upon the pseudo-equivalency
    matrix. User provides the data and the pseudo-equivalency matrix and the analysis
    provides suggested number of absorbing components as well as whether there are
    likely to be unwanted spectral contributions.
    
        Args:
            dataFrame (DataFrame): spectral data
            
            pseudo_equiv_matrix (list of lists): list containing the rows of the pseudo-equivalency
                                matrix.
            
            rank_data (int): rank of the data matrix, as determined from SVD (number of coloured species)
                
            with_plots (boolean): argument for files with plots due to testing

        Returns:
            None

    """  
    if not isinstance(dataFrame, pd.DataFrame):
        raise TypeError("data must be inputted as a pandas DataFrame, try using read_spectral_data_from_txt or similar function first")
    
    if not isinstance(pseudo_equiv_matrix, list):
        raise TypeError("The Pseudo-equivalency matrix must be inputted as a list containing lists with each row of the pseudo-equivalency matrix")
    PEM = np.matrix(pseudo_equiv_matrix)
    rkp = rank(PEM)
    print("Rank of pseudo-equivalency matrix is ", rkp)
    
    ns = nullspace(PEM)
    print("Nullspace/kernel of pseudo-equivalency matrix is ", ns)
    if ns.size == 0:
        print("Null space of pseudo-equivalency matrix is null")
        rankns = 0
    else:
        rankns = rank(ns)
    
    print("the rank of the nullspace/kernel of pseudo-equivalency matrix is ", rankns)
    
    num_components = PEM.shape[1]
    if rankns > 0:
        ncr = num_components - rankns
        print("Choose the following number of absorbing species:", ncr)
    else:
        ncr = num_components
    ncms = rank_data
    
    if ncr == ncms:
        print("Solve standard problem assuming no unwanted contributions")
    elif ncr == ncms - 1:
        print("Solve with unwanted contributions")
    else:
        print("There may be uncounted for species in the model, or multiple sources of unknown contributions")
    
#=============================================================================
#---------------------------PROBLEM GENERATION TOOLS------------------------
#============================================================================= 
Example 76
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 4 votes vote down vote up
def snv(dataFrame, offset=0):
    """
    Implementation of the Standard Normal Variate (SNV) filter for Kipet which is a weighted normalization
    method that is commonly used to remove scatter effects in spectroscopic data, this pre-processing 
    step can be applied before the SG filter or used on its own. SNV can be sensitive to noisy entries 
    in the spectra and can increase nonlinear behaviour between S and C as it is not a linear transformation.
    
    
    Args:
        dataFrame (DataFrame): the data to be processed (either concentration or spectral data)
        offset (float): user-defined offset which can be used to avoid over-normalization for samples
                        with near-zero standard deviation. Guide for choosing this value is for something 
                        near the expected noise level to be specified. Default value is zero.
        
    Returns:
        DataFrame containing pre-processed data
    
    References:

    """
    # data checks
    if not isinstance(dataFrame, pd.DataFrame):
        raise TypeError("data must be inputted as a pandas DataFrame, try using read_spectral_data_from_txt or similar function first")
    print("Applying the SNV pre-processing")    

    D = np.array(dataFrame)
    snv_proc = np.array(dataFrame)
    for t in range(len(dataFrame.index)):
        row = list()
        sum_spectra = 0
        for l in range(len(dataFrame.columns)):
            row.append(D[t,l])
            sum_spectra += D[t,l]
        mean_spectra = sum_spectra/(len(dataFrame.columns))
        std = 0
        for l in range(len(dataFrame.columns)):
            std += (mean_spectra-D[t,l])**2
        new_row = list()
        for l in range(len(dataFrame.columns)):
            if offset ==0:
                w = (D[t,l]-mean_spectra)*(std/(len(dataFrame.columns)-1))**0.5
            else:
                w = (D[t,l]-mean_spectra)*(std/(len(dataFrame.columns)-1))**0.5 + 1/offset
            new_row.append(w)
                
        snv_proc[t]=new_row

    data_frame = pd.DataFrame(data=snv_proc,
                              columns = dataFrame.columns,
                              index=dataFrame.index)
    return data_frame 
Example 77
Project: kipet   Author: salvadorgarciamunoz   File: TemplateBuilder.py    GNU General Public License v3.0 4 votes vote down vote up
def add_spectral_data(self, data):
        """Add spectral data

        Args:
            data (DataFrame): DataFrame with measurement times as
                              indices and wavelengths as columns.

        Returns:
            None

        """
        if isinstance(data, pd.DataFrame):
            # add zero rows for feed times that are not in original measurements in D-matrix (CS):
            df = pd.DataFrame(index=self._feed_times, columns=data.columns)
            for t in self._feed_times:
                if t not in data.index:  # for points that are the same in original measurement times and feed times (CS)
                    df.loc[t] = [0.0 for n in range(len(data.columns))]
            dfall = data.append(df)
            dfall.sort_index(inplace=True)
            dfall.index = dfall.index.to_series().apply(lambda x: np.round(x, 6))  # time from data rounded to 6 digits
            ##############Filter out NaN############### points that are the same in original measurement times and feed times (CS)
            count = 0
            for j in dfall.index:
                if count >= 1 and count < len(dfall.index):
                    if dfall.index[count] == dfall.index[count - 1]:
                        dfall = dfall.dropna()
                count += 1
            ###########################################
            self._spectral_data = dfall
        else:
            raise RuntimeError('Spectral data format not supported. Try pandas.DataFrame')

        D = np.array(dfall)

        for t in range(len(dfall.index)):
            for l in range(len(dfall.columns)):
                if D[t, l] >= 0:
                    pass
                else:
                    self._is_D_deriv = True
        if self._is_D_deriv == True:
            print(
                "Warning! Since D-matrix contains negative values Kipet is assuming a derivative of D has been inputted") 
Example 78
Project: prediction-constrained-topic-models   Author: dtak   File: eval_pretrained_sklearn_binary_classifier.py    MIT License 4 votes vote down vote up
def eval_pretrained_clf(
        classifier_path='/tmp/',
        classifier_name='logistic_regression',
        datasets_by_split=None,
        verbose=True,
        feat_colnames=None,
        y_col_id=0,
        y_orig_col_id=0,
        y_col_name='',
        output_path='/tmp/',
        seed_bootstrap=42,
        n_bootstraps=5000,
        bootstrap_stratify_pos_and_neg=True,
        ):
    start_time = time.time()
    (make_classifier, score_classifier, calc_best_idx,
        make_clf_report, make_csv_row_dict, make_interp_report) = \
            make_constructor_and_evaluator_funcs(
                classifier_name,
                n_bootstraps=n_bootstraps,
                seed_bootstrap=seed_bootstrap,
                bootstrap_stratify_pos_and_neg=bootstrap_stratify_pos_and_neg)

    # Read classifier obj from disk
    clf_path = os.path.join(
        classifier_path,
        'clf_%d_object.dump' % (y_orig_col_id))
    best_clf = joblib.load(clf_path)

    if os.path.exists(output_path):
        n_keys = len(datasets_by_split.keys())
        for ss, split in enumerate(datasets_by_split.keys()):
            csv_fpath = os.path.join(
                output_path,
                'clf_%d_callback_%s.csv' % (y_orig_col_id, split))
            row_dict = make_csv_row_dict(
                best_clf,
                datasets_by_split[split]['x'],
                datasets_by_split[split]['y'][:, y_col_id],
                y_col_name,
                split,
                classifier_name)
            csv_df = pd.DataFrame([row_dict])
            csv_df.to_csv(
                csv_fpath,
                index=False)
            if verbose:
                elapsed_time = time.time() - start_time
                print("eval %d/%d on %5s split done after %11.2f sec" % (ss, n_keys, split, elapsed_time))
                print("wrote csv file: " + csv_fpath)
    return best_clf 
Example 79
Project: OpenAPS   Author: medicinexlab   File: bgdata.py    MIT License 4 votes vote down vote up
def get_new_df_entries_every_5_minutes(bg_df, start_index, end_index, set_str):
    """
    Function to take the given bg_df and make a new_bg_df within the start and end indices
    such that entries are spaced out by 5 minutes (MIN_ENTRY_SPACING_MINUTE).

    Input:      bg_df                           Pandas dataframe of all of the data from ./data/[id_str]/devicestatus.json
                start_index                     The starting index of the data in the bg_df
                end_index                       The ending index of the data in the bg_df
                set_str                         String of the set name (i.e. "Training", "Validation", "Testing")
    Output:     new_bg_df                       New Pandas dataframe with the data from the original start and end indices
                                                    such that the entries are spaced out by 5 minutes.
                start_index                     New starting index for the new_bg_df
                end_index                       New ending index for the new_bg_df
    Usage:      bg_df, start_train_index, end_train_index = get_bg_index(bg_df, "2017-05-05", "2017-05-14", "Training", True)
    """
    new_bg_df = pd.DataFrame()
    last_time = 0
    starting_df = True

    for df_index in range(end_index, start_index):
        add_entry = False
        try:
            time = int((bg_df.iloc[df_index]['created_at'] - bg_df.iloc[start_index]['created_at']) / np.timedelta64(1, 'm'))
            test_if_has_enacted = bg_df.iloc[df_index]['openaps']['enacted']['bg'] #Test to see if df entry has suggested and enacted functioning
            test_if_has_suggested = bg_df.iloc[df_index]['openaps']['suggested']['bg']

            if last_time - time >= MIN_ENTRY_SPACING_MINUTE or starting_df: #check if spaced out by 5 minute or just starting the df
                starting_df = False
                last_time = time

                #If it has both enacted and suggested and is spaced out by MIN_ENTRY_SPACING_MINUTE from last entry, then set boolean to be true
                add_entry = True
        except:
            add_entry = False

        if add_entry:
            new_bg_df = new_bg_df.append(bg_df.iloc[df_index], ignore_index=True) #if boolean is true, add it to the new dataframe

    start_index = len(new_bg_df) - 1
    end_index = 0

    #Print the number of entries, the start index, and the end index
    print("{}: {} number entries".format(set_str, start_index - end_index + 1))
    print("{} Start Index = {} and End Index = {}".format(set_str, start_index, end_index))
    print

    return new_bg_df, start_index, end_index 
Example 80
Project: pybench   Author: pentschev   File: benchmark_ml.py    Apache License 2.0 4 votes vote down vote up
def load_data(nrows, ncols, cached, train_split=1.0, label_col=None):
    import gzip
    import os
    import numpy as np, gzip, os
    import pandas as pd

    train_rows = int(nrows * train_split)

    if os.path.exists(cached):
        with gzip.open(cached) as f:
            X = np.load(f)

        if train_split < 1.0 and label_col is not None:
            X = X[:, [i for i in range(X.shape[1]) if i != label_col]]
            y = X[:, label_col : label_col + 1]
            rindices = np.random.randint(0, X.shape[0] - 1, nrows)
            X = X[rindices, :ncols]
            y = y[rindices]
            df_y_train = pd.DataFrame(
                {"fea%d" % i: y[0:train_rows, i] for i in range(y.shape[1])}
            )
            df_y_test = pd.DataFrame(
                {"fea%d" % i: y[train_rows:, i] for i in range(y.shape[1])}
            )
        else:
            X = X[np.random.randint(0, X.shape[0] - 1, nrows), :ncols]

    else:
        # throws FileNotFoundError error if mortgage dataset is not present
        raise FileNotFoundError(
            "Please download the required dataset or check the path"
        )

    if train_split < 1.0 and label_col is not None:
        df_X_train = pd.DataFrame(
            {"fea%d" % i: X[0:train_rows, i] for i in range(X.shape[1])}
        )
        df_X_test = pd.DataFrame(
            {"fea%d" % i: X[train_rows:, i] for i in range(X.shape[1])}
        )

        return {
            "X_train": df_X_train,
            "X_test": df_X_test,
            "y_train": df_y_train,
            "y_test": df_y_test,
        }
    else:
        df = pd.DataFrame({"fea%d" % i: X[:, i] for i in range(X.shape[1])})
        return df