Python pandas.DataFrame() Examples

The following are code examples for showing how to use pandas.DataFrame(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: navitia_client   Author: leonardbinet   File: parser.py    MIT License 6 votes vote down vote up
def write_all(self, directory):
        # Get results
        unnested = pd.DataFrame(self.unnested_items)  # df
        nested = self.nested_items  # dict
        # Write item csv
        unnested.to_csv(os.path.join(directory, self.item_name + ".csv"))
        # Write item json
        with open(os.path.join(directory, self.item_name + ".json"), 'w') as f:
            json.dump(nested, f, ensure_ascii=False)
        # Write links (of first page)
        with open(os.path.join(directory, "links.json"), 'w') as f:
            json.dump(self.links, f, ensure_ascii=False)
        # Write disruptions (if item different)
        if self.item_name != "disruptions":
            unnested_dis = pd.DataFrame(self.disruptions)  # df
            unnested_dis.to_csv(os.path.join(directory, "disruptions.csv"))
        # Write logs
        with open(os.path.join(directory, "parse_log.json"), 'w') as f:
            json.dump(self.log, f, ensure_ascii=False) 
Example 2
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 6 votes vote down vote up
def _run_deseq2_peaks(self):
        peak_df = pd.DataFrame()
        for replicon in sorted(self._replicon_dict):
            if self._replicon_dict[replicon]["peak_df"].empty:
                continue
            peak_df = peak_df.append(self._replicon_dict[replicon]["peak_df"],
                                     ignore_index=True)
        count_df = peak_df.loc[:, self._exp_lib_list + self._ctr_lib_list]
        deseq2_runner = DESeq2Runner(count_df)
        result_df, self._size_factors = deseq2_runner.run_deseq2(
            self._exp_lib_list, self._ctr_lib_list, self._size_factors,
            self._pairwise_replicates)
        # normalize counts
        peak_df[self._lib_names_list] = peak_df[
            self._lib_names_list].div(self._size_factors, axis='columns')
        # append DESeq2 output
        peak_df = pd.concat([peak_df, result_df], axis=1)
        for replicon in sorted(self._replicon_dict):
            self._replicon_dict[replicon]["peak_df"] = peak_df[
                peak_df.replicon == replicon] 
Example 3
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 6 votes vote down vote up
def generate_peaks_from_blockbuster(self, min_cluster_expr_frac,
                                        min_block_overlap,
                                        min_max_block_expr_frac):
        for replicon in self._replicon_dict:
            self._replicon_dict[replicon]["peak_df"] = pd.DataFrame()
        cluster = {}
        for line in self._blockbuster_output.rstrip().split('\n'):
            if line.startswith('>'):
                if cluster:
                    self._call_cluster_peaks(cluster, min_cluster_expr_frac,
                                             min_block_overlap,
                                             min_max_block_expr_frac)
                    cluster = {}
                cluster["header"] = line
                cluster["blocks"] = []
            else:
                cluster["blocks"].append(line)
        if cluster:
            self._call_cluster_peaks(cluster, min_cluster_expr_frac,
                                     min_block_overlap,
                                     min_max_block_expr_frac) 
Example 4
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 6 votes vote down vote up
def calculate_peak_expression(self):
        self._generate_peak_counts()
        self._peak_df = pd.DataFrame()
        for replicon in sorted(self._replicon_dict):
            if self._replicon_dict[replicon]["peak_df"].empty:
                continue
            self._replicon_dict[replicon]["peak_df"]["replicon"] = replicon
            for lib_name, lib in self._lib_dict.items():
                self._replicon_dict[replicon][
                    "peak_df"][lib_name] = lib.replicon_dict[
                        replicon]["peak_counts"]
                del lib.replicon_dict[replicon]["peak_counts"]
            # add pseudocounts
            # self._replicon_dict[
            #    replicon]["peak_df"].loc[:, self._lib_names_list] += 1.0
            self._peak_df = self._peak_df.append(self._replicon_dict[replicon][
                "peak_df"], ignore_index=True) 
Example 5
Project: backtrader-cn   Author: pandalibin   File: utils.py    GNU General Public License v3.0 6 votes vote down vote up
def write_daily_alert(cls, symbol, stock_id, action):
        """
        write daily stock alert to MongoDB.
        :param symbol: Arctic symbol
        :param data: dict, like: {'stock': '000651', 'action': 'buy/sell'}
        :return: None
        """

        lib = get_or_create_library(conf.DAILY_STOCK_ALERT_LIBNAME)

        data = {
            'stock': stock_id,
            'action': action
        }
        df = pd.DataFrame([data], columns=data.keys())
        if symbol in lib.list_symbols():
            lib.append(symbol, df)
        else:
            lib.write(symbol, df) 
Example 6
Project: backtrader-cn   Author: pandalibin   File: test_datas_utils.py    GNU General Public License v3.0 6 votes vote down vote up
def _test_strip_unused_cols(self):
        data = pd.DataFrame({
            'name': ['tom', 'jack'],
            'age': [24, 56],
            'gender': ['male', 'male'],
            'address': ['cn', 'us']
        })
        data.index = pd.date_range(start='2017-01-01', periods=2)

        origin_cols = ['name', 'age', 'gender', 'address']
        unused_cols = ['address', 'gender']
        new_cols = ['name', 'age']

        self.assertEqual(list(data.columns).sort(), origin_cols.sort())

        bdu.Utils.strip_unused_cols(data, *unused_cols)

        self.assertEqual(list(data.columns).sort(), new_cols.sort()) 
Example 7
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 6 votes vote down vote up
def write_absorption_data_to_txt(filename,dataframe):
    """ Write absorption data Sij to txt file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    f = open(filename,'w')
    for i in dataframe.index:
        for j in dataframe.columns:
            f.write("{0} {1} {2}\n".format(i,j,dataframe[j][i]))
    f.close() 
Example 8
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 6 votes vote down vote up
def write_concentration_data_to_txt(filename,dataframe):
    """ Write concentration data Cij to txt file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    f = open(filename,'w')
    for i in dataframe.index:
        for j in dataframe.columns:
            f.write("{0} {1} {2}\n".format(i,j,dataframe[j][i]))
    f.close() 
Example 9
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 6 votes vote down vote up
def generate_absorbance_data(wl_span,parameters_dict):
    """
    helper function to generate absorption data based on 
    lorentzian parameters
    """
    components = parameters_dict.keys()
    n_components = len(components)
    n_lambdas = len(wl_span)
    array = np.zeros((n_lambdas,n_components))
    for i,l in enumerate(wl_span):
        j = 0
        for k,p in six.iteritems(parameters_dict):
            alphas = p['alphas']
            betas  = p['betas']
            gammas = p['gammas']
            array[i,j] = absorbance(l,alphas,betas,gammas)
            j+=1

    data_frame = pd.DataFrame(data=array,
                              columns = components,
                              index=wl_span)
    return data_frame 
Example 10
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 6 votes vote down vote up
def add_noise_to_signal(signal, size):
    """
    Adds a random normally distributed noise to a clean signal. Used mostly in Kipet
    To noise absorbances or concentration profiles obtained from simulations. All
    values that are negative after the noise is added are set to zero
    Args:
        signal (data): the Z or S matrix to have noise added to it
        size (scalar): sigma (or size of distribution)
    Returns:
        pandas dataframe
    """
    clean_sig = signal    
    noise = np.random.normal(0,size,clean_sig.shape)
    sig = clean_sig+noise    
    df= pd.DataFrame(data=sig)
    df[df<0]=0
    return df

#=============================================================================
#---------------------------PRE-PROCESSING TOOLS------------------------
#============================================================================= 
Example 11
Project: kipet   Author: salvadorgarciamunoz   File: TemplateBuilder.py    GNU General Public License v3.0 6 votes vote down vote up
def add_absorption_data(self, data):
        """Add absorption data

        Args:
            data (DataFrame): DataFrame with wavelengths as
                              indices and muxture components as columns.

        Returns:
            None

        """
        if isinstance(data, pd.DataFrame):
            self._absorption_data = data
        else:
            raise RuntimeError('Spectral data format not supported. Try pandas.DataFrame')

    # For inclusion of discrete jumps 
Example 12
Project: pybench   Author: pentschev   File: utils.py    Apache License 2.0 6 votes vote down vote up
def benchmark_json_to_pandas(path):
    """Convert the json "benchmarks" field of a pytest-benchmark json file into
    a pandas.DataFrame.

    Parameters
    ----------
    path: str
        path to json file

    Returns
    -------
    pandas.DataFrame
        A pandas DataFrame containing benchmarks extracted from a
        pytest-benchmark json file

    Example
    -------
    >>> benchmarks_df = benchmark_json_to_pandas(
    >>>     "/path/to/pytest_benchmark.json")
    """
    data = json.load(open(path))

    return pd.io.json.json_normalize(data=data["benchmarks"]) 
Example 13
Project: pybench   Author: pentschev   File: utils.py    Apache License 2.0 6 votes vote down vote up
def filter_by_string_in_column(df, column, value):
    """Filter pandas DataFrame by value, where value is a subsequence of the
    of the string contained in a column.

    Parameters
    ----------
    df: pandas.DataFrame
        A pandas DataFrame containing data from pytest-benchmark.
    column: str
        Column name where to check for value.
    value: str
        String to be checked if is part of column's content.

    Returns
    -------
    pandas.DataFrame
        A pandas DataFrame containing only rows for which value is contained in
        column content.

    Example
    -------
    >>> numpy_df = filter_by_string_in_column(df, 'name', 'numpy')
    """
    return df.loc[df[column].str.contains(value)] 
Example 14
Project: pybench   Author: pentschev   File: benchmark_ml.py    Apache License 2.0 6 votes vote down vote up
def load_mortgage(d):
    kwargs = {"nrows": d["shape"][0], "ncols": d["shape"][1], "cached": d["data"]}

    if "train_split" in d:
        kwargs["train_split"] = d["train_split"]
    if "label_col" in d:
        kwargs["label_col"] = d["label_col"]

    data = load_data(**kwargs)

    if d["module"] == "cuml":
        import cudf

        if isinstance(data, dict):
            for k, v in data.items():
                data[k] = cudf.DataFrame.from_pandas(v)

            data["y_train"] = cudf.Series(data["y_train"]["fea0"])
        else:
            data = cudf.DataFrame.from_pandas(data)

    return {"module": d["module"], "data": data} 
Example 15
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 6 votes vote down vote up
def average_true_range(df, n):
    """
    
    :param df: pandas.DataFrame
    :param n: 
    :return: pandas.DataFrame
    """
    i = 0
    TR_l = [0]
    while i < df.index[-1]:
        TR = max(df.loc[i + 1, 'High'], df.loc[i, 'Close']) - min(df.loc[i + 1, 'Low'], df.loc[i, 'Close'])
        TR_l.append(TR)
        i = i + 1
    TR_s = pd.Series(TR_l)
    ATR = pd.Series(TR_s.ewm(span=n, min_periods=n).mean(), name='ATR_' + str(n))
    df = df.join(ATR)
    return df 
Example 16
Project: pandas-technical-indicators   Author: Crypto-toolbox   File: technical_indicators.py    MIT License 6 votes vote down vote up
def ppsr(df):
    """Calculate Pivot Points, Supports and Resistances for given data
    
    :param df: pandas.DataFrame
    :return: pandas.DataFrame
    """
    PP = pd.Series((df['High'] + df['Low'] + df['Close']) / 3)
    R1 = pd.Series(2 * PP - df['Low'])
    S1 = pd.Series(2 * PP - df['High'])
    R2 = pd.Series(PP + df['High'] - df['Low'])
    S2 = pd.Series(PP - df['High'] + df['Low'])
    R3 = pd.Series(df['High'] + 2 * (PP - df['Low']))
    S3 = pd.Series(df['Low'] - 2 * (df['High'] - PP))
    psr = {'PP': PP, 'R1': R1, 'S1': S1, 'R2': R2, 'S2': S2, 'R3': R3, 'S3': S3}
    PSR = pd.DataFrame(psr)
    df = df.join(PSR)
    return df 
Example 17
Project: navitia_client   Author: leonardbinet   File: parser.py    MIT License 5 votes vote down vote up
def get_unnested_items(self):
        df = pd.DataFrame(self.nested_items[self.item_name])
        flatten_dataframe(df, drop=True, max_depth=5)
        self.unnested_items = df.to_dict() 
Example 18
Project: navitia_client   Author: leonardbinet   File: parser.py    MIT License 5 votes vote down vote up
def count_nbr_collected_items(self):
        unnested = pd.DataFrame(self.unnested_items)  # df
        self.nbr_collected_items = len(unnested.index) 
Example 19
Project: navitia_client   Author: leonardbinet   File: parser.py    MIT License 5 votes vote down vote up
def parse_log(self):
        log = {}
        log["number_requests"] = len(self.results)
        log["number_parsed"] = len(self.parsed)
        log["keys"] = self.keys
        log["nbr_announced_items"] = self.nbr_expected_items
        log["nbr_collected_items"] = self.nbr_collected_items
        log["item_columns"] = list(pd.DataFrame(
            self.unnested_items).columns.values)
        self.log = log
        log["parsing_errors"] = self.parsing_errors 
Example 20
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def _convert_to_data_frame(self):
        self._window_df = pd.DataFrame()
        for replicon in sorted(self._replicon_dict):
            for strand in ["+", "-"]:
                # add window positions to data frame
                row_number = len(self._replicon_dict[replicon]["window_list"])
                df = pd.concat([
                    pd.Series([replicon] * row_number),
                    pd.Series([strand] * row_number),
                    pd.Series([window[0]+1 for window in
                               self._replicon_dict[
                                   replicon]["window_list"]]),
                    pd.Series([window[1] for window in
                               self._replicon_dict[
                        replicon]["window_list"]])], axis=1)
                df.columns = ["replicon", "strand", "w_start", "w_end"]
                # add library counts to data frame
                for lib_name, lib in self._lib_dict.items():
                    df[lib_name] = (pd.Series(lib.replicon_dict[
                        replicon]["window_counts"].loc[:, strand]))
                self._window_df = self._window_df.append(df,
                                                         ignore_index=True)
            del self._replicon_dict[replicon]["window_list"]
        # remove windows without expression in any library
        print("Removing empty windows from DataFrame with {} rows...".format(
            len(self._window_df.index)), flush=True)
        t_start = time()
        self._window_df = self._window_df.loc[
            (self._window_df.loc[:, self._lib_names_list].sum(axis=1) > 0), :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(self._window_df.index)), flush=True)
        if self._window_df.empty:
            print("**Dataframe empty**", flush=True)
            return
        if self._stat_test == "gtest":
            self._run_gtest_preprocessing()
        elif self._stat_test == "deseq":
            self._run_deseq_preprocessing() 
Example 21
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def _prefilter_windows_deseq(self, df):
        print("Removing windows where not all experiment libs show "
              "expression from DataFrame with {} rows...".format(len(df)),
              flush=True)
        t_start = time()
        for exp_lib in self._exp_lib_list:
            exp_lib_zero_count = 0.0
            df = df.loc[(df.loc[:, exp_lib] > exp_lib_zero_count), :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        if df.empty:
            return df
        initial_window_df = df.copy()
        # normalize counts on initial windows
        initial_window_df[self._lib_names_list] = initial_window_df[
            self._lib_names_list].div(self._size_factors, axis='columns')
        # minimum expression cutoff based on mean over experiment libraries
        print("Removing windows based on mad cutoff from DataFrame "
              "with {} rows...".format(len(df)), flush=True)
        t_start = time()
        median_abs_dev_from_zero = mad(initial_window_df.loc[
            :, self._exp_lib_list].mean(axis=1), center=0.0)
        min_expr = (self._mad_multiplier * median_abs_dev_from_zero)
        print("Minimal window expression based on mean over RIP/CLIP "
              "libraries: {} (MAD from zero: {})".format(
                  min_expr, median_abs_dev_from_zero), flush=True)
        df = df.loc[initial_window_df.loc[:, self._exp_lib_list].mean(
            axis=1) >= min_expr, :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        return df 
Example 22
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def _plot_initial_windows(self, unsig_base_means, unsig_fcs,
                              sig_base_means, sig_fcs):
        # create plot folder if it does not exist
        plot_folder = "{}/plots".format(self._output_folder)
        if not exists(plot_folder):
            makedirs(plot_folder)
        # MA plot
        plt.plot(np.log10(unsig_base_means),
                 np.log2(unsig_fcs), ".",
                 markersize=2.0, alpha=0.3)
        plt.plot(np.log10(sig_base_means),
                 np.log2(sig_fcs), ".",
                 markersize=2.0, color="red", alpha=0.3)
        plt.axhline(y=np.median(np.log2(unsig_fcs.append(sig_fcs))))
        plt.axvline(x=np.median(np.log10(unsig_base_means.append(
                                         sig_base_means))))
        plt.title("Initial_windows_MA_plot")
        plt.xlabel("log10 base mean")
        plt.ylabel("log2 fold-change")
        plt.savefig("{}/Initial_windows_MA_plot.png".format(plot_folder),
                    dpi=600)
        plt.close()
        # HexBin plot
        df = pd.DataFrame({'log10 base mean': np.log10(unsig_base_means.append(
            sig_base_means)), 'log2 fold-change': np.log2(unsig_fcs.append(
                sig_fcs))})
        df.plot(kind='hexbin', x='log10 base mean',
                y='log2 fold-change', gridsize=50, bins='log')
        plt.axhline(y=np.median(np.log2(unsig_fcs.append(sig_fcs))))
        plt.axvline(x=np.median(np.log10(unsig_base_means.append(
                                         sig_base_means))))
        plt.title("Initial_windows_HexBin_plot")
        plt.savefig("{}/Initial_windows_HexBin_plot.pdf".format(plot_folder))
        plt.close() 
Example 23
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def _combine_windows(self, df):
        peak_list = []
        peak = {"peak_start": None, "peak_end": None}
        for index, window in df.iterrows():
            # significant window
            if window.loc["significant"]:
                # start new peak region if no peak region was started before
                if peak["peak_start"] is None:
                    peak["peak_start"] = window.loc["w_start"]
                    peak["peak_end"] = window.loc["w_end"]
                # start new peak region while still inside previous peak region
                #   *this is due to gaps in the windows caused by pre-filtering
                #   *add previous peak region to output
                # +1 -> combine adjacent peaks
                elif window.loc["w_start"] > peak["peak_end"] + 1:
                    peak_list.append(deepcopy(peak))
                    peak["peak_start"] = window.loc["w_start"]
                    peak["peak_end"] = window.loc["w_end"]
                # elongate peak if window overlaps
                else:
                    peak["peak_end"] = window.loc["w_end"]
            # non-significant window
            else:
                # jump to next window if outside of peak region
                # or current position upstream of peak end
                # +1 -> combine adjacent peaks
                if (peak["peak_start"] is None or window.loc[
                        "w_start"] <= peak["peak_end"] + 1):
                    continue
                # otherwise end peak region and append to output list
                peak_list.append(deepcopy(peak))
                peak["peak_start"] = None
                peak["peak_end"] = None
        # append peak if last window in data frame was significant
        if peak["peak_start"] is not None:
            peak_list.append(deepcopy(peak))
        peak_df = pd.DataFrame(peak_list, columns=["peak_start", "peak_end"])
        return peak_df 
Example 24
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 5 votes vote down vote up
def _call_cluster_peaks(self, cluster, min_cluster_expr_frac,
                            min_block_overlap, min_max_block_expr_frac):
        cluster_entries = cluster["header"].strip().split('\t')
        cluster_expr = float(cluster_entries[5])
        cluster_strand = cluster_entries[4]
        cluster_replicon = cluster_entries[1]
        peak_df = pd.DataFrame()

        if len(cluster["blocks"]) == 1:
            block_entries = cluster["blocks"][0].strip().split('\t')
            peak_start = int(block_entries[2]) + 1
            peak_end = int(block_entries[3])
            peak_df = peak_df.append(pd.Series([peak_start, peak_end], index=[
                "peak_start", "peak_end"]), ignore_index=True)
        else:
            blocks = [block.strip().split('\t') for block in cluster["blocks"]]
            block_df = pd.DataFrame(
                blocks, columns=["blockNb", "blockChrom", "blockStart",
                                 "blockEnd", "blockStrand", "blockExpression",
                                 "readCount"])
            block_df[["blockNb", "blockStart", "blockEnd", "blockExpression",
                      "readCount"]] = block_df[
                    ["blockNb", "blockStart", "blockEnd", "blockExpression",
                     "readCount"]].apply(pd.to_numeric)
            peak_df = self._split_cluster_peaks(block_df, cluster_expr,
                                                peak_df, min_cluster_expr_frac,
                                                min_block_overlap,
                                                min_max_block_expr_frac)
        if peak_df.empty:
            return
        peak_df = peak_df.astype(np.int64)
        peak_df["peak_strand"] = cluster_strand
        self._replicon_dict[cluster_replicon]["peak_df"] = self._replicon_dict[
            cluster_replicon]["peak_df"].append(peak_df, ignore_index=True) 
Example 25
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 5 votes vote down vote up
def _filter_peaks(self, df):
        # calculate mad for original data frame
        median_abs_dev_from_zero = mad(df.loc[:, self._exp_lib_list].mean(
            axis=1), center=0.0)
        # padj filter
        print("Removing peaks based on padj from DataFrame with {} rows...".
              format(len(df)), flush=True)
        t_start = time()
        df = df.query('padj < @self._padj_threshold')
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        if df.empty:
            return df
        # minimum expression cutoff based on mean over experiment libraries
        print("Removing peaks based on mad cutoff from DataFrame "
              "with {} rows...".format(len(df)), flush=True)
        t_start = time()
        min_expr = (self._mad_multiplier * median_abs_dev_from_zero)
        print("Minimal peak expression based on mean over RIP/CLIP "
              "libraries:" "{} (MAD from zero: {})".format(
                  min_expr, median_abs_dev_from_zero), flush=True)
        df = df.loc[df.loc[:, self._exp_lib_list].mean(axis=1) >= min_expr, :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        if df.empty:
            return df
        # minimum fold change
        print("Removing peaks based on minimum fold change "
              "from DataFrame with {} rows...".format(len(df)), flush=True)
        t_start = time()
        log2_fc_cutoff = np.log2(self._fc_cutoff)
        df = df.query('log2FoldChange >= @log2_fc_cutoff')
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        return df 
Example 26
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 5 votes vote down vote up
def _filter_peaks_without_replicates(self, df):
        # calculate mad for original data frame
        median_abs_dev_from_zero = mad(df.loc[:, self._exp_lib_list].mean(
            axis=1), center=0.0)
        # minimum expression cutoff based on mean over experiment libraries
        print("Removing peaks based on mad cutoff from DataFrame "
              "with {} rows...".format(len(df)), flush=True)
        t_start = time()
        min_expr = (self._mad_multiplier * median_abs_dev_from_zero)
        print("Minimal peak expression based on mean over RIP/CLIP "
              "libraries:" "{} (MAD from zero: {})".format(
                  min_expr, median_abs_dev_from_zero), flush=True)
        df = df.loc[df.loc[:, self._exp_lib_list].mean(axis=1) >= min_expr, :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        if df.empty:
            return df
        # minimum fold change
        print("Removing windows based on minimum fold change from DataFrame "
              "with {} rows...".format(len(df)), flush=True)
        t_start = time()
        df = df.query('fold_change >= @self._fc_cutoff')
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(df)), flush=True)
        return df 
Example 27
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 5 votes vote down vote up
def _plot_initial_peaks(self, unsig_base_means, unsig_fcs,
                            sig_base_means, sig_fcs):
        # create plot folder if it does not exist
        plot_folder = "{}/plots".format(self._output_folder)
        if not exists(plot_folder):
            makedirs(plot_folder)
        # MA plot
        plt.plot(np.log10(unsig_base_means),
                 np.log2(unsig_fcs), ".",
                 markersize=2.0, alpha=0.3)
        plt.plot(np.log10(sig_base_means),
                 np.log2(sig_fcs), ".",
                 markersize=2.0, color="red", alpha=0.3)
        plt.axhline(y=np.median(np.log2(unsig_fcs.append(sig_fcs))))
        plt.axvline(x=np.median(np.log10(unsig_base_means.append(
                                         sig_base_means))))
        plt.title("Initial_peaks_MA_plot")
        plt.xlabel("log10 base mean")
        plt.ylabel("log2 fold-change")
        plt.savefig("{}/Initial_peaks_MA_plot.png".format(plot_folder),
                    dpi=600)
        plt.close()
        # HexBin plot
        df = pd.DataFrame({'log10 base mean': np.log10(unsig_base_means.append(
            sig_base_means)), 'log2 fold-change': np.log2(unsig_fcs.append(
                sig_fcs))})
        df.plot(kind='hexbin', x='log10 base mean',
                y='log2 fold-change', gridsize=50, bins='log')
        plt.axhline(y=np.median(np.log2(unsig_fcs.append(sig_fcs))))
        plt.axvline(x=np.median(np.log10(unsig_base_means.append(
                                         sig_base_means))))
        plt.title("Initial_peaks_HexBin_plot")
        plt.savefig("{}/Initial_peaks_HexBin_plot.pdf".format(plot_folder))
        plt.close() 
Example 28
Project: PEAKachu   Author: tbischler   File: gtest.py    ISC License 5 votes vote down vote up
def __init__(self, ctr_rep_counts, tagged_rep_counts,
                 pairwise_replicates=False):
        self._rep_df = pd.DataFrame({"ctr_counts": ctr_rep_counts,
                                     "tagged_counts": tagged_rep_counts})
        self._pairwise_replicates = pairwise_replicates
        self._pooled_g_res = {}
        self._total_g_res = {}
        self._heterogenous_g_res = {}
        self._single_g_res = {} 
Example 29
Project: PEAKachu   Author: tbischler   File: library.py    ISC License 5 votes vote down vote up
def count_reads_for_windows(self):
        read_counter = ReadCounter(self.paired_end, self.max_insert_size,
                                   self.bam_file)
        for replicon in self.replicon_dict:
            self.replicon_dict[replicon]['window_counts'] = pd.DataFrame()
            for strand in ['+', '-']:
                window_counts = read_counter.count_reads_for_windows(
                            replicon,
                            strand,
                            self.replicon_dict[replicon]["window_list"])
                self.replicon_dict[replicon]['window_counts'][
                    strand] = window_counts
        read_counter.close_bam() 
Example 30
Project: PEAKachu   Author: tbischler   File: consensus_peak.py    ISC License 5 votes vote down vote up
def plot_consensus_peak(self):
        # create plot folder if it does not exist
        plot_folder = "{}/plots".format(self._project_folder)
        if not exists(plot_folder):
            makedirs(plot_folder)
        self._store_peaks()
        comb_cons_value_dict = self._get_peak_coverage()
        df = pd.DataFrame(comb_cons_value_dict, columns=sorted(
            comb_cons_value_dict))
        ax = df.plot(title="Consensus peak per library")
        ax.set_xlabel("Nucleotide position")
        ax.set_ylabel("Relative expression")
        plt.savefig("{}/plots/consensus_peaks.pdf".format(
            self._project_folder)) 
Example 31
Project: incubator-spot   Author: apache   File: dns_oa.py    Apache License 2.0 5 votes vote down vote up
def _ingest_summary(self):
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")
        
        ingest_summary_cols = ["date","total"]		
        result_rows = []        
        df_filtered =  pd.DataFrame()

        query_to_load = ("""
            SELECT frame_time, COUNT(*) as total FROM {0}.{1}
            WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL
            AND frame_time IS NOT NULL AND frame_len IS NOT NULL
            AND dns_qry_name IS NOT NULL AND ip_src IS NOT NULL
            AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL
            AND dns_qry_rcode IS NOT NULL ) GROUP BY frame_time;
        """).format(self._db,self._table_name, yr, mn, dy)

        results = impala.execute_query_as_list(query_to_load)
        df = pd.DataFrame(results)

        # Forms a new dataframe splitting the minutes from the time column
        df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy,\
            val['frame_time'].replace("  "," ").split(" ")[3].split(":")[0].zfill(2),\
            val['frame_time'].replace("  "," ").split(" ")[3].split(":")[1].zfill(2)),\
            int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df.iterrows()],columns = ingest_summary_cols)

        #Groups the data by minute
        sf = df_new.groupby(by=['date'])['total'].sum()
        df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})

        df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False)

        if len(df_final) > 0:
            query_to_insert=("""
                INSERT INTO {0}.dns_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4};
            """).format(self._db, yr, mn, dy, tuple(df_final))
            impala.execute_query(query_to_insert) 
Example 32
Project: rhodonite   Author: nestauk   File: tabular.py    MIT License 5 votes vote down vote up
def vertices_to_dataframe(g, vectors=False, keys=None):
    """vertices_to_dataframe
    Transforms a graph's vertices and their properties into a tabular format.

    Parameters
    ----------
        g : :obj:`Graph`
            A graph.
        keys : :obj:`iter` of :obj:`str`, optional
            A list of property map keys to convert in to columns. If None, all 
            property maps are converted. Default is None.
    Returns
    -------
        vertex_df : :obj:`DataFrame` A dataframe where each row represents a 
            vertex and the columns are properties of those edges. By default, 
            the dataframe will contain a column with the vertex id.
    """
    vertex_df = pd.DataFrame(list(g.vertices()), columns=['v'], dtype='int')
    filt = g.get_vertex_filter()
    if filt[0] is not None:
        filt = filt[0].a > 0
        filtered = True
    else:
        filtered=False
        filt = np.array([True] * g.num_vertices())
    if keys is None:
        keys = list(g.vp.keys())
    for k, vp in g.vp.items():
        if k in keys:
            vt = vp.value_type()
            if ('vector' not in vt) & ('string' not in vt) & ('object' not in vt):
                if ('int' in vt) | ('bool' in vt):
                    vertex_df[k] = vp.get_array()[filt]
                    vertex_df[k] = vertex_df[k].astype(int)
                elif 'double' in vt:
                    vertex_df[k] = vp.get_array()[filt]
                    vertex_df[k] = vertex_df[k].astype(float)
            elif ('vector' in vt) & ('string' not in vt) & (vectors == True):
                vertex_df[k] = [[i for i in vp[v]] for v, f in zip(g.vertices(), filt) if f]
    return vertex_df 
Example 33
Project: rhodonite   Author: nestauk   File: tabular.py    MIT License 5 votes vote down vote up
def edges_to_dataframe(g, keys=None, drop_keys=[], sort=None, vectors=False):
    """edges_to_dataframe
    Transforms a graph's edges and their properties into a tabular format.

    Parameters
    ----------
        g : :obj:`Graph` 
            A graph.
        keys : :obj:`iter` of :obj:`str`, optional
            A list of property map keys to convert in
            to columns. If None, all property maps are converted. Default is
            None.
    Returns
    -------
        edge_df : :obj:`DataFrame` 
            A dataframe where each row represents an edge and the columns are 
            properties of those edges. By default, the dataframe will contain 
            a column for the source vertices and another for the target 
            vertices.
    """
    edge_df = pd.DataFrame(list(g.edges()), columns=['s', 't'], dtype='int')
    edge_df = pd.DataFrame(g.get_edges(), columns=['s', 't', 'e_index'], dtype='int')
    indices = edge_df['e_index'].values
    if keys is None:
        keys = list(g.ep.keys())
    for k, ep in g.ep.items():
        if k in keys:
            vt = ep.value_type()
            if ('vector' not in vt) & ('string' not in vt) & ('object' not in vt):
                if ('int' in vt) | ('bool' in vt):
                    edge_df[k] = ep.get_array()[indices]
                    edge_df[k] = edge_df[k].astype(int)
                elif 'double' in vt:
                    edge_df[k] = ep.get_array()[indices]
                    edge_df[k] = edge_df[k].astype(float)
            elif ('vector' in vt) & ('string' not in vt) & (vectors == True):
                edge_df[k] = [[i for i in vp[v]] for v in g.vertices()]
    if sort:
        edge_df.sort_values(sort, inplace=True)
    return edge_df 
Example 34
Project: backtrader-cn   Author: pandalibin   File: utils.py    GNU General Public License v3.0 5 votes vote down vote up
def split_data(cls, data, percent=0.3):
        """
        Split the data into training data and test data.
        :param data(DataFrame): data to be split.
        :param percent(float): percent of data used as training data.
        :return: training data(DataFrame) and testing data(DataFrame)
        """

        rows = len(data)
        train_rows = math.floor(rows * percent)
        test_rows = rows - train_rows

        return data.iloc[:train_rows], data.iloc[-test_rows:] 
Example 35
Project: backtrader-cn   Author: pandalibin   File: utils.py    GNU General Public License v3.0 5 votes vote down vote up
def get_best_params(cls, al_results):
        """
        Get the best params, current algorithm is the largest total return rate.
        :param al_results(list): all the optional params and corresponding analysis data.
        :return: best params and corresponding analysis data(dict)
        """
        al_results_df = pd.DataFrame.from_dict(al_results)
        al_results_df = al_results_df.sort_values('total_return_rate', ascending=False)

        al_result_dict = al_results_df.iloc[0].to_dict()

        return al_result_dict 
Example 36
Project: backtrader-cn   Author: pandalibin   File: models.py    GNU General Public License v3.0 5 votes vote down vote up
def save_training_params(symbol, params):
    """
    save training params to library.
    :param symbol: str, arctic symbol
    :param params: dict, e.g.: {"ma_period_s": 1, "ma_period_l": 2, "stock_id": "600909"}
    :return: None
    """

    stock_id = params.ma_periods['stock_id']
    params_to_save = dict(params=params)
    df = pd.DataFrame([params_to_save], columns=params_to_save.keys(), index=[stock_id])

    # write to database
    # if library does not exist, create it
    lib = get_or_create_library(conf.STRATEGY_PARAMS_LIBNAME)

    if lib.has_symbol(symbol):
        logger.debug(
            f'symbol: {symbol} already exists, '
            f'change the params of stock {stock_id}, '
            f'then delete and write symbol: {symbol}.'
        )
        params_df = lib.read(symbol).data
        params_df.loc[stock_id, 'params'] = params
        lib.delete(symbol)
        lib.write(symbol, params_df)
    else:
        logger.debug(
            f'write the params of stock {stock_id} to symbol: {symbol}'
        )
        lib.write(symbol, df) 
Example 37
Project: backtrader-cn   Author: pandalibin   File: test_strategies_ma.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_get_params_list(self):
        training_data = pd.DataFrame(np.random.rand(100, 2))
        params_list = bsm.MATrendStrategy.get_params_list(training_data, '000651')

        self.assertEqual(len(params_list), 29) 
Example 38
Project: backtrader-cn   Author: pandalibin   File: test_datas_tushare.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_download_delta_data_initial_no_data(self, mock_get_hist_data):
        mock_hist_data = pd.DataFrame()

        mock_get_hist_data.return_value = mock_hist_data

        coll_name = '000651'
        ts_his_data = bdt.TsHisData(coll_name)

        lib = models.get_library(conf.CN_STOCK_LIBNAME)
        lib.delete(coll_name)

        ts_his_data.download_delta_data()

        self.assertFalse(lib.has_symbol(coll_name)) 
Example 39
Project: backtrader-cn   Author: pandalibin   File: test_datas_tushare.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_download_delta_data_initial(self, mock_get_hist_data):
        mock_hist_data = pd.DataFrame(data={
            'open': [10, 11],
            'high': [12, 13],
            'close': [14, 15],
            'low': [16, 17],
            'volume': [18, 19],
            'price_change': [20, 21],
            'p_change': [22, 23],
            'ma5': [24, 25],
            'ma10': [26, 27],
            'ma20': [28, 29],
            'v_ma5': [30, 31],
            'v_ma10': [32, 33],
            'v_ma20': [34, 35],
            'turnover': [36, 37]
        }, index=['2017-01-01', '2017-01-02'])

        mock_get_hist_data.return_value = mock_hist_data

        coll_name = '000651'
        ts_his_data = bdt.TsHisData(coll_name)

        lib = models.get_library(conf.CN_STOCK_LIBNAME)
        lib.delete(coll_name)

        ts_his_data.download_delta_data()

        hist_data_000651 = ts_his_data.get_data()

        self.assertEqual(len(hist_data_000651), 2) 
Example 40
Project: backtrader-cn   Author: pandalibin   File: test_datas_tushare.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_download_delta_data_no_data(self, mock_get_hist_data):
        coll_name = '000651'
        ts_his_data = bdt.TsHisData(coll_name)

        mock_delta_data = pd.DataFrame()
        mock_get_hist_data.return_value = mock_delta_data

        ts_his_data.download_delta_data()

        hist_data_000651 = ts_his_data.get_data()

        self.assertEqual(len(hist_data_000651), 2) 
Example 41
Project: backtrader-cn   Author: pandalibin   File: test_datas_tushare.py    GNU General Public License v3.0 5 votes vote down vote up
def _test_download_delta_data(self, mock_get_hist_data):
        coll_name = '000651'
        ts_his_data = bdt.TsHisData(coll_name)

        yesterday = dt.datetime.now() - dt.timedelta(days=1)
        mock_delta_data = pd.DataFrame(data={
            'open': 38,
            'high': 39,
            'close': 40,
            'low': 41,
            'volume': 42,
            'price_change': 43,
            'p_change': 44,
            'ma5': 45,
            'ma10': 46,
            'ma20': 47,
            'v_ma5': 48,
            'v_ma10': 49,
            'v_ma20': 50,
            'turnover': 51
        }, index=[dt.datetime.strftime(yesterday, '%Y-%m-%d')])

        mock_get_hist_data.return_value = mock_delta_data

        ts_his_data.download_delta_data()

        hist_data_000651 = ts_his_data.get_data()

        self.assertEqual(len(hist_data_000651), 3)
        self.assertEqual(dt.datetime.strftime(hist_data_000651.index[-1], '%Y-%m-%d'),
                         dt.datetime.strftime(yesterday, '%Y-%m-%d'))
        lib = models.get_library(conf.CN_STOCK_LIBNAME)
        lib.delete(coll_name) 
Example 42
Project: osqf2015   Author: mvaz   File: model.py    MIT License 5 votes vote down vote up
def from_blaze(cls, filename, date_col='Date', value_col='Close'):
        df = bz.odo(filename, pd.DataFrame)[[date_col, value_col]] #[1000:1100]
        df = df.rename(columns = {value_col: 'Value'})
        ts = df.set_index(date_col)
        return cls(ts) 
Example 43
Project: osqf2015   Author: mvaz   File: model.py    MIT License 5 votes vote down vote up
def __init__(self):
        super(StockModel, self).__init__()
        file_name = "notebooks/db2.bcolz"
        self.df = bz.odo(file_name, pd.DataFrame)[['Date', 'Close']] #[1000:1100]
        self.devol()
        self.returns_df = None 
Example 44
Project: kipet   Author: salvadorgarciamunoz   File: validate_templatebuilder.py    GNU General Public License v3.0 5 votes vote down vote up
def test_add_spectral_data(self):

        builder = TemplateBuilder()
        d_frame = pd.DataFrame()
        self.assertFalse(builder.has_spectral_data())
        builder.add_spectral_data(d_frame)
        self.assertTrue(builder.has_spectral_data()) 
Example 45
Project: kipet   Author: salvadorgarciamunoz   File: validate_templatebuilder.py    GNU General Public License v3.0 5 votes vote down vote up
def test_add_absorption_data(self):
        builder = TemplateBuilder()
        s_frame = pd.DataFrame()
        self.assertFalse(builder.has_adsorption_data())
        builder.add_absorption_data(s_frame)
        self.assertTrue(builder.has_adsorption_data()) 
Example 46
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def write_spectral_data_to_csv(filename,dataframe):
    """ Write spectral data Dij to csv file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    dataframe.to_csv(filename) 
Example 47
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def write_absorption_data_to_csv(filename,dataframe):
    """ Write absorption data Sij to csv file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    dataframe.to_csv(filename) 
Example 48
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def write_concentration_data_to_csv(filename,dataframe):
    """ Write concentration data Cij to csv file.
    
        Args:
            filename (str): name of output file
          
            dataframe (DataFrame): pandas DataFrame
        
        Returns:
            None

    """
    dataframe.to_csv(filename) 
Example 49
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_concentration_data_from_txt(filename):
    """ Reads txt with concentration data
    
        Args:
            filename (str): name of input file
          
        Returns:
            DataFrame

    """

    f = open(filename,'r')
    data_dict = dict()
    set_index = set()
    set_columns = set()

    for line in f:
        if line not in ['','\n','\t','\t\n']:
            l=line.split()
            i = float(l[0])
            j = l[1]
            k = float(l[2])
            set_index.add(i)
            set_columns.add(j)
            data_dict[i,j] = k
    f.close()
    
    data_array = np.zeros((len(set_index),len(set_columns)))
    sorted_index = sorted(set_index)
    sorted_columns = set_columns

    for i,idx in enumerate(sorted_index):
        for j,jdx in enumerate(sorted_columns):
            data_array[i,j] = data_dict[idx,jdx]

    return pd.DataFrame(data=data_array,columns=sorted_columns,index=sorted_index) 
Example 50
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_spectral_data_from_csv(filename, instrument = False, negatives_to_zero = False):
    """ Reads csv with spectral data
    
        Args:
            filename (str): name of input file
            instrument (bool): if data is direct from instrument
            negatives_to_zero (bool): if data contains negatives and baseline shift is not
                                        done then this forces negative values to zero.

        Returns:
            DataFrame

    """

    data = pd.read_csv(filename,index_col=0)
    if instrument:
        #this means we probably have a date/timestamp on the columns
        data = pd.read_csv(filename,index_col=0, parse_dates = True)
        data = data.T
        for n in data.index:
            h,m,s = n.split(':')
            sec = (float(h)*60+float(m))*60+float(s)
            data.rename(index={n:sec}, inplace=True)
        data.index = [float(n) for n in data.index]
    else:
        data.columns = [float(n) for n in data.columns]

    #If we have negative values then this makes them equal to zero
    if negatives_to_zero:
        for t in (data.index):
            for l in data.columns:
                if data.loc[t,l] < 0:
                    data.loc[t,l] = 0.0

    return data