Python pandas.concat() Examples

The following are code examples for showing how to use pandas.concat(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 6 votes vote down vote up
def perform_g_test_with_repl_for_windows(self):
        if self._window_df.empty:
            return
        p_values = self._window_df.apply(self._single_g_test, axis=1)
        padj_values = p_values.loc[
            :, ["pooled_G_p_value", "total_G_p_value"]].apply(
                self._correct_p_values, axis=0)
        padj_values.columns = [col_name.replace("p_value", "padj_value")
                               for col_name in padj_values.columns]
        padj_values.index = p_values.index
        significance = pd.concat([p_values, padj_values], axis=1).apply(
            self._check_significance_with_repl, axis=1)
        significance.name = "significant"
        self._window_df = pd.concat([self._window_df, p_values, padj_values,
                                     significance], axis=1)
        self._plot_and_write_windows_gfold() 
Example 2
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 6 votes vote down vote up
def perform_g_test_without_repl_for_windows(self):
        if self._window_df.empty:
            return
        p_values = self._window_df.apply(self._single_g_test, axis=1)
        padj_values = p_values.apply(self._correct_p_values, axis=0)
        padj_values.columns = [
            col_name.replace("p_value", "padj_value")
            for col_name in padj_values.columns]
        padj_values.index = p_values.index
        significance = pd.concat(
            [p_values, padj_values],
            axis=1).apply(self._check_significance_without_repl, axis=1)
        significance.name = "significant"
        self._window_df = pd.concat([self._window_df, p_values, padj_values,
                                     significance], axis=1)
        self._plot_and_write_windows_gfold() 
Example 3
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 6 votes vote down vote up
def run_deseq2_for_windows(self):
        count_df = self._window_df.loc[:, self._exp_lib_list +
                                       self._ctr_lib_list]
        deseq2_runner = DESeq2Runner(count_df)
        result_df, self._size_factors = deseq2_runner.run_deseq2(
            self._exp_lib_list, self._ctr_lib_list, self._size_factors,
            self._pairwise_replicates)
        # normalize counts
        self._window_df[self._lib_names_list] = self._window_df[
            self._lib_names_list].div(self._size_factors, axis='columns')
        # append DESeq2 output
        self._window_df = pd.concat([self._window_df, result_df], axis=1)
        # check significance
        self._window_df["significant"] = self._window_df.apply(
            self._check_significance_with_deseq, axis=1)
        self._plot_and_write_windows_deseq() 
Example 4
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 6 votes vote down vote up
def _run_deseq2_peaks(self):
        peak_df = pd.DataFrame()
        for replicon in sorted(self._replicon_dict):
            if self._replicon_dict[replicon]["peak_df"].empty:
                continue
            peak_df = peak_df.append(self._replicon_dict[replicon]["peak_df"],
                                     ignore_index=True)
        count_df = peak_df.loc[:, self._exp_lib_list + self._ctr_lib_list]
        deseq2_runner = DESeq2Runner(count_df)
        result_df, self._size_factors = deseq2_runner.run_deseq2(
            self._exp_lib_list, self._ctr_lib_list, self._size_factors,
            self._pairwise_replicates)
        # normalize counts
        peak_df[self._lib_names_list] = peak_df[
            self._lib_names_list].div(self._size_factors, axis='columns')
        # append DESeq2 output
        peak_df = pd.concat([peak_df, result_df], axis=1)
        for replicon in sorted(self._replicon_dict):
            self._replicon_dict[replicon]["peak_df"] = peak_df[
                peak_df.replicon == replicon] 
Example 5
Project: PEAKachu   Author: tbischler   File: gtest.py    ISC License 6 votes vote down vote up
def _total_gtest(self):
        '''Use maximum over all g-value sums for all possible replicate
           combinations. Instead using the mean would also be possible!
        '''
        g_value_dict = {}
        ctr_counts = self._rep_df.loc[:, "ctr_counts"]
        for comb_iter in range(len(self._rep_df.index)):
            tagged_counts = self._rep_df.loc[:, "tagged_counts"]
            comb_df = pd.concat([ctr_counts.reset_index(drop=True),
                                 tagged_counts.iloc[comb_iter:].append(
                                     tagged_counts.iloc[
                                         0:comb_iter]).reset_index(drop=True)],
                                axis=1)
            rep_g_values = comb_df.apply(self._gtest, axis=1)
            rep_p_values = rep_g_values.apply(self._g_to_p_value, args=(1,))
            tot_g_value = rep_g_values.sum()
            g_value_dict[tot_g_value] = rep_p_values
        self._total_g_res["g_value"] = max(g_value_dict.keys())
        self._replicate_p_values = g_value_dict[self._total_g_res["g_value"]]
        # Degrees of freedom = replicate number
        self._total_g_res["dof"] = len(self._rep_df.index)
        # Calculate p-value
        self._total_g_res["p_value"] = self._g_to_p_value(
            self._total_g_res["g_value"], self._total_g_res["dof"]) 
Example 6
Project: GreenGuard   Author: D3-AI   File: data.py    MIT License 6 votes vote down vote up
def _load_turbine_readings(readings_path, target_times, signals):
    turbine_id = target_times.turbine_id.iloc[0]
    turbine_path = os.path.join(readings_path, turbine_id)
    filenames = sorted(os.listdir(turbine_path))
    filenames = _filter_by_filename(target_times, filenames)

    readings = list()
    for readings_file in filenames:
        readings_file_path = os.path.join(turbine_path, readings_file)
        data = _load_readings_file(readings_file_path)
        data = _filter_by_signal(data, signals)
        data = _filter_by_timestamp(data, target_times)

        readings.append(data)

    if readings:
        readings = pd.concat(readings)
    else:
        readings = pd.DataFrame(columns=['timestamp', 'signal_id', 'value', 'turbine_id'])

    LOGGER.info('Loaded %s readings from turbine %s', len(readings), turbine_id)

    return readings 
Example 7
Project: GreenGuard   Author: D3-AI   File: csv.py    MIT License 6 votes vote down vote up
def load(self, target_times, window_size, signals=None, debug=False):
        if isinstance(target_times, str):
            target_times = pd.read_csv(target_times)
            target_times['cutoff_time'] = pd.to_datetime(target_times['cutoff_time'])

        if isinstance(signals, pd.DataFrame):
            signals = signals.signal_id

        window_size = pd.to_timedelta(window_size)
        timestamps = self._get_timestamps(target_times, window_size)

        readings = list()
        for turbine_id in timestamps.turbine_id.unique():
            readings.append(self._load_turbine(turbine_id, timestamps, signals))

        dask_scheduler = 'single-threaded' if debug else None
        computed = dask.compute(*readings, scheduler=dask_scheduler)
        readings = pd.concat((c for c in computed if len(c)), ignore_index=True, sort=False)

        LOGGER.info('Loaded %s turbine readings', len(readings))

        return readings 
Example 8
Project: tensorflow-DeepFM   Author: ChenglongChen   File: DataReader.py    MIT License 6 votes vote down vote up
def gen_feat_dict(self):
        if self.dfTrain is None:
            dfTrain = pd.read_csv(self.trainfile)
        else:
            dfTrain = self.dfTrain
        if self.dfTest is None:
            dfTest = pd.read_csv(self.testfile)
        else:
            dfTest = self.dfTest
        df = pd.concat([dfTrain, dfTest])
        self.feat_dict = {}
        tc = 0
        for col in df.columns:
            if col in self.ignore_cols:
                continue
            if col in self.numeric_cols:
                # map to a single index
                self.feat_dict[col] = tc
                tc += 1
            else:
                us = df[col].unique()
                self.feat_dict[col] = dict(zip(us, range(tc, len(us)+tc)))
                tc += len(us)
        self.feat_dim = tc 
Example 9
Project: models   Author: kipoi   File: dataloader_m.py    MIT License 6 votes vote down vote up
def prepro_pos_table(pos_tables):
    """Extracts unique positions and sorts them."""
    if not isinstance(pos_tables, list):
        pos_tables = [pos_tables]

    pos_table = None
    for next_pos_table in pos_tables:
        if pos_table is None:
            pos_table = next_pos_table
        else:
            pos_table = pd.concat([pos_table, next_pos_table])
        pos_table = pos_table.groupby('chromo').apply(
            lambda df: pd.DataFrame({'pos': np.unique(df['pos'])}))
        pos_table.reset_index(inplace=True)
        pos_table = pos_table[['chromo', 'pos']]
        pos_table.sort_values(['chromo', 'pos'], inplace=True)
    return pos_table 
Example 10
Project: AlitaNet   Author: iFe1er   File: Alita_DeepFM_MFM.py    MIT License 6 votes vote down vote up
def AutoInt(self,embedding,AutoInt_Weights,layer):
        querys=tf.tensordot(embedding,AutoInt_Weights['W_query_'+str(layer+1)],axes=[[2],[0]])#(N,f,k) * (k,d*head)= N,f,d*head
        keys=  tf.tensordot(embedding,AutoInt_Weights['W_key_'+str(layer+1)],axes=[[2],[0]])#N,f,d*head
        values = tf.tensordot(embedding, AutoInt_Weights['W_value_'+str(layer+1)], axes=[[2], [0]])#N,f,d*head

        #head,N,f,d
        querys = tf.stack(tf.split(querys, self.autoint_head, axis=2))#default axis=0 (head,N,f,d)
        keys = tf.stack(tf.split(keys, self.autoint_head, axis=2))
        values = tf.stack(tf.split(values, self.autoint_head, axis=2))

        self.normalize_autoint_att_score=tf.nn.softmax(tf.matmul(querys,keys,transpose_b=True))#heads,N,f,f
        out=tf.matmul(self.normalize_autoint_att_score,values)#heads,N,f,d
        out = tf.concat(tf.split(out, self.autoint_head,axis=0), axis=-1) #[(N,f,d)*heads个] --concat--> [1,N,f,d*heads]
        out = tf.squeeze(out, axis=0)  # (N,f,d*heads)

        if self.autoint_params['use_res']:
            #                 first round:N,f,k    k,d*heads  | others:  N,f,d*heads  d*heads,d*heads
            out+=tf.tensordot(embedding,AutoInt_Weights['W_res_'+str(layer+1)],axes=[[2],[0]])

        if self.autoint_params['relu']:
            out=tf.nn.relu(out)
        return out 
Example 11
Project: AlitaNet   Author: iFe1er   File: Alita_DeepFM.py    MIT License 6 votes vote down vote up
def AutoInt(self,embedding,AutoInt_Weights,layer):
        querys=tf.tensordot(embedding,AutoInt_Weights['W_query_'+str(layer+1)],axes=[[2],[0]])#(N,f,k) * (k,d*head)= N,f,d*head
        keys=  tf.tensordot(embedding,AutoInt_Weights['W_key_'+str(layer+1)],axes=[[2],[0]])#N,f,d*head
        values = tf.tensordot(embedding, AutoInt_Weights['W_value_'+str(layer+1)], axes=[[2], [0]])#N,f,d*head

        #head,N,f,d
        querys = tf.stack(tf.split(querys, self.autoint_head, axis=2))#default axis=0 (head,N,f,d)
        keys = tf.stack(tf.split(keys, self.autoint_head, axis=2))
        values = tf.stack(tf.split(values, self.autoint_head, axis=2))

        self.normalize_autoint_att_score=tf.nn.softmax(tf.matmul(querys,keys,transpose_b=True))#heads,N,f,f
        out=tf.matmul(self.normalize_autoint_att_score,values)#heads,N,f,d
        out = tf.concat(tf.split(out, self.autoint_head,axis=0), axis=-1) #[(N,f,d)*heads个] --concat--> [1,N,f,d*heads]
        out = tf.squeeze(out, axis=0)  # (N,f,d*heads)

        if self.autoint_params['use_res']:
            #                 first round:N,f,k    k,d*heads  | others:  N,f,d*heads  d*heads,d*heads
            out+=tf.tensordot(embedding,AutoInt_Weights['W_res_'+str(layer+1)],axes=[[2],[0]])

        if self.autoint_params['relu']:
            out=tf.nn.relu(out)
        return out 
Example 12
Project: pymapd-examples   Author: omnisci   File: OKR_techsup_ga.py    Apache License 2.0 6 votes vote down vote up
def format_data(response):
    reports = response['reports'][0]
    columnHeader = reports['columnHeader']['dimensions']
    metricHeader = reports['columnHeader']['metricHeader']['metricHeaderEntries']

    columns = columnHeader
    for metric in metricHeader:
        columns.append(metric['name'])
    data = json_normalize(reports['data']['rows'])
    data_dimensions = pd.DataFrame(data['dimensions'].tolist())
    data_metrics = pd.DataFrame(data['metrics'].tolist())
    data_metrics = data_metrics.applymap(lambda x: x['values'])
    data_metrics = pd.DataFrame(data_metrics[0].tolist())
    result = pd.concat([data_dimensions, data_metrics], axis=1, ignore_index=True)
    result.columns = ["blog_title", "blog_url", "referral_path", "c1_timestamp", "geo_city_code", "unique_pageviews", "time_on_page"] # set the column names
    return (result) 
Example 13
Project: nba_scraper   Author: mcbarlowe   File: scrape_functions.py    GNU General Public License v3.0 6 votes vote down vote up
def main_scrape(game_id):
    """
    this is the main function that runs and ties all them together. Doing it
    this way so I can better write tests that work on Travis CI due to their
    IP being blacklisted by NBA.com.

    Inputs:
    game_id     - NBA game id of game to be scraped

    Outputs:
    game_df     - pandas dataframe of the play by play
    """

    v2_dict = get_pbp_api(game_id)
    game_df = scrape_pbp(v2_dict)
    periods = []
    for period in range(1, game_df["period"].max() + 1):
        lineups = get_lineup_api(game_id, period)
        periods.append(
            get_lineup(game_df[game_df["period"] == period].copy(), lineups, game_df,)
        )
    game_df = pd.concat(periods).reset_index(drop=True)

    return game_df 
Example 14
Project: LHMP   Author: hydrogo   File: gr4j_cemaneige.py    GNU General Public License v3.0 6 votes vote down vote up
def interaction(river_name, path_to_scheme, path_to_observations,\
    X1, X2, X3, X4, X5, X6):

    # simulate our modeled hydrograph
    data = dataframe_construction(path_to_scheme)
    data['Qsim'] = simulation(data, [X1, X2, X3, X4, X5, X6])

    # read observations
    obs = pd.read_csv(path_to_observations, index_col=0, parse_dates=True,
                      squeeze=True, header=None, names=['Date', 'Qobs'])

    # concatenate data
    data = pd.concat([data, obs], axis=1)

    # calculate efficiency criterion
    # slice data only for observational period and drop NA values
    data_for_obs = data.ix[obs.index, ['Qsim', 'Qobs']].dropna()
    eff = NS(data_for_obs['Qobs'], data_for_obs['Qsim'])

    # plot
    ax = data.ix[obs.index, ['Qsim', 'Qobs']].plot(figsize=(10, 7), style=['b-', 'k.'])
    ax.set_title(river_name + ' daily runoff modelling, ' + 'Nash-Sutcliffe efficiency: {}'.format(np.round(eff, 2))) 
Example 15
Project: LHMP   Author: hydrogo   File: simhyd_cemaneige.py    GNU General Public License v3.0 6 votes vote down vote up
def interaction(river_name, path_to_scheme, path_to_observations,\
    INSC, COEFF, SQ, SMSC, SUB, CRAK, K, etmul, DELAY, X_m, X5, X6):

    # simulate our modeled hydrograph
    data = dataframe_construction(path_to_scheme)
    data['Qsim'] = simulation(data, [INSC, COEFF, SQ, SMSC, SUB, CRAK, K,\
    etmul, DELAY, X_m, X5, X6])

    # read observations
    obs = pd.read_csv(path_to_observations, index_col=0, parse_dates=True,
                      squeeze=True, header=None, names=['Date', 'Qobs'])

    # concatenate data
    data = pd.concat([data, obs], axis=1)

    # calculate efficiency criterion
    # slice data only for observational period and drop NA values
    data_for_obs = data.ix[obs.index, ['Qsim', 'Qobs']].dropna()
    eff = NS(data_for_obs['Qobs'], data_for_obs['Qsim'])

    # plot
    ax = data.ix[obs.index, ['Qsim', 'Qobs']].plot(figsize=(10, 7), style=['b-', 'k.'])
    ax.set_title(river_name + ' daily runoff modelling, ' + 'Nash-Sutcliffe efficiency: {}'.format(np.round(eff, 2))) 
Example 16
Project: gullikson-scripts   Author: kgullikson88   File: HDF5_Helpers.py    MIT License 6 votes vote down vote up
def _make_cache(self, addmode='all', update_cache=True, cache_fname='CCF_metadata.csv'):
        """ Read through all the datasets in each CCF interface, pulling the metadata.
        """
        if self._cache is not None:
            logging.info('Cache already loaded! Not reloading!')
            return

        if not update_cache and os.path.exists(cache_fname):
            logging.info('Reading pre-made cache from {}'.format(cache_fname))
            self._cache = pd.read_csv(cache_fname)
            return

        logging.info('Reading HDF5 metadata for faster access later')
        dataframes = []
        for inst in self._interfaces.keys():
            logging.debug('Generating cache for instrument {}'.format(inst))
            interface = self._interfaces[inst]
            data = interface._compile_data(starname=None, date=None, addmode=addmode, read_ccf=False)
            data['Instrument'] = inst
            dataframes.append(data)

        self._cache = pd.concat(dataframes)
        self._cache.to_csv(cache_fname) 
Example 17
Project: IPToCC   Author: roniemartinez   File: __init__.py    MIT License 6 votes vote down vote up
def get_rir_database():
    global lock
    global _rir_database
    global _countries
    if _rir_database is None:
        with lock:
            if _rir_database is None:
                logger.info('Loading RIR databases')
                _rir_database = pandas.concat(read_rir_databases())
                _rir_database = _rir_database[((_rir_database['Type'] == 'ipv4') | (_rir_database['Type'] == 'ipv6')) &
                                              (_rir_database['Type'] != '*')]
                _rir_database[['Start', 'End']] = _rir_database.apply(convert_to_ip_object, axis=1,
                                                                      result_type='expand')
                countries = pandas.read_csv(
                    os.path.join(os.path.dirname(os.path.abspath(__file__)), 'iso3166.csv'),
                    names=['country_code', 'country_name']
                )
                _countries = dict(zip(countries['country_code'].values, countries['country_name'].values))
                logger.info('RIR databases loaded')
    return _rir_database 
Example 18
Project: wrangle   Author: autonomio   File: wrangler_utils.py    MIT License 6 votes vote down vote up
def string_contains_to_binary(data, col_that_contains, col_contains_strings):

    '''Convert String to Binary

    WHAT: Deals with cases where string values are converted in to Binary based
    on a value contained in the string.

    '''

    temp_contains = pd.DataFrame()

    if type(col_that_contains) is str:
        col_that_contains = [col_that_contains]

    for col in col_that_contains:
        count = 0
        for string in col_contains_strings:
            temp_contains = pd.concat([temp_contains, data[col].str.contains(string)], axis=1)
            count += 1

        temp_contains.columns = col_contains_strings
        temp_contains = temp_contains.astype(float)

    return temp_contains 
Example 19
Project: Feature-Stuff   Author: hiflyin   File: categorical.py    MIT License 6 votes vote down vote up
def add_dummies(df, cols = None, drop = True):
    '''
    Inputs:
        df: a pandas Dataframe containing the columns to add dummies for.
        cols: a list or array of the names of the columns to dummy. If is None (default) then all object columns are taken.
        drop: if the categorical columns are to be dropped after adding the dummies. Default = True.

    Output: the dataframe with the added dummies. NaNs will be ignored rather than considered a distinct category.

    TO DO: TypeErrors?
    '''

    if cols is None:
        cols = [col for col in df.columns if df[col].dtype == 'object']

    for col in cols:
        dummies = pd.get_dummies(df[col], prefix=col).astype(np.int8)
        df = pd.concat([df, dummies], axis=1)
        if drop:
            df.drop([col], inplace=True, axis=1)

    del dummies
    gc.collect()
    return(df) 
Example 20
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def _convert_to_data_frame(self):
        self._window_df = pd.DataFrame()
        for replicon in sorted(self._replicon_dict):
            for strand in ["+", "-"]:
                # add window positions to data frame
                row_number = len(self._replicon_dict[replicon]["window_list"])
                df = pd.concat([
                    pd.Series([replicon] * row_number),
                    pd.Series([strand] * row_number),
                    pd.Series([window[0]+1 for window in
                               self._replicon_dict[
                                   replicon]["window_list"]]),
                    pd.Series([window[1] for window in
                               self._replicon_dict[
                        replicon]["window_list"]])], axis=1)
                df.columns = ["replicon", "strand", "w_start", "w_end"]
                # add library counts to data frame
                for lib_name, lib in self._lib_dict.items():
                    df[lib_name] = (pd.Series(lib.replicon_dict[
                        replicon]["window_counts"].loc[:, strand]))
                self._window_df = self._window_df.append(df,
                                                         ignore_index=True)
            del self._replicon_dict[replicon]["window_list"]
        # remove windows without expression in any library
        print("Removing empty windows from DataFrame with {} rows...".format(
            len(self._window_df.index)), flush=True)
        t_start = time()
        self._window_df = self._window_df.loc[
            (self._window_df.loc[:, self._lib_names_list].sum(axis=1) > 0), :]
        t_end = time()
        print("Removal took {} seconds. DataFrame contains now {} rows.".
              format((t_end-t_start), len(self._window_df.index)), flush=True)
        if self._window_df.empty:
            print("**Dataframe empty**", flush=True)
            return
        if self._stat_test == "gtest":
            self._run_gtest_preprocessing()
        elif self._stat_test == "deseq":
            self._run_deseq_preprocessing() 
Example 21
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def combine_peaks_and_recalculate_values(self):
        for replicon in self._replicon_dict:
            # forward strand
            peak_df_forward = self._combine_windows(
                self._window_df[
                    (self._window_df["replicon"] == replicon) & (
                        self._window_df["strand"] == "+")])
            if not peak_df_forward.empty:
                peak_df_forward["peak_strand"] = '+'
            # reverse strand
            peak_df_reverse = self._combine_windows(self._window_df[
                (self._window_df["replicon"] == replicon) & (
                    self._window_df["strand"] == "-")])
            if not peak_df_reverse.empty:
                peak_df_reverse["peak_strand"] = '-'
            # combined
            self._replicon_dict[replicon]["peak_df"] = pd.concat([
                peak_df_forward, peak_df_reverse], axis=0, ignore_index=True,
                sort=False)
        del self._window_df
        self._generate_peak_counts()
        for replicon in sorted(self._replicon_dict):
            for lib_name, lib in self._lib_dict.items():
                self._replicon_dict[
                    replicon]["peak_df"][lib_name] = lib.replicon_dict[
                        replicon]["peak_counts"]
            self._replicon_dict[replicon]["peak_df"]["replicon"] = replicon
        if self._stat_test == "gtest":
            self._run_gtest_peaks()
        elif self._stat_test == "deseq":
            self._run_deseq2_peaks() 
Example 22
Project: PEAKachu   Author: tbischler   File: window.py    ISC License 5 votes vote down vote up
def _perform_g_test_without_repl_for_peaks(self, df):
        if df.empty:
            return df
        p_values = df.apply(self._single_g_test, axis=1)
        padj_values = p_values.apply(self._correct_p_values, axis=0)
        padj_values.columns = [
            col_name.replace("p_value", "padj_value")
            for col_name in padj_values.columns]
        padj_values.index = p_values.index
        df = pd.concat([df, p_values, padj_values], axis=1)
        return df 
Example 23
Project: Wide-Residual-Nets-for-SETI   Author: sgrvinod   File: average_scores.py    Apache License 2.0 5 votes vote down vote up
def average_scores(input_folder, output_path):
    """
    Averages scores of several CSV files generated by test.py

    Args:
        input_folder (path): folder with models' scores' CSVs in it.
        output_path (path): path of output CSV file with averaged scores, ready for submission to SETI scoreboards
    """
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    model_scores = []
    for i, csv in enumerate(csv_files):
        df = pd.read_csv(os.path.join(input_folder, csv), index_col=0, header=None)
        if i == 0:
            index = df.index
        else:
            assert index.equals(df.index), "Indices of one or more files do not match!"
        model_scores.append(df)
    print "Read %d files. Averaging..." % len(model_scores)

    concat_scores = pd.concat(model_scores)
    averaged_scores = concat_scores.groupby(level=0).mean()
    assert averaged_scores.shape[0] == len(list(index)), "Something went wrong when concatenating/averaging!"
    averaged_scores = averaged_scores.reindex(index)

    averaged_scores.to_csv(output_path, header=False, index=True)
    print "Averaged scores saved to %s" % output_path 
Example 24
Project: prediction-constrained-topic-models   Author: dtak   File: select_best_runs_and_snapshots.py    MIT License 5 votes vote down vote up
def load_df_from_all_folders_matching_list_of_patterns(
        list_of_path_patterns=None,
        legend_name=None,
        y_ind=0,
        column_names=None,
        query_str=None,
        task_ids=None,
        **kwargs):
    pprint(">>> BEGIN load_df_from_all_folders_that_match_pattern")
    list_of_match_df = list()
    for path_pattern in list_of_path_patterns:
        cur_alg_df = load_df_from_all_folders_that_match_pattern(
            path_pattern,
            y_ind=y_ind,
            task_ids=task_ids,
            column_names=column_names)
        if query_str is not None:
            cur_alg_df = cur_alg_df.query(query_str).copy()

        # Append to list of all matching dataframes
        list_of_match_df.append(cur_alg_df)



    # Create all matching DataFrame
    all_matching_runs_df = pd.concat(list_of_match_df)
    pprint("<<< END   load_df_from_all_folders_that_match_pattern")
    return all_matching_runs_df


######################
## Funcs that select best df 
Example 25
Project: prediction-constrained-topic-models   Author: dtak   File: select_best_runs_and_snapshots.py    MIT License 5 votes vote down vote up
def make_best_task_df(
        df,
        target_query="SPLIT_NAME == 'VALID' and LAP > 50",
        score_colname='Y_ERROR_RATE',
        score_ranking_func=np.argmin,
        default_score=None,
        verbose=False):
    ''' Find best task for each unique job in provided df.

    Returns
    -------
    best_df : dataframe of best tasks for each unique job
    '''
    if default_score is None:
        default_score = fetch_default_score(score_ranking_func.__name__)
    best_task_df_list = list()
    job_paths = np.unique(df['JOB_PATH'].values)
    for job_path in job_paths:
        if job_path is None:
            continue
        job_df = df.query("JOB_PATH == '%s'" % job_path)
        taskids = np.unique(job_df['TASKID'].values)
        best_score_idx = np.zeros_like(taskids, dtype=np.int32)
        best_score = default_score * np.ones_like(taskids, dtype=np.float64)
        for tt, taskidstr in enumerate(taskids):
            task_df = job_df.query(target_query + " and TASKID == '%s'" % taskidstr)
            if task_df.shape[0] < 1:
                continue
            if not np.all(np.isfinite(task_df[score_colname].values)):
                pprint(task_df[score_colname].values)
            best_score_idx[tt] = score_ranking_func(task_df[score_colname].values)
            best_score[tt] = task_df[score_colname].values[best_score_idx[tt]]
        best_task_idx = score_ranking_func(best_score)
        best_task_df = job_df.query("TASKID == '%s'" % taskids[best_task_idx])
        best_task_df_list.append(best_task_df)
        if verbose:
            pprint(job_path)
            pprint("best task: %s" % best_task_idx)
    return pd.concat(best_task_df_list) 
Example 26
Project: prediction-constrained-topic-models   Author: dtak   File: select_best_runs_and_snapshots.py    MIT License 5 votes vote down vote up
def load_df_from_all_folders_that_match_pattern(
        src_path_pattern='',
        task_ids='1',
        when_task_path_does_not_exist='continue',
        when_split_csv_does_not_exist='raise_error',
        y_ind=0,
        column_names=None,
        output_data_type='binary',
        engine=None,
        csv_pattern='snapshot_perf_metrics_%s.csv'):
    ''' Load results from many folders that match a pattern into data frame.

    Aggregates results from many pipelines.

    Returns
    -------
    df : pandas DataFrame
    '''
    src_path_list = [s for s in sorted(glob.glob(src_path_pattern))]
    if len(src_path_list) == 0:
        raise ValueError("ERROR: No snapshot csv files for provided pattern:%s" % src_path_pattern)
    mega_df = None
    df_list = list()
    column_names = load_default_column_name_dict(
        output_data_type=output_data_type)
    for src_path in src_path_list:
        df = load_df_from_training_results_folder(
            src_path=src_path, 
            task_ids=task_ids,
            when_task_path_does_not_exist=when_task_path_does_not_exist,
            when_split_csv_does_not_exist=when_split_csv_does_not_exist,
            column_names=column_names,
            engine=engine,
            csv_pattern=csv_pattern,
            y_ind=y_ind)
        df_list.append(df)
    mega_df = pd.concat(df_list)
    return mega_df 
Example 27
Project: face-attendance-machine   Author: matiji66   File: facerec_from_webcam_mult_thread.py    Apache License 2.0 5 votes vote down vote up
def vote_class(face_encoding, tolerance=0.3, topN=5):
    myprint('vote start ', time.time())
    """
    当比较的结果小于tolerance的时候,有多个值,采用取topN 进行投票 ,决定最终的分类,此处没有对 distance 距离进行加权
    :param face_encoding: face encoding
    :param tolerance: 距离的阈值,越小越相似
    :param topN: 参与投票的最大数量
    :return: detect name
    """
    # 计算出距离
    distance_ = face_recognition.face_distance(known_face_encodings, face_encoding)
    df = pd.DataFrame(distance_, columns=["dis"])  # 转换成 DataFrame
    topDF = df[df['dis'] <= tolerance].nsmallest(topN, columns=['dis'])  # 过滤结果集
    namedf = NAME_DF.loc[topDF.index]  # 从姓名列表中获取face距离对应的人脸名称
    con = pd.concat([topDF, namedf], axis=1)  # concat name and distance
    # print('con', con)
    group = con.groupby(["name"])['dis'].sum()
    gp = group.reset_index()
    print('vote -- ', gp)
    if len(gp) == 0:
        print("------unknown -----")
        return "Unknown", 10
    import numpy as np  # TODO  optimize
    arr = np.array(gp)
    name1 = arr[0, 0]
    dis1 = arr[0, 1]
    print("get top one:", name1, dis1)
    myprint('vote end', time.time())
    return name1, dis1 
Example 28
Project: face-attendance-machine   Author: matiji66   File: facerec_from_webcam_faster.py    Apache License 2.0 5 votes vote down vote up
def vote_class(face_encoding, tolerance=0.3, topN=5):
    myprint('vote start ', time.time())
    """
    当比较的结果小于tolerance的时候,有多个值,采用取topN 进行投票 ,决定最终的分类,此处没有对 distance 距离进行加权
    :param face_encoding: face encoding
    :param tolerance: 距离的阈值,越小越相似
    :param topN: 参与投票的最大数量
    :return: detect name
    """
    # 计算出距离
    distance_ = face_recognition.face_distance(known_face_encodings, face_encoding)
    df = pd.DataFrame(distance_, columns=["dis"])  # 转换成 DataFrame
    topDF = df[df['dis'] <= tolerance].nsmallest(topN, columns=['dis'])  # 过滤结果集
    namedf = NAME_DF.loc[topDF.index]  # 从姓名列表中获取face距离对应的人脸名称
    con = pd.concat([topDF, namedf], axis=1)  # concat name and distance
    # print('con', con)
    group = con.groupby(["name"])['dis'].sum()
    gp = group.reset_index()
    print('vote -- ', gp)
    if len(gp) == 0:
        print("------unknown -----")
        return "Unknown", 10
    import numpy as np  # TODO  optimize
    arr = np.array(gp)
    name1 = arr[0, 0]
    dis1 = arr[0, 1]
    print("get top one:", name1, dis1)
    myprint('vote end', time.time())
    return name1, dis1 
Example 29
Project: DataComp   Author: Cojabi   File: stats.py    Apache License 2.0 5 votes vote down vote up
def p_correction(p_values):
    """
    Corrects p_values for multiple testing.

    :param p_values: Dictionary storing p_values with corresponding feature names as keys.
    :return: DataFrame which shows the results of the analysis; p-value, corrected p-value and boolean indicating \
    significance.
    """

    p_trans = _transform_p_dict(p_values)

    # get and drop features which are NaN to skip them in multitest correction
    nan_features = p_trans[pd.isnull(p_trans[0])]
    p_trans = p_trans.dropna(axis=0, subset=[0])

    # extract p_value column to pass into multiple testing correction
    p_val_col = p_trans[0].sort_values()

    # add NaN features back to p_trans to include them into result table later on
    p_trans = pd.concat([p_trans, nan_features])

    # raise Error if no p_values where calculated that can be passed into multiple test correction
    if p_val_col.values.size == 0:
        # unpack the p_values which are stored in 2 layer nested dicts.
        nested_values = []
        for value in p_values.values():
            nested_values.append(*value.values())

        # if all p_values are nan, return an all nan result table
        if pd.isnull(nested_values).all():
            result_table = _create_result_table(None, p_val_col, p_trans, conf_invs, counts)
            return result_table.sort_index()

        raise ValueError("No p_values have been submitted into multiple test correction.")

    # correct p-values
    result = multipletests(p_val_col.values)

    return result, p_val_col, p_trans 
Example 30
Project: Kaggle-Statoil-Challenge   Author: adodd202   File: utils.py    MIT License 5 votes vote down vote up
def MinMaxBestBaseStacking(input_folder, best_base, output_path):
    sub_base = pd.read_csv(best_base)
    all_files = os.listdir(input_folder)

    # Read and concatenate submissions
    outs = [pd.read_csv(os.path.join(input_folder, f), index_col=0) for f in all_files]
    concat_sub = pd.concat(outs, axis=1)
    cols = list(map(lambda x: "is_iceberg_" + str(x), range(len(concat_sub.columns))))
    concat_sub.columns = cols
    concat_sub.reset_index(inplace=True)

    # get the data fields ready for stacking
    concat_sub['is_iceberg_max'] = concat_sub.iloc[:, 1:6].max(axis=1)
    concat_sub['is_iceberg_min'] = concat_sub.iloc[:, 1:6].min(axis=1)
    concat_sub['is_iceberg_mean'] = concat_sub.iloc[:, 1:6].mean(axis=1)
    concat_sub['is_iceberg_median'] = concat_sub.iloc[:, 1:6].median(axis=1)

    # set up cutoff threshold for lower and upper bounds, easy to twist
    cutoff_lo = 0.67
    cutoff_hi = 0.33

    concat_sub['is_iceberg_base'] = sub_base['is_iceberg']
    concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:, 1:6] > cutoff_lo, axis=1),
                                        concat_sub['is_iceberg_max'],
                                        np.where(np.all(concat_sub.iloc[:, 1:6] < cutoff_hi, axis=1),
                                                 concat_sub['is_iceberg_min'],
                                                 concat_sub['is_iceberg_base']))
    concat_sub[['id', 'is_iceberg']].to_csv(output_path,
                                            index=False, float_format='%.12f') 
Example 31
Project: Kaggle-Statoil-Challenge   Author: adodd202   File: utils.py    MIT License 5 votes vote down vote up
def ensembleVer2(input_folder, output_path):
    print('Out:' + output_path)
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    model_scores = []
    for i, csv in enumerate(csv_files):
        df = pd.read_csv(os.path.join(input_folder, csv), index_col=0)
        if i == 0:
            index = df.index
        else:
            assert index.equals(df.index), "Indices of one or more files do not match!"
        model_scores.append(df)
    print("Read %d files. Averaging..." % len(model_scores))

    # print(model_scores)
    concat_scores = pd.concat(model_scores)
    print(concat_scores.head())
    concat_scores['is_iceberg'] = concat_scores['is_iceberg'].astype(np.float32)

    averaged_scores = concat_scores.groupby(level=0).mean()
    assert averaged_scores.shape[0] == len(list(index)), "Something went wrong when concatenating/averaging!"
    averaged_scores = averaged_scores.reindex(index)

    stacked_1 = pd.read_csv('statoil-submission-template.csv')  # for the header
    print(stacked_1.shape)
    sub = pd.DataFrame()
    sub['id'] = stacked_1['id']

    sub['is_iceberg'] = np.exp(np.mean(
        [
            averaged_scores['is_iceberg'].apply(lambda x: np.log(x))
        ], axis=0))

    print(sub.shape)
    sub.to_csv(output_path, index=False, float_format='%.9f')
    print("Averaged scores saved to %s" % output_path)


# Convert the np arrays into the correct dimention and type
# Note that BCEloss requires Float in X as well as in y 
Example 32
Project: GreenGuard   Author: D3-AI   File: data.py    MIT License 5 votes vote down vote up
def _load_readings(readings_path, target_times, signals, window_size):
    turbine_ids = target_times.turbine_id.unique()

    target_times = _get_times(target_times, window_size)

    readings = list()
    for turbine_id in sorted(turbine_ids):
        turbine_target_times = target_times[target_times['turbine_id'] == turbine_id]
        LOGGER.info('Loading turbine %s readings', turbine_id)
        turbine_readings = _load_turbine_readings(readings_path, turbine_target_times, signals)
        turbine_readings['turbine_id'] = turbine_id
        readings.append(turbine_readings)

    return pd.concat(readings) 
Example 33
Project: GreenGuard   Author: D3-AI   File: csv.py    MIT License 5 votes vote down vote up
def __consolidate(self, readings, turbine_id):
        readings = pd.concat(readings, ignore_index=True)
        try:
            readings['value'] = readings['value'].astype(float)
        except ValueError:
            signals = readings[readings['value'].str.isnumeric()].signal_id.unique()
            raise ValueError('Signals contain non-numerical values: {}'.format(signals))

        readings['turbine_id'] = turbine_id

        LOGGER.info('Loaded %s readings from turbine %s', len(readings), turbine_id)

        return readings 
Example 34
Project: pymapd-examples   Author: omnisci   File: OKR_oss_git.py    Apache License 2.0 5 votes vote down vote up
def get_views(r):
    # retrieve views information
    r_views = r.get_views_traffic()
    df = pd.DataFrame.from_dict(r_views)
    # iterate through individual view objects nested in the contents
    i = 0
    ts = pd.Series('ts', index=[i])
    cnt = pd.Series('cnt', index=[i])
    uni = pd.Series('uni', index=[i])
    repo = pd.Series('repo', index=[i])
    for view in df['views']: # this column contains a list of view objects
        i += 1
        repo[i] = r.name
        ts[i] = getattr(view, 'timestamp')
        ts[i] = ts[i]/1000000000
        ts[i] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts[i]))
        cnt[i] = getattr(view, 'count')
        uni[i] = getattr(view, 'uniques')

    # setup dataframe by concatenating the series together as columns
    list_of_series = [repo, ts, cnt, uni]
    # drop the column names before concatenating
    repo.drop([0], inplace = True)
    ts.drop([0], inplace = True)
    cnt.drop([0], inplace = True)
    uni.drop([0], inplace = True)
    df_views = pd.concat(list_of_series, axis=1, ignore_index=True)
    # rename the columns to useful labels
    columns = ['repo', 'view_timestamp', 'view_count', 'view_unique']
    print (df_views)
    df_views.columns = columns

    if df_views.empty:
        print ("no views")
    else:
        print (str(df_views['view_count'].sum()) + ' views for ' + r.name)
        return df_views 
Example 35
Project: deep-learning-note   Author: wdxtub   File: 13_house_price.py    MIT License 5 votes vote down vote up
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net(train_features.shape[1])
    train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
    utils.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')
    print('train rmse %f' % train_ls[-1])
    preds = net(test_features).detach().numpy()
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('./data/HousePrice/submission.csv', index=False) 
Example 36
Project: nba_scraper   Author: mcbarlowe   File: nba_scraper.py    GNU General Public License v3.0 5 votes vote down vote up
def scrape_date_range(
    date_from, date_to, data_format="pandas", data_dir=f"{Path.home()}/nbadata.csv"
):
    """
    Function scrapes all `regular-season` nba games between two dates

    Inputs:
    date_from   - Date to scrape from
    date_to     - Date to scrape to
    data_format - the format of the data the user wants returned. This is either
                  a pandas dataframe or a csv file
    data_dir    - a filepath which to write the csv file if that option is chosen.
                  If no filepath is passed then it will attempt to write to the
                  user's home directory

    Outputs:
    nba_df     - If pandas is chosen then this function will
                 return this pandas dataframe object. If csv then
                 a csv file will be written but None will be returned
    """
    check_format(data_format)
    check_valid_dates(date_from, date_to)

    game_ids = sf.get_date_games(date_from, date_to)
    scraped_games = []

    for game in game_ids:
        print(f"Scraping game id: {game}")
        scraped_games.append(sf.main_scrape(game))

    if data_format == "pandas":
        return pd.concat(scraped_games)
    else:
        pd.concat(scraped_games).to_csv(data_dir, index=False)
        return None 
Example 37
Project: nba_scraper   Author: mcbarlowe   File: nba_scraper.py    GNU General Public License v3.0 5 votes vote down vote up
def scrape_game(game_ids, data_format="pandas", data_dir=f"{Path.home()}/"):
    """
    function scrapes nba games and returns them in the data format requested
    by the user.

    Inputs:
    game_ids    - list of nba game ids to scrape
    data_format - the format of the data the user wants returned. This is either
                  a pandas dataframe or a csv file
    data_dir    - a filepath which to write the csv file if that option is chosen.
                  If no filepath is passed then it will attempt to write to the
                  user's home directory

    Outputs:
    nba_df     - If pandas is chosen then this function will
                 return this pandas dataframe object. If csv then
                 a csv file will be written but None will be returned
    """
    check_format(data_format)

    scraped_games = []
    for game in game_ids:
        print(f"Scraping game id: 00{game}")
        scraped_games.append(sf.main_scrape(f"00{game}"))

    nba_df = pd.concat(scraped_games)

    if data_format == "pandas":
        return nba_df
    else:
        nba_df.to_csv(f"{data_dir}/{game_ids[0]}.csv", index=False)
        return None 
Example 38
Project: nba_scraper   Author: mcbarlowe   File: nba_scraper.py    GNU General Public License v3.0 5 votes vote down vote up
def scrape_season(season, data_format="pandas", data_dir=f"{Path.home()}/nbadata.csv"):
    """
    This function scrapes and entire season and either returns it as a pandas
    dataframe or writes it to file as a csv file

    Inputs:
    season      - season to be scraped must be an integer
    data_format - the format of the data the user wants returned. This is either
                  a pandas dataframe or a csv file
    data_dir    - a filepath which to write the csv file if that option is chosen.
                  If no filepath is passed then it will attempt to write to the
                  user's home directory

    Outputs:
    nba_df     - If pandas is chosen then this function will
                 return this pandas dataframe object. If csv then
                 a csv file will be written but None will be returned
    """
    check_format(data_format)

    scraped_games = []
    game_ids = list(range(int(f"2{season-2001}00001"), int(f"2{season-2001}01231")))

    for game in game_ids:
        print(f"Scraping game id: 00{game}")
        scraped_games.append(sf.main_scrape(f"00{game}"))

    nba_df = pd.concat(scraped_games)

    if data_format == "pandas":
        return nba_df
    else:
        nba_df.to_csv(f"{data_dir}/nba{season}.csv", index=False)
        return None 
Example 39
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: kaggle_k_fold_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
                       learning_rate, weight_decay, batch_size):
    """Conducts k-fold cross validation for the model."""
    assert k > 1
    fold_size = X_train.shape[0] // k

    train_loss_sum = 0.0
    test_loss_sum = 0.0
    for test_idx in range(k):
        X_val_test = X_train[test_idx * fold_size: (test_idx + 1) *
                                                   fold_size, :]
        y_val_test = y_train[test_idx * fold_size: (test_idx + 1) * fold_size]
        val_train_defined = False
        for i in range(k):
            if i != test_idx:
                X_cur_fold = X_train[i * fold_size: (i + 1) * fold_size, :]
                y_cur_fold = y_train[i * fold_size: (i + 1) * fold_size]
                if not val_train_defined:
                    X_val_train = X_cur_fold
                    y_val_train = y_cur_fold
                    val_train_defined = True
                else:
                    X_val_train = nd.concat(X_val_train, X_cur_fold, dim=0)
                    y_val_train = nd.concat(y_val_train, y_cur_fold, dim=0)
        net = get_net()
        train_loss = train(net, X_val_train, y_val_train, epochs, verbose_epoch,
                           learning_rate, weight_decay, batch_size)
        train_loss_sum += train_loss
        test_loss = get_rmse_log(net, X_val_test, y_val_test)
        print("Test loss: %f" % test_loss)
        test_loss_sum += test_loss
    return train_loss_sum / k, test_loss_sum / k

# The sets of parameters. Better results are obtained with modifications.
# These parameters can be fine-tuned with k-fold cross-validation. 
Example 40
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: kaggle_k_fold_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
          weight_decay, batch_size):
    """Trains the model and predicts on the test data set."""
    net = get_net()
    _ = train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
                 weight_decay, batch_size)
    preds = net(X_test).asnumpy()
    test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test['Id'], test['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False) 
Example 41
Project: LHMP   Author: hydrogo   File: hbv.py    GNU General Public License v3.0 5 votes vote down vote up
def interaction(river_name, path_to_scheme, path_to_observations,\
    parBETA, parCET, parFC, parK0, parK1, parK2, parLP, parMAXBAS,\
    parPERC, parUZL, parPCORR, parTT, parCFMAX, parSFCF, parCFR, parCWH):

    # simulate our modeled hydrograph
    data = dataframe_construction(path_to_scheme)
    data['Qsim'] = simulation(data, [parBETA, parCET, parFC, parK0, parK1,\
    parK2, parLP, parMAXBAS, parPERC, parUZL, parPCORR, parTT, parCFMAX,\
    parSFCF, parCFR, parCWH])

    # read observations
    obs = pd.read_csv(path_to_observations, index_col=0, parse_dates=True,
                      squeeze=True, header=None, names=['Date', 'Qobs'])

    # concatenate data
    data = pd.concat([data, obs], axis=1)

    # calculate efficiency criterion
    # slice data only for observational period and drop NA values
    data_for_obs = data.ix[obs.index, ['Qsim', 'Qobs']].dropna()
    eff = NS(data_for_obs['Qobs'], data_for_obs['Qsim'])

    # plot
    ax = data.ix[obs.index, ['Qsim', 'Qobs']].plot(figsize=(10, 7), style=['b-', 'k.'])
    ax.set_title(river_name + ' daily runoff modelling, ' + 'Nash-Sutcliffe efficiency: {}'.format(np.round(eff, 2)))
    #text_pos = np.max(data['Qobs'])
    #ax.text(obs.index[100], text_pos, 'NS: {}'.format(np.round(eff, 2)), size=14) 
Example 42
Project: OpenFermion-Cirq   Author: quantumlib   File: result.py    Apache License 2.0 5 votes vote down vote up
def extend(self,
               results: Iterable[OptimizationResult]) -> None:
        new_data_frame = pandas.DataFrame(
                {'optimal_value': result.optimal_value,
                 'optimal_parameters': result.optimal_parameters,
                 'num_evaluations': result.num_evaluations,
                 'cost_spent': result.cost_spent,
                 'time': result.time,
                 'seed': result.seed,
                 'status': result.status,
                 'message': result.message}
                for result in results)
        self.data_frame = pandas.concat([self.data_frame, new_data_frame])
        self.results.extend(results) 
Example 43
Project: gullikson-scripts   Author: kgullikson88   File: Sensitivity.py    MIT License 5 votes vote down vote up
def split_by_component(df):
    df['prim_comp'] = df.Comp.map(lambda s: s[0])
    df['sec_comp'] = df.Comp.map(lambda s: s[-1])
    comps = pd.concat((df[['prim_comp', 'Sp1']], df[['sec_comp', 'Sp2']]))
    prim = comps.loc[comps.prim_comp.notnull()].rename(columns={'Sp1': 'SpT', 'prim_comp': 'comp'})
    sec = comps.loc[comps.sec_comp.notnull()].rename(columns={'Sp2': 'SpT', 'sec_comp': 'comp'})
    return pd.concat((prim, sec))[['comp', 'SpT']].drop_duplicates(subset='comp') 
Example 44
Project: motion-tracking   Author: dansbecker   File: parse_alov_bb.py    MIT License 5 votes vote down vote up
def calc_frame_pairs(frames_df):
    """Add columns denoting the relevant info for the current and previous frame.

    Each video has loads of frames, and we need to get information for subsequent
    frames in the same row in the DataFrame. This will make it easy for our image
    generator to easily cycle through pairs. We'll accomplish this by taking the
    `frames_df`, lopping off the last row, placing in a filler row, and merging it
    back onto the original `frames_df`. The rest will be cleanup.

    Args:
    ----
        frames_df: pandas DataFrame

    Returns:
    -------
        save_df: pandas DataFrame
    """

    filler_row = pd.DataFrame(np.zeros((1, frames_df.shape[1])),
                              columns=frames_df.columns)
    less_one_df = frames_df[:-1]
    lagged_df = pd.concat([filler_row, less_one_df], axis=0)

    lagged_cols = [col + '_start' for col in frames_df.columns]
    lagged_df.columns = lagged_cols
    lagged_df.reset_index(inplace=True, drop=True)

    end_cols = [col + '_end' for col in frames_df.columns]
    frames_df.columns = end_cols
    merged_df = pd.concat([lagged_df, frames_df], axis=1)

    max_frames_df = merged_df.groupby('filename_start')['frame_start'].max()
    max_frames_df.name = 'max_frame'
    temp_df = merged_df.join(max_frames_df, on='filename_start')
    save_df = temp_df.query('max_frame != frame_start')

    return save_df 
Example 45
Project: purple_air_api   Author: ReagentX   File: sensor.py    GNU General Public License v3.0 5 votes vote down vote up
def get_historical(self, weeks_to_get: int) -> pd.DataFrame:
        '''Get data from the ThingSpeak API one week at a time up to weeks_to_get weeks in the past'''
        from_week = datetime.now()
        to_week = from_week - timedelta(weeks=1)
        url = f'https://thingspeak.com/channels/{self.tp_a}/feed.csv?api_key={self.tp_a_key}&offset=0&average=&round=2&start={to_week.strftime("%Y-%m-%d")}%2000:00:00&end={from_week.strftime("%Y-%m-%d")}%2000:00:00'
        df = pd.read_csv(url)
        if weeks_to_get > 1:
            for i in range(weeks_to_get):
                from_week = to_week  # DateTimes are immutable so this reference is not a problem
                to_week = to_week - timedelta(weeks=1)
                url = f'https://thingspeak.com/channels/{self.tp_a}/feed.csv?api_key={self.tp_a_key}&offset=0&average=&round=2&start={to_week.strftime("%Y-%m-%d")}%2000:00:00&end={from_week.strftime("%Y-%m-%d")}%2000:00:00'
                df = pd.concat([df, pd.read_csv(url)])

        # Handle formatting the DataFrame
        df.rename(columns={'field1': 'PM1 CF=ATM ug/m3',
                           'field2': 'PM25 CF=ATM ug/m3',
                           'field3': 'PM10 CF=ATM ug/m3',
                           'field4': 'Free HEAP memory',
                           'field5': 'ADC0 Voltage',
                           'field6': 'Sensor Firmware',
                           'field7': 'Unused',
                           'field8': 'PM25 CF=1 ug/m3'
                           }, inplace=True)
        df['created_at'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d %H:%M:%S %Z')
        df.index = df.pop('entry_id')
        return df 
Example 46
Project: cs294-112_hws   Author: xuwd11   File: plot.py    MIT License 5 votes vote down vote up
def plot_data(data, value="AverageReturn"):
    if isinstance(data, list):
        data = pd.concat(data, ignore_index=True)

    sns.set(style="darkgrid", font_scale=1.5)
    sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
    plt.legend(loc='best').draggable()
    plt.show() 
Example 47
Project: cs294-112_hws   Author: xuwd11   File: plot.py    MIT License 5 votes vote down vote up
def plot_data(data, value="AverageReturn"):
    if isinstance(data, list):
        data = pd.concat(data, ignore_index=True)

    sns.set(style="darkgrid", font_scale=1.5)
    sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
    plt.legend(loc='best').draggable()
    plt.show() 
Example 48
Project: cs294-112_hws   Author: xuwd11   File: plot_3.py    MIT License 5 votes vote down vote up
def get_datasets(fpath, condition=None):
    unit = 0
    datasets = []
    for root, dir, files in os.walk(fpath):
        if 'log.txt' in files:
            param_path = open(os.path.join(root,'params.json'))
            params = json.load(param_path)
            exp_name = params['exp_name']
            
            log_path = os.path.join(root,'log.txt')
            experiment_data = pd.read_table(log_path)

            experiment_data.insert(
                len(experiment_data.columns),
                'Unit',
                unit
                )        
            experiment_data.insert(
                len(experiment_data.columns),
                'Condition',
                condition or exp_name
                )

            datasets.append(experiment_data)
            unit += 1
    datasets = pd.concat(datasets, ignore_index=True)
    return datasets 
Example 49
Project: cs294-112_hws   Author: xuwd11   File: plot.py    MIT License 5 votes vote down vote up
def plot_data(data, value="AverageReturn"):
    if isinstance(data, list):
        data = pd.concat(data, ignore_index=True)

    sns.set(style="darkgrid", font_scale=1.5)
    sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
    plt.legend(loc='best').draggable()
    plt.show() 
Example 50
Project: cs294-112_hws   Author: xuwd11   File: plot.py    MIT License 5 votes vote down vote up
def plot_data(data, value="AverageReturn"):
    if isinstance(data, list):
        data = pd.concat(data, ignore_index=True)

    sns.set(style="darkgrid", font_scale=1.5)
    sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
    plt.legend(loc='best').draggable()
    plt.show()