Python pandas.concat() Examples

The following are 30 code examples of pandas.concat(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: pre_submission.py    From MPContribs with MIT License 7 votes vote down vote up
def get_table(results, letter):
    y = "Δ{}".format(letter)
    df = Table(
        RecursiveDict([("δ", results[0]), (y, results[1]), (y + "ₑᵣᵣ", results[2])])
    )
    x0, x1 = map(float, df["δ"].iloc[[0, -1]])
    pad = 0.15 * (x1 - x0)
    mask = (results[3] > x0 - pad) & (results[3] < x1 + pad)
    x, fit = results[3][mask], results[4][mask]
    df.set_index("δ", inplace=True)
    df2 = pd.DataFrame(RecursiveDict([("δ", x), (y + " Fit", fit)]))
    df2.set_index("δ", inplace=True)
    cols = ["δ", y, y + "ₑᵣᵣ", y + " Fit"]
    return (
        pd.concat([df, df2], sort=True)
        .sort_index()
        .reset_index()
        .rename(columns={"index": "δ"})
        .fillna("")[cols]
    ) 
Example #2
Source File: dataloader_m.py    From models with MIT License 7 votes vote down vote up
def prepro_pos_table(pos_tables):
    """Extracts unique positions and sorts them."""
    if not isinstance(pos_tables, list):
        pos_tables = [pos_tables]

    pos_table = None
    for next_pos_table in pos_tables:
        if pos_table is None:
            pos_table = next_pos_table
        else:
            pos_table = pd.concat([pos_table, next_pos_table])
        pos_table = pos_table.groupby('chromo').apply(
            lambda df: pd.DataFrame({'pos': np.unique(df['pos'])}))
        pos_table.reset_index(inplace=True)
        pos_table = pos_table[['chromo', 'pos']]
        pos_table.sort_values(['chromo', 'pos'], inplace=True)
    return pos_table 
Example #3
Source File: cbc_hb.py    From lifestyles with MIT License 6 votes vote down vote up
def model(profiles, comparisons, selections, sample=2500, alpha_prior_std=10):
    all_attributes = pd.get_dummies(profiles).columns
    profiles_dummies = pd.get_dummies(profiles, drop_first=True)
    choices = pd.concat({profile: profiles_dummies.loc[comparisons[profile]].reset_index(drop=True) for profile in comparisons.columns}, axis=1)

    respondants = selections.columns
    n_attributes_in_model = profiles_dummies.shape[1]
    n_participants = selections.shape[1]

    with pm.Model():

        # https://www.sawtoothsoftware.com/download/ssiweb/CBCHB_Manual.pdf
        # need to include the covariance matrix as a parent of `partsworth`
        alpha = pm.Normal('alpha', 0, sd=alpha_prior_std, shape=n_attributes_in_model, testval=np.random.randn(n_attributes_in_model))
        partsworth = pm.MvNormal("partsworth", alpha, tau=np.eye(n_attributes_in_model), shape=(n_participants, n_attributes_in_model))

        cs = [_create_observation_variable(selection, choices, partsworth[i, :]) for i, (_, selection) in enumerate(selections.iteritems())]

        trace = pm.sample(sample)
    return transform_trace_to_individual_summary_statistics(trace, respondants, profiles_dummies.columns, all_attributes) 
Example #4
Source File: DataReader.py    From tensorflow-DeepFM with MIT License 6 votes vote down vote up
def gen_feat_dict(self):
        if self.dfTrain is None:
            dfTrain = pd.read_csv(self.trainfile)
        else:
            dfTrain = self.dfTrain
        if self.dfTest is None:
            dfTest = pd.read_csv(self.testfile)
        else:
            dfTest = self.dfTest
        df = pd.concat([dfTrain, dfTest])
        self.feat_dict = {}
        tc = 0
        for col in df.columns:
            if col in self.ignore_cols:
                continue
            if col in self.numeric_cols:
                # map to a single index
                self.feat_dict[col] = tc
                tc += 1
            else:
                us = df[col].unique()
                self.feat_dict[col] = dict(zip(us, range(tc, len(us)+tc)))
                tc += len(us)
        self.feat_dim = tc 
Example #5
Source File: runTests.py    From svviz with MIT License 6 votes vote down vote up
def saveTimingInfo(summary):
    timingsPath = "test_timings.csv"
    git_version = subprocess.check_output(["git", "describe"]).strip()
    
    new_row = summary[["timing"]].T
    new_row["date"] = [datetime.datetime.now()]
    new_row["version"] = git_version


    if os.path.exists(timingsPath):
        timings = pandas.read_csv(timingsPath, index_col=0)
        timings = pandas.concat([timings, new_row])
    else:
        timings = new_row

    timings.to_csv(timingsPath)

    print(timings) 
Example #6
Source File: chart.py    From ipygee with MIT License 6 votes vote down vote up
def concat(*plots):
    """ Concatenate plots. The type of the resulting plot will be the type
        of the first parsed plot
    """
    first = plots[0]
    if isinstance(first, DateTimeLine):
        chart = DateTimeLine()
    else:
        chart = Line()

    y_data = {}
    for plot in plots:
        p_data = plot.y_data
        for serie, data in p_data.items():
            y_data[serie] = data
            chart.add(serie, data)

    chart.y_data = y_data
    return chart 
Example #7
Source File: test_statistics.py    From esmlab with Apache License 2.0 6 votes vote down vote up
def test_weighted_mean(dim, level, wgts_name):
    res = esmlab.weighted_mean(dset, dim=dim, weights=wgts[wgts_name])
    df = dset.to_dataframe()
    df_w = wgts.to_dataframe()[wgts_name]
    if not dim:
        res = res.to_array().data
        d = pd.concat([df, df_w], axis=1)
        expected = d.apply(
            lambda x: np.ma.average(np.ma.MaskedArray(x, mask=np.isnan(x)), weights=d.t_s_wgts)
        )[['da1', 'da2']]
        expected = expected.to_xarray().data
        np.testing.assert_allclose(res, expected)
    else:

        expected = df.groupby(level=level).apply(
            wavg, weights=wgts[wgts_name].data, col_names=['da1', 'da2']
        )

        res = res.to_dataframe()
        assert_frame_equal(res.sort_index(), expected.sort_index()) 
Example #8
Source File: observation_history.py    From tensortrade with Apache License 2.0 6 votes vote down vote up
def observe(self) -> np.array:
        """Returns the rows to be observed by the agent."""
        rows = self.rows.copy()

        if len(rows) < self.window_size:
            size = self.window_size - len(rows)
            padding = np.zeros((size, rows.shape[1]))
            padding = pd.DataFrame(padding, columns=self.rows.columns)
            rows = pd.concat([padding, rows], ignore_index=True, sort=False)

        if isinstance(rows, pd.DataFrame):
            rows = rows.fillna(0, axis=1)
            rows = rows.values

        rows = np.nan_to_num(rows)

        return rows 
Example #9
Source File: test_introspective_rationale_explainer.py    From interpret-text with MIT License 6 votes vote down vote up
def test_bert_explain_local(self):
        train_data = get_ssts_dataset('train')
        test_data = get_ssts_dataset('test')
        X_train = train_data[TEXT_COL]
        X_test = test_data[TEXT_COL]
        preprocessor =BertPreprocessor()

        df_train = pd.concat([train_data[LABEL_COL], preprocessor.preprocess(X_train)], axis=1)
        df_test = pd.concat([test_data[LABEL_COL], preprocessor.preprocess(X_test)], axis=1)
        model_config = BERT_MODEL_CONFIG
        explainer = IntrospectiveRationaleExplainer(classifier_type=CLASSIFIER_TYPE_BERT, cuda=CUDA)
        explainer.build_model_config(model_config)
        explainer.set_preprocessor(preprocessor)
        explainer.load()
        explainer.fit(df_train, df_test)

        local_explanation = explainer.explain_local(SENTENCE)
        # BERT adds [CLS] at the beginning of a sentence and [SEP] at the end of each sentence  but we remove them.
        assert len(local_explanation.local_importance_values) == len(SENTENCE.split()) 
Example #10
Source File: ciftify_postPINT2_sub2sub.py    From ciftify with MIT License 6 votes vote down vote up
def calc_allroiidx_distances(vertices_df, roi, surfL, surfR, pvertex_colname):
    '''
    loop over all subjects calculating distances for one roi
    '''
    ## determine the surface for measurment
    hemi = vertices_df.loc[vertices_df.roiidx==roi,'hemi'].values[0]
    if hemi == "L": surf = surfL
    if hemi == "R": surf = surfR

    ## subset the dataframe
    roidf = vertices_df.loc[vertices_df.roiidx==roi,:]

    ## run all the subjects and return into a tupley thing of results
    all_dfs = (calc_subdistances_distances(roidf, surf, subid, pvertex_colname) for subid in vertices_df.subid.unique())
    ## concatenate all the results
    roi_sub2sub = pd.concat(all_dfs, ignore_index=True)
    return(roi_sub2sub) 
Example #11
Source File: analyze_estimates.py    From performance_tracker with GNU General Public License v3.0 6 votes vote down vote up
def match_arrivals_with_schedule(estimated_trips, schedule_direction):
    schedule_direction.loc[:,"datetime_utc"] = pd.to_datetime(schedule_direction["datetime"], utc=True)
    estimated_trips.loc[:,"datetime_utc"] = pd.to_datetime(estimated_trips["datetime"], utc=True)
    schedule_direction = schedule_direction.set_index(pd.DatetimeIndex(schedule_direction["datetime_utc"])).sort_index()
    matched_estimates = [
        match_times(
            stop_id,
            stop_estimates,
            schedule_direction[schedule_direction["stop_id"] == stop_id],
        )
        for stop_id, stop_estimates in estimated_trips.groupby(["stop_id"])
    ]
    matched_estimates = [x for x in matched_estimates if x is not None]
    matched_estimates = pd.concat(matched_estimates)
    matched_estimates["since_scheduled"] = (
        matched_estimates["datetime_utc"] - matched_estimates["closest_scheduled"]
    )
    return matched_estimates 
Example #12
Source File: ecg_delineate.py    From NeuroKit with MIT License 6 votes vote down vote up
def _ecg_delineate_check(waves, rpeaks):
    """This function replaces the delineated features with np.nan if its standardized distance from R-peaks is more than
    3."""
    df = pd.DataFrame.from_dict(waves)
    features_columns = df.columns

    df = pd.concat([df, pd.DataFrame({"ECG_R_Peaks": rpeaks})], axis=1)

    # loop through all columns to calculate the z distance
    for column in features_columns:  # pylint: disable=W0612
        df = _calculate_abs_z(df, features_columns)

    # Replace with nan if distance > 3
    for col in features_columns:
        for i in range(len(df)):
            if df["Dist_R_" + col][i] > 3:
                df[col][i] = np.nan

    # Return df without distance columns
    df = df[features_columns]
    waves = df.to_dict("list")
    return waves 
Example #13
Source File: movielens_preprocess.py    From striatum with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def movie_preprocessing(movie):
    movie_col = list(movie.columns)
    movie_tag = [doc.split('|') for doc in movie['tag']]
    tag_table = {token: idx for idx, token in enumerate(set(itertools.chain.from_iterable(movie_tag)))}
    movie_tag = pd.DataFrame(movie_tag)
    tag_table = pd.DataFrame(tag_table.items())
    tag_table.columns = ['Tag', 'Index']

    # use one-hot encoding for movie genres (here called tag)
    tag_dummy = np.zeros([len(movie), len(tag_table)])

    for i in range(len(movie)):
        for j in range(len(tag_table)):
            if tag_table['Tag'][j] in list(movie_tag.iloc[i, :]):
                tag_dummy[i, j] = 1

    # combine the tag_dummy one-hot encoding table to original movie files
    movie = pd.concat([movie, pd.DataFrame(tag_dummy)], 1)
    movie_col.extend(['tag' + str(i) for i in range(len(tag_table))])
    movie.columns = movie_col
    movie = movie.drop('tag', 1)
    return movie 
Example #14
Source File: test_introspective_rationale_explainer.py    From interpret-text with MIT License 6 votes vote down vote up
def test_rnn_explain_local(self):
        train_data = get_ssts_dataset('train')
        test_data = get_ssts_dataset('test')
        all_data = pd.concat([train_data, test_data])
        X_train = train_data[TEXT_COL]
        X_test = test_data[TEXT_COL]
        preprocessor = GlovePreprocessor(count_threshold=TOKEN_COUNT_THRESHOLD, token_cutoff=MAX_SENT_COUNT)
        preprocessor.build_vocab(all_data[TEXT_COL])

        df_train = pd.concat([train_data[LABEL_COL], preprocessor.preprocess(X_train)], axis=1)
        df_test = pd.concat([test_data[LABEL_COL], preprocessor.preprocess(X_test)], axis=1)
        model_config = RNN_MODEL_CONFIG
        explainer = IntrospectiveRationaleExplainer(classifier_type=CLASSIFIER_TYPE_RNN, cuda=CUDA)
        explainer.build_model_config(model_config)
        explainer.set_preprocessor(preprocessor)
        explainer.load()
        explainer.fit(df_train, df_test)

        local_explanation = explainer.explain_local(SENTENCE)
        assert len(local_explanation.local_importance_values) == len(SENTENCE.split()) 
Example #15
Source File: dataproc.py    From pydiogment with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def balance_dataset(data):
    # define column names
    column_names = list(data.columns)

    # assert equal number o samples per class
    samples_pro_emotion = {e: len(data[data.emotion == e]) for e in data.emotion.unique()}
    balanced_data = pd.concat([data[data.emotion == e].sample(min(samples_pro_emotion.values()))
                               for e in data.emotion.unique()],
                               axis=0,
                               keys=list(data.columns))

    # split data
    X = balanced_data.iloc[:, :-1]
    y = balanced_data.iloc[:, -1:].astype('category')
    # print("%25s : %s" % ("Data with balanced sets", str(balanced_data.shape)))
    return balanced_data, X, y, column_names 
Example #16
Source File: misc.py    From steppy-toolkit with MIT License 6 votes vote down vote up
def transform(self, numerical_feature_list, categorical_feature_list):
        """
        Args:
            numerical_feature_list: list of numerical features
            categorical_feature_list: list of categorical features

        Returns:
            Dictionary with following keys:
                features: DataFrame with concatenated features
                feature_names: list of features names
                categorical_features: list of categorical feature names
        """
        features = numerical_feature_list + categorical_feature_list
        for feature in features:
            feature = self._format_target(feature)
            feature.set_index(self.id_column, drop=True, inplace=True)
        features = pd.concat(features, axis=1).astype(np.float32).reset_index()

        outputs = dict()
        outputs['features'] = features
        outputs['feature_names'] = list(features.columns)
        outputs['categorical_features'] = self._get_feature_names(categorical_feature_list)
        return outputs 
Example #17
Source File: action_detector_diagnosis.py    From DETAD with MIT License 5 votes vote down vote up
def _limit_prediction(self):
        """
            Of each class J, limit the predictions to the top scoring (N_j * self.limit_factor) 
            predictions, where N_j is the number of ground truth instances for class J.
        """
        ground_truth_gbvn = self.ground_truth.groupby('label')
        prediction_gbvn = self.prediction.groupby('label')
        
        filtered_prediction_df_list = []
        for label, this_ground_truth in ground_truth_gbvn:
            try:
                # Check if there is at least one prediction for this class.
                this_prediction = prediction_gbvn.get_group(label)
            except Exception as e:
                continue
            
            # pick the top (len(this_ground_truth)*self.limit_factor) predictions
            filtered_prediction_df_list += [this_prediction.nlargest(n=int(len(this_ground_truth)*self.limit_factor),
                                                                     columns='score')]

        filtered_prediction = pd.concat(filtered_prediction_df_list, ignore_index=True)

        # reset prediction ids
        filtered_prediction['prediction-id'] = range(len(filtered_prediction))

        return filtered_prediction 
Example #18
Source File: utils.py    From thewarden with MIT License 5 votes vote down vote up
def df_unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        ret = pd.concat([df, pd.DataFrame(
            (d for idx, d in df[column].iteritems()))], axis=1)
        del ret[column]
    else:
        ret = pd.concat([df, pd.DataFrame(
            (d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
        del ret[column]
    return ret 
Example #19
Source File: storageExt.py    From FINE with MIT License 5 votes vote down vote up
def getDataForTimeSeriesAggregation(self):
        """ Function for getting the required data if a time series aggregation is requested. """
        weightDict, data = {}, []
        I = [(self.fullChargeOpRateFix, self.fullChargeOpRateMax, 'chargeRate_', self.chargeTsaWeight),
             (self.fullDischargeOpRateFix, self.fullDischargeOpRateMax, 'dischargeRate_', self.dischargeTsaWeight),
             (self.fullStateOfChargeOpRateFix, self.fullStateOfChargeOpRateMax, '_SOCRate_', self.stateOfChargeTsaWeight),
             (self.fullOpexPerChargeOpTimeSeries, None, '_opexPerChargeOp_', self.opexChargeOpTsaWeight)]

        for rateFix, rateMax, rateName, rateWeight in I:
            weightDict, data = self.prepareTSAInput(rateFix, rateMax, rateName, rateWeight, weightDict, data)

        return (pd.concat(data, axis=1), weightDict) if data else (None, {}) 
Example #20
Source File: conversion.py    From FINE with MIT License 5 votes vote down vote up
def getDataForTimeSeriesAggregation(self):
        """ Function for getting the required data if a time series aggregation is requested. """
        weightDict, data = {}, []
        weightDict, data = self.prepareTSAInput(self.fullOperationRateFix, self.fullOperationRateMax,
                                                '_operationRate_', self.tsaWeight, weightDict, data)
        return (pd.concat(data, axis=1), weightDict) if data else (None, {}) 
Example #21
Source File: transmission.py    From FINE with MIT License 5 votes vote down vote up
def getDataForTimeSeriesAggregation(self):
        """ Function for getting the required data if a time series aggregation is requested. """
        weightDict, data = {}, []
        weightDict, data = self.prepareTSAInput(self.fullOperationRateFix, self.fullOperationRateMax,
                                                '_operationRate_', self.tsaWeight, weightDict, data)
        return (pd.concat(data, axis=1), weightDict) if data else (None, {}) 
Example #22
Source File: storage.py    From FINE with MIT License 5 votes vote down vote up
def getDataForTimeSeriesAggregation(self):
        """ Function for getting the required data if a time series aggregation is requested. """
        weightDict, data = {}, []
        I = [(self.fullChargeOpRateFix, self.fullChargeOpRateMax, 'chargeRate_', self.chargeTsaWeight),
             (self.fullDischargeOpRateFix, self.fullDischargeOpRateMax, 'dischargeRate_', self.dischargeTsaWeight)]

        for rateFix, rateMax, rateName, rateWeight in I:
            weightDict, data = self.prepareTSAInput(rateFix, rateMax, rateName, rateWeight, weightDict, data)
        return (pd.concat(data, axis=1), weightDict) if data else (None, {}) 
Example #23
Source File: utils.py    From FINE with MIT License 5 votes vote down vote up
def buildFullTimeSeries(df, periodsOrder, axis=1):
    data = []
    for p in periodsOrder:
        data.append(df.loc[p])
    return pd.concat(data, axis=axis, ignore_index=True) 
Example #24
Source File: annotators.py    From EarthSim with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def annotated_points(self):
        element = self.point_stream.element
        groups = []
        for g, idx in self._group_data.items():
            df = element.iloc[idx].dframe()
            df[self.column] = g
            groups.append(df)
        data = pd.concat(groups).sort_values(self.column) if groups else []
        return element.clone(data, vdims=self.column).opts(plot={'color_index': self.column},
                                                           style={'cmap': 'Category20'}) 
Example #25
Source File: 13_house_price.py    From deep-learning-note with MIT License 5 votes vote down vote up
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net(train_features.shape[1])
    train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
    utils.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')
    print('train rmse %f' % train_ls[-1])
    preds = net(test_features).detach().numpy()
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('./data/HousePrice/submission.csv', index=False) 
Example #26
Source File: ledger.py    From tensortrade with Apache License 2.0 5 votes vote down vote up
def as_frame(self, sort_by_order_seq=False) -> pd.DataFrame:

        if not sort_by_order_seq:
            return pd.DataFrame(self.transactions)

        df = pd.DataFrame(self.transactions)
        frames = []
        for poid in df.poid.unique():
            frames += [df.loc[df.poid == poid, :]]

        return pd.concat(frames, ignore_index=True, axis=0) 
Example #27
Source File: test_statistics.py    From esmlab with Apache License 2.0 5 votes vote down vote up
def test_weighted_std(dim, level, wgts_name):
    res = esmlab.weighted_std(dset, dim=dim, weights=wgts[wgts_name])
    df = dset.to_dataframe()
    df_w = wgts.to_dataframe()[wgts_name]

    if not dim:

        d = pd.concat([df, df_w], axis=1)
        df_w_mean = d.apply(
            lambda x: np.ma.average(np.ma.MaskedArray(x, mask=np.isnan(x)), weights=d.t_s_wgts)
        )[['da1', 'da2']]
        temp_df = (df - df_w_mean) ** 2
        temp_df = temp_df.multiply(df_w, axis='index').sum()
        total_weights_da1 = df_w[df['da1'].notnull()].sum()
        total_weights_da2 = df_w[df['da2'].notnull()].sum()
        res = res.to_array().to_pandas()
        expected = res.copy(True)
        expected['da1'] = np.sqrt(temp_df['da1'] / total_weights_da1)
        expected['da2'] = np.sqrt(temp_df['da2'] / total_weights_da2)
        np.testing.assert_allclose(res, expected)

    else:
        df_w_mean = df.groupby(level=level).apply(
            wavg, weights=wgts[wgts_name].data, col_names=['da1', 'da2']
        )
        temp_df = (df - df_w_mean) ** 2
        temp_df = temp_df.multiply(df_w, axis='index').sum(level=level)
        total_weights_da1 = df_w[df['da1'].notnull()].sum(level=level)
        total_weights_da2 = df_w[df['da2'].notnull()].sum(level=level)
        res = res.to_dataframe()
        expected = pd.DataFrame(columns=res.columns)
        expected['da1'] = np.sqrt(temp_df['da1'] / total_weights_da1)
        expected['da2'] = np.sqrt(temp_df['da2'] / total_weights_da2)
        assert_frame_equal(res.sort_index(), expected.sort_index()) 
Example #28
Source File: nba_scraper.py    From nba_scraper with GNU General Public License v3.0 5 votes vote down vote up
def scrape_date_range(
    date_from, date_to, data_format="pandas", data_dir=f"{Path.home()}/nbadata.csv"
):
    """
    Function scrapes all `regular-season` nba games between two dates

    Inputs:
    date_from   - Date to scrape from
    date_to     - Date to scrape to
    data_format - the format of the data the user wants returned. This is either
                  a pandas dataframe or a csv file
    data_dir    - a filepath which to write the csv file if that option is chosen.
                  If no filepath is passed then it will attempt to write to the
                  user's home directory

    Outputs:
    nba_df     - If pandas is chosen then this function will
                 return this pandas dataframe object. If csv then
                 a csv file will be written but None will be returned
    """
    check_format(data_format)
    check_valid_dates(date_from, date_to)

    game_ids = sf.get_date_games(date_from, date_to)
    scraped_games = []

    for game in game_ids:
        print(f"Scraping game id: {game}")
        scraped_games.append(sf.main_scrape(game))

    if data_format == "pandas":
        return pd.concat(scraped_games)
    else:
        pd.concat(scraped_games).to_csv(data_dir, index=False)
        return None 
Example #29
Source File: nba_scraper.py    From nba_scraper with GNU General Public License v3.0 5 votes vote down vote up
def scrape_season(season, data_format="pandas", data_dir=f"{Path.home()}/nbadata.csv"):
    """
    This function scrapes and entire season and either returns it as a pandas
    dataframe or writes it to file as a csv file

    Inputs:
    season      - season to be scraped must be an integer
    data_format - the format of the data the user wants returned. This is either
                  a pandas dataframe or a csv file
    data_dir    - a filepath which to write the csv file if that option is chosen.
                  If no filepath is passed then it will attempt to write to the
                  user's home directory

    Outputs:
    nba_df     - If pandas is chosen then this function will
                 return this pandas dataframe object. If csv then
                 a csv file will be written but None will be returned
    """
    check_format(data_format)

    scraped_games = []
    game_ids = list(range(int(f"2{season-2001}00001"), int(f"2{season-2001}01231")))

    for game in game_ids:
        if game == 21201214:
            print(f"Game {game} is not available")
            continue
        else:
            print(f"Scraping game id: 00{game}")
            scraped_games.append(sf.main_scrape(f"00{game}"))

    if len(scraped_games) == 0:
        return

    nba_df = pd.concat(scraped_games)

    if data_format == "pandas":
        return nba_df
    else:
        nba_df.to_csv(f"{data_dir}/nba{season}.csv", index=False)
        return None 
Example #30
Source File: stacking_val_predict.py    From argus-freesound with MIT License 5 votes vote down vote up
def calc_lwlrap_on_val():
    probs_df_lst = []
    for fold in config.folds:
        fold_probs_path = PREDICTION_DIR / f'fold_{fold}' / 'val' / 'probs.csv'
        probs_df = pd.read_csv(fold_probs_path)
        probs_df.set_index('fname', inplace=True)
        probs_df_lst.append(probs_df)

    probs_df = pd.concat(probs_df_lst, axis=0)
    train_curated_df = pd.read_csv(config.train_curated_csv_path)

    lwlrap = LwlrapBase(config.classes)
    for i, row in train_curated_df.iterrows():
        target = np.zeros(len(config.classes))
        for label in row.labels.split(','):
            target[config.class2index[label]] = 1.

        pred = probs_df.loc[row.fname].values
        lwlrap.accumulate(target[np.newaxis], pred[np.newaxis])

    result = {
        'overall_lwlrap': lwlrap.overall_lwlrap(),
        'per_class_lwlrap': {cls: lwl for cls, lwl in zip(config.classes,
                                                          lwlrap.per_class_lwlrap())}
    }
    print(result)
    with open(PREDICTION_DIR / 'val_lwlrap.json', 'w') as file:
        json.dump(result, file, indent=2)