Python pandas.to_numeric() Examples

The following are code examples for showing how to use pandas.to_numeric(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: gullikson-scripts   Author: kgullikson88   File: Mamajek_Table.py    MIT License 7 votes vote down vote up
def __init__(self, filename=TABLE_FILENAME):
        MS = SpectralTypeRelations.MainSequence()

        # Read in the table.
        colspecs=[[0,7], [7,14], [14,21], [21,28], [28,34], [34,40], [40,47], [47,55],
                  [55,63], [63,70], [70,78], [78,86], [86,94], [94,103], [103,110],
                  [110,116], [116,122], [122,130], [130,137], [137,144], [144,151],
                  [151,158]]
        mam_df = pd.read_fwf(filename, header=20, colspecs=colspecs, na_values=['...'])[:92]

        # Strip the * from the logAge column. Probably shouldn't but...
        mam_df['logAge'] = mam_df['logAge'].map(lambda s: s.strip('*') if isinstance(s, basestring) else s)

        # Convert everything to floats
        for col in mam_df.columns:
            mam_df[col] = pd.to_numeric(mam_df[col], errors='ignore')

        # Add the spectral type number for interpolation
        mam_df['SpTNum'] = mam_df['SpT'].map(MS.SpT_To_Number)
        
        self.mam_df = mam_df 
Example 2
Project: pymapd-examples   Author: omnisci   File: OKR_techsup_discourse.py    Apache License 2.0 6 votes vote down vote up
def main():
    dfcreds = get_credentials(keyfile)
    str_authentication = "&api_key=" + dfcreds['access_token'] + "&api_username=" + dfcreds['api_username']
    for url, fn in endpoints:
        url_get = url + str_authentication
        df = pd.read_json(url_get, orient="columns")
    #isolate the list
        cell = df.iloc[3,0]
    #format and clean up the data
        df = pd.DataFrame.from_dict(cell) #turn the list into a dataframe
        dfnew = pd.DataFrame(df, columns=["c1_timestamp","c2_value"]) # set the column names
        dfnew["c1_timestamp"] = pd.to_datetime(df["x"])
        dfnew["c2_value"] = pd.to_numeric(df["y"])
    # write to csv
        print ("writing csv to " + fn)
        dfnew.to_csv(fn, index=False, date_format="%Y-%m-%d") # write to csv 
Example 3
Project: scicast   Author: iandriver   File: matrix_filter.py    MIT License 6 votes vote down vote up
def find_top_common_genes(self):
        top_common_list = []
        count = 0
        done = False
        log2_df_by_gene = self.log2_df_cell.transpose()
        log2_df2_gene = log2_df_by_gene.apply(pd.to_numeric,errors='coerce')
        log_mean = log2_df2_gene.mean(axis=0).sort_values(ascending=False)
        try:
            log2_sorted_gene = log2_df_by_gene.reindex_axis(log2_df_by_gene.mean(axis=0).sort_values(ascending=False).index, axis=1)
        except ValueError:
            overlap_list = [item for item, count in collections.Counter(self.log2_df_cell.index).items() if count > 1]
            print(overlap_list, len(overlap_list))
            sys.exit('Error: Duplicate GeneIDs are present.')
        for gene in log2_sorted_gene.columns.tolist():
            if sum(genes < 1 for genes in log2_df_by_gene[gene])<6:
                if count < 20:
                    count+=1
                    top_common_list.append(gene)
            if count == 20:
                done = True
                break
        if done:
            return log2_df_by_gene[top_common_list].transpose()
        else:
            return [0] 
Example 4
Project: scicast   Author: iandriver   File: cluster.py    MIT License 6 votes vote down vote up
def find_top_common_genes(log2_df_by_cell, num_common=25):
    top_common_list = []
    count = 0
    done = False
    log2_df_by_gene = log2_df_by_cell.transpose()
    log2_df2_gene = log2_df_by_gene.apply(pd.to_numeric,errors='coerce')
    log_mean = log2_df2_gene.mean(axis=0).sort_values(ascending=False)
    try:
        log2_sorted_gene = log2_df_by_gene.reindex_axis(log2_df_by_gene.mean(axis=0).sort_values(ascending=False).index, axis=1)
    except ValueError:
        overlap_list = [item for item, count in collections.Counter(log2_df_by_cell.index).items() if count > 1]
        print(overlap_list, len(overlap_list))
        sys.exit('Error: Duplicate GeneIDs are present.')
    for gene in log2_sorted_gene.columns.tolist():
        if sum(genes < 1 for genes in log2_df_by_gene[gene])<6:
            if count < num_common:
                count+=1
                top_common_list.append(gene)
        if count == num_common:
            done = True
            break
    if done:
        return log2_df_by_gene[top_common_list].transpose()
    else:
        return [0] 
Example 5
Project: scicast   Author: iandriver   File: matrix_filter.py    MIT License 6 votes vote down vote up
def find_top_common_genes(log2_df_by_cell, num_common=25):
        top_common_list = []
        count = 0
        done = False
        log2_df_by_gene = log2_df_by_cell.transpose()
        log2_df2_gene = pd.DataFrame(pd.to_numeric(log2_df_by_gene, errors='coerce'))
        log_mean = log2_df2_gene.mean(axis=0).sort_values(ascending=False)
        log2_sorted_gene = log2_df_by_gene.reindex_axis(log2_df_by_gene.mean(axis=0).sort_values(ascending=False).index, axis=1)
        for gene in log2_sorted_gene.columns.tolist():
            if sum(genes < 1 for genes in log2_df_by_gene[gene])<6:
                if count < num_common:
                    count+=1
                    top_common_list.append(gene)
            if count == num_common:
                done = True
                break
        if done:
            return log2_df_by_gene[top_common_list].transpose()
        else:
            return [0] 
Example 6
Project: thewarden   Author: pxsocs   File: pricing.py    MIT License 6 votes vote down vote up
def df_fx(self, currency, fx_provider):
        try:
            # First get the df from this currency
            if currency != 'USD':
                fx = PriceData(currency, fx_provider)
                fx.df = fx.df.rename(columns={'close': 'fx_close'})
                fx.df["fx_close"] = pd.to_numeric(fx.df.fx_close,
                                                  errors='coerce')
                # Merge the two dfs:
                merge_df = pd.merge(self.df, fx.df, on='date', how='inner')
                merge_df['close'] = merge_df['close'].astype(float)
                merge_df['close_converted'] = merge_df['close'] * merge_df[
                    'fx_close']
                return (merge_df)
            else:  # If currency is USD no conversion is needed - prices are all in USD
                self.df['fx_close'] = 1
                self.df['close_converted'] = self.df['close'].astype(float)
                return (self.df)
        except Exception as e:
            self.errors.append(e)
            return (None) 
Example 7
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 6 votes vote down vote up
def test_error(self):
        s = pd.Series([1, -3.14, 'apple'])
        msg = 'Unable to parse string "apple" at position 2'
        with pytest.raises(ValueError, match=msg):
            to_numeric(s, errors='raise')

        res = to_numeric(s, errors='ignore')
        expected = pd.Series([1, -3.14, 'apple'])
        tm.assert_series_equal(res, expected)

        res = to_numeric(s, errors='coerce')
        expected = pd.Series([1, -3.14, np.nan])
        tm.assert_series_equal(res, expected)

        s = pd.Series(['orange', 1, -3.14, 'apple'])
        msg = 'Unable to parse string "orange" at position 0'
        with pytest.raises(ValueError, match=msg):
            to_numeric(s, errors='raise') 
Example 8
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 6 votes vote down vote up
def test_numeric_lists_and_arrays(self):
        # Test to_numeric with embedded lists and arrays
        df = pd.DataFrame(dict(
            a=[[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1]
        ))
        df['a'] = df['a'].apply(to_numeric)
        expected = pd.DataFrame(dict(
            a=[[3.14, 1.0], 1.6, 0.1],
        ))
        tm.assert_frame_equal(df, expected)

        df = pd.DataFrame(dict(
            a=[np.array([decimal.Decimal(3.14), 1.0]), 0.1]
        ))
        df['a'] = df['a'].apply(to_numeric)
        expected = pd.DataFrame(dict(
            a=[[3.14, 1.0], 0.1],
        ))
        tm.assert_frame_equal(df, expected) 
Example 9
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 6 votes vote down vote up
def test_numeric_dtypes(self):
        idx = pd.Index([1, 2, 3], name='xxx')
        res = pd.to_numeric(idx)
        tm.assert_index_equal(res, idx)

        res = pd.to_numeric(pd.Series(idx, name='xxx'))
        tm.assert_series_equal(res, pd.Series(idx, name='xxx'))

        res = pd.to_numeric(idx.values)
        tm.assert_numpy_array_equal(res, idx.values)

        idx = pd.Index([1., np.nan, 3., np.nan], name='xxx')
        res = pd.to_numeric(idx)
        tm.assert_index_equal(res, idx)

        res = pd.to_numeric(pd.Series(idx, name='xxx'))
        tm.assert_series_equal(res, pd.Series(idx, name='xxx'))

        res = pd.to_numeric(idx.values)
        tm.assert_numpy_array_equal(res, idx.values) 
Example 10
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 6 votes vote down vote up
def test_str(self):
        idx = pd.Index(['1', '2', '3'], name='xxx')
        exp = np.array([1, 2, 3], dtype='int64')
        res = pd.to_numeric(idx)
        tm.assert_index_equal(res, pd.Index(exp, name='xxx'))

        res = pd.to_numeric(pd.Series(idx, name='xxx'))
        tm.assert_series_equal(res, pd.Series(exp, name='xxx'))

        res = pd.to_numeric(idx.values)
        tm.assert_numpy_array_equal(res, exp)

        idx = pd.Index(['1.5', '2.7', '3.4'], name='xxx')
        exp = np.array([1.5, 2.7, 3.4])
        res = pd.to_numeric(idx)
        tm.assert_index_equal(res, pd.Index(exp, name='xxx'))

        res = pd.to_numeric(pd.Series(idx, name='xxx'))
        tm.assert_series_equal(res, pd.Series(exp, name='xxx'))

        res = pd.to_numeric(idx.values)
        tm.assert_numpy_array_equal(res, exp) 
Example 11
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 6 votes vote down vote up
def test_coerce_uint64_conflict(self):
        # see gh-17007 and gh-17125
        #
        # Still returns float despite the uint64-nan conflict,
        # which would normally force the casting to object.
        df = pd.DataFrame({"a": [200, 300, "", "NaN", 30000000000000000000]})
        expected = pd.Series([200, 300, np.nan, np.nan,
                              30000000000000000000], dtype=float, name="a")
        result = to_numeric(df["a"], errors="coerce")
        tm.assert_series_equal(result, expected)

        s = pd.Series(["12345678901234567890", "1234567890", "ITEM"])
        expected = pd.Series([12345678901234567890,
                              1234567890, np.nan], dtype=float)
        result = to_numeric(s, errors="coerce")
        tm.assert_series_equal(result, expected)

        # For completeness, check against "ignore" and "raise"
        result = to_numeric(s, errors="ignore")
        tm.assert_series_equal(result, s)

        msg = "Unable to parse string"
        with pytest.raises(ValueError, match=msg):
            to_numeric(s, errors="raise") 
Example 12
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 5 votes vote down vote up
def _call_cluster_peaks(self, cluster, min_cluster_expr_frac,
                            min_block_overlap, min_max_block_expr_frac):
        cluster_entries = cluster["header"].strip().split('\t')
        cluster_expr = float(cluster_entries[5])
        cluster_strand = cluster_entries[4]
        cluster_replicon = cluster_entries[1]
        peak_df = pd.DataFrame()

        if len(cluster["blocks"]) == 1:
            block_entries = cluster["blocks"][0].strip().split('\t')
            peak_start = int(block_entries[2]) + 1
            peak_end = int(block_entries[3])
            peak_df = peak_df.append(pd.Series([peak_start, peak_end], index=[
                "peak_start", "peak_end"]), ignore_index=True)
        else:
            blocks = [block.strip().split('\t') for block in cluster["blocks"]]
            block_df = pd.DataFrame(
                blocks, columns=["blockNb", "blockChrom", "blockStart",
                                 "blockEnd", "blockStrand", "blockExpression",
                                 "readCount"])
            block_df[["blockNb", "blockStart", "blockEnd", "blockExpression",
                      "readCount"]] = block_df[
                    ["blockNb", "blockStart", "blockEnd", "blockExpression",
                     "readCount"]].apply(pd.to_numeric)
            peak_df = self._split_cluster_peaks(block_df, cluster_expr,
                                                peak_df, min_cluster_expr_frac,
                                                min_block_overlap,
                                                min_max_block_expr_frac)
        if peak_df.empty:
            return
        peak_df = peak_df.astype(np.int64)
        peak_df["peak_strand"] = cluster_strand
        self._replicon_dict[cluster_replicon]["peak_df"] = self._replicon_dict[
            cluster_replicon]["peak_df"].append(peak_df, ignore_index=True) 
Example 13
Project: Kaggle-Statoil-Challenge   Author: adodd202   File: utils.py    MIT License 5 votes vote down vote up
def getStatoilTrainValLoaders(args):
    fixSeed(args)
    local_data = pd.read_json('/home/adodd202/train.json')

    local_data = shuffle(local_data)  # otherwise same validation set each time!
    local_data = local_data.reindex(np.random.permutation(local_data.index))

    local_data['band_1'] = local_data['band_1'].apply(lambda x: np.array(x).reshape(75, 75))
    local_data['band_2'] = local_data['band_2'].apply(lambda x: np.array(x).reshape(75, 75))
    local_data['inc_angle'] = pd.to_numeric(local_data['inc_angle'], errors='coerce')
    local_data['inc_angle'].fillna(0, inplace=True)

    band_1 = np.concatenate([im for im in local_data['band_1']]).reshape(-1, 75, 75)
    band_2 = np.concatenate([im for im in local_data['band_2']]).reshape(-1, 75, 75)
    # band_3=(band_1+band_2)/2
    local_full_img = np.stack([band_1, band_2], axis=1)

    train_imgs = XnumpyToTensor(local_full_img, args)
    train_targets = YnumpyToTensor(local_data['is_iceberg'].values, args)
    dset_train = TensorDataset(train_imgs, train_targets)

    local_train_ds, local_val_ds = trainTestSplit(dset_train, args.validationRatio)
    local_train_loader = torch.utils.data.DataLoader(local_train_ds, batch_size=args.batch_size, shuffle=False,
                                                     num_workers=args.workers)
    local_val_loader = torch.utils.data.DataLoader(local_val_ds, batch_size=args.batch_size, shuffle=False,
                                                   num_workers=args.workers)
    return local_train_loader, local_val_loader, local_train_ds, local_val_ds 
Example 14
Project: Kaggle-Statoil-Challenge   Author: adodd202   File: utils.py    MIT License 5 votes vote down vote up
def BinaryInference(local_model, args):
    if args.use_cuda:
        local_model.cuda()
    local_model.eval()
    df_test_set = pd.read_json('/home/adodd202/test.json')
    df_test_set['band_1'] = df_test_set['band_1'].apply(lambda x: np.array(x).reshape(75, 75))
    df_test_set['band_2'] = df_test_set['band_2'].apply(lambda x: np.array(x).reshape(75, 75))
    df_test_set['inc_angle'] = pd.to_numeric(df_test_set['inc_angle'], errors='coerce')
    # df_test_set.head(3)
    print(df_test_set.shape)
    columns = ['id', 'is_iceberg']
    df_pred = pd.DataFrame(data=np.zeros((0, len(columns))), columns=columns)
    # df_pred.id.astype(int)
    for index, row in df_test_set.iterrows():
        rwo_no_id = row.drop('id')
        band_1_test = (rwo_no_id['band_1']).reshape(-1, 75, 75)
        band_2_test = (rwo_no_id['band_2']).reshape(-1, 75, 75)
        # band_3_test = (band_1_test + band_2_test) / 2
        full_img_test = np.stack([band_1_test, band_2_test], axis=1)

        x_data_np = np.array(full_img_test, dtype=np.float32)
        if args.use_cuda:
            X_tensor_test = Variable(torch.from_numpy(x_data_np).cuda())  # Note the conversion for pytorch
        else:
            X_tensor_test = Variable(torch.from_numpy(x_data_np))  # Note the conversion for pytorch

        # X_tensor_test=X_tensor_test.view(1, trainX.shape[1]) # does not work with 1d tensors
        predicted_val = (local_model(X_tensor_test).data).float()  # probabilities
        p_test = predicted_val.cpu().numpy().item()  # otherwise we get an array, we need a single float

        df_pred = df_pred.append({'id': row['id'], 'is_iceberg': p_test}, ignore_index=True)

    return df_pred 
Example 15
Project: pymapd-examples   Author: omnisci   File: parsing_utils.py    Apache License 2.0 5 votes vote down vote up
def format_int_col(df, col_list):
    if col_list != {}:
        for col in col_list: df[col] = pd.to_numeric(df[col], downcast='integer') 
Example 16
Project: pymapd-examples   Author: omnisci   File: parsing_utils.py    Apache License 2.0 5 votes vote down vote up
def format_flt_col(df, col_list):
    if col_list != {}:
        for col in col_list: df[col] = pd.to_numeric(df[col], downcast='float') 
Example 17
Project: gullikson-scripts   Author: kgullikson88   File: HelperFunctions.py    MIT License 5 votes vote down vote up
def read_observed_targets(target_filename=OBS_TARGET_FNAME):
    """
    Reads the observed targets excel file into a pandas dataframe
    :param target_filename: The filename to read. Has a very specific format!
    :return:
    """
    sample_names = ['identifier', 'RA/DEC (J2000)', 'plx', 'Vmag', 'Kmag', 'vsini', 'SpT', 'configuration',
                    'Instrument',
                    'Date',
                    'Temperature', 'Velocity', 'vsini_sec', '[Fe/H]', 'Significance', 'Sens_min', 'Sens_any',
                    'Comments',
                    'Rank', 'Keck', 'VLT', 'Gemini', 'Imaging_Detecton']

    def plx_convert(s):
        try:
            return float(s)
        except ValueError:
            return np.nan

    sample = pd.read_excel(target_filename, sheetname=0, na_values=['     ~'], names=sample_names,
                           converters=dict(plx=plx_convert))
    sample = sample.reset_index(drop=True)[1:]

    # Convert everything to floats
    for col in sample.columns:
        sample[col] = pd.to_numeric(sample[col], errors='ignore')

    return sample 
Example 18
Project: fmlpy   Author: crazywiden   File: bars.py    MIT License 5 votes vote down vote up
def volume_bar(data, size):
    '''
    Calculate HOLC for a certain volume
    @requires:
        pandas
    @parameters:
        data: input data with time, price and volume
        size: input volume bar, integer
    @return:
        dataframe (start_t, end_t, start_idx, end_idx, high, low, close, open)
    '''
    data = _preprocess(data, True)
    if not isinstance(size, int):
        raise TypeError("Size should be an integer")
    data.loc[:, 'vol'] = pd.to_numeric(data['vol'])
    data.loc[:, 'cumsum'] = data['vol'].cumsum()
    data['cumsum'] = data['cumsum'] // size
    data = data.assign(idx=list(range(data.shape[0])))
    data['idx'] = data['idx'].astype('int64')
    aggregated_data = data.groupby('cumsum').agg({'time': ['first', 'last'], 'idx': ['first', 'last'], 
        'price': ['min', 'max', 'first', 'last']})
    aggregated_data.columns = [' '.join(col).strip() for col in aggregated_data.columns.values]
    aggregated_data.columns = ['start_t', 'end_t', 'start_idx', 'end_idx','low', 'high', 'open', 'close']
    result = aggregated_data.reset_index()
    result = result.drop(['cumsum'], axis=1)
    result = result.dropna()
    return result 
Example 19
Project: fmlpy   Author: crazywiden   File: bars.py    MIT License 5 votes vote down vote up
def dollar_bar(data, bar):
    '''
    Calculate HOLC for a certain dollar
    @requires:
        pandas
    @parameters:
        data: input data with time, price and volume
        bar: input dollar bar, integer
    @return:
        dataframe (start_t, end_t, start_idx, end_idx, high, low, close, open)
    '''
    data = _preprocess(data, True)
    if not isinstance(bar, int):
        raise TypeError("Dollar bar should be an integer")
    data.loc[:, 'vol'] = pd.to_numeric(data['vol'])
    data['dollar'] = data['price'] * data['vol']
    data.loc[:, 'cumsum'] = data['dollar'].cumsum()
    data['cumsum'] = data['cumsum'] // bar
    data = data.assign(idx=list(range(data.shape[0])))
    data['idx'] = data['idx'].astype('int64')
    aggregated_data = data.groupby('cumsum').agg({'time': ['first', 'last'], 'idx': ['first', 'last'],
        'price': ['min', 'max', 'first', 'last']})
    aggregated_data.columns = [' '.join(col).strip() for col in aggregated_data.columns.values]
    aggregated_data.columns = ['start_t', 'end_t', 'start_idx', 'end_idx','low', 'high', 'open', 'close']
    result = aggregated_data.reset_index()
    result = result.drop(['cumsum'], axis=1)
    result = result.dropna()
    return result 
Example 20
Project: scicast   Author: iandriver   File: matrix_filter.py    MIT License 5 votes vote down vote up
def log2_oulierfilter(df_by_cell, plot=False):
        log2_df = np.log2(df_by_cell+1)
        top_log2 = find_top_common_genes(log2_df)
        if all(top_log2) != 0:
            log2_df2= pd.to_numeric(pd.DataFrame(log2_df), errors='coerce')
            log_mean = top_log2.mean(axis=0).sort_values(ascending=False)
            log2_sorted = top_log2.reindex_axis(top_log2.mean(axis=0).sort_values(ascending=False).index, axis=1)
            xticks = []
            keep_col= []
            log2_cutoff = np.average(np.average(log2_sorted))-2*np.average(np.std(log2_sorted))
            for col, m in zip(log2_sorted.columns.tolist(),log2_sorted.mean()):
                if m > log2_cutoff:
                    keep_col.append(col)
                    xticks.append(col+' '+str("%.2f" % m))
            excluded_cells = [x for x in log2_sorted.columns.tolist() if x not in keep_col]
            filtered_df_by_cell = df_by_cell[keep_col]
            filtered_df_by_gene = filtered_df_by_cell.transpose()
            filtered_log2 = np.log2(filtered_df_by_cell[filtered_df_by_cell>0])
            if plot:
                ax = sns.boxplot(data=filtered_log2, whis= .75, notch=True)
                ax = sns.stripplot(x=filtered_log2.columns.values, y=filtered_log2.mean(axis=0), size=4, jitter=True, edgecolor="gray")
                xtickNames = plt.setp(ax, xticklabels=xticks)
                plt.setp(xtickNames, rotation=90, fontsize=9)
                plt.show()
                plt.clf()
                sns.distplot(filtered_log2.mean())
                plt.show()
            log2_expdf_cell = np.log2(filtered_df_by_cell+1)
            log2_expdf_gene = log2_expdf_cell.transpose()
            return log2_expdf_cell, log2_expdf_gene
        else:
            print("no common genes found")
            return log2_df, log2_df.transpose() 
Example 21
Project: performance_tracker   Author: metro-ontime   File: process_vehicles.py    GNU General Public License v3.0 5 votes vote down vote up
def process_raw_vehicles(df, track):
    df = df.drop_duplicates(
        subset=["report_time", "latitude", "longitude", "vehicle_id"]
    )
    df = df[df["predictable"] == True]

    df["latitude"] = pd.to_numeric(df.latitude)
    df["longitude"] = pd.to_numeric(df.longitude)
    df = toGDF(df)

    mask_0 = (df["direction"] == 0) | (df["direction"] == 90)
    mask_1 = (df["direction"] == 180) | (df["direction"] == 270)
    df_0 = df.loc[mask_0]
    df_0 = df_0.assign(direction_id = 0)
    df_1 = df.loc[mask_1]
    df_1 = df_1.assign(direction_id = 1)
    df_0["relative_position"] = findRelativePositions(df_0, track[0])
    df_1["relative_position"] = findRelativePositions(df_1, track[1])
    df = pd.concat([df_0, df_1])

    df["datetime"] = pd.to_datetime(df["report_time"], utc=True)
    df["datetime_local_iso8601"] = df.report_time.apply(
        lambda dt: pendulum.parse(dt, tz="UTC")
        .in_tz("America/Los_Angeles")
        .to_iso8601_string()
    )
    df = df.reset_index(drop=True)  # necessary both before and after getTrips
    df = getTrips(df)
    df = df.reset_index(drop=True)  # necessary both before and after getTrips
    df["datetime"] = df["datetime_local_iso8601"]
    df = df[["datetime", "trip_id", "direction_id", "relative_position"]]
    return df 
Example 22
Project: performance_tracker   Author: metro-ontime   File: estimate_arrivals.py    GNU General Public License v3.0 5 votes vote down vote up
def estimate_arrivals(trip_id, trip, stations, direction):
    trip.loc[:, "estimate"] = False
    stations.loc[:, "estimate"] = True
    trip_est = stations
    trip_est.loc[:, "trip_id"] = trip_id
    trip_est.loc[:, "direction_id"] = direction
    combined = trip.append(trip_est)
    combined = combined.sort_values("relative_position")
    combined = combined.reset_index(drop=True)
    # shift vals to move adjacent position and date data into each row
    combined.loc[:, "previous_pos"] = combined.relative_position.shift()
    combined.loc[:, "next_pos"] = combined.relative_position.shift(-1)
    combined.loc[:, "previous_dt"] = combined.datetime.shift()
    combined.loc[:, "next_dt"] = combined.datetime.shift(-1)
    select = combined[combined["estimate"] == True]
    select.loc[:, "weight"] = (select.relative_position - select.previous_pos) / (
        select.next_pos - select.previous_pos
    )
    select.loc[:, "time_interpolation"] = (
        select.next_dt - select.previous_dt
    ) * select.weight
    select.loc[:, "datetime"] = select.previous_dt + select.time_interpolation
    select.loc[:, "datetime"] = pd.DatetimeIndex(select.datetime).round("S")
    select.loc[:, "stop_id"] = pd.to_numeric(select.stop_id, downcast="integer")
    # Some station estimates cannot be reliably estimated using this
    # technique and will have datetime = NaT, so we remove them.
    select = select.dropna(subset=["datetime"])
    return select 
Example 23
Project: respy   Author: OpenSourceEconomics   File: shared.py    MIT License 5 votes vote down vote up
def downcast_to_smallest_dtype(series):
    """Downcast the dtype of a :class:`pandas.Series` to the lowest possible dtype.

    Be aware that NumPy integers silently overflow which is why conversion to low dtypes
    should be done after calculations. For example, using :class:`np.uint8` for an array
    and squaring the elements leads to silent overflows for numbers higher than 255.

    For more information on the boundaries the NumPy documentation under
    https://docs.scipy.org/doc/numpy-1.17.0/user/basics.types.html.

    """
    # We can skip integer as "unsigned" and "signed" will find the same dtypes.
    _downcast_options = ["unsigned", "signed", "float"]

    if series.dtype.name == "category":
        min_dtype = "category"

    elif series.dtype == np.bool:
        min_dtype = np.dtype("uint8")

    else:
        min_dtype = np.dtype("float64")

        for dc_opt in _downcast_options:
            dtype = pd.to_numeric(series, downcast=dc_opt).dtype

            if dtype.itemsize == 1 and dtype.name.startswith("u"):
                min_dtype = dtype
                break
            elif dtype.itemsize == min_dtype.itemsize and dtype.name.startswith("u"):
                min_dtype = dtype
            elif dtype.itemsize < min_dtype.itemsize:
                min_dtype = dtype
            else:
                pass

    return series.astype(min_dtype) 
Example 24
Project: student-resources   Author: djgroen   File: make-animation.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def read_csv_to_df():
    # Reads data from data directory
    df_list = []
    
    num_files = len(glob.glob('%s/agents.*.csv' % data_path))
    for i in range(1,num_files+1):
        file_path = '%s/agents.%s.csv' % (data_path, i)
        print(file_path)
        dataframe = pd.read_csv(file_path, index_col='#id')
        dataframe.apply(pd.to_numeric)
        df_list.append(dataframe)
    return df_list 
Example 25
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_empty(input_kwargs, result_kwargs):
    # see gh-16302
    ser = Series([], dtype=object)
    result = to_numeric(ser, **input_kwargs)

    expected = Series([], **result_kwargs)
    tm.assert_series_equal(result, expected) 
Example 26
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_series(last_val):
    ser = Series(["1", "-3.14", last_val])
    result = to_numeric(ser)

    expected = Series([1, -3.14, 7])
    tm.assert_series_equal(result, expected) 
Example 27
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_series_numeric(data):
    ser = Series(data, index=list("ABCD"), name="EFG")

    result = to_numeric(ser)
    tm.assert_series_equal(result, ser) 
Example 28
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_error(data, msg):
    ser = Series(data)

    with pytest.raises(ValueError, match=msg):
        to_numeric(ser, errors="raise") 
Example 29
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_ignore_error(errors, exp_data):
    ser = Series([1, -3.14, "apple"])
    result = to_numeric(ser, errors=errors)

    expected = Series(exp_data)
    tm.assert_series_equal(result, expected) 
Example 30
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_list():
    ser = ["1", "-3.14", "7"]
    res = to_numeric(ser)

    expected = np.array([1, -3.14, 7])
    tm.assert_numpy_array_equal(res, expected) 
Example 31
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_list_numeric(data, arr_kwargs):
    result = to_numeric(data)
    expected = np.array(data, **arr_kwargs)
    tm.assert_numpy_array_equal(result, expected) 
Example 32
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_numeric(kwargs):
    data = [1, -3.14, 7]

    ser = Series(data, **kwargs)
    result = to_numeric(ser)

    expected = Series(data)
    tm.assert_series_equal(result, expected) 
Example 33
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_numeric_df_columns(columns):
    # see gh-14827
    df = DataFrame(
        dict(
            a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"],
            b=[1.0, 2.0, 3.0, 4.0],
        )
    )

    expected = DataFrame(dict(a=[1.2, 3.14, np.inf, 0.1], b=[1.0, 2.0, 3.0, 4.0]))

    df_copy = df.copy()
    df_copy[columns] = df_copy[columns].apply(to_numeric)

    tm.assert_frame_equal(df_copy, expected) 
Example 34
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_numeric_embedded_arr_likes(data, exp_data):
    # Test to_numeric with embedded lists and arrays
    df = DataFrame(dict(a=data))
    df["a"] = df["a"].apply(to_numeric)

    expected = DataFrame(dict(a=exp_data))
    tm.assert_frame_equal(df, expected) 
Example 35
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_type_check(errors):
    # see gh-11776
    df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
    kwargs = dict(errors=errors) if errors is not None else dict()
    error_ctx = pytest.raises(TypeError, match="1-d array")

    with error_ctx:
        to_numeric(df, **kwargs) 
Example 36
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_scalar(val, signed, transform):
    val = -val if signed else val
    assert to_numeric(transform(val)) == float(val) 
Example 37
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_really_large_scalar(large_val, signed, transform, errors):
    # see gh-24910
    kwargs = dict(errors=errors) if errors is not None else dict()
    val = -large_val if signed else large_val

    val = transform(val)
    val_is_string = isinstance(val, str)

    if val_is_string and errors in (None, "raise"):
        msg = "Integer out of range. at position 0"
        with pytest.raises(ValueError, match=msg):
            to_numeric(val, **kwargs)
    else:
        expected = float(val) if (errors == "coerce" and val_is_string) else val
        tm.assert_almost_equal(to_numeric(val, **kwargs), expected) 
Example 38
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
    # see gh-24910
    kwargs = dict(errors=errors) if errors is not None else dict()
    val = -large_val if signed else large_val
    val = transform(val)

    extra_elt = "string"
    arr = [val] + multiple_elts * [extra_elt]

    val_is_string = isinstance(val, str)
    coercing = errors == "coerce"

    if errors in (None, "raise") and (val_is_string or multiple_elts):
        if val_is_string:
            msg = "Integer out of range. at position 0"
        else:
            msg = 'Unable to parse string "string" at position 1'

        with pytest.raises(ValueError, match=msg):
            to_numeric(arr, **kwargs)
    else:
        result = to_numeric(arr, **kwargs)

        exp_val = float(val) if (coercing and val_is_string) else val
        expected = [exp_val]

        if multiple_elts:
            if coercing:
                expected.append(np.nan)
                exp_dtype = float
            else:
                expected.append(extra_elt)
                exp_dtype = object
        else:
            exp_dtype = float if isinstance(exp_val, (int, float)) else object

        tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) 
Example 39
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors):
    # see gh-24910
    #
    # Even if we discover that we have to hold float, does not mean
    # we should be lenient on subsequent elements that fail to be integer.
    kwargs = dict(errors=errors) if errors is not None else dict()
    arr = [str(-large_val if signed else large_val)]

    if multiple_elts:
        arr.insert(0, large_val)

    if errors in (None, "raise"):
        index = int(multiple_elts)
        msg = "Integer out of range. at position {index}".format(index=index)

        with pytest.raises(ValueError, match=msg):
            to_numeric(arr, **kwargs)
    else:
        result = to_numeric(arr, **kwargs)

        if errors == "coerce":
            expected = [float(i) for i in arr]
            exp_dtype = float
        else:
            expected = arr
            exp_dtype = object

        tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) 
Example 40
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_numeric_dtypes(data, transform_assert_equal):
    transform, assert_equal = transform_assert_equal
    data = transform(data)

    result = to_numeric(data)
    assert_equal(result, data) 
Example 41
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_str(data, exp, transform_assert_equal):
    transform, assert_equal = transform_assert_equal
    result = to_numeric(transform(data))

    expected = transform(exp)
    assert_equal(result, expected) 
Example 42
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_datetime_like(tz_naive_fixture, transform_assert_equal):
    transform, assert_equal = transform_assert_equal
    idx = pd.date_range("20130101", periods=3, tz=tz_naive_fixture)

    result = to_numeric(transform(idx))
    expected = transform(idx.asi8)
    assert_equal(result, expected) 
Example 43
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_timedelta(transform_assert_equal):
    transform, assert_equal = transform_assert_equal
    idx = pd.timedelta_range("1 days", periods=3, freq="D")

    result = to_numeric(transform(idx))
    expected = transform(idx.asi8)
    assert_equal(result, expected) 
Example 44
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_period(transform_assert_equal):
    transform, assert_equal = transform_assert_equal

    idx = pd.period_range("2011-01", periods=3, freq="M", name="")
    inp = transform(idx)

    if isinstance(inp, Index):
        result = to_numeric(inp)
        expected = transform(idx.asi8)
        assert_equal(result, expected)
    else:
        # TODO: PeriodDtype, so support it in to_numeric.
        pytest.skip("Missing PeriodDtype support in to_numeric") 
Example 45
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_downcast_invalid_cast():
    # see gh-13352
    data = ["1", 2, 3]
    invalid_downcast = "unsigned-integer"
    msg = "invalid downcasting method provided"

    with pytest.raises(ValueError, match=msg):
        to_numeric(data, downcast=invalid_downcast) 
Example 46
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_errors_invalid_value():
    # see gh-26466
    data = ["1", 2, 3]
    invalid_error_value = "invalid"
    msg = "invalid error value specified"

    with pytest.raises(ValueError, match=msg):
        to_numeric(data, errors=invalid_error_value) 
Example 47
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_downcast_basic(data, kwargs, exp_dtype):
    # see gh-13352
    result = to_numeric(data, **kwargs)
    expected = np.array([1, 2, 3], dtype=exp_dtype)
    tm.assert_numpy_array_equal(result, expected) 
Example 48
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_signed_downcast(data, signed_downcast):
    # see gh-13352
    smallest_int_dtype = np.dtype(np.typecodes["Integer"][0])
    expected = np.array([1, 2, 3], dtype=smallest_int_dtype)

    res = to_numeric(data, downcast=signed_downcast)
    tm.assert_numpy_array_equal(res, expected) 
Example 49
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_ignore_downcast_invalid_data():
    # If we can't successfully cast the given
    # data to a numeric dtype, do not bother
    # with the downcast parameter.
    data = ["foo", 2, 3]
    expected = np.array(data, dtype=object)

    res = to_numeric(data, errors="ignore", downcast="unsigned")
    tm.assert_numpy_array_equal(res, expected) 
Example 50
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_ignore_downcast_cannot_convert_float(data, expected, downcast):
    # Cannot cast to an integer (signed or unsigned)
    # because we have a float number.
    res = to_numeric(data, downcast=downcast)
    tm.assert_numpy_array_equal(res, expected) 
Example 51
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_downcast_not8bit(downcast, expected_dtype):
    # the smallest integer dtype need not be np.(u)int8
    data = ["256", 257, 258]

    expected = np.array([256, 257, 258], dtype=expected_dtype)
    res = to_numeric(data, downcast=downcast)
    tm.assert_numpy_array_equal(res, expected) 
Example 52
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_downcast_limits(dtype, downcast, min_max):
    # see gh-14404: test the limits of each downcast.
    series = to_numeric(Series(min_max), downcast=downcast)
    assert series.dtype == dtype 
Example 53
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_coerce_uint64_conflict(data, exp_data):
    # see gh-17007 and gh-17125
    #
    # Still returns float despite the uint64-nan conflict,
    # which would normally force the casting to object.
    result = to_numeric(Series(data), errors="coerce")
    expected = Series(exp_data, dtype=float)
    tm.assert_series_equal(result, expected) 
Example 54
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numeric.py    MIT License 5 votes vote down vote up
def test_non_coerce_uint64_conflict(errors, exp):
    # see gh-17007 and gh-17125
    #
    # For completeness.
    ser = Series(["12345678901234567890", "1234567890", "ITEM"])

    if isinstance(exp, str):
        with pytest.raises(ValueError, match=exp):
            to_numeric(ser, errors=errors)
    else:
        result = to_numeric(ser, errors=errors)
        tm.assert_series_equal(result, ser) 
Example 55
Project: sthlm-bostad-vis   Author: ashwinvis   File: sssb.py    GNU General Public License v3.0 5 votes vote down vote up
def make_df_hist(self, store_deltas=True):
        col1 = self.cache_timestamp.strftime("%c")
        col2 = "No. of Applications"

        credit_days = self.df["Credit days"].str.split()
        df_tmp = credit_days.apply(pd.Series)
        df_tmp.columns = [col1, col2]

        # Format and change dtype
        series1 = df_tmp[col1].apply(pd.to_numeric)
        series2 = df_tmp[col2].str.lstrip("(").str.rstrip("st)").apply(pd.to_numeric)
        if self.df_hist is None:
            self.df_hist = pd.DataFrame({col2: series2, "Start": series1})
        else:
            self.df_hist = self.df_hist.T.dropna().T
            if store_deltas:
                self.df_hist[col2] = series2
                delta = series1 - self.df_hist.T.sum() + series2
                self.df_hist[col1] = delta
                if all(delta == 0):
                    return False
            else:
                self.df_hist[col1] = series1
                self.df_hist[col2] = series2

        return True 
Example 56
Project: car-park-prediction   Author: codeformuenster   File: utils_features.py    MIT License 5 votes vote down vote up
def engineer_features(df):
    """Feature engineering for regression."""
    # engineer simple features
    df['datetime'] = pd.to_datetime(df.timestamp)  # TODO: can be removed?
    df['cap'] = pd.to_numeric(df.free)
    df['time'] = df.datetime.dt.time  # time of the day
    df['date'] = df.datetime.dt.date  # date
    df['year'] = df.datetime.dt.year  # year
    df['month'] = df.datetime.dt.month  # month number
    df['weekday'] = df.datetime.dt.weekday  # weekday number
    df['weekend'] = (df.datetime.dt.weekday > 5).astype(int)  # weekend flag
    df['hour'] = df.datetime.dt.hour  # hour
    df['minute'] = df.datetime.dt.minute  # minute

    # lag features
    # TODO # 30 minutes ago (interpolated)  # TODO: speed up interpolation

    # engineer external features
    # TODO # 'bank holiday North-Rhine Westfalia'
    # TODO # 'bank holiday Niedersachsen'
    # TODO # 'bank holiday Netherlands'
    # TODO # avg. temperature of day
    # TODO # rain probability of day
    # TODO # X coordinate car park
    # TODO # Y coordinate car park
    # TODO # Send
    # TODO # Events from event API?
    # TODO # football match?
    # update database

    return df 
Example 57
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_empty(self):
        # see gh-16302
        s = pd.Series([], dtype=object)

        res = to_numeric(s)
        expected = pd.Series([], dtype=np.int64)

        tm.assert_series_equal(res, expected)

        # Original issue example
        res = to_numeric(s, errors='coerce', downcast='integer')
        expected = pd.Series([], dtype=np.int8)

        tm.assert_series_equal(res, expected) 
Example 58
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_series(self):
        s = pd.Series(['1', '-3.14', '7'])
        res = to_numeric(s)
        expected = pd.Series([1, -3.14, 7])
        tm.assert_series_equal(res, expected)

        s = pd.Series(['1', '-3.14', 7])
        res = to_numeric(s)
        tm.assert_series_equal(res, expected) 
Example 59
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_series_numeric(self):
        s = pd.Series([1, 3, 4, 5], index=list('ABCD'), name='XXX')
        res = to_numeric(s)
        tm.assert_series_equal(res, s)

        s = pd.Series([1., 3., 4., 5.], index=list('ABCD'), name='XXX')
        res = to_numeric(s)
        tm.assert_series_equal(res, s)

        # bool is regarded as numeric
        s = pd.Series([True, False, True, True],
                      index=list('ABCD'), name='XXX')
        res = to_numeric(s)
        tm.assert_series_equal(res, s) 
Example 60
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_error_seen_bool(self):
        s = pd.Series([True, False, 'apple'])
        msg = 'Unable to parse string "apple" at position 2'
        with pytest.raises(ValueError, match=msg):
            to_numeric(s, errors='raise')

        res = to_numeric(s, errors='ignore')
        expected = pd.Series([True, False, 'apple'])
        tm.assert_series_equal(res, expected)

        # coerces to float
        res = to_numeric(s, errors='coerce')
        expected = pd.Series([1., 0., np.nan])
        tm.assert_series_equal(res, expected) 
Example 61
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_list_numeric(self):
        s = [1, 3, 4, 5]
        res = to_numeric(s)
        tm.assert_numpy_array_equal(res, np.array(s, dtype=np.int64))

        s = [1., 3., 4., 5.]
        res = to_numeric(s)
        tm.assert_numpy_array_equal(res, np.array(s))

        # bool is regarded as numeric
        s = [True, False, True, True]
        res = to_numeric(s)
        tm.assert_numpy_array_equal(res, np.array(s)) 
Example 62
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_numeric(self):
        s = pd.Series([1, -3.14, 7], dtype='O')
        res = to_numeric(s)
        expected = pd.Series([1, -3.14, 7])
        tm.assert_series_equal(res, expected)

        s = pd.Series([1, -3.14, 7])
        res = to_numeric(s)
        tm.assert_series_equal(res, expected)

        # GH 14827
        df = pd.DataFrame(dict(
            a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), '0.1'],
            b=[1.0, 2.0, 3.0, 4.0],
        ))
        expected = pd.DataFrame(dict(
            a=[1.2, 3.14, np.inf, 0.1],
            b=[1.0, 2.0, 3.0, 4.0],
        ))

        # Test to_numeric over one column
        df_copy = df.copy()
        df_copy['a'] = df_copy['a'].apply(to_numeric)
        tm.assert_frame_equal(df_copy, expected)

        # Test to_numeric over multiple columns
        df_copy = df.copy()
        df_copy[['a', 'b']] = df_copy[['a', 'b']].apply(to_numeric)
        tm.assert_frame_equal(df_copy, expected) 
Example 63
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_all_nan(self):
        s = pd.Series(['a', 'b', 'c'])
        res = to_numeric(s, errors='coerce')
        expected = pd.Series([np.nan, np.nan, np.nan])
        tm.assert_series_equal(res, expected) 
Example 64
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_type_check(self, errors):
        # see gh-11776
        df = pd.DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
        kwargs = dict(errors=errors) if errors is not None else dict()
        error_ctx = pytest.raises(TypeError, match="1-d array")

        with error_ctx:
            to_numeric(df, **kwargs) 
Example 65
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_datetime_like(self, tz_naive_fixture):
        idx = pd.date_range("20130101", periods=3,
                            tz=tz_naive_fixture, name="xxx")
        res = pd.to_numeric(idx)
        tm.assert_index_equal(res, pd.Index(idx.asi8, name="xxx"))

        res = pd.to_numeric(pd.Series(idx, name="xxx"))
        tm.assert_series_equal(res, pd.Series(idx.asi8, name="xxx"))

        res = pd.to_numeric(idx.values)
        tm.assert_numpy_array_equal(res, idx.asi8) 
Example 66
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_timedelta(self):
        idx = pd.timedelta_range('1 days', periods=3, freq='D', name='xxx')
        res = pd.to_numeric(idx)
        tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))

        res = pd.to_numeric(pd.Series(idx, name='xxx'))
        tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))

        res = pd.to_numeric(idx.values)
        tm.assert_numpy_array_equal(res, idx.asi8) 
Example 67
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_period(self):
        idx = pd.period_range('2011-01', periods=3, freq='M', name='xxx')
        res = pd.to_numeric(idx)
        tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))

        # TODO: enable when we can support native PeriodDtype
        # res = pd.to_numeric(pd.Series(idx, name='xxx'))
        # tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) 
Example 68
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_downcast_basic(self, data):
        # see gh-13352
        invalid_downcast = "unsigned-integer"
        msg = "invalid downcasting method provided"

        with pytest.raises(ValueError, match=msg):
            pd.to_numeric(data, downcast=invalid_downcast)

        expected = np.array([1, 2, 3], dtype=np.int64)

        # Basic function tests.
        res = pd.to_numeric(data)
        tm.assert_numpy_array_equal(res, expected)

        res = pd.to_numeric(data, downcast=None)
        tm.assert_numpy_array_equal(res, expected)

        # Basic dtype support.
        smallest_uint_dtype = np.dtype(np.typecodes["UnsignedInteger"][0])

        # Support below np.float32 is rare and far between.
        float_32_char = np.dtype(np.float32).char
        smallest_float_dtype = float_32_char

        expected = np.array([1, 2, 3], dtype=smallest_uint_dtype)
        res = pd.to_numeric(data, downcast="unsigned")
        tm.assert_numpy_array_equal(res, expected)

        expected = np.array([1, 2, 3], dtype=smallest_float_dtype)
        res = pd.to_numeric(data, downcast="float")
        tm.assert_numpy_array_equal(res, expected) 
Example 69
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_signed_downcast(self, data, signed_downcast):
        # see gh-13352
        smallest_int_dtype = np.dtype(np.typecodes["Integer"][0])
        expected = np.array([1, 2, 3], dtype=smallest_int_dtype)

        res = pd.to_numeric(data, downcast=signed_downcast)
        tm.assert_numpy_array_equal(res, expected) 
Example 70
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_ignore_downcast_invalid_data(self):
        # If we can't successfully cast the given
        # data to a numeric dtype, do not bother
        # with the downcast parameter.
        data = ["foo", 2, 3]
        expected = np.array(data, dtype=object)

        res = pd.to_numeric(data, errors="ignore",
                            downcast="unsigned")
        tm.assert_numpy_array_equal(res, expected) 
Example 71
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_ignore_downcast_neg_to_unsigned(self):
        # Cannot cast to an unsigned integer
        # because we have a negative number.
        data = ["-1", 2, 3]
        expected = np.array([-1, 2, 3], dtype=np.int64)

        res = pd.to_numeric(data, downcast="unsigned")
        tm.assert_numpy_array_equal(res, expected) 
Example 72
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_ignore_downcast_cannot_convert_float(
            self, data, expected, downcast):
        # Cannot cast to an integer (signed or unsigned)
        # because we have a float number.
        res = pd.to_numeric(data, downcast=downcast)
        tm.assert_numpy_array_equal(res, expected) 
Example 73
Project: recruit   Author: Frank-qlu   File: test_numeric.py    Apache License 2.0 5 votes vote down vote up
def test_downcast_limits(self, dtype, downcast, min_max):
        # see gh-14404: test the limits of each downcast.
        series = pd.to_numeric(pd.Series(min_max), downcast=downcast)
        assert series.dtype == dtype 
Example 74
Project: ESA   Author: mzy2240   File: test_saw.py    MIT License 5 votes vote down vote up
def test_change_gen_voltage_set_points(self):
        """Set all generator voltages to 1, and ensure the command
        sticks.
        """
        # https://www.powerworld.com/WebHelp/#MainDocumentation_HTML/ChangeParametersMultipleElement_Sample_Code_Python.htm%3FTocPath%3DAutomation%2520Server%2520Add-On%2520(SimAuto)%7CAutomation%2520Server%2520Functions%7C_____9
        # Start by converting our generator data to a list of lists.
        value_list = self.gen_v_pu.values.tolist()

        # Loop over the values, set to 1.
        # noinspection PyTypeChecker
        for v in value_list:
            # Set voltage at 1.
            v[-1] = 1.0

        # Send in the command.
        # noinspection PyTypeChecker
        result = saw_14.ChangeParametersMultipleElement(
            ObjectType='gen', ParamList=self.params, ValueList=value_list)

        self.assertIsNone(result)

        # Check results.
        gen_v = saw_14.GetParametersMultipleElement(
            ObjectType='gen', ParamList=self.params)

        # Our present results should not be the same as the original.
        try:
            pd.testing.assert_frame_equal(gen_v, self.gen_v_pu)
        except AssertionError:
            # Frames are not equal. Success.
            pass
        else:
            self.fail('DataFrames are equal, but they should not be.')

        # Our current results should have all 1's for the GenRegPUVolt
        # column.
        # actual = pd.to_numeric(gen_v['GenRegPUVolt']).values
        actual = pd.to_numeric(gen_v['GenVoltSet']).values
        expected = np.array([1.0] * actual.shape[0])

        np.testing.assert_array_equal(actual, expected) 
Example 75
Project: PEAKachu   Author: tbischler   File: adaptive.py    ISC License 4 votes vote down vote up
def generate_combined_bed_file(self):
        # execute read conversion in parallel
        print("** Converting reads to bed format for {} libraries...".format(
            len(self._exp_lib_list)), flush=True)
        exp_lib_dict = {lib_name: self._lib_dict[lib_name] for lib_name in
                        self._exp_lib_list}
        t_start = time()
        with futures.ProcessPoolExecutor(
                max_workers=self._max_proc) as executor:
            future_to_lib_name = {
                executor.submit(lib.merge_reads):
                lib.lib_name for lib in exp_lib_dict.values()}
        for future in futures.as_completed(future_to_lib_name):
            lib_name = future_to_lib_name[future]
            try:
                self._lib_dict[lib_name].replicon_dict = future.result()
            except Exception as exc:
                print("{} generated an exception: {}".format(lib_name, exc),
                      flush=True)
        for replicon in sorted(self._replicon_dict):
            self._replicon_dict[replicon]["reads"] = pd.Series()
            for lib_name, lib in exp_lib_dict.items():
                self._replicon_dict[replicon]["reads"] = self._replicon_dict[
                    replicon]["reads"].add(lib.replicon_dict[replicon][
                        "reads"], fill_value=0)
            self._replicon_dict[replicon]["reads"] = self._replicon_dict[
                replicon]["reads"].reset_index(name="count")
            split_index = pd.DataFrame(list(self._replicon_dict[replicon][
                "reads"]["index"].str.split(',')), columns=[
                    "start", "end", "strand"])
            split_index.loc[:, ["start", "end"]] = split_index.loc[
                :, ["start", "end"]].apply(pd.to_numeric)
            del self._replicon_dict[replicon]["reads"]["index"]
            self._replicon_dict[replicon]["reads"] = split_index.join(
                self._replicon_dict[replicon]["reads"]).sort_values(
                    ["strand", "start", "end"], ascending=[False, True, True])
            self._replicon_dict[replicon]["reads"]["replicon"] = replicon
            self._replicon_dict[replicon]["reads"]["tag_id"] = (
                self._replicon_dict[replicon]["reads"].index + 1).map(
                'tag_{:.0f}'.format)
            self._replicon_dict[replicon]["reads"] = self._replicon_dict[
                replicon]["reads"].loc[:,
                                       ["replicon",
                                        "start",
                                        "end",
                                        "tag_id",
                                        "count",
                                        "strand"]]
            # create blockbuster input folder if it does not exist
            self._blockbuster_input_folder = "{}/blockbuster_input".format(
                    self._output_folder)
            if not exists(self._blockbuster_input_folder):
                makedirs(self._blockbuster_input_folder)
            self._replicon_dict[replicon]["reads"].to_csv(
                "{}/{}_sorted_reads_for_blockbuster.bed".format(
                    self._blockbuster_input_folder, replicon),
                sep='\t', header=False, index=False, encoding='utf-8')
        t_end = time()
        print("Reads converted to bed format in {} seconds.\n".format(
            t_end-t_start), flush=True) 
Example 76
Project: airqdata   Author: dr-1   File: luftdaten.py    GNU General Public License v3.0 4 votes vote down vote up
def get_metadata(self, **retrieval_kwargs):
        """Get sensor metadata and current measurements from cache or
        luftdaten.info API.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function

        Warns:
            UserWarning if sensor does not appear to be online
        """

        # Get and cache metadata and measurements of past five minutes
        filename = os.path.basename(self.metadata_url.rstrip("/")) + ".json"
        filepath = os.path.join(cache_dir, filename)
        parsed = retrieve(cache_file=filepath,
                          url=self.metadata_url,
                          label=("sensor {} metadata from luftdaten.info"
                                 .format(self.sensor_id)),
                          call_rate_limiter=call_rate_limiter,
                          **retrieval_kwargs)

        try:
            metadata = (parsed
                        .drop(columns=["sensordatavalues", "timestamp"])
                        .iloc[0])
        except (ValueError, AttributeError):
            warnings.warn("Sensor metadata could not be retrieved")
        else:
            metadata.name = "metadata"
            self.metadata = metadata

            # Extract metadata into corresponding properties
            self.sensor_type = metadata["sensor.sensor_type.name"]
            self.lat = float(metadata["location.latitude"])
            self.lon = float(metadata["location.longitude"])
            self.label = "at " + utils.label_coordinates(self.lat, self.lon)

            # Extract most current measurements
            current = parsed["sensordatavalues"].iloc[-1]
            current = (json_normalize(current)
                       .replace({"P1": "pm10", "P2": "pm2.5"})
                       .set_index("value_type")["value"])
            current = (pd.to_numeric(current)
                       .replace([999.9, 1999.9], pd.np.nan))
            self.current_measurements = dict(current)
            self.phenomena = list(current.index)
            self.units = {phenomenon: UNITS[phenomenon]
                          for phenomenon in UNITS
                          if phenomenon in self.phenomena} 
Example 77
Project: airqdata   Author: dr-1   File: luftdaten.py    GNU General Public License v3.0 4 votes vote down vote up
def search_proximity(lat=50.848, lon=4.351, radius=8):
    """Find sensors within given radius from a location.

    Args:
        lat: latitude of the center of search, in decimal degrees
        lon: longitude of the center of search, in decimal degrees
        radius: maximum distance from center, in kilometers

    Default values are the approximate center and radius of Brussels.

    Returns:
        Dataframe of matching sensors, listing sensor types, locations
        and distances in kilometers from the search center, indexed by
        sensor ID

    Raises:
        requests.HTTPError if request failed
    """
    url = (API_ENDPOINTS["proximity search pattern"]
           .format(lat=lat, lon=lon, radius=radius))
    call_rate_limiter()
    response = requests.get(url)
    response.raise_for_status()
    sensors = json_normalize(response.json())
    if len(sensors) == 0:
        sensors = pd.DataFrame(columns=["sensor_type", "latitude", "longitude",
                                        "distance"])
        sensors.index.name = "sensor_id"
        return sensors
    sensors = (sensors[["sensor.id", "sensor.sensor_type.name",
                        "location.latitude", "location.longitude"]]
               .rename(columns={"sensor.id": "sensor_id",
                                "sensor.sensor_type.name": "sensor_type",
                                "location.latitude": "latitude",
                                "location.longitude": "longitude"}))
    for col in "latitude", "longitude":
        sensors[col] = pd.to_numeric(sensors[col], downcast="float")
    sensors.set_index("sensor_id", inplace=True)

    # Drop duplicates - sensors appear once for each measurement in past 5 mins
    sensors = sensors[~sensors.index.duplicated()]

    # Calculate distances from search center and sort by those distances
    sensors["distance"] = sensors.apply(lambda x:
                                        utils.haversine(lat, lon,
                                                        float(x["latitude"]),
                                                        float(x["longitude"])),
                                        axis=1)
    sensors.sort_values("distance", inplace=True)

    return sensors 
Example 78
Project: scicast   Author: iandriver   File: cluster.py    MIT License 4 votes vote down vote up
def log2_oulierfilter(df_by_cell, plot=False, already_log2=False):
    if not already_log2:
        log2_df = np.log2(df_by_cell+1)

    else:
        log2_df = df_by_cell
    top_log2 = find_top_common_genes(log2_df)
    if all(top_log2) != 0:
        log2_df2= log2_df.apply(pd.to_numeric,errors='coerce')
        log_mean = top_log2.mean(axis=0).sort_values(ascending=False)
        log2_sorted = top_log2.reindex_axis(top_log2.mean(axis=0).sort_values(ascending=False).index, axis=1)
        xticks = []
        keep_col= []
        log2_cutoff = np.average(np.average(log2_sorted))-2*np.average(np.std(log2_sorted))
        for col, m in zip(log2_sorted.columns.tolist(),log2_sorted.mean()):
            if m > log2_cutoff:
                keep_col.append(col)
                xticks.append(col+' '+str("%.2f" % m))
        excluded_cells = [x for x in log2_sorted.columns.tolist() if x not in keep_col]
        filtered_df_by_cell = df_by_cell[keep_col]
        filtered_df_by_gene = filtered_df_by_cell.transpose()
        if not already_log2:
            filtered_log2 = np.log2(filtered_df_by_cell[filtered_df_by_cell>0])
        else:
            filtered_log2 = filtered_df_by_cell[filtered_df_by_cell>0]
        if plot:
            ax = sns.boxplot(data=filtered_log2, whis= .75, notch=True)
            ax = sns.stripplot(x=filtered_log2.columns.values, y=filtered_log2.mean(axis=0), size=4, jitter=True, edgecolor="gray")
            xtickNames = plt.setp(ax, xticklabels=xticks)
            plt.setp(xtickNames, rotation=90, fontsize=9)
            plt.show()
            plt.clf()
            sns.distplot(filtered_log2.mean())
            plt.show()
        if not already_log2:
            log2_expdf_cell = np.log2(filtered_df_by_cell+1)
        else:
            log2_expdf_cell = filtered_df_by_cell
        log2_expdf_gene = log2_expdf_cell.transpose()
        return log2_expdf_cell, log2_expdf_gene
    else:
        print("no common genes found")
        return log2_df, log2_df.transpose() 
Example 79
Project: recruit   Author: Frank-qlu   File: categorical.py    Apache License 2.0 4 votes vote down vote up
def _from_inferred_categories(cls, inferred_categories, inferred_codes,
                                  dtype, true_values=None):
        """
        Construct a Categorical from inferred values.

        For inferred categories (`dtype` is None) the categories are sorted.
        For explicit `dtype`, the `inferred_categories` are cast to the
        appropriate type.

        Parameters
        ----------
        inferred_categories : Index
        inferred_codes : Index
        dtype : CategoricalDtype or 'category'
        true_values : list, optional
            If none are provided, the default ones are
            "True", "TRUE", and "true."

        Returns
        -------
        Categorical
        """
        from pandas import Index, to_numeric, to_datetime, to_timedelta

        cats = Index(inferred_categories)
        known_categories = (isinstance(dtype, CategoricalDtype) and
                            dtype.categories is not None)

        if known_categories:
            # Convert to a specialized type with `dtype` if specified.
            if dtype.categories.is_numeric():
                cats = to_numeric(inferred_categories, errors="coerce")
            elif is_datetime64_dtype(dtype.categories):
                cats = to_datetime(inferred_categories, errors="coerce")
            elif is_timedelta64_dtype(dtype.categories):
                cats = to_timedelta(inferred_categories, errors="coerce")
            elif dtype.categories.is_boolean():
                if true_values is None:
                    true_values = ["True", "TRUE", "true"]

                cats = cats.isin(true_values)

        if known_categories:
            # Recode from observation order to dtype.categories order.
            categories = dtype.categories
            codes = _recode_for_categories(inferred_codes, cats, categories)
        elif not cats.is_monotonic_increasing:
            # Sort categories and recode for unknown categories.
            unsorted = cats.copy()
            categories = cats.sort_values()

            codes = _recode_for_categories(inferred_codes, unsorted,
                                           categories)
            dtype = CategoricalDtype(categories, ordered=False)
        else:
            dtype = CategoricalDtype(cats, ordered=False)
            codes = inferred_codes

        return cls(codes, dtype=dtype, fastpath=True) 
Example 80
Project: recruit   Author: Frank-qlu   File: cast.py    Apache License 2.0 4 votes vote down vote up
def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True,
                         coerce=False, copy=True):
    """ if we have an object dtype, try to coerce dates and/or numbers """

    conversion_count = sum((datetime, numeric, timedelta))
    if conversion_count == 0:
        raise ValueError('At least one of datetime, numeric or timedelta must '
                         'be True.')
    elif conversion_count > 1 and coerce:
        raise ValueError("Only one of 'datetime', 'numeric' or "
                         "'timedelta' can be True when when coerce=True.")

    if isinstance(values, (list, tuple)):
        # List or scalar
        values = np.array(values, dtype=np.object_)
    elif not hasattr(values, 'dtype'):
        values = np.array([values], dtype=np.object_)
    elif not is_object_dtype(values.dtype):
        # If not object, do not attempt conversion
        values = values.copy() if copy else values
        return values

    # If 1 flag is coerce, ensure 2 others are False
    if coerce:
        # Immediate return if coerce
        if datetime:
            from pandas import to_datetime
            return to_datetime(values, errors='coerce', box=False)
        elif timedelta:
            from pandas import to_timedelta
            return to_timedelta(values, errors='coerce', box=False)
        elif numeric:
            from pandas import to_numeric
            return to_numeric(values, errors='coerce')

    # Soft conversions
    if datetime:
        # GH 20380, when datetime is beyond year 2262, hence outside
        # bound of nanosecond-resolution 64-bit integers.
        try:
            values = lib.maybe_convert_objects(values,
                                               convert_datetime=datetime)
        except OutOfBoundsDatetime:
            pass

    if timedelta and is_object_dtype(values.dtype):
        # Object check to ensure only run if previous did not convert
        values = lib.maybe_convert_objects(values, convert_timedelta=timedelta)

    if numeric and is_object_dtype(values.dtype):
        try:
            converted = lib.maybe_convert_numeric(values, set(),
                                                  coerce_numeric=True)
            # If all NaNs, then do not-alter
            values = converted if not isna(converted).all() else values
            values = values.copy() if copy else values
        except Exception:
            pass

    return values