Python pandas.read_csv() Examples

The following are code examples for showing how to use pandas.read_csv(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: comet-commonsense   Author: atcbosselut   File: atomic.py    Apache License 2.0 6 votes vote down vote up
def load_data(self, path):
        if ".pickle" in path:
            print("Loading data from: {}".format(path))
            data_utils.load_existing_data_loader(self, path)

            return True

        for split in self.data:
            file_name = "v4_atomic_{}.csv".format(map_name(split))

            df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0)
            df.iloc[:, :9] = df.iloc[:, :9].apply(
                lambda col: col.apply(json.loads))

            for cat in self.categories:
                attr = df[cat]
                self.data[split]["total"] += utils.zipped_flatten(zip(
                    attr.index, ["<{}>".format(cat)] * len(attr), attr.values))

        if do_take_partial_dataset(self.opt.data):
            self.data["train"]["total"] = select_partial_dataset(
                self.opt.data, self.data["train"]["total"])

        return False 
Example 2
Project: wikilinks   Author: trovdimi   File: redirects_candidates.py    MIT License 6 votes vote down vote up
def export_data_unresolved():

    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_work_view = db.get_work_view()
    connection = db_work_view._db_connection


    df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False)

    df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ')
    df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ')
    df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ')


    df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection)


    sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved'])

    sample_unresoleved['n'].fillna(0, inplace=True)
    sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8") 
Example 3
Project: ieml   Author: IEMLdev   File: ieml_database.py    GNU General Public License v3.0 6 votes vote down vote up
def get_descriptors(self, files_list=None):
        if files_list is not None:
            p1 = subprocess.Popen(['echo', '-ne', r'\0'.join(files_list)],
                                  stdout=subprocess.PIPE, cwd=self.folder)
            # p1 = subprocess.Popen(['sed', '-e', r's/\n/\x0/g'], stdin=p0.stdout,
            #                       stdout=subprocess.PIPE, cwd=self.folder)
        else:
            p1 = subprocess.Popen("find -path *.desc -print0".split(),
                                  stdout=subprocess.PIPE, cwd=self.folder)

        p2 = subprocess.Popen("xargs -0 cat".split(), stdin=p1.stdout, stdout=subprocess.PIPE, cwd=self.folder)
        try:
            r = pandas.read_csv(p2.stdout, sep=' ', header=None)
            r.columns=['ieml', 'language', 'descriptor', 'value']
        except EmptyDataError:
            r = pandas.DataFrame(columns=['ieml', 'language', 'descriptor', 'value'])

        return Descriptors(r) 
Example 4
Project: DataHack2018   Author: InnovizTech   File: data_utils.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def read_data(folder, index, data_type):
    """
    Reads data from files.
    :param str folder: folder with required video
    :param int index: frame index
    :param str data_type: data to be read. Options are: 'pointcloud', 'egomotion', 'labels'
    :return:
    """
    file_name = frame_to_filename(folder, index, data_type)
    cm_to_m_factor = 0.01
    if data_type == 'pointcloud':
        data = pd.read_csv(file_name, delimiter=',', header=None).values
        return data * cm_to_m_factor
    if data_type == 'egomotion':
        data = pd.read_csv(file_name, delimiter=',', header=None).values.ravel()
        return data
    if data_type == 'labels':
        return pd.read_csv(file_name, delimiter=',', dtype=int, header=None).values 
Example 5
Project: python-scripts-for-scanip-abaqus   Author: mngad   File: get_data_from_maks_output.py    GNU General Public License v3.0 6 votes vote down vote up
def combine_csvs(folder):
    """Combine the csvs."""
    list_of_files = os.listdir(folder)
    list_of_files.sort()
    new_file_list = []
    for file in list_of_files:
        if 'MaskStatistics.csv' in file:
            new_file_list.append(file)

    data_dict = {}
    for file in new_file_list:
        print(folder + '\\' + file)
        data = pandas.read_csv(folder + '\\' + file, header=1,)
        # print(data.iloc[0])
        data_dict[file] = data
        header = list(data.columns.values)

    # print(header)
    return data_dict, header 
Example 6
Project: Price-prediction-and-recommendation-of-second-hand-housing-in-Shanghai   Author: tz28   File: preprocess.py    BSD 2-Clause "Simplified" License 6 votes vote down vote up
def feature_encode():
	data = pd.read_csv('lianjia1.txt', sep=',', encoding='utf-8',
	                   names=['name', 'rooms', 'area', 'district', 'floor', 'uprice', 'sprice', 'age'])
	#对小区索引编码
	name_list = list(data['name'])
	arr_name = LabelEncoder().fit_transform(data['name'])
	data['name'] = arr_name
	file = open('name_encode_dict.txt', 'a', encoding='utf-8')
	for i in range(len(name_list)):
		file.write(name_list[i] + ',' + str(arr_name[i]) + '\n')
	file.close()
	# district_list = list(data['district'])
	arr_district = LabelEncoder().fit_transform(data['district'])
	data['district'] = arr_district
	# file = open('distric_encode_dict.txt','a',encoding='utf-8')
	# for i in range(len(district_list)):
	# 	file.write(district_list[i] + ',' + str(arr_district [i]) + '\n')
	# file.close()
	data.to_csv('lianjia3.txt', index=False, sep=',', header=False)
#对特征正则化

#把特征组合成结构体的形式,因为postgresql中的madlib要求这种形式 
Example 7
Project: GreenGuard   Author: D3-AI   File: csv.py    MIT License 6 votes vote down vote up
def load(self, target_times, window_size, signals=None, debug=False):
        if isinstance(target_times, str):
            target_times = pd.read_csv(target_times)
            target_times['cutoff_time'] = pd.to_datetime(target_times['cutoff_time'])

        if isinstance(signals, pd.DataFrame):
            signals = signals.signal_id

        window_size = pd.to_timedelta(window_size)
        timestamps = self._get_timestamps(target_times, window_size)

        readings = list()
        for turbine_id in timestamps.turbine_id.unique():
            readings.append(self._load_turbine(turbine_id, timestamps, signals))

        dask_scheduler = 'single-threaded' if debug else None
        computed = dask.compute(*readings, scheduler=dask_scheduler)
        readings = pd.concat((c for c in computed if len(c)), ignore_index=True, sort=False)

        LOGGER.info('Loaded %s turbine readings', len(readings))

        return readings 
Example 8
Project: tensorflow-DeepFM   Author: ChenglongChen   File: main.py    MIT License 6 votes vote down vote up
def _load_data():

    dfTrain = pd.read_csv(config.TRAIN_FILE)
    dfTest = pd.read_csv(config.TEST_FILE)

    def preprocess(df):
        cols = [c for c in df.columns if c not in ["id", "target"]]
        df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
        df["ps_car_13_x_ps_reg_03"] = df["ps_car_13"] * df["ps_reg_03"]
        return df

    dfTrain = preprocess(dfTrain)
    dfTest = preprocess(dfTest)

    cols = [c for c in dfTrain.columns if c not in ["id", "target"]]
    cols = [c for c in cols if (not c in config.IGNORE_COLS)]

    X_train = dfTrain[cols].values
    y_train = dfTrain["target"].values
    X_test = dfTest[cols].values
    ids_test = dfTest["id"].values
    cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]

    return dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices 
Example 9
Project: tensorflow-DeepFM   Author: ChenglongChen   File: DataReader.py    MIT License 6 votes vote down vote up
def gen_feat_dict(self):
        if self.dfTrain is None:
            dfTrain = pd.read_csv(self.trainfile)
        else:
            dfTrain = self.dfTrain
        if self.dfTest is None:
            dfTest = pd.read_csv(self.testfile)
        else:
            dfTest = self.dfTest
        df = pd.concat([dfTrain, dfTest])
        self.feat_dict = {}
        tc = 0
        for col in df.columns:
            if col in self.ignore_cols:
                continue
            if col in self.numeric_cols:
                # map to a single index
                self.feat_dict[col] = tc
                tc += 1
            else:
                us = df[col].unique()
                self.feat_dict[col] = dict(zip(us, range(tc, len(us)+tc)))
                tc += len(us)
        self.feat_dim = tc 
Example 10
Project: Graphlib   Author: HamletWantToCode   File: Alchemy_dataset.py    MIT License 6 votes vote down vote up
def process(self):
        if self.mode == 'dev':
            self.target = pd.read_csv(self.raw_paths[1], index_col=0,
                    usecols=['gdb_idx',] + ['property_%d' % x for x in range(12)])
            self.target = self.target[['property_%d' % x for x in range(12)]]
        
        _dataset = DatasetFolder(root=self.raw_paths[0], loader=self.sdf_graph_reader,
                                extensions='sdf', transform=self.pre_transform)

        data_list = []
        for alchemy_data, _ in _dataset:
            data_list.append(alchemy_data)

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
        if self.mode == 'dev':
            torch.save(_dataset.targets, self.processed_paths[1]) 
Example 11
Project: pymapd-examples   Author: omnisci   File: OKR_techsup_docker_load.py    Apache License 2.0 6 votes vote down vote up
def main():
    # connect to MapD
    dfcreds = get_credentials(omnisci_keyfile)
    connection = wake_and_connect_to_mapd(dfcreds['write_key_name'], dfcreds['write_key_secret'], mapdhost, mapddbname)
    # loop through tables
    if connection == 'RETRY':
        print('could not wake OmniSci; exiting')
    else:
        for csv_file, renamed_cols, int8_cols, int32_cols, ts_cols, tf, str_cols, bool_cols in file_names:
            #get the contents of the file and turn them into a dataframe
            print ("reading from file " + csv_file)
            dfnew = pd.read_csv(csv_file, index_col=False)
            #rename and recast columns
            parse_cols(dfnew, renamed_cols, int8_cols, int32_cols, ts_cols, tf, str_cols, bool_cols)
            #append the contents of this file to the existing table
            print ("appending file contents to table " + table_name)
            connection.load_table(table_name, dfnew, preserve_index=False, create=False) #load the new table into OmniSci
        # disconnect MapD
        disconnect_mapd(connection) 
Example 12
Project: pymapd-examples   Author: omnisci   File: OKR_techsup_higherlogic_load.py    Apache License 2.0 6 votes vote down vote up
def load_new_table_mapd(connection, table_name, csv_file, dtcol, tfrmt, drop_cols, mapd_host, mapd_user):
    df = pd.read_csv(csv_file)
    df.reset_index(drop=True, inplace=True)
    format_date_cols(df, dtcol, tfrmt) #force the column containing datetime values to be recast from strings to datetimes
    # drop the big columns of text we don't need for metrics
    df.drop(columns = drop_cols)
    # drop the old table
    drop_table_mapd(connection, table_name) #drop the old table
    print ("creating table " + table_name)
    print ('with columns')
    print (list(df.columns.values))
    connection.create_table(table_name, df, preserve_index=False) #create the new table
    print ("loading table " + table_name)
    connection.load_table(table_name, df) #load the new table into OmniSci

# MAIN 
Example 13
Project: pymapd-examples   Author: omnisci   File: OKR_oss_git_load.py    Apache License 2.0 6 votes vote down vote up
def load_new_table_mapd(connection, table_name, csv_file, ts_cols, ts_format, ts_units, int_cols):
    df = pd.read_csv(csv_file)
    format_int_col(df, int_cols)
    if ts_format == 'None':
        format_date_cols(df, ts_cols, un=ts_units)
    elif ts_units == 'None':
        format_date_cols(df, ts_cols, tf=ts_format)

    if df.empty:
        print ("no results to upload")
    else:
        df.reset_index(drop=True, inplace=True)
        print ("loading table " + table_name)
        connection.load_table(table_name, df, preserve_index=False, create=False) #append the data into the exisiting table in OmniSci

# Load CSV to dataframe and then copy to table using pymapd 
Example 14
Project: pymapd-examples   Author: omnisci   File: OKR_oss_git_load.py    Apache License 2.0 6 votes vote down vote up
def append_new_table_mapd(connection, table_name, csv_file, ts_cols, ts_format, ts_units, int_cols):
    df = pd.read_csv(csv_file)
    format_int_col(df, int_cols)
    if ts_format == 'None':
        format_date_cols(df, ts_cols, un=ts_units)
    elif ts_units == 'None':
        format_date_cols(df, ts_cols, tf=ts_format)

    if df.empty:
        print ("no results to upload")
    else:
        #load the new rows
        df.reset_index(drop=True, inplace=True)
        print ("loading table " + table_name)
        connection.load_table(table_name, df, preserve_index=False, create=False) #append the data into the exisiting table in OmniSci

        #dedupe all of the rows
        command = "select CAST(view_timestamp as DATE) view_timestamp, MAX(view_unique) as view_unique, repo from oss_git_views where repo = 'mapd-core' group by view_timestamp, repo order by view_timestamp ASC"
        df_deduped = pd.read_sql_query(command, connection)
        print ("reloading table " + table_name)
        print (df_deduped)
        drop_table_mapd(connection, table_name)
        connection.load_table(table_name, df_deduped, preserve_index=False, create=True) #append the data into the exisiting table in OmniSci

# MAIN 
Example 15
Project: NiBetaSeries   Author: HBClab   File: nistats.py    MIT License 6 votes vote down vote up
def _lsa_events_converter(events_file):
    """Make a model where each trial has its own regressor using least squares
    all (LSA)

    Parameters
    ----------
    events_file : str
        File that contains all events from the bold run

    Yields
    ------
    events : DataFrame
        A DataFrame in which each trial has its own trial_type
    """

    import pandas as pd
    events = pd.read_csv(events_file, sep='\t')
    events['original_trial_type'] = events['trial_type']
    for cond, cond_df in events.groupby('trial_type'):
        cond_idx = cond_df.index
        for i_trial, trial_idx in enumerate(cond_idx):
            trial_name = '{0}_{1:04d}'.format(cond, i_trial+1)
            events.loc[trial_idx, 'trial_type'] = trial_name
    return events 
Example 16
Project: keras-anomaly-detection   Author: chen0040   File: bidirectional_lstm_autoencoder.py    MIT License 6 votes vote down vote up
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = BidirectionalLstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
Example 17
Project: keras-anomaly-detection   Author: chen0040   File: cnn_lstm_autoencoder.py    MIT License 6 votes vote down vote up
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = CnnLstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
Example 18
Project: keras-anomaly-detection   Author: chen0040   File: lstm_autoencoder.py    MIT License 6 votes vote down vote up
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = LstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
Example 19
Project: synthetic-data-tutorial   Author: theodi   File: generate.py    MIT License 6 votes vote down vote up
def generate_treatments() -> list:
    """ Generate and return sample of treatments patients received. 

    Reads data/treatment_codes_nhs_ae.csv file 

    NHS treatment codes:
    https://www.datadictionary.nhs.uk/web_site_content/supporting_information/clinical_coding/accident_and_emergency_treatment_tables.asp?shownav=1
    """

    treatment_codes_df = pd.read_csv(filepaths.nhs_ae_treatment_codes)
    treatments = treatment_codes_df['Treatment'].tolist()

    # likelihood of each of the treatments - make some more common
    weights = random.choices(range(1, 100), k=len(treatments))
    treatment_codes = random.choices(
        treatments, k=num_of_rows, weights=weights)
    return treatment_codes 
Example 20
Project: synthetic-data-tutorial   Author: theodi   File: deidentify.py    MIT License 6 votes vote down vote up
def convert_postcodes_to_lsoa(hospital_ae_df: pd.DataFrame) -> pd.DataFrame:
    """Adds corresponding Lower layer super output area for each row
    depending on their postcode. Uses London postcodes dataset from
    https://www.doogal.co.uk/PostcodeDownloads.php 

    Keyword arguments:
    hospital_ae_df -- Hopsitals A&E records dataframe
    """
    postcodes_df = pd.read_csv(filepaths.postcodes_london)
    hospital_ae_df = pd.merge(
        hospital_ae_df, 
        postcodes_df[['Postcode', 'Lower layer super output area']], 
        on='Postcode'
    )
    hospital_ae_df = hospital_ae_df.drop('Postcode', 1)
    return hospital_ae_df 
Example 21
Project: Wide-Residual-Nets-for-SETI   Author: sgrvinod   File: average_scores.py    Apache License 2.0 5 votes vote down vote up
def average_scores(input_folder, output_path):
    """
    Averages scores of several CSV files generated by test.py

    Args:
        input_folder (path): folder with models' scores' CSVs in it.
        output_path (path): path of output CSV file with averaged scores, ready for submission to SETI scoreboards
    """
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    model_scores = []
    for i, csv in enumerate(csv_files):
        df = pd.read_csv(os.path.join(input_folder, csv), index_col=0, header=None)
        if i == 0:
            index = df.index
        else:
            assert index.equals(df.index), "Indices of one or more files do not match!"
        model_scores.append(df)
    print "Read %d files. Averaging..." % len(model_scores)

    concat_scores = pd.concat(model_scores)
    averaged_scores = concat_scores.groupby(level=0).mean()
    assert averaged_scores.shape[0] == len(list(index)), "Something went wrong when concatenating/averaging!"
    averaged_scores = averaged_scores.reindex(index)

    averaged_scores.to_csv(output_path, header=False, index=True)
    print "Averaged scores saved to %s" % output_path 
Example 22
Project: chainer-openai-transformer-lm   Author: soskek   File: analysis.py    MIT License 5 votes vote down vote up
def rocstories(data_dir, pred_path, log_path):
    preds = pd.read_csv(pred_path, delimiter='\t')[
        'prediction'].values.tolist()
    _, _, _, labels = _rocstories(os.path.join(
        data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
    test_accuracy = accuracy_score(labels, preds) * 100.
    logs = [json.loads(line) for line in open(log_path)][1:]
    best_validation_index = np.argmax([log['va_acc'] for log in logs])
    valid_accuracy = logs[best_validation_index]['va_acc']
    print('ROCStories Valid Accuracy: %.2f' % (valid_accuracy))
    print('ROCStories Test Accuracy:  %.2f' % (test_accuracy)) 
Example 23
Project: chainer-openai-transformer-lm   Author: soskek   File: analysis.py    MIT License 5 votes vote down vote up
def sst(data_dir, pred_path, log_path):
    preds = pd.read_csv(pred_path, delimiter='\t')[
        'prediction'].values.tolist()
    test_url = 'https://raw.githubusercontent.com/harvardnlp/sent-conv-torch/master/data/stsa.binary.test'
    path = chainer.dataset.cached_download(test_url)
    teX, teY = _sst(path)
    labels = teY
    test_accuracy = accuracy_score(labels, preds) * 100.
    logs = [json.loads(line) for line in open(log_path)][1:]
    best_validation_index = np.argmax([log['va_acc'] for log in logs])
    valid_accuracy = logs[best_validation_index]['va_acc']
    print('SST Valid Accuracy: %.2f' % (valid_accuracy))
    print('SST Test Accuracy:  %.2f' % (test_accuracy)) 
Example 24
Project: sfcc   Author: kv-kunalvyas   File: auxiliary.py    MIT License 5 votes vote down vote up
def initialise():
    data_frame = pd.read_csv('../data/train.csv', header=0)
    return data_frame 
Example 25
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_concentration_data_from_csv(filename):
    """ Reads csv with concentration data
    
        Args:
            filename (str): name of input file
         
        Returns:
            DataFrame

    """
    data = pd.read_csv(filename,index_col=0)
    data.columns = [n for n in data.columns]
    return data 
Example 26
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_spectral_data_from_csv(filename, instrument = False, negatives_to_zero = False):
    """ Reads csv with spectral data
    
        Args:
            filename (str): name of input file
            instrument (bool): if data is direct from instrument
            negatives_to_zero (bool): if data contains negatives and baseline shift is not
                                        done then this forces negative values to zero.

        Returns:
            DataFrame

    """

    data = pd.read_csv(filename,index_col=0)
    if instrument:
        #this means we probably have a date/timestamp on the columns
        data = pd.read_csv(filename,index_col=0, parse_dates = True)
        data = data.T
        for n in data.index:
            h,m,s = n.split(':')
            sec = (float(h)*60+float(m))*60+float(s)
            data.rename(index={n:sec}, inplace=True)
        data.index = [float(n) for n in data.index]
    else:
        data.columns = [float(n) for n in data.columns]

    #If we have negative values then this makes them equal to zero
    if negatives_to_zero:
        for t in (data.index):
            for l in data.columns:
                if data.loc[t,l] < 0:
                    data.loc[t,l] = 0.0

    return data 
Example 27
Project: kipet   Author: salvadorgarciamunoz   File: data_tools.py    GNU General Public License v3.0 5 votes vote down vote up
def read_absorption_data_from_csv(filename):
    """ Reads csv with spectral data
    
        Args:
            filename (str): name of input file
          
        Returns:
            DataFrame

    """
    data = pd.read_csv(filename,index_col=0)
    return data 
Example 28
Project: wikilinks   Author: trovdimi   File: weighted_pagerank.py    MIT License 5 votes vote down vote up
def correlations_ground_truth():
    print 'ground truth'
    #load network
    wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering.xml.gz")
    #read counts with zeros
    article_counts  =  pd.read_csv(TMP+'article_counts.tsv', sep='\t')
    cor = {}
    for damping in [0.8,0.9]:
        page_rank = pagerank(wikipedia, damping=damping)
        wikipedia.vertex_properties['page_rank_'+str(damping)] = page_rank
        page_rank_values = list()
        counts = list()
        correlations_values = {}
        for index, row in article_counts.iterrows():
            counts.append(float(row['counts']))
            page_rank_values.append(page_rank[wikipedia.vertex(int(row['target_article_id']))])
        print 'pearson'
        p = pearsonr(page_rank_values, counts)
        print p
        correlations_values['pearson']=p
        print 'spearmanr'
        s = spearmanr(page_rank_values, counts)
        print s
        correlations_values['spearmanr']=s
        print 'kendalltau'
        k = kendalltau(page_rank_values, counts)
        print k
        correlations_values['kendalltau']=k
        cor['page_rank_'+str(damping)]=correlations_values
    write_pickle(HOME+'output/correlations/correlations_pagerank.obj', cor) 
Example 29
Project: ieml   Author: IEMLdev   File: ieml_database.py    GNU General Public License v3.0 5 votes vote down vote up
def get_structure(self):
        p1 = subprocess.Popen("find -path *.ieml -print0".split(), stdout=subprocess.PIPE, cwd=self.folder)
        p2 = subprocess.Popen("xargs -0 cat".split(), stdin=p1.stdout, stdout=subprocess.PIPE, cwd=self.folder)
        r = pandas.read_csv(p2.stdout, sep=' ', header=None)
        r.columns = ['ieml', 'key', 'value']
        return Structure(r) 
Example 30
Project: scrapy-soccerway   Author: tvl   File: parse_log.py    Apache License 2.0 5 votes vote down vote up
def venues():
    l = []
    df = pd.read_csv('404.log', delimiter=' ', header=None)

    for row in df[7]:
        l.append(int((row.split('/')[6][1:])))
    print('Nonexitent ({}) venues:'.format(len(l)))
    print(str(l)) 
Example 31
Project: GeoLibs-Dator   Author: GeographicaGS   File: csv.py    MIT License 5 votes vote down vote up
def extract(self, query=None):
        df = pd.read_csv(self.options['data']['location'])
        return df 
Example 32
Project: DataComp   Author: Cojabi   File: prop_matching.py    Apache License 2.0 5 votes vote down vote up
def create_prop_matched_dfs(matched, datacol):
    """
    Creates a new DataCollection containing only the matched cases. A table listing the matched data points is required.

    :param matched: Either a path to a csv which contains the matched data or a dataframe containing the matches. \
    2 Columns: one lists the subjects of df1 and the other lists the matching sample from df2.
    :param datacol: DataCollection object
    :return: DataCollection object containing only the matches samples
    """

    # load matches and drop non matched ids
    if type(matched) == str:
        matched = pd.read_csv(matched)

    matched.dropna(inplace=True)

    # create dfs containing only matched data. Try to get oder of dataframes and matching columns correct
    # check if mached_labels are in the first or second position of the DataCollection

    if set(datacol[0].index.intersection(matched.iloc[:, 0])) == set(matched.iloc[:, 0]):
        prop_dfs = [datacol[0].loc[datacol[0].index.intersection(matched.iloc[:, 0])],
                    datacol[1].loc[datacol[1].index.intersection(matched.iloc[:, 1])]]

    elif set(datacol[1].index.intersection(matched.iloc[:, 0])) == set(matched.iloc[:, 0]):
        prop_dfs = [datacol[1].loc[datacol[1].index.intersection(matched.iloc[:, 0])],
                    datacol[0].loc[datacol[0].index.intersection(matched.iloc[:, 1])]]
    else:
        raise ValueError("Matched labels do not fit to either of the dataframes in the datacollection!")

    return DataCollection(prop_dfs, datacol.df_names, datacol.categorical_feats) 
Example 33
Project: DataComp   Author: Cojabi   File: prop_matching.py    Apache License 2.0 5 votes vote down vote up
def create_prop_matched_dfs_longitudinal(matches_path, datacol, pat_col):
    """
    Creates a new Collection containing only the matched cases. A table listing the matched data points is required.

    :param matches_path: Path to a csv which contains the matched data. 2 Columns: one lists the subjects of df1 and \
    the other lists the matching sample from df2.
    :param datacol: DataCollection object
    :return: DataCollection object containing only the matches samples
    """

    # load matches and drop non matched ids
    matched = pd.read_csv(matches_path)
    matched.dropna(inplace=True)

    majority_inds = matched.iloc[:, 1]
    minority_inds = matched.iloc[:, 0]

    # create dfs containing only matched data. Try to get oder of dataframes and matching columns correct
    try:
        majority_pats = datacol[1].loc[majority_inds, pat_col]
        majority_df = datacol[1][datacol[1][pat_col].isin(majority_pats)]

        minority_pats = datacol[0].loc[minority_inds, pat_col]
        minority_df = datacol[0][datacol[0][pat_col].isin(minority_pats)]

        prop_dfs = [minority_df, majority_df]


    except KeyError:

        majority_pats = datacol[0].loc[majority_inds, pat_col]
        majority_df = datacol[0][datacol[0][pat_col].isin(majority_pats)]

        minority_pats = datacol[1].loc[minority_inds, pat_col]
        minority_df = datacol[1][datacol[1][pat_col].isin(minority_pats)]

        prop_dfs = [majority_df, minority_df]

    return DataCollection(prop_dfs, datacol.df_names, datacol.categorical_feats) 
Example 34
Project: Kaggle-Statoil-Challenge   Author: adodd202   File: utils.py    MIT License 5 votes vote down vote up
def MinMaxBestBaseStacking(input_folder, best_base, output_path):
    sub_base = pd.read_csv(best_base)
    all_files = os.listdir(input_folder)

    # Read and concatenate submissions
    outs = [pd.read_csv(os.path.join(input_folder, f), index_col=0) for f in all_files]
    concat_sub = pd.concat(outs, axis=1)
    cols = list(map(lambda x: "is_iceberg_" + str(x), range(len(concat_sub.columns))))
    concat_sub.columns = cols
    concat_sub.reset_index(inplace=True)

    # get the data fields ready for stacking
    concat_sub['is_iceberg_max'] = concat_sub.iloc[:, 1:6].max(axis=1)
    concat_sub['is_iceberg_min'] = concat_sub.iloc[:, 1:6].min(axis=1)
    concat_sub['is_iceberg_mean'] = concat_sub.iloc[:, 1:6].mean(axis=1)
    concat_sub['is_iceberg_median'] = concat_sub.iloc[:, 1:6].median(axis=1)

    # set up cutoff threshold for lower and upper bounds, easy to twist
    cutoff_lo = 0.67
    cutoff_hi = 0.33

    concat_sub['is_iceberg_base'] = sub_base['is_iceberg']
    concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:, 1:6] > cutoff_lo, axis=1),
                                        concat_sub['is_iceberg_max'],
                                        np.where(np.all(concat_sub.iloc[:, 1:6] < cutoff_hi, axis=1),
                                                 concat_sub['is_iceberg_min'],
                                                 concat_sub['is_iceberg_base']))
    concat_sub[['id', 'is_iceberg']].to_csv(output_path,
                                            index=False, float_format='%.12f') 
Example 35
Project: Kaggle-Statoil-Challenge   Author: adodd202   File: utils.py    MIT License 5 votes vote down vote up
def ensembleVer2(input_folder, output_path):
    print('Out:' + output_path)
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    model_scores = []
    for i, csv in enumerate(csv_files):
        df = pd.read_csv(os.path.join(input_folder, csv), index_col=0)
        if i == 0:
            index = df.index
        else:
            assert index.equals(df.index), "Indices of one or more files do not match!"
        model_scores.append(df)
    print("Read %d files. Averaging..." % len(model_scores))

    # print(model_scores)
    concat_scores = pd.concat(model_scores)
    print(concat_scores.head())
    concat_scores['is_iceberg'] = concat_scores['is_iceberg'].astype(np.float32)

    averaged_scores = concat_scores.groupby(level=0).mean()
    assert averaged_scores.shape[0] == len(list(index)), "Something went wrong when concatenating/averaging!"
    averaged_scores = averaged_scores.reindex(index)

    stacked_1 = pd.read_csv('statoil-submission-template.csv')  # for the header
    print(stacked_1.shape)
    sub = pd.DataFrame()
    sub['id'] = stacked_1['id']

    sub['is_iceberg'] = np.exp(np.mean(
        [
            averaged_scores['is_iceberg'].apply(lambda x: np.log(x))
        ], axis=0))

    print(sub.shape)
    sub.to_csv(output_path, index=False, float_format='%.9f')
    print("Averaged scores saved to %s" % output_path)


# Convert the np arrays into the correct dimention and type
# Note that BCEloss requires Float in X as well as in y 
Example 36
Project: HushUtility   Author: Deathhush   File: StockDataFetcher.py    MIT License 5 votes vote down vote up
def load_daily_df_by_year(self, symbol, year=2015):
        file_name = self.fetch_daily_bar_by_year(symbol, year)        
        return pd.read_csv(file_name) 
Example 37
Project: HushUtility   Author: Deathhush   File: StockEvaluation.py    MIT License 5 votes vote down vote up
def generate_daily_df(symbol, year='', file_path = 'D:\\Testland\\stock_data\\'):
    if (year != ''):
        year_part = '\\%s\\' % year        
    result_df = pandas.read_csv('%s%s%s.csv' % (file_path, year_part, symbol), header=None, names=[u'date', u'time', u'open', u'high', u'low', u'close', u'volume',u'amount'])    
    result_df = result_df.groupby('date').agg({'high':np.max, 'low':np.min, 'volume':np.sum, 'amount':np.sum, 'open':'first', 'close':'last'})        
    result_df['ma5']=pd.rolling_mean(result_df['close'] , 5)
    result_df['ma10']=pd.rolling_mean(result_df['close'] , 10)
    analyzed_path = '%s\\analyzed\\%s.%s.daily.analyzed.csv' % (file_path, symbol, year)
    result_df.to_csv(analyzed_path)
    result_df = pandas.read_csv(analyzed_path)      
    return result_df 
Example 38
Project: GreenGuard   Author: D3-AI   File: data.py    MIT License 5 votes vote down vote up
def _load_readings_file(turbine_file):
    LOGGER.info('Loading file %s', turbine_file)
    data = pd.read_csv(turbine_file)
    data.columns = data.columns.str.lower()
    data.rename(columns={'signal': 'signal_id'}, inplace=True)

    if 'unnamed: 0' in data.columns:
        # Someone forgot to drop the index before
        # storing the DataFrame as a CSV
        del data['unnamed: 0']

    LOGGER.info('Loaded %s readings from file %s', len(data), turbine_file)

    return data 
Example 39
Project: GreenGuard   Author: D3-AI   File: csv.py    MIT License 5 votes vote down vote up
def __load_readings_file(self, turbine_file, timestamps, signals):
        LOGGER.debug('Loading file %s', turbine_file)
        data = pd.read_csv(turbine_file, low_memory=False)
        data.columns = data.columns.str.lower()
        data = data.rename(columns={'signal': 'signal_id'})

        if 'unnamed: 0' in data.columns:
            # Someone forgot to drop the index before
            # storing the DataFrame as a CSV
            del data['unnamed: 0']

        LOGGER.debug('Loaded %s readings from file %s', len(data), turbine_file)

        return data 
Example 40
Project: tensorflow-DeepFM   Author: ChenglongChen   File: DataReader.py    MIT License 5 votes vote down vote up
def parse(self, infile=None, df=None, has_label=False):
        assert not ((infile is None) and (df is None)), "infile or df at least one is set"
        assert not ((infile is not None) and (df is not None)), "only one can be set"
        if infile is None:
            dfi = df.copy()
        else:
            dfi = pd.read_csv(infile)
        if has_label:
            y = dfi["target"].values.tolist()
            dfi.drop(["id", "target"], axis=1, inplace=True)
        else:
            ids = dfi["id"].values.tolist()
            dfi.drop(["id"], axis=1, inplace=True)
        # dfi for feature index
        # dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
        dfv = dfi.copy()
        for col in dfi.columns:
            if col in self.feat_dict.ignore_cols:
                dfi.drop(col, axis=1, inplace=True)
                dfv.drop(col, axis=1, inplace=True)
                continue
            if col in self.feat_dict.numeric_cols:
                dfi[col] = self.feat_dict.feat_dict[col]
            else:
                dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
                dfv[col] = 1.

        # list of list of feature indices of each sample in the dataset
        Xi = dfi.values.tolist()
        # list of list of feature values of each sample in the dataset
        Xv = dfv.values.tolist()
        if has_label:
            return Xi, Xv, y
        else:
            return Xi, Xv, ids 
Example 41
Project: models   Author: kipoi   File: test_basset_model.py    MIT License 5 votes vote down vote up
def test_ref_seq():
    # Get pure fasta predictions
    model_dir = model_root + "./"
    model = kipoi.get_model(model_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir")
    dataloader_arguments = {
        "fasta_file": "/nfs/research1/stegle/users/rkreuzhu/opt/manuscript_code/data/raw/dataloader_files/shared/hg19.fa",
        "intervals_file": "test_files/test_encode_roadmap.bed"
    }
    # predict using results
    preds = model.pipeline.predict(dataloader_arguments)
    #
    res_orig = pd.read_csv("/nfs/research1/stegle/users/rkreuzhu/deeplearning/Basset/data/test_encode_roadmap_short_pred.txt", "\t", header=None)
    assert np.isclose(preds, res_orig.values, atol=1e-3).all() 
Example 42
Project: models   Author: kipoi   File: dataloader.py    MIT License 5 votes vote down vote up
def data(anno_file, fasta_file, meth_file, target_file=None):
    """
    Args:
        anno_file: file path; gtf file containing genes and exons
        fasta_file: file path; Genome sequence
        target_file: file path; path to the targets in the csv format
	meth_file: file path; methylation information
    """
    bt, genes = loadgene(anno_file)
    SEQ_WIDTH = 800
       
    if target_file is not None:
        targets = pd.read_csv(target_file, header=None, index_col=0)
        targets = targets.loc[genes]
        targets = targets.values
    else:
        targets = None
    # Run the fasta extractor
    seq = get_hot_seq_meth(meth_file, bt, fasta_file)
    # import pdb
    # pdb.set_trace()
    return {
            "inputs": seq,
            "targets": targets,
            "metadata": {
                "gene_id": np.array(genes),
             }
           } 
Example 43
Project: models   Author: kipoi   File: dataloader.py    MIT License 5 votes vote down vote up
def _read_target(self, target_file):
        dt = pd.read_csv(target_file, index_col=0)
        event_names = dt[self.label_col].tolist()
        self._index = event_names
        dt = dt.drop(self.label_col, axis=1)
        tissues = dt.columns
        dt = dt.as_matrix()
        dt = np.stack((dt, 1 - dt), axis=2)  # might bug if only one tissue
        self.target = dt
        self.tissues = tissues 
Example 44
Project: programsynthesishunting   Author: flexgp   File: save_plots.py    GNU General Public License v3.0 5 votes vote down vote up
def save_plot_from_file(filename, stat_name):
    """
    Saves a plot of a given stat from the stats file.

    :param filename: a full specified path to a .csv stats file.
    :param stat_name: the stat of interest for plotting.
    :return: Nothing.
    """

    # Read in the data
    data = pd.read_csv(filename, sep="\t")
    try:
        stat = list(data[stat_name])
    except KeyError:
        s = "utilities.stats.save_plots.save_plot_from_file\n" \
            "Error: stat %s does not exist" % stat_name
        raise Exception(s)

        # Set up the figure.
    fig = plt.figure()
    ax1 = fig.add_subplot(1, 1, 1)

    # Plot the data.
    ax1.plot(stat)

    # Plot title.
    plt.title(stat_name)

    # Get save path
    save_path = pathsep.join(filename.split(pathsep)[:-1])

    # Save plot and close.
    plt.savefig(path.join(save_path, (stat_name + '.pdf')))
    plt.close() 
Example 45
Project: oslodatascience-rl   Author: Froskekongen   File: common.py    MIT License 5 votes vote down vote up
def readLogPong(filename, **kwargs):
    '''
    Get pong log file (LogPong) as a dataframe.
    filename: file name of log file.
    **kwargs: arguments passed to pd.read_csv.
    '''
    df = pd.read_csv(filename, sep=';', names=('time', 'episode', 'rewardSum'), **kwargs)
    df.time = pd.to_datetime(df.time)
    return df 
Example 46
Project: spleeter   Author: deezer   File: tensor.py    MIT License 5 votes vote down vote up
def dataset_from_csv(csv_path, **kwargs):
    """ Load dataset from a CSV file using Pandas. kwargs if any are
    forwarded to the `pandas.read_csv` function.

    :param csv_path: Path of the CSV file to load dataset from.
    :returns: Loaded dataset.
    """
    df = pd.read_csv(csv_path, **kwargs)
    dataset = (
        tf.data.Dataset.from_tensor_slices(
            {key: df[key].values for key in df})
    )
    return dataset 
Example 47
Project: pymapd-examples   Author: omnisci   File: OKR_techsup_discourse_load.py    Apache License 2.0 5 votes vote down vote up
def load_new_table_mapd(connection, table_name, csv_file, dtcol, tfrmt, mapd_host, mapd_user):
    df = pd.read_csv(csv_file)
    df.reset_index(drop=True, inplace=True)
    format_date_cols(df, dtcol, tfrmt) #force the column containing datetime values to be recast from strings to datetimes
    drop_table_mapd(connection, table_name) #drop the old table
    connection.create_table(table_name, df, preserve_index=False) #create the new table
    print ("loading table " + table_name)
    connection.load_table(table_name, df) #load the new table into OmniSci

# MAIN 
Example 48
Project: pymapd-examples   Author: omnisci   File: OKR_techsup_ga.py    Apache License 2.0 5 votes vote down vote up
def parse_data(csvfile, dtcols, intcols, floatcols, strcols, renamings, tfrmt):
    df = pd.read_csv(csvfile)
    df.reset_index(drop=True, inplace=True)
    format_date_cols(df, dtcols, tfrmt) #force the column containing datetime values to be recast from strings to timestamps
    format_int_col(df, intcols)
    format_str_col(df, strcols)
    format_flt_col(df, floatcols)
    df = parse_geo_data(df)
    df = df.drop('geo_city_code', 1)
    df = df.drop('city_parent_code', 1)
    df = df.drop('city_target_type', 1)
    df = df.drop('city_status', 1)
    return df 
Example 49
Project: deep-learning-note   Author: wdxtub   File: 1_linear_basic.py    MIT License 5 votes vote down vote up
def readData(path):
    """
    使用pandas读取数据
    csv file
    x, y
    10, 7
    11, 8
    ...
    """
    data = pd.read_csv(path)
    return data 
Example 50
Project: deep-learning-note   Author: wdxtub   File: 2_linear_statsmodels.py    MIT License 5 votes vote down vote up
def readData(path):
    """
    使用pandas读取数据
    """
    data = pd.read_csv(path)
    return data