Python pandas.read_csv() Examples

The following are 30 code examples for showing how to use pandas.read_csv(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may want to check out the right sidebar which shows the related API usage.

You may also want to check out all available functions/classes of the module pandas , or try the search function .

Example 1
Project: svviz   Author: svviz   File: runTests.py    License: MIT License 6 votes vote down vote up
def saveTimingInfo(summary):
    timingsPath = "test_timings.csv"
    git_version = subprocess.check_output(["git", "describe"]).strip()
    
    new_row = summary[["timing"]].T
    new_row["date"] = [datetime.datetime.now()]
    new_row["version"] = git_version


    if os.path.exists(timingsPath):
        timings = pandas.read_csv(timingsPath, index_col=0)
        timings = pandas.concat([timings, new_row])
    else:
        timings = new_row

    timings.to_csv(timingsPath)

    print(timings) 
Example 2
Project: Financial-NLP   Author: Coldog2333   File: NLP.py    License: Apache License 2.0 6 votes vote down vote up
def load_label(self):
        """
        load label dictionary into the object.
        the format must be like this:
            积极,消极
            p1,n1
            p2,n2
            ...,...
            pk,nk
        """
        table=pd.read_csv(self.label_file)
        pos=table.loc[:,'积极'].tolist()
        neg=table.loc[:,'消极'].tolist()
        self.Label_index=pos+neg
        self.Label_dict=dict(zip(pos,[1]*len(pos)))
        self.Label_dict.update(dict(zip(neg,[-1]*len(neg)))) 
Example 3
Project: comet-commonsense   Author: atcbosselut   File: atomic.py    License: Apache License 2.0 6 votes vote down vote up
def load_data(self, path):
        if ".pickle" in path:
            print("Loading data from: {}".format(path))
            data_utils.load_existing_data_loader(self, path)

            return True

        for split in self.data:
            file_name = "v4_atomic_{}.csv".format(map_name(split))

            df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0)
            df.iloc[:, :9] = df.iloc[:, :9].apply(
                lambda col: col.apply(json.loads))

            for cat in self.categories:
                attr = df[cat]
                self.data[split]["total"] += utils.zipped_flatten(zip(
                    attr.index, ["<{}>".format(cat)] * len(attr), attr.values))

        if do_take_partial_dataset(self.opt.data):
            self.data["train"]["total"] = select_partial_dataset(
                self.opt.data, self.data["train"]["total"])

        return False 
Example 4
Project: tensorflow-DeepFM   Author: ChenglongChen   File: main.py    License: MIT License 6 votes vote down vote up
def _load_data():

    dfTrain = pd.read_csv(config.TRAIN_FILE)
    dfTest = pd.read_csv(config.TEST_FILE)

    def preprocess(df):
        cols = [c for c in df.columns if c not in ["id", "target"]]
        df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
        df["ps_car_13_x_ps_reg_03"] = df["ps_car_13"] * df["ps_reg_03"]
        return df

    dfTrain = preprocess(dfTrain)
    dfTest = preprocess(dfTest)

    cols = [c for c in dfTrain.columns if c not in ["id", "target"]]
    cols = [c for c in cols if (not c in config.IGNORE_COLS)]

    X_train = dfTrain[cols].values
    y_train = dfTrain["target"].values
    X_test = dfTest[cols].values
    ids_test = dfTest["id"].values
    cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]

    return dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices 
Example 5
Project: tensorflow-DeepFM   Author: ChenglongChen   File: DataReader.py    License: MIT License 6 votes vote down vote up
def gen_feat_dict(self):
        if self.dfTrain is None:
            dfTrain = pd.read_csv(self.trainfile)
        else:
            dfTrain = self.dfTrain
        if self.dfTest is None:
            dfTest = pd.read_csv(self.testfile)
        else:
            dfTest = self.dfTest
        df = pd.concat([dfTrain, dfTest])
        self.feat_dict = {}
        tc = 0
        for col in df.columns:
            if col in self.ignore_cols:
                continue
            if col in self.numeric_cols:
                # map to a single index
                self.feat_dict[col] = tc
                tc += 1
            else:
                us = df[col].unique()
                self.feat_dict[col] = dict(zip(us, range(tc, len(us)+tc)))
                tc += len(us)
        self.feat_dim = tc 
Example 6
Project: NiBetaSeries   Author: HBClab   File: nistats.py    License: MIT License 6 votes vote down vote up
def _lsa_events_converter(events_file):
    """Make a model where each trial has its own regressor using least squares
    all (LSA)

    Parameters
    ----------
    events_file : str
        File that contains all events from the bold run

    Yields
    ------
    events : DataFrame
        A DataFrame in which each trial has its own trial_type
    """

    import pandas as pd
    events = pd.read_csv(events_file, sep='\t')
    events['original_trial_type'] = events['trial_type']
    for cond, cond_df in events.groupby('trial_type'):
        cond_idx = cond_df.index
        for i_trial, trial_idx in enumerate(cond_idx):
            trial_name = '{0}_{1:04d}'.format(cond, i_trial+1)
            events.loc[trial_idx, 'trial_type'] = trial_name
    return events 
Example 7
Project: NiBetaSeries   Author: HBClab   File: test_nistats.py    License: MIT License 6 votes vote down vote up
def test_select_confounds_error(confounds_file, tmp_path):
    import pandas as pd
    import numpy as np

    confounds_df = pd.read_csv(str(confounds_file), sep='\t', na_values='n/a')

    confounds_df['white_matter'][0] = np.nan

    conf_file = tmp_path / "confounds.tsv"

    confounds_df.to_csv(str(conf_file), index=False, sep='\t', na_rep='n/a')

    with pytest.raises(ValueError) as val_err:
        _select_confounds(str(conf_file), ['white_matter', 'csf'])

    assert "The selected confounds contain nans" in str(val_err.value) 
Example 8
Project: NiBetaSeries   Author: HBClab   File: test_nistats.py    License: MIT License 6 votes vote down vote up
def test_select_confounds(confounds_file, selected_confounds, nan_confounds,
                          expanded_confounds):
    import pandas as pd
    import numpy as np

    confounds_df = pd.read_csv(str(confounds_file), sep='\t', na_values='n/a')

    res_df = _select_confounds(str(confounds_file), selected_confounds)

    # check if the correct columns are selected
    assert set(expanded_confounds) == set(res_df.columns)

    # check if nans are being imputed when expected
    if nan_confounds:
        for nan_c in nan_confounds:
            vals = confounds_df[nan_c].values
            expected_result = np.nanmean(vals[vals != 0])
            assert res_df[nan_c][0] == expected_result 
Example 9
Project: neuropythy   Author: noahbenson   File: core.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def load_LUT(filename, to='data'):
    '''
    load_LUT(filename) loads the given filename as a FreeSurfer LUT.

    The optional argument to (default: 'data') specifies how the LUT should be interpreted; it can
    be any of the following:
      * 'data' specifies that a dataframe should be returned.
    '''
    from neuropythy.util import to_dataframe
    import pandas
    # start by slurping in the text:
    dat = pandas.read_csv(filename, comment='#', sep='\s+', names=['id', 'name', 'r','g','b','a'])
    # if all the alpha values are 0, we set them to 1 (not sure why freesurfer does this)
    dat['a'] = 255 - dat['a']
    if pimms.is_str(to): to = to.lower()
    if   to is None: return dat
    elif to == 'data':
        df = to_dataframe({'id': dat['id'].values, 'name': dat['name'].values})
        df['color'] = dat.apply(lambda r: [r[k]/255.0 for k in ['r','g','b','a']], axis=1)
        df.set_index('id', inplace=True)
        return df
    else: raise ValueError('Unknown to argument: %s' % to)
# A function to load in default data from the freesurfer home: e.g., the default LUTs 
Example 10
Project: keras-anomaly-detection   Author: chen0040   File: bidirectional_lstm_autoencoder.py    License: MIT License 6 votes vote down vote up
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = BidirectionalLstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
Example 11
Project: keras-anomaly-detection   Author: chen0040   File: cnn_lstm_autoencoder.py    License: MIT License 6 votes vote down vote up
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = CnnLstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
Example 12
Project: keras-anomaly-detection   Author: chen0040   File: lstm_autoencoder.py    License: MIT License 6 votes vote down vote up
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = LstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
Example 13
Project: DOTA_models   Author: ringringyi   File: data_utils.py    License: Apache License 2.0 6 votes vote down vote up
def read_names(names_path):
    """read data from downloaded file. See SmallNames.txt for example format
    or go to https://www.kaggle.com/kaggle/us-baby-names for full lists

    Args:
        names_path: path to the csv file similar to the example type
    Returns:
        Dataset: a namedtuple of two elements: deduped names and their associated
            counts. The names contain only 26 chars and are all lower case
    """
    names_data = pd.read_csv(names_path)
    names_data.Name = names_data.Name.str.lower()

    name_data = names_data.groupby(by=["Name"])["Count"].sum()
    name_counts = np.array(name_data.tolist())
    names_deduped = np.array(name_data.index.tolist())

    Dataset = collections.namedtuple('Dataset', ['Name', 'Count'])
    return Dataset(names_deduped, name_counts) 
Example 14
Project: deep-summarization   Author: harpribot   File: data2tensor.py    License: MIT License 6 votes vote down vote up
def generate_vocabulary(self, review_summary_file):
        """

        :param review_summary_file:
        :return:
        """
        self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = "" 
Example 15
Project: bioservices   Author: cokelaer   File: mappers.py    License: GNU General Public License v3.0 6 votes vote down vote up
def get_data_from_biodbnet(self, df_hgnc):
        """keys are unique Gene names
        
        input is made of the df based on HGNC data web services

        uniprot accession are duplicated sometimes. If som this is actually the
        iprimary accession entry and all secondary ones.


        e.g. ,
        
        ABHD11 >>>> Q8N723;Q8NFV2;Q8NFV3;Q6PJU0;Q8NFV4;H7BYM8;Q8N722;Q9HBS8 ABHDB_HUMAN Alpha/beta hydrolase domain-containing protein 11
        correspond actually to the primary one : Q8NFV4

        """
        b = biodbnet.BioDBNet()
        res2 = b.db2db("Gene Symbol", ["HGNC ID", "UniProt Accession", "UniProt Entry Name", "UniProt Protein Name", "KEGG Gene ID", "Ensembl Gene ID"], 
                res.keys()[0:2000])

        import pandas as pd
        import StringIO
        c = pd.read_csv(StringIO.StringIO(res2), delimiter="\t", index_col="Gene Symbol")
        return c 
Example 16
Project: lirpg   Author: Hwhitetooth   File: monitor.py    License: MIT License 6 votes vote down vote up
def test_monitor():
    env = gym.make("CartPole-v1")
    env.seed(0)
    mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
    menv = Monitor(env, mon_file)
    menv.reset()
    for _ in range(1000):
        _, _, done, _ = menv.step(0)
        if done:
            menv.reset()

    f = open(mon_file, 'rt')

    firstline = f.readline()
    assert firstline.startswith('#')
    metadata = json.loads(firstline[1:])
    assert metadata['env_id'] == "CartPole-v1"
    assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'},  "Incorrect keys in monitor metadata"

    last_logline = pandas.read_csv(f, index_col=None)
    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
    f.close()
    os.remove(mon_file) 
Example 17
Project: kuzushiji-recognition   Author: see--   File: data.py    License: MIT License 6 votes vote down vote up
def load_gt(fn, label_key='labels', has_height_width=True):
  labels = pd.read_csv(fn, dtype={'image_id': str, label_key: str})
  labels = labels.fillna('')
  labels_ = defaultdict(list)
  all_labels = set()
  for img_id, label_str in zip(labels['image_id'], labels[label_key]):
    img_labels = label_str.split(' ')
    if has_height_width:
      l, x, y, h, w = img_labels[::5], img_labels[1::5], img_labels[2::5], \
          img_labels[3::5], img_labels[4::5]
      for ll, xx, yy, hh, ww in zip(l, x, y, h, w):
        labels_[img_id].append((int(xx), int(yy), int(hh), int(ww), ll))
        all_labels.add(ll)
    else:
      l, x, y = img_labels[::3], img_labels[1::3], img_labels[2::3]
      for ll, xx, yy in zip(l, x, y):
        labels_[img_id].append((int(xx), int(yy), ll))
        all_labels.add(ll)

  label_to_int = {v: k for k, v in enumerate(sorted(list(all_labels)))}
  labels = dict(labels_)
  return labels, label_to_int 
Example 18
Project: pywr   Author: pywr   File: test_control_curves.py    License: GNU General Public License v3.0 6 votes vote down vote up
def test_control_curve_interpolated_json(use_parameters):
    # this is a little hack-y, as the parameters don't provide access to their
    # data once they've been initalised
    if use_parameters:
        model = load_model("reservoir_with_cc_param_values.json")
    else:
        model = load_model("reservoir_with_cc.json")
    reservoir1 = model.nodes["reservoir1"]
    model.setup()
    path = os.path.join(os.path.dirname(__file__), "models", "control_curve.csv")
    control_curve = pd.read_csv(path)["Control Curve"].values
    values = [-8, -6, -4]

    @assert_rec(model, reservoir1.cost)
    def expected_cost(timestep, si):
        # calculate expected cost manually and compare to parameter output
        volume_factor = reservoir1._current_pc[si.global_id]
        cc = control_curve[timestep.index]
        return np.interp(volume_factor, [0.0, cc, 1.0], values[::-1])
    model.run() 
Example 19
Project: pywr   Author: pywr   File: test_control_curves.py    License: GNU General Public License v3.0 6 votes vote down vote up
def test_circular_control_curve_interpolated_json():
    # this is a little hack-y, as the parameters don't provide access to their
    # data once they've been initalised
    model = load_model("reservoir_with_circular_cc.json")
    reservoir1 = model.nodes["reservoir1"]
    model.setup()
    path = os.path.join(os.path.dirname(__file__), "models", "control_curve.csv")
    control_curve = pd.read_csv(path)["Control Curve"].values
    values = [-8, -6, -4]

    @assert_rec(model, reservoir1.cost)
    def expected_cost(timestep, si):
        # calculate expected cost manually and compare to parameter output
        volume_factor = reservoir1._current_pc[si.global_id]
        cc = control_curve[timestep.index]
        return np.interp(volume_factor, [0.0, cc, 1.0], values[::-1])
    model.run() 
Example 20
Project: pywr   Author: pywr   File: test_recorders.py    License: GNU General Public License v3.0 6 votes vote down vote up
def test_seasonal_fdc_recorder(self):
        """
        Test the FlowDurationCurveRecorder
        """
        model = load_model("timeseries4.json")

        df = pandas.read_csv(os.path.join(os.path.dirname(__file__), 'models', 'timeseries3.csv'),
                             parse_dates=True, dayfirst=True, index_col=0)

        percentiles = np.linspace(20., 100., 5)

        summer_flows = df.loc[pandas.Timestamp("2014-06-01"):pandas.Timestamp("2014-08-31"), :]
        summer_fdc = np.percentile(summer_flows, percentiles, axis=0)

        model.run()

        rec = model.recorders["seasonal_fdc"]
        assert_allclose(rec.fdc, summer_fdc) 
Example 21
Project: xalpha   Author: refraction-ray   File: info.py    License: MIT License 6 votes vote down vote up
def _fetch_csv(self, path):
        """
        fetch the information and pricetable from path+code.csv, not recommend to use manually,
        just set the fetch label to be true when init the object

        :param path:  string of folder path
        """
        try:
            content = pd.read_csv(path + self.code + ".csv")
            pricetable = content.iloc[1:]
            datel = list(pd.to_datetime(pricetable.date))
            self.price = pricetable[["netvalue", "totvalue", "comment"]]
            self.price["date"] = datel
            saveinfo = json.loads(content.iloc[0].date)
            if not isinstance(saveinfo, dict):
                raise FundTypeError("This csv doesn't looks like from fundinfo")
            self.segment = saveinfo["segment"]
            self.feeinfo = saveinfo["feeinfo"]
            self.name = saveinfo["name"]
            self.rate = saveinfo["rate"]
        except FileNotFoundError as e:
            # print('no saved copy of fund %s' % self.code)
            raise e 
Example 22
Project: xalpha   Author: refraction-ray   File: info.py    License: MIT License 6 votes vote down vote up
def _fetch_csv(self, path):
        """
        fetch the information and pricetable from path+code.csv, not recommend to use manually,
        just set the fetch label to be true when init the object

        :param path:  string of folder path
        """
        try:
            pricetable = pd.read_csv(path + self.code + ".csv")
            datel = list(pd.to_datetime(pricetable.date))
            self.price = pricetable[["netvalue", "totvalue", "comment"]]
            self.price["date"] = datel

        except FileNotFoundError as e:
            # print('no saved copy of %s' % self.code)
            raise e 
Example 23
Project: xalpha   Author: refraction-ray   File: info.py    License: MIT License 6 votes vote down vote up
def _fetch_csv(self, path):
        """
        fetch the information and pricetable from path+code.csv, not recommend to use manually,
        just set the fetch label to be true when init the object

        :param path:  string of folder path
        """
        try:
            content = pd.read_csv(path + self.code + ".csv")
            pricetable = content.iloc[1:]
            datel = list(pd.to_datetime(pricetable.date))
            self.price = pricetable[["netvalue", "totvalue", "comment"]]
            self.price["date"] = datel
            self.name = content.iloc[0].comment
        except FileNotFoundError as e:
            # print('no saved copy of %s' % self.code)
            raise e 
Example 24
Project: xalpha   Author: refraction-ray   File: universal.py    License: MIT License 6 votes vote down vote up
def fetch_backend(key):
    prefix = ioconf.get("prefix", "")
    key = prefix + key
    backend = ioconf.get("backend")
    path = ioconf.get("path")
    if backend == "csv":
        key = key + ".csv"

    try:
        if backend == "csv":
            df0 = pd.read_csv(os.path.join(path, key))
        elif backend == "sql":
            df0 = pd.read_sql(key, path)
        else:
            raise ValueError("no %s option for backend" % backend)

        return df0

    except (FileNotFoundError, exc.ProgrammingError, KeyError):
        return None 
Example 25
Project: HardRLWithYoutube   Author: MaxSobolMark   File: monitor.py    License: MIT License 6 votes vote down vote up
def test_monitor():
    env = gym.make("CartPole-v1")
    env.seed(0)
    mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
    menv = Monitor(env, mon_file)
    menv.reset()
    for _ in range(1000):
        _, _, done, _ = menv.step(0)
        if done:
            menv.reset()

    f = open(mon_file, 'rt')

    firstline = f.readline()
    assert firstline.startswith('#')
    metadata = json.loads(firstline[1:])
    assert metadata['env_id'] == "CartPole-v1"
    assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'},  "Incorrect keys in monitor metadata"

    last_logline = pandas.read_csv(f, index_col=None)
    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
    f.close()
    os.remove(mon_file) 
Example 26
Project: Financial-NLP   Author: Coldog2333   File: Senti.py    License: Apache License 2.0 5 votes vote down vote up
def renew_word_score(self, Min_count, Size):
        common_words=pd.read_csv(self.word_score_filename)
        self.word_classifier = dict(zip(common_words.iloc[:,0], common_words.iloc[:,1])) 
Example 27
Project: Financial-NLP   Author: Coldog2333   File: Senti.py    License: Apache License 2.0 5 votes vote down vote up
def score_of_common_words(self, Min_count, Size, saveflag=1, savefilename=''):
        """
        calculate scores of common words, and save the results as you like.
        
        p.s. please make sure you have set savefilename.
        """
        self.set_model_parameters(Min_count, Size)
        table=pd.read_csv(self.nlp_vector_filename)
        #table=table.abs() # 余弦相似度直接取绝对值
        result=['']*table.shape[0]
        score=[0]*table.shape[0]
        label_num=(table.shape[1]-1)/2
        for i in range(table.shape[0]): 
            w2v=table.iloc[i,1:label_num+1]
            wn=table.iloc[i,len(label_num)+1:len(label_num)*2+1]
            result[i]=self.get_topn_topm(w2v, wn, n=9, m=3) # 这是一个字符串Index
            for reword in result[i]:
                score[i]+=table.loc[i, reword+'w2v']*self.nlp.Label_dict[reword]
            score[i]/=len(result[i])
        
        if saveflag:
            try:
                fp=open(self.valid_word_filename,'r',encoding='utf-8')
                txtlist=fp.readlines()
            except:
                fp=open(self.valid_word_filename,'r',encoding='gbk')
                txtlist=fp.readlines()
            valid_words=[]
            for t in txtlist:
                t=t.split('\n')[0]
                valid_words.append(t)
            fp.close()
            rawdata=pd.DataFrame(score, valid_words)
            pd.DataFrame.to_csv(rawdata, savefilename,encoding='gkb') 
Example 28
Project: MPContribs   Author: materialsproject   File: utils.py    License: MIT License 5 votes vote down vote up
def read_csv(body, is_data_section=True, **kwargs):
    """run pandas.read_csv on (sub)section body"""
    csv_comment_char = "#"
    import pandas

    body = body.strip()
    if not body:
        return None
    from mpcontribs.io.core.components.tdata import Table

    if is_data_section:
        cur_line = 1
        while 1:
            body_split = body.split("\n", cur_line)
            first_line = body_split[cur_line - 1].strip()
            cur_line += 1
            if first_line and not first_line.startswith(csv_comment_char):
                break
        sep = kwargs.get("sep", ",")
        options = {"sep": sep, "header": 0}
        header = [col.strip() for col in first_line.split(sep)]
        body = "\n".join([sep.join(header), body_split[1]])
        if first_line.startswith("level_"):
            options.update({"index_col": [0, 1]})
        ncols = len(header)
    else:
        options = {"sep": ":", "header": None, "index_col": 0}
        ncols = 2
    options.update(**kwargs)
    converters = dict((col, strip_converter) for col in range(ncols))
    return Table(
        pandas.read_csv(
            StringIO(body),
            comment=csv_comment_char,
            skipinitialspace=True,
            squeeze=True,
            converters=converters,
            encoding="utf8",
            **options
        ).dropna(how="all")
    ) 
Example 29
Project: MPContribs   Author: materialsproject   File: translate_vicalloy.py    License: MIT License 5 votes vote down vote up
def get_translate(workdir=None):

    filename = os.path.join(workdir, "Vicalloy/Fe-Co-V_140922a_META_DATA.csv")
    compdata_f = pd.read_csv(filename, sep="\t").dropna()
    print compdata_f.head()
    x = compdata_f["Xnom (mm)"].values
    y = compdata_f["Ynom (mm)"].values
    Co_concentration = compdata_f["Co (at%)"].values
    Fe_concentration = compdata_f["Fe (at%)"].values
    V_concentration = compdata_f["V (at%)"].values
    method = "linear"
    # method = 'nearest'

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        Co_concI = interp2d(x, y, Co_concentration, kind=method)
        Fe_concI = interp2d(x, y, Fe_concentration, kind=method)
        V_concI = interp2d(x, y, V_concentration, kind=method)

    def translate(key):
        manip_z, manip_y = key
        sample_y = manip_z - 69.5
        sample_x = (manip_y + 8) * 2
        Co = Co_concI(sample_x, sample_y)[0] / 100.0
        Fe = Fe_concI(sample_x, sample_y)[0] / 100.0
        V = V_concI(sample_x, sample_y)[0] / 100.0
        return ("Fe{:.2f}Co{:.2f}V{:.2f}".format(Fe, Co, V), sample_x, sample_y)

    return translate 
Example 30
Project: MPContribs   Author: materialsproject   File: mspScan.py    License: MIT License 5 votes vote down vote up
def read_scan(filename):
    scandata_f = pd.read_csv(filename, sep="\t", skiprows=12)
    if not ("Counter 1" in scandata_f.columns):
        scandata_f = pd.read_csv(filename, sep="\t", skiprows=10)
    if not ("Counter 1" in scandata_f.columns):
        raise ValueError("Check input file (tried skipping 12 or 10 lines)!")
    filedissection = dissect_filename(filename)
    for file_attr in filedissection.keys():
        scandata_f[file_attr] = filedissection[file_attr]
    return scandata_f