Python pandas.read_json() Examples

The following are code examples for showing how to use pandas.read_json(). They are extracted from open source Python projects. You can vote up the examples you like or vote down the ones you don't like. You can also save this page to your account.

Example 1
Project: sequana   Author: sequana   File: fastq_stats.py    (license) View Source Project 6 votes vote down vote up
def get_stats(self):
        import pandas as pd
        filenames, mode = self._get_files("*.json")
        if mode == "pe":
            df1 = pd.read_json(filenames[0])
            df2 = pd.read_json(filenames[1])
            df  = pd.concat([df1, df2])
            # Should have been sorted !
            df.index = ['R1', 'R2']
        else:
            df = pd.read_json(filenames[0])
            df.index = ['R1']
        df = df[["A", "C", "G", "T", "N", "n_reads", "mean quality", "GC content",
                "average read length", "total bases"]]
        for this in "ACGTN":
            df[this] /= df["total bases"] 
            df[this] *= 100
        return df 
Example 2
Project: Medium-crawler-with-data-analyzer   Author: lifei96   File: medium_topstories_analyzer.py    (license) View Source Project 6 votes vote down vote up
def read_stories_by_tags():
    tags = list()
    current_date = START_DATE
    while current_date <= END_DATE:
        file_in = open("./TopStories/%s.json" % current_date.isoformat(), 'r')
        raw_data = json.loads(str(file_in.read()))
        file_in.close()
        for raw_story in raw_data['stories']:
            for raw_tag in raw_story['tags']:
                tag = dict()
                tag['top_date'] = current_date.isoformat()
                tag['story_id'] = raw_story['story_id']
                tag['author'] = raw_story['author']
                tag['published_date'] = raw_story['published_date']
                tag['recommends'] = raw_story['recommends']
                tag['responses'] = raw_story['responses']
                tag['name'] = raw_tag['name']
                tag['post_count'] = raw_tag['postCount']
                tag['follower_count'] = raw_tag['metadata']['followerCount']
                tags.append(tag)
        print(current_date.isoformat())
        current_date = current_date + datetime.timedelta(days=1)
    return pd.read_json(json.dumps(tags)) 
Example 3
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 6 votes vote down vote up
def test_frame_from_json_bad_data(self):
        self.assertRaises(ValueError, read_json, StringIO('{"key":b:a:d}'))

        # too few indices
        json = StringIO('{"columns":["A","B"],'
                        '"index":["2","3"],'
                        '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
        self.assertRaises(ValueError, read_json, json,
                          orient="split")

        # too many columns
        json = StringIO('{"columns":["A","B","C"],'
                        '"index":["1","2","3"],'
                        '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
        self.assertRaises(AssertionError, read_json, json,
                          orient="split")

        # bad key
        json = StringIO('{"badkey":["A","B"],'
                        '"index":["2","3"],'
                        '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
        with tm.assertRaisesRegexp(ValueError, r"unexpected key\(s\): badkey"):
            read_json(json, orient="split") 
Example 4
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 6 votes vote down vote up
def test_v12_compat(self):
        df = DataFrame(
            [[1.56808523, 0.65727391, 1.81021139, -0.17251653],
             [-0.2550111, -0.08072427, -0.03202878, -0.17581665],
             [1.51493992, 0.11805825, 1.629455, -1.31506612],
             [-0.02765498, 0.44679743, 0.33192641, -0.27885413],
             [0.05951614, -2.69652057, 1.28163262, 0.34703478]],
            columns=['A', 'B', 'C', 'D'],
            index=pd.date_range('2000-01-03', '2000-01-07'))
        df['date'] = pd.Timestamp('19920106 18:21:32.12')
        df.ix[3, 'date'] = pd.Timestamp('20130101')
        df['modified'] = df['date']
        df.ix[1, 'modified'] = pd.NaT

        v12_json = os.path.join(self.dirpath, 'tsframe_v012.json')
        df_unser = pd.read_json(v12_json)
        assert_frame_equal(df, df_unser)

        df_iso = df.drop(['modified'], axis=1)
        v12_iso_json = os.path.join(self.dirpath, 'tsframe_iso_v012.json')
        df_unser_iso = pd.read_json(v12_iso_json)
        assert_frame_equal(df_iso, df_unser_iso) 
Example 5
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 6 votes vote down vote up
def test_date_format_frame(self):
        df = self.tsframe.copy()

        def test_w_date(date, date_unit=None):
            df['date'] = Timestamp(date)
            df.ix[1, 'date'] = pd.NaT
            df.ix[5, 'date'] = pd.NaT
            if date_unit:
                json = df.to_json(date_format='iso', date_unit=date_unit)
            else:
                json = df.to_json(date_format='iso')
            result = read_json(json)
            assert_frame_equal(result, df)

        test_w_date('20130101 20:43:42.123')
        test_w_date('20130101 20:43:42', date_unit='s')
        test_w_date('20130101 20:43:42.123', date_unit='ms')
        test_w_date('20130101 20:43:42.123456', date_unit='us')
        test_w_date('20130101 20:43:42.123456789', date_unit='ns')

        self.assertRaises(ValueError, df.to_json, date_format='iso',
                          date_unit='foo') 
Example 6
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 6 votes vote down vote up
def test_date_format_series(self):
        def test_w_date(date, date_unit=None):
            ts = Series(Timestamp(date), index=self.ts.index)
            ts.ix[1] = pd.NaT
            ts.ix[5] = pd.NaT
            if date_unit:
                json = ts.to_json(date_format='iso', date_unit=date_unit)
            else:
                json = ts.to_json(date_format='iso')
            result = read_json(json, typ='series')
            assert_series_equal(result, ts)

        test_w_date('20130101 20:43:42.123')
        test_w_date('20130101 20:43:42', date_unit='s')
        test_w_date('20130101 20:43:42.123', date_unit='ms')
        test_w_date('20130101 20:43:42.123456', date_unit='us')
        test_w_date('20130101 20:43:42.123456789', date_unit='ns')

        ts = Series(Timestamp('20130101 20:43:42.123'), index=self.ts.index)
        self.assertRaises(ValueError, ts.to_json, date_format='iso',
                          date_unit='foo') 
Example 7
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 6 votes vote down vote up
def test_date_unit(self):
        df = self.tsframe.copy()
        df['date'] = Timestamp('20130101 20:43:42')
        df.ix[1, 'date'] = Timestamp('19710101 20:43:42')
        df.ix[2, 'date'] = Timestamp('21460101 20:43:42')
        df.ix[4, 'date'] = pd.NaT

        for unit in ('s', 'ms', 'us', 'ns'):
            json = df.to_json(date_format='epoch', date_unit=unit)

            # force date unit
            result = read_json(json, date_unit=unit)
            assert_frame_equal(result, df)

            # detect date unit
            result = read_json(json, date_unit=None)
            assert_frame_equal(result, df) 
Example 8
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 6 votes vote down vote up
def test_weird_nested_json(self):
        # this used to core dump the parser
        s = r'''{
        "status": "success",
        "data": {
        "posts": [
            {
            "id": 1,
            "title": "A blog post",
            "body": "Some useful content"
            },
            {
            "id": 2,
            "title": "Another blog post",
            "body": "More content"
            }
           ]
          }
        }'''

        read_json(s) 
Example 9
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 6 votes vote down vote up
def test_misc_example(self):

        # parsing unordered input fails
        result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]', numpy=True)
        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])

        error_msg = """DataFrame\\.index are different

DataFrame\\.index values are different \\(100\\.0 %\\)
\\[left\\]:  Index\\(\\[u?'a', u?'b'\\], dtype='object'\\)
\\[right\\]: RangeIndex\\(start=0, stop=2, step=1\\)"""
        with tm.assertRaisesRegexp(AssertionError, error_msg):
            assert_frame_equal(result, expected, check_index_type=False)

        result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]')
        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
        assert_frame_equal(result, expected) 
Example 10
Project: autoxd   Author: nessessary   File: backend_live_runner.py    (license) View Source Project 6 votes vote down vote up
def _getUserStrategy(self, downloadStrategyInterval=60):
	"""??????????????
	downloadStrategyInterval: int default=60 ?
	return: df"""
	k = "SignForWebUser_preLoadTime"
	preLoadTime = myredis.get_obj(k)
	if preLoadTime is None:
	    preLoadTime = datetime.datetime(2015, 10, 19, 15, 33, 47, 53000)	#????????
	#????
	if (agl.curTime() - preLoadTime).total_seconds() > downloadStrategyInterval:
	    url = "http://stocksign.sinaapp.com/query?cmd=query_strategy"
	    result = Http().get(url)
	    df_source = pd.read_json(result)
	    df_source.columns = ['id', 'user_id', 'title', 'code']
	    preLoadTime = agl.curTime()
	    myredis.set_obj(k, preLoadTime)
	    myredis.set_obj('mysource', df_source)
	else:
	    df_source = myredis.get_obj('mysource')
	    if df_source is None:
		df_source = pd.DataFrame([])
	return df_source 
Example 11
Project: market-predictor   Author: bsmitty5000   File: processingData.py    (license) View Source Project 6 votes vote down vote up
def read_scraped_jason(filename):
    df = pd.read_json(filename)
    
    for column in df.columns:
        df[column] = df[column].apply(unlist)
    # gets only first 10 characters of date: year/month/day
    df['date'] = df['date'].apply(lambda x: x[:10])
    df['date'] = pd.to_datetime(df['date'])
    
    # if any removes duplicate posts
    df = df.drop_duplicates(subset = ['keywords'])
    # sorts dataframe by post date
    df = df.sort_values(by='date')
 
    df = df.drop('body', 1)
    df = df.drop('title', 1)
    
    df['keywords'].replace('', np.nan, inplace=True)
    df = df.dropna()
    
    return df 
Example 12
Project: funing   Author: langzi1949   File: sauron.py    (license) View Source Project 6 votes vote down vote up
def get_sh_info(id_no_list,prod_code):
    uri = 'mongodb://nbread:[email protected]:20000/gather'
    client = MongoClient(uri)
    db = client['gather']
    collection = db['sauron_credit_logs']
    json_list = collection.find({"data.user_idcard":{"$in":id_no_list},"prodCode":prod_code})
    arr =[]
    key_arr=[]
    for line in json_list:
        sauron_dict = arrange_dict(line['data'])
        arr.append(sauron_dict)
    
    data = pd.read_json(json.dumps(arr))
    data_ri = data.reindex(columns={'user_idcard','user_phone','user_name','last_appear_idcard','last_appear_phone','used_idcards_cnt','used_phones_cnt','sn_score','sn_order1_contacts_cnt','sn_order1_blacklist_contacts_cnt','sn_order2_blacklist_contacts_cnt','sn_order2_blacklist_routers_cnt','sn_order2_blacklist_routers_pct','idcard_in_blacklist','phone_in_blacklist','in_court_blacklist','in_p2p_blacklist','in_bank_blacklist','last_appear_idcard_in_blacklist','last_appear_phone_in_blacklist','online_installment_cnt','offline_installment_cnt','credit_card_repayment_cnt','payday_loan_cnt','online_cash_loan_cnt','offline_cash_loan_cnt','others_cnt','search_cnt','search_cnt_recent_7_days','search_cnt_recent_14_days','search_cnt_recent_30_days','search_cnt_recent_60_days','search_cnt_recent_90_days','search_cnt_recent_180_days','org_cnt','org_cnt_recent_7_days','org_cnt_recent_14_days','org_cnt_recent_30_days','org_cnt_recent_60_days','org_cnt_recent_90_days','org_cnt_recent_180_days'})
    data_ri.to_csv("D://desktop//text.csv",encoding ='utf-8')
    #data_ri.to_csv("D://desktop//text.csv",header=True,index_label=['phone','id_card','STAN_FRD_LEVEL'])
    #print(data_ri) 
Example 13
Project: StockPredictor   Author: wallsbreaker   File: indicator_acquire.py    (license) View Source Project 6 votes vote down vote up
def extract_features_from_json():
    input_path = '../../data/20_5_from_2008/'
    df_list = []
    for json_file in os.listdir(input_path):
        train_data = pd.read_json(os.path.join(input_path, json_file), orient='columns')
        train_data.dropna(inplace=True)
        train_data.sort_index(ascending=False, inplace=True)
        train_data.index = range(len(train_data))
        if len(train_data) > 0:
            data_norm(train_data)

        values = train_data['real_up_after_240'].tolist()
        codes = train_data['code'].tolist()
        train_data.drop(['datetime', 'code', 'real_up_after_240'], axis=1, inplace=True)
        features = train_data.values.tolist()

        with open('../../data/20_5_from_2008/data', 'a') as f:
            for ix in xrange(len(codes)):
                if np.inf not in features[ix] and -np.inf not in features[ix]:
                    f.write('%s;0 %s;1 %f\n' % (codes[ix][2:], ' '.join([str(x) for x in features[ix]]), values[ix])) 
Example 14
Project: fitbit-analyzer   Author: 5agado   File: utils.py    (license) View Source Project 6 votes vote down vote up
def loadStepsData(dumpDir):
    """
    Load steps data from dumping done using the official Fitbit API.
    Check README file for further info on the scraping process and saved format
    :param dumpDir: the folder where the date has been dumped
    :return: a list of dataframes, one for each day, containing the intraday steps data
    """
    def loadFun(jsonData):
        intradayData = jsonData['activities-steps-intraday']['dataset']
        date = jsonData['activities-steps'][0]['dateTime']
        if not intradayData:
            return None
        df = pd.read_json(json.dumps(intradayData))
        df['datetime'] = pd.to_datetime(date + ' ' + df['time'])
        df.drop('time', inplace=True, axis=1)
        return df

    return _loadData(dumpDir, 'steps', loadFun) 
Example 15
Project: bitrader   Author: jr-minnaar   File: bitx.py    (MIT License) View Source Project 5 votes vote down vote up
def get_orders_frame(self, state=None, kind='auth'):
        q = self.get_orders(state, kind)
        tj = json.dumps(q['orders'])
        df = pd.read_json(tj, convert_dates=['creation_timestamp', 'expiration_timestamp'])
        df.index = df.creation_timestamp
        return df 
Example 16
Project: berlin-devfest-2016-backend   Author: giansegato   File: backend.py    (MIT License) View Source Project 5 votes vote down vote up
def processData(data):
    df = pd.DataFrame.transpose(pd.read_json(json.dumps(data)))
    df = df.dropna(subset = [key for key in df.keys() if "x_" in key])
    df = df[pd.notnull(df['y_observed'])]
    
    X = df[[key for key in df.keys() if "x_" in key]].values
    y = df["y_observed"].values

    return X, y

# 5th: initial model 
Example 17
Project: fabric8-analytics-license-analysis   Author: fabric8-analytics   File: s3_data_store.py    (license) View Source Project 5 votes vote down vote up
def read_json_file_into_pandas_df(self, filename, index_col=False):
        json_string = self.read_json_file(filename=filename)
        return pd.read_json(json_string) 
Example 18
Project: crema   Author: bmcfee   File: 03-evaluate.py    (license) View Source Project 5 votes vote down vote up
def evaluate(input_path, n_jobs):

    aud, ann = zip(*crema.utils.get_ann_audio(input_path))

    test_idx = set(pd.read_json('index_test.json')['id'])

    # drop anything not in the test set
    ann = [ann_i for ann_i in ann if crema.utils.base(ann_i) in test_idx]
    aud = [aud_i for aud_i in aud if crema.utils.base(aud_i) in test_idx]

    stream = tqdm(zip(ann, aud), desc='Evaluating test set', total=len(ann))

    results = Parallel(n_jobs=n_jobs)(delayed(track_eval)(ann_i, aud_i)
                                      for ann_i, aud_i in stream)
    df = pd.DataFrame.from_dict(dict(results), orient='index')

    print('Results')
    print('-------')
    print(df.describe())

    df.to_json(os.path.join(OUTPUT_PATH, 'test_scores.json')) 
Example 19
Project: OpenAPS   Author: medicinexlab   File: bgdata.py    (license) View Source Project 5 votes vote down vote up
def get_bg_dataframe(id_str):
    """
    Function to convert the json file to a pandas dataframe.
    It takes in the string of the id and looks for the devicestatus.json file.
    All data should be stored such that in the directory where main.py lies,
    there is a directory called "data". Inside this directory,
    there is another directory with just the ID Number. Inside this data folder lies the
    devicestatus.json file, which contains the data. If the file is not in the path given,
    it raises an IOError. The path should look like the following example:

    ./data/12345678/devicestatus.json

    Input:      id_str                          ID number as a string
    Output:     bg_df                           Pandas dataframe of all of the data from ./data/[id_str]/devicestatus.json
    Usage:      bg_df = get_bg_dataframe("12345678")
    """

    try:
        file_location = "./data/" + id_str + "/devicestatus.json"
        bg_df = pd.read_json(file_location) #Opens the data file and reads in the data into a dataFrame
    except:
        raise IOError(file_location + " is not a valid file.")

    print
    print("{} total entries.".format(len(bg_df)))

    return bg_df


#Function to find the indices for the given start and end date strings 
Example 20
Project: geekbook   Author: mmagnus   File: find_files.py    (license) View Source Project 5 votes vote down vote up
def file_search(filename, verbose):
    """Search for filename. Returns dirname of the filename's path, and the full path.
    
    170107 add cache. If the db is not found, create an empty pandas df 
    and populate this df with append later. If the filename is not in the db
    run g/locate. Then, save the found path to the db (using pandas, via df, to json)"""

    # cache
    if os.path.isfile(JSON_DB):
        df = pd.read_json(JSON_DB, orient='records')
        #filename = 'x.pse'
        pathdf = df[df['fn'] == filename]['path']
        if not pathdf.empty:
            path = pathdf.to_string(index=False)
            logger.info('find file [from the db]:' + filename)
            return os.path.dirname(path), path
    else:
        df = pd.DataFrame()

    # if filename is not found in the db
    logger.info('find file:' + filename)

    if platform.system() == "Linux":
        out = commands.getoutput('locate ' + filename)
    if platform.system() == "Darwin":
        out = commands.getoutput('glocate ' + filename)
    first_hit = out.split('\n')[0]
    logger.info('# of hits ' + str(len(out.split('\n'))) + " " + out.replace('\n',', '))
    if not first_hit:
        logger.info('not found')
    else:
        logger.info('hit ' + first_hit)

    # update cache
    dffile = pd.DataFrame([[filename, first_hit],], columns=['fn', 'path'])
    df = df.append(dffile, ignore_index=True)
    # save to json
    df.to_json(JSON_DB, orient='records')
    ##
    return os.path.dirname(first_hit), first_hit 
Example 21
Project: slaveo   Author: lamter   File: future.py    (license) View Source Project 5 votes vote down vote up
def get_holiday_json(self):
        """
        ???????
        :return:
        """
        path = os.path.join(pwd, 'holiday.json')
        return pd.read_json(path, typ="series").sort_index() 
Example 22
Project: sci-pype   Author: jay-johnson   File: pycore.py    (license) View Source Project 5 votes vote down vote up
def pd_json_to_df(self, data_json, sorted_by_key="Date", in_ascending=True):
        import pandas as pd
        new_df  = pd.read_json(data_json).sort_values(by=sorted_by_key, ascending=in_ascending)
        return new_df
    # end of pd_json_to_df 
Example 23
Project: visualizations   Author: ContentMine   File: preprocessing.py    (license) View Source Project 5 votes vote down vote up
def get_raw(filename):
    with open(filename) as infile:
        raw = infile.read()
        # the next line needs rewriting as soon as the zenodo-dump conforms to 'records'-format
        # [{k:v}, {k:v},...]
        rawfacts = pd.read_json('[%s]' % ','.join(raw.splitlines()), orient='records')
    return rawfacts


### functions for ingesting from CProject



### functions for preprocessing 
Example 24
Project: quickdraw_prediction_model   Author: keisukeirie   File: feature_engineering_func.py    (license) View Source Project 5 votes vote down vote up
def load_json(filename):
    '''
    Function:
        - opens json file and store information in a pandas dataframe
        - also prints out aggregated df with counts of picture by countrycode
    Input:
        1. filename/path ex: ./data/filename.json
    Output:
        1. new dataframe containing json info
    '''
    df = pd.read_json(filename, lines=True)
    test = df.groupby(df['countrycode']).count()
    print test.sort(columns='drawing',ascending=False).head(15)
    return df 
Example 25
Project: kaggle-cooking   Author: fpoli   File: utils.py    (license) View Source Project 5 votes vote down vote up
def read_data(project_path):
    print "Reading data..."
    train = pd.read_json(project_path + "/data/train.json")
    test = pd.read_json(project_path + "/data/test.json")

    print "Train size:", len(train.id)
    print "Test size:", len(test.id)

    return train, test 
Example 26
Project: fabric8-analytics-stack-analysis   Author: fabric8-analytics   File: local_filesystem.py    (license) View Source Project 5 votes vote down vote up
def read_json_file_into_pandas_df(self, filename):
        return pd.read_json(os.path.join(self.src_dir, filename), dtype=np.int8) 
Example 27
Project: fabric8-analytics-stack-analysis   Author: fabric8-analytics   File: s3_data_store.py    (license) View Source Project 5 votes vote down vote up
def read_json_file_into_pandas_df(self, filename):
        json_string = self.read_json_file(filename=filename)
        return pd.read_json(json_string, dtype=np.int8) 
Example 28
Project: IntroPython2016   Author: UWPCE-PythonCert   File: crimeHeatMap.py    (license) View Source Project 5 votes vote down vote up
def apiResults(locationInfo):
	query = ("https://data.seattle.gov/resource/pu5n-trf4.json?$limit={}&$where=within_circle(incident_location,{},{},{})"
		.format(locationInfo['limit'],
				locationInfo['latitude'],
				locationInfo['longitude'],
				locationInfo['radius']))
	return pd.read_json(query) 
Example 29
Project: catalyst   Author: enigmampc   File: poloniex.py    (license) View Source Project 5 votes vote down vote up
def fetch_raw_metadata_frame(self, api_key, page_number):
        if page_number > 1:
            return pd.DataFrame([])

        raw = pd.read_json(
            self._format_metadata_url(
              api_key,
              page_number,
            ),
            orient='index',
        )

        raw = raw.sort_index().reset_index()
        raw.rename(
            columns={'index': 'symbol'},
            inplace=True,
        )

        raw = raw[raw['isFrozen'] == 0]
        return raw 
Example 30
Project: catalyst   Author: enigmampc   File: poloniex.py    (license) View Source Project 5 votes vote down vote up
def fetch_raw_symbol_frame(self,
                               api_key,
                               symbol,
                               calendar,
                               start_date,
                               end_date,
                               frequency):

        # TODO: replace this with direct exchange call
        # The end date and frequency should be used to
        # calculate the number of bars
        if(frequency == 'minute'):
            pc = PoloniexCurator()
            raw = pc.onemin_to_dataframe(symbol, start_date, end_date)

        else:
            raw = pd.read_json(
                self._format_data_url(
                    api_key,
                    symbol,
                    start_date,
                    end_date,
                    frequency,
                ),
                orient='records',
            )
            raw.set_index('date', inplace=True)

        # BcolzDailyBarReader introduces a 1/1000 factor in the way
        # pricing is stored on disk, which we compensate here to get
        # the right pricing amounts
        # ref: data/us_equity_pricing.py
        scale = 1
        raw.loc[:, 'open'] /= scale
        raw.loc[:, 'high'] /= scale
        raw.loc[:, 'low'] /= scale
        raw.loc[:, 'close'] /= scale
        raw.loc[:, 'volume'] *= scale

        return raw 
Example 31
Project: bigquery-bokeh-dashboard   Author: GoogleCloudPlatform   File: utils.py    (license) View Source Project 5 votes vote down vote up
def run_query(query, cache_key, expire=3600, dialect='legacy'):
    memcached_client = memcached_discovery.get_client()
    if memcached_client is None:
        return _run(query, dialect=dialect)
    else:
        json = memcached_client.get(cache_key)
        if json is not None:
            df = pd.read_json(json, orient='records')
        else:
            df = _run(query, dialect=dialect)
            memcached_client.set(cache_key, df.to_json(orient='records'), expire=expire)
        return df 
Example 32
Project: Guess-Genre-By-Lyrics   Author: ormatt   File: build_model.py    (license) View Source Project 5 votes vote down vote up
def main():
    start_time = time.time()
    args = parse_args()
    logger.setLevel(getattr(logging, args.verbosity.upper()))
    logger.info("Started")

    build_constants()

    df = pd.read_json(path_or_buf=DATA_PATH, orient='records', encoding="UTF8")
    logger.debug("Loaded {} rows into df".format(len(df)))

    df = utils.get_data_subset.crop(df, None, None)
    df = utils.get_data_subset.filter_rows_by_string(df,
                                                     [TARGET_COL],
                                                     ['Rock',
                                                      'Hip Hop'])
    df = utils.clean_data.execute_cleaners(df)
    df = utils.normalize_data.normalize_genres(df, TARGET_COL)
    X, y = utils.get_data_subset.get_x_y(df, SAMPLE_COL, TARGET_COL)

    clf = model_pipeline.get_pipeline(SAMPLE_COL)

    utils.persistence.dump(DF_DUMP_NAME, df)
    utils.persistence.dump(CLF_DUMP_NAME, clf)

    if args.train:
        train_and_test.train_and_dump(X, y, clf)
    elif args.test:
        train_and_test.test_using_kfold(X, y, clf)

    logger.info("Finished in {0:.2f} seconds".format(time.time() - start_time)) 
Example 33
Project: cjworkbench   Author: CJWorkbench   File: enigma.py    (license) View Source Project 5 votes vote down vote up
def handle_dotio_url(wf_module, url, split_url, num_rows):
    """
    Processes response for any request to enigma.io. Here, we assume that the API key is provided,
    because, at least at first glance (or two or three) there doesn't seem to be any provisions for
    accessing dataset endpoints sans API key.
    """

    if num_rows > 500:
        wf_module.set_error("You can request a maximum of 500 rows.")
        return

    if "/limit/" not in url:
        if url.endswith('/'):
            url += "limit/{}".format(num_rows)
        else:
            url += "/limit/{}".format(num_rows)

    response = requests.get(url)
    if response.status_code != 200:
        error = json.loads(response.text)
        if "message" in error:
            message = error["message"]
        else:
            message = error["info"]["message"]
            if "additional" in error["info"]:
               message += ": " + error["info"]["additional"]["message"]
        wf_module.set_error("Unable to retrieve data from Enigma. Received {} status, with message {}"
            .format(response.status_code, message))
        return
    try:
        json_text = json.loads(response.text)
        table = pd.read_json(json.dumps(json_text['result']))
        return table
    except Exception as ex: # Generic exceptions suck, but is it the most pragmatic/all-encompassing here?
        wf_module.set_error("Unable to process request: {}".format(str(ex)))
        return 
Example 34
Project: jupyter-handsontables   Author: techmuch   File: __init__.py    (license) View Source Project 5 votes vote down vote up
def _from_json(self, value, obj=None):
        if value is not None:
            df = pd.read_json(json.dumps(value), orient="split")
        else:
            df = pd.DataFrame()
        return df 
Example 35
Project: jupyter-handsontables   Author: techmuch   File: __init__.py    (license) View Source Project 5 votes vote down vote up
def _from_json(self, value, obj=None):
        if value is not None:
            df = pd.read_json(json.dumps(value), orient="split")
        else:
            df = pd.DataFrame()
        return df 
Example 36
Project: datanode   Author: jay-johnson   File: pycore.py    (license) View Source Project 5 votes vote down vote up
def pd_json_to_df(self, data_json, sorted_by_key="Date", in_ascending=True):
        import pandas as pd
        new_df  = pd.read_json(data_json).sort_values(by=sorted_by_key, ascending=in_ascending)
        return new_df
    # end of pd_json_to_df 
Example 37
Project: Medium-crawler-with-data-analyzer   Author: lifei96   File: medium_posts_data_reader.py    (license) View Source Project 5 votes vote down vote up
def read_posts():
    posts = list()
    file_in = open('./post_list.txt', 'r')
    post_list = str(file_in.read()).split(' ')
    file_in.close()
    num = 0
    for post_id in post_list:
        if not post_id:
            continue
        if not os.path.exists('./data/Posts/%s.json' % post_id):
            continue
        try:
            file_in = open('./data/Posts/%s.json' % post_id, 'r')
            raw_data = json.loads(str(file_in.read()))
            file_in.close()
            post = dict()
            post['post_id'] = post_id
            post['published_date'] = raw_data['published_date']
            post['recommends'] = raw_data['recommends']
            post['responses'] = raw_data['responses']
            posts.append(post)
        except:
            continue
        num += 1
        print(post_id)
        print(num)
    return pd.read_json(json.dumps(posts)) 
Example 38
Project: Medium-crawler-with-data-analyzer   Author: lifei96   File: medium_tags_data_reader.py    (license) View Source Project 5 votes vote down vote up
def read_posts():
    posts = list()
    file_in = open('./post_list.txt', 'r')
    post_list = str(file_in.read()).split(' ')
    file_in.close()
    num = 0
    for post_id in post_list:
        if not post_id:
            continue
        if not os.path.exists('./data/Posts/%s.json' % post_id):
            continue
        try:
            file_in = open('./data/Posts/%s.json' % post_id, 'r')
            raw_data = json.loads(str(file_in.read()))
            file_in.close()
            for tag in raw_data['tags']:
                post = dict()
                post['post_id'] = post_id
                post['published_date'] = raw_data['published_date']
                post['recommends'] = raw_data['recommends']
                post['responses'] = raw_data['responses']
                post['tag'] = tag['name']
                posts.append(post)
                print(post)
        except:
            continue
        num += 1
        print(post_id)
        print(num)
    return pd.read_json(json.dumps(posts)) 
Example 39
Project: Medium-crawler-with-data-analyzer   Author: lifei96   File: medium_topstories_analyzer.py    (license) View Source Project 5 votes vote down vote up
def read_stories_without_tags():
    stories = list()
    current_date = START_DATE
    while current_date <= END_DATE:
        file_in = open("./TopStories/%s.json" % current_date.isoformat(), 'r')
        raw_data = json.loads(str(file_in.read()))
        file_in.close()
        for raw_story in raw_data['stories']:
            story = dict()
            story['top_date'] = current_date.isoformat()
            story['story_id'] = raw_story['story_id']
            story['author'] = raw_story['author']
            story['published_date'] = raw_story['published_date']
            story['recommends'] = raw_story['recommends']
            story['responses'] = raw_story['responses']
            story['tags_count'] = len(raw_story['tags'])
            stories.append(story)
        print(current_date.isoformat())
        current_date = current_date + datetime.timedelta(days=1)
    return pd.read_json(json.dumps(stories)) 
Example 40
Project: Medium-crawler-with-data-analyzer   Author: lifei96   File: medium_users_data_reader.py    (license) View Source Project 5 votes vote down vote up
def read_users():
    users = list()
    file_in = open('./username_list.txt', 'r')
    username_list = str(file_in.read()).split(' ')
    file_in.close()
    num = 0
    for username in username_list:
        if not username:
            continue
        if not os.path.exists('./data/Users/%s.json' % username):
            continue
        try:
            file_in = open('./data/Users/%s.json' % username, 'r')
            raw_data = json.loads(str(file_in.read()))
            file_in.close()
            user = dict()
            user['username'] = username
            user['reg_date'] = datetime.date.fromtimestamp(raw_data['profile']['user']['createdAt']/1000.0).isoformat()
            if not raw_data['profile']['user']['lastPostCreatedAt']:
                raw_data['profile']['user']['lastPostCreatedAt'] = raw_data['profile']['user']['createdAt']
            user['last_post_date'] = datetime.date.fromtimestamp(raw_data['profile']['user']['lastPostCreatedAt']/1000.0).isoformat()
            user['posts_count'] = raw_data['profile']['numberOfPostsPublished']
            user['following_count'] = raw_data['profile']['user']['socialStats']['usersFollowedCount']
            user['followers_count'] = raw_data['profile']['user']['socialStats']['usersFollowedByCount']
            users.append(user)
        except:
            continue
        num += 1
        print(username)
        print(num)
    return pd.read_json(json.dumps(users)) 
Example 41
Project: PythonTrading   Author: F2011B   File: realtimeProCon.py    (license) View Source Project 5 votes vote down vote up
def data_received(self, data):
        updateOZ_event.data=pd.read_json(data.decode())
        updateOZ_event.set() 
Example 42
Project: PythonTrading   Author: F2011B   File: realtimeProCon.py    (license) View Source Project 5 votes vote down vote up
def handle_OZServer(loop):
    reader, writer = yield from asyncio.open_connection('127.0.0.1', 2222,loop=loop)
    symbolList=list()
    while True:
        if updateOZ_event.is_set():
            print('In Server send')
            updateOZ_event.clear()
            for element in updateOZ_event.data :
                writer.write(('Add_'+ element+'_End').encode())
            writer.write('Send'.encode())

            outputbuffer = StringIO()
            condition = True
            while condition:
                data =  yield from reader.read(1024)
                message=data.decode()
                if message.find('!ENDMSG!') != -1:
                    message = message.replace('!ENDMSG!', '')
                    condition = False
                    print('End found')

                outputbuffer.write(message)

            outputbuffer.seek(0)
            DF=pd.read_json(outputbuffer)
            #print(DF)
            yield from updateOZ_queue.put(DF)
        yield None

    writer.close()
    reader.close() 
Example 43
Project: spotlight   Author: maciejkula   File: helpers.py    (license) View Source Project 5 votes vote down vote up
def _load_data(filename, columns=None):

    data = pd.read_json(filename, lines=True)
    data = data.sort_values('validation_mrr', ascending=False)

    mrr_cols = ['validation_mrr', 'test_mrr']

    if columns is None:
        columns = [x for x in data.columns if
                   (x not in mrr_cols and x != 'hash')]

    cols = data.columns
    cols = mrr_cols + columns

    return data[cols] 
Example 44
Project: spotlight   Author: maciejkula   File: helpers.py    (license) View Source Project 5 votes vote down vote up
def _load_data(filename, columns=None):

    data = pd.read_json(filename, lines=True)
    data = data.sort_values('validation_mrr', ascending=False)

    mrr_cols = ['validation_mrr', 'test_mrr']

    if columns is None:
        columns = [x for x in data.columns if
                   (x not in mrr_cols and x != 'hash')]

    cols = data.columns
    cols = mrr_cols + columns

    return data[cols] 
Example 45
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 5 votes vote down vote up
def test_frame_double_encoded_labels(self):
        df = DataFrame([['a', 'b'], ['c', 'd']],
                       index=['index " 1', 'index / 2'],
                       columns=['a \\ b', 'y / z'])

        assert_frame_equal(df, read_json(df.to_json(orient='split'),
                                         orient='split'))
        assert_frame_equal(df, read_json(df.to_json(orient='columns'),
                                         orient='columns'))
        assert_frame_equal(df, read_json(df.to_json(orient='index'),
                                         orient='index'))
        df_unser = read_json(df.to_json(orient='records'), orient='records')
        assert_index_equal(df.columns, df_unser.columns)
        np.testing.assert_equal(df.values, df_unser.values) 
Example 46
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 5 votes vote down vote up
def test_frame_non_unique_index(self):
        df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1],
                       columns=['x', 'y'])

        self.assertRaises(ValueError, df.to_json, orient='index')
        self.assertRaises(ValueError, df.to_json, orient='columns')

        assert_frame_equal(df, read_json(df.to_json(orient='split'),
                                         orient='split'))
        unser = read_json(df.to_json(orient='records'), orient='records')
        self.assertTrue(df.columns.equals(unser.columns))
        np.testing.assert_equal(df.values, unser.values)
        unser = read_json(df.to_json(orient='values'), orient='values')
        np.testing.assert_equal(df.values, unser.values) 
Example 47
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 5 votes vote down vote up
def test_frame_non_unique_columns(self):
        df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2],
                       columns=['x', 'x'])

        self.assertRaises(ValueError, df.to_json, orient='index')
        self.assertRaises(ValueError, df.to_json, orient='columns')
        self.assertRaises(ValueError, df.to_json, orient='records')

        assert_frame_equal(df, read_json(df.to_json(orient='split'),
                                         orient='split', dtype=False))
        unser = read_json(df.to_json(orient='values'), orient='values')
        np.testing.assert_equal(df.values, unser.values)

        # GH4377; duplicate columns not processing correctly
        df = DataFrame([['a', 'b'], ['c', 'd']], index=[
                       1, 2], columns=['x', 'y'])
        result = read_json(df.to_json(orient='split'), orient='split')
        assert_frame_equal(result, df)

        def _check(df):
            result = read_json(df.to_json(orient='split'), orient='split',
                               convert_dates=['x'])
            assert_frame_equal(result, df)

        for o in [[['a', 'b'], ['c', 'd']],
                  [[1.5, 2.5], [3.5, 4.5]],
                  [[1, 2.5], [3, 4.5]],
                  [[Timestamp('20130101'), 3.5],
                   [Timestamp('20130102'), 4.5]]]:
            _check(DataFrame(o, index=[1, 2], columns=['x', 'x'])) 
Example 48
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 5 votes vote down vote up
def test_frame_from_json_nones(self):
        df = DataFrame([[1, 2], [4, 5, 6]])
        unser = read_json(df.to_json())
        self.assertTrue(np.isnan(unser[2][0]))

        df = DataFrame([['1', '2'], ['4', '5', '6']])
        unser = read_json(df.to_json())
        self.assertTrue(np.isnan(unser[2][0]))
        unser = read_json(df.to_json(), dtype=False)
        self.assertTrue(unser[2][0] is None)
        unser = read_json(df.to_json(), convert_axes=False, dtype=False)
        self.assertTrue(unser['2']['0'] is None)

        unser = read_json(df.to_json(), numpy=False)
        self.assertTrue(np.isnan(unser[2][0]))
        unser = read_json(df.to_json(), numpy=False, dtype=False)
        self.assertTrue(unser[2][0] is None)
        unser = read_json(df.to_json(), numpy=False,
                          convert_axes=False, dtype=False)
        self.assertTrue(unser['2']['0'] is None)

        # infinities get mapped to nulls which get mapped to NaNs during
        # deserialisation
        df = DataFrame([[1, 2], [4, 5, 6]])
        df.loc[0, 2] = np.inf
        unser = read_json(df.to_json())
        self.assertTrue(np.isnan(unser[2][0]))
        unser = read_json(df.to_json(), dtype=False)
        self.assertTrue(np.isnan(unser[2][0]))

        df.loc[0, 2] = np.NINF
        unser = read_json(df.to_json())
        self.assertTrue(np.isnan(unser[2][0]))
        unser = read_json(df.to_json(), dtype=False)
        self.assertTrue(np.isnan(unser[2][0])) 
Example 49
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 5 votes vote down vote up
def test_frame_empty_mixedtype(self):
        # mixed type
        df = DataFrame(columns=['jim', 'joe'])
        df['joe'] = df['joe'].astype('i8')
        self.assertTrue(df._is_mixed_type)
        assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df,
                           check_index_type=False) 
Example 50
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_pandas.py    (license) View Source Project 5 votes vote down vote up
def test_frame_mixedtype_orient(self):  # GH10289
        vals = [[10, 1, 'foo', .1, .01],
                [20, 2, 'bar', .2, .02],
                [30, 3, 'baz', .3, .03],
                [40, 4, 'qux', .4, .04]]

        df = DataFrame(vals, index=list('abcd'),
                       columns=['1st', '2nd', '3rd', '4th', '5th'])

        self.assertTrue(df._is_mixed_type)
        right = df.copy()

        for orient in ['split', 'index', 'columns']:
            inp = df.to_json(orient=orient)
            left = read_json(inp, orient=orient, convert_axes=False)
            assert_frame_equal(left, right)

        right.index = np.arange(len(df))
        inp = df.to_json(orient='records')
        left = read_json(inp, orient='records', convert_axes=False)
        assert_frame_equal(left, right)

        right.columns = np.arange(df.shape[1])
        inp = df.to_json(orient='values')
        left = read_json(inp, orient='values', convert_axes=False)
        assert_frame_equal(left, right)