Python pandas.io.parsers.read_csv() Examples

The following are 30 code examples of pandas.io.parsers.read_csv(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas.io.parsers , or try the search function .
Example #1
Source File: test_excel.py    From Computable with MIT License 7 votes vote down vote up
def test_parse_cols_int(self):
        _skip_if_no_openpyxl()
        _skip_if_no_xlrd()

        suffix = ['xls', 'xlsx', 'xlsm']

        for s in suffix:
            pth = os.path.join(self.dirpath, 'test.%s' % s)
            xls = ExcelFile(pth)
            df = xls.parse('Sheet1', index_col=0, parse_dates=True,
                           parse_cols=3)
            df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
            df2 = df2.reindex(columns=['A', 'B', 'C'])
            df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
                            parse_dates=True, parse_cols=3)
            # TODO add index to xls file)
            tm.assert_frame_equal(df, df2, check_names=False)
            tm.assert_frame_equal(df3, df2, check_names=False) 
Example #2
Source File: test_stata.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_read_write_reread_dta14(self, file, parsed_114, version):
        file = getattr(self, file)
        parsed = self.read_dta(file)
        parsed.index.name = 'index'

        expected = self.read_csv(self.csv14)
        cols = ['byte_', 'int_', 'long_', 'float_', 'double_']
        for col in cols:
            expected[col] = expected[col]._convert(datetime=True, numeric=True)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['date_td'] = pd.to_datetime(
            expected['date_td'], errors='coerce')

        tm.assert_frame_equal(parsed_114, parsed)

        with tm.ensure_clean() as path:
            parsed_114.to_stata(path, {'date_td': 'td'}, version=version)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(
                written_and_read_again.set_index('index'), parsed_114) 
Example #3
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_read_write_reread_dta14(self, file, parsed_114, version):
        file = getattr(self, file)
        parsed = self.read_dta(file)
        parsed.index.name = 'index'

        expected = self.read_csv(self.csv14)
        cols = ['byte_', 'int_', 'long_', 'float_', 'double_']
        for col in cols:
            expected[col] = expected[col]._convert(datetime=True, numeric=True)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['date_td'] = pd.to_datetime(
            expected['date_td'], errors='coerce')

        tm.assert_frame_equal(parsed_114, parsed)

        with tm.ensure_clean() as path:
            parsed_114.to_stata(path, {'date_td': 'td'}, version=version)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(
                written_and_read_again.set_index('index'), parsed_114) 
Example #4
Source File: test_network.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_read_csv_chunked_download(self, s3_resource, caplog):
        # 8 MB, S3FS usees 5MB chunks
        df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
        buf = BytesIO()
        str_buf = StringIO()

        df.to_csv(str_buf)

        buf = BytesIO(str_buf.getvalue().encode('utf-8'))

        s3_resource.Bucket("pandas-test").put_object(
            Key="large-file.csv",
            Body=buf)

        with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
            read_csv("s3://pandas-test/large-file.csv", nrows=5)
            # log of fetch_range (start, stop)
            assert ((0, 5505024) in {x.args[-2:] for x in caplog.records}) 
Example #5
Source File: test_read_fwf.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_read_csv_compat():
    csv_data = """\
A,B,C,D,E
2011,58,360.242940,149.910199,11950.7
2011,59,444.953632,166.985655,11788.4
2011,60,364.136849,183.628767,11806.2
2011,61,413.836124,184.375703,11916.8
2011,62,502.953953,173.237159,12468.3
"""
    expected = read_csv(StringIO(csv_data), engine="python")

    fwf_data = """\
A   B     C            D            E
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4
201160    364.136849   183.628767   11806.2
201161    413.836124   184.375703   11916.8
201162    502.953953   173.237159   12468.3
"""
    colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
    result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
    tm.assert_frame_equal(result, expected) 
Example #6
Source File: test_excel.py    From Computable with MIT License 6 votes vote down vote up
def test_parse_cols_list(self):
        _skip_if_no_openpyxl()
        _skip_if_no_xlrd()

        suffix = ['xls', 'xlsx', 'xlsm']

        for s in suffix:
            pth = os.path.join(self.dirpath, 'test.%s' % s)
            xls = ExcelFile(pth)
            df = xls.parse('Sheet1', index_col=0, parse_dates=True,
                           parse_cols=[0, 2, 3])
            df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
            df2 = df2.reindex(columns=['B', 'C'])
            df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
                            parse_dates=True,
                            parse_cols=[0, 2, 3])
            # TODO add index to xls file)
            tm.assert_frame_equal(df, df2, check_names=False)
            tm.assert_frame_equal(df3, df2, check_names=False) 
Example #7
Source File: test_excel.py    From Computable with MIT License 6 votes vote down vote up
def check_excel_table_sheet_by_index(self, filename, csvfile):
        import xlrd

        pth = os.path.join(self.dirpath, filename)
        xls = ExcelFile(pth)
        df = xls.parse(0, index_col=0, parse_dates=True)
        df2 = self.read_csv(csvfile, index_col=0, parse_dates=True)
        df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True)
        tm.assert_frame_equal(df, df2, check_names=False)
        tm.assert_frame_equal(df3, df2, check_names=False)

        df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1)
        df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1)
        tm.assert_frame_equal(df4, df.ix[:-1])
        tm.assert_frame_equal(df4, df5)

        self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf') 
Example #8
Source File: test_network.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_parse_public_s3_bucket_chunked_python(self, tips_df):
        # Read with a chunksize using the Python parser
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp,
                                 engine='python')
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df) 
Example #9
Source File: test_network.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_parse_public_s3_bucket_chunked(self, tips_df):
        # Read with a chunksize
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp)
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them
                # properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df) 
Example #10
Source File: test_network.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_parse_public_s3_bucket(self, tips_df):
        pytest.importorskip('s3fs')
        # more of an integration test due to the not-public contents portion
        # can probably mock this though.
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df = read_csv('s3://pandas-test/tips.csv' +
                          ext, compression=comp)
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(df, tips_df)

        # Read public file from bucket with not-public contents
        df = read_csv('s3://cant_get_it/tips.csv')
        assert isinstance(df, DataFrame)
        assert not df.empty
        tm.assert_frame_equal(df, tips_df) 
Example #11
Source File: test_excel.py    From Computable with MIT License 6 votes vote down vote up
def test_xlsx_table(self):
        _skip_if_no_xlrd()
        _skip_if_no_openpyxl()

        pth = os.path.join(self.dirpath, 'test.xlsx')
        xlsx = ExcelFile(pth)
        df = xlsx.parse('Sheet1', index_col=0, parse_dates=True)
        df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
        df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True)

        # TODO add index to xlsx file
        tm.assert_frame_equal(df, df2, check_names=False)
        tm.assert_frame_equal(df3, df2, check_names=False)

        df4 = xlsx.parse('Sheet1', index_col=0, parse_dates=True,
                         skipfooter=1)
        df5 = xlsx.parse('Sheet1', index_col=0, parse_dates=True,
                         skip_footer=1)
        tm.assert_frame_equal(df4, df.ix[:-1])
        tm.assert_frame_equal(df4, df5) 
Example #12
Source File: test_network.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_parse_public_s3_bucket(self, tips_df):
        pytest.importorskip('s3fs')

        # more of an integration test due to the not-public contents portion
        # can probably mock this though.
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df = read_csv('s3://pandas-test/tips.csv' +
                          ext, compression=comp)
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(df, tips_df)

        # Read public file from bucket with not-public contents
        df = read_csv('s3://cant_get_it/tips.csv')
        assert isinstance(df, DataFrame)
        assert not df.empty
        tm.assert_frame_equal(df, tips_df) 
Example #13
Source File: test_network.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_parse_public_s3_bucket(self, tips_df):
        pytest.importorskip('s3fs')

        # more of an integration test due to the not-public contents portion
        # can probably mock this though.
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df = read_csv('s3://pandas-test/tips.csv' +
                          ext, compression=comp)
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(df, tips_df)

        # Read public file from bucket with not-public contents
        df = read_csv('s3://cant_get_it/tips.csv')
        assert isinstance(df, DataFrame)
        assert not df.empty
        tm.assert_frame_equal(df, tips_df) 
Example #14
Source File: test_network.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_parse_public_s3_bucket_chunked(self, tips_df):
        # Read with a chunksize
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp)
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them
                # properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df) 
Example #15
Source File: test_network.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_parse_public_s3_bucket_chunked_python(self, tips_df):
        # Read with a chunksize using the Python parser
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp,
                                 engine='python')
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df) 
Example #16
Source File: test_network.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_parse_public_s3_bucket_chunked(self, tips_df):
        # Read with a chunksize
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp)
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them
                # properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df) 
Example #17
Source File: test_read_fwf.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_read_csv_compat():
    csv_data = """\
A,B,C,D,E
2011,58,360.242940,149.910199,11950.7
2011,59,444.953632,166.985655,11788.4
2011,60,364.136849,183.628767,11806.2
2011,61,413.836124,184.375703,11916.8
2011,62,502.953953,173.237159,12468.3
"""
    expected = read_csv(StringIO(csv_data), engine="python")

    fwf_data = """\
A   B     C            D            E
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4
201160    364.136849   183.628767   11806.2
201161    413.836124   184.375703   11916.8
201162    502.953953   173.237159   12468.3
"""
    colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
    result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
    tm.assert_frame_equal(result, expected) 
Example #18
Source File: test_network.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_parse_public_s3_bucket_chunked_python(self, tips_df):
        # Read with a chunksize using the Python parser
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp,
                                 engine='python')
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df) 
Example #19
Source File: test_excel.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def get_csv_refdf(self, basename):
        """
        Obtain the reference data from read_csv with the Python engine.

        Parameters
        ----------

        basename : str
            File base name, excluding file extension.

        Returns
        -------

        dfref : DataFrame
        """
        pref = os.path.join(self.dirpath, basename + '.csv')
        dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python')
        return dfref 
Example #20
Source File: test_stata.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_read_write_reread_dta14(self, file, parsed_114, version):
        file = getattr(self, file)
        parsed = self.read_dta(file)
        parsed.index.name = 'index'

        expected = self.read_csv(self.csv14)
        cols = ['byte_', 'int_', 'long_', 'float_', 'double_']
        for col in cols:
            expected[col] = expected[col]._convert(datetime=True, numeric=True)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['date_td'] = pd.to_datetime(
            expected['date_td'], errors='coerce')

        tm.assert_frame_equal(parsed_114, parsed)

        with tm.ensure_clean() as path:
            parsed_114.to_stata(path, {'date_td': 'td'}, version=version)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(
                written_and_read_again.set_index('index'), parsed_114) 
Example #21
Source File: test_network.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_read_csv_chunked_download(self, s3_resource, caplog):
        # 8 MB, S3FS usees 5MB chunks
        df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
        buf = BytesIO()
        str_buf = StringIO()

        df.to_csv(str_buf)

        buf = BytesIO(str_buf.getvalue().encode('utf-8'))

        s3_resource.Bucket("pandas-test").put_object(
            Key="large-file.csv",
            Body=buf)

        with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
            read_csv("s3://pandas-test/large-file.csv", nrows=5)
            # log of fetch_range (start, stop)
            assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records)) 
Example #22
Source File: test_network.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def tips_df(datapath):
    """DataFrame with the tips dataset."""
    return read_csv(datapath('io', 'parser', 'data', 'tips.csv')) 
Example #23
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_read_write_reread_dta15(self, file):

        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(
            datetime.strptime, args=('%Y-%m-%d',))

        file = getattr(self, file)
        parsed = self.read_dta(file)

        tm.assert_frame_equal(expected, parsed) 
Example #24
Source File: test_excel.py    From Computable with MIT License 5 votes vote down vote up
def read_csv(self, *args, **kwds):
        kwds = kwds.copy()
        kwds['engine'] = 'python'
        return read_csv(*args, **kwds) 
Example #25
Source File: test_stata.py    From Computable with MIT License 5 votes vote down vote up
def test_read_dta8(self):
        expected = read_csv(self.csv8, parse_dates=True, sep='\t')
        parsed = self.read_dta(self.dta8)
        tm.assert_frame_equal(parsed, expected) 
Example #26
Source File: test_stata.py    From Computable with MIT License 5 votes vote down vote up
def test_read_dta7(self):
        expected = read_csv(self.csv7, parse_dates=True, sep='\t')
        parsed = self.read_dta(self.dta7)
        tm.assert_frame_equal(parsed, expected) 
Example #27
Source File: test_stata.py    From Computable with MIT License 5 votes vote down vote up
def test_write_dta6(self):
        skip_if_not_little_endian()

        original = self.read_csv(self.csv3)
        original.index.name = 'index'

        with tm.ensure_clean() as path:
            original.to_stata(path, None, False)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  original) 
Example #28
Source File: test_stata.py    From Computable with MIT License 5 votes vote down vote up
def test_read_dta3(self):
        parsed = self.read_dta(self.dta3)
        parsed_13 = self.read_dta(self.dta3_13)

        # match stata here
        expected = self.read_csv(self.csv3)
        expected = expected.astype(np.float32)
        expected['year'] = expected['year'].astype(np.int32)
        expected['quarter'] = expected['quarter'].astype(np.int16)

        tm.assert_frame_equal(parsed, expected)
        tm.assert_frame_equal(parsed_13, expected) 
Example #29
Source File: test_stata.py    From Computable with MIT License 5 votes vote down vote up
def read_csv(self, file):
        return read_csv(file, parse_dates=True) 
Example #30
Source File: test_network.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_read_csv_handles_boto_s3_object(self,
                                             s3_resource,
                                             tips_file):
        # see gh-16135

        s3_object = s3_resource.meta.client.get_object(
            Bucket='pandas-test',
            Key='tips.csv')

        result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
        assert isinstance(result, DataFrame)
        assert not result.empty

        expected = read_csv(tips_file)
        tm.assert_frame_equal(result, expected)