Python pandas.read_html() Examples

The following are code examples for showing how to use pandas.read_html(). They are extracted from open source Python projects. You can vote up the examples you like or vote down the ones you don't like. You can also save this page to your account.

Example 1
Project: bitrader   Author: jr-minnaar   File: arbitrage_tools.py    (MIT License) View Source Project 7 votes vote down vote up
def get_forex_buy_quote(currency_code: str = 'EUR', source: str = 'FNB', order_type: str = 'buy'):
    """Get latest forex from FNB website

    """
    if source == 'FNB':
        tables = pd.read_html(
            'https://www.fnb.co.za/Controller?nav=rates.forex.list.ForexRatesList',
            index_col=1, header=0, match=currency_code)

        df = tables[0]

        types = {
            'buy': 'Bank Selling Rate',
            'sell': 'Bank Buying Rate',
        }

        exhange_rate = df.loc[currency_code, types[order_type]]

        return Decimal("%.4f" % float(exhange_rate)) 
Example 2
Project: TuShare   Author: andyzsf   File: reference.py    (license) View Source Project 6 votes vote down vote up
def _sz_hz(date='', retry_count=3, pause=0.001):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            request = Request(rv.MAR_SZ_HZ_URL%(ct.P_TYPE['http'], ct.DOMAINS['szse'],
                                    ct.PAGES['szsefc'], date))
            lines = urlopen(request, timeout = 10).read()
            if len(lines) <= 200:
                return pd.DataFrame()
            df = pd.read_html(lines, skiprows=[0])[0]
            df.columns = rv.MAR_SZ_HZ_COLS
            df['opDate'] = date
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 3
Project: spylon   Author: maxpoint   File: update_spark_params.py    (license) View Source Project 6 votes vote down vote up
def _fetch_documentation(version, base_url="https://spark.apache.org/docs"):
    doc_urls = [
        "{base_url}/{version}/configuration.html",
        "{base_url}/{version}/sql-programming-guide.html",
        "{base_url}/{version}/monitoring.html",
        "{base_url}/{version}/spark-standalone.html",
        "{base_url}/{version}/running-on-mesos.html",
        "{base_url}/{version}/running-on-yarn.html",
    ]

    for url in doc_urls:
        doc_url = url.format(version=version, base_url=base_url)
        # print(url)
        print("Loading spark properties from %s", doc_url)
        dfs = pd.read_html(doc_url, header=0)
        desired_cols = ["Property Name", "Default", "Meaning"]
        for df in dfs:
            if ("Property Name" in df) and ('Default' in df):
                for pn, default, desc in df[desired_cols].itertuples(index=False):
                    if type(default) == numpy.bool_:
                        default = bool(default)
                    yield pn, default, desc 
Example 4
Project: PyTrader   Author: didw   File: webreader.py    (license) View Source Project 6 votes vote down vote up
def get_financial_statements(code):
    url = "http://companyinfo.stock.naver.com/v1/company/ajax/cF1001.aspx?cmp_cd=%s&fin_typ=0&freq_typ=Y" % (code)
    html = requests.get(url).text

    html = html.replace('<th class="bg r01c02 endLine line-bottom"colspan="8">??</th>', "")
    html = html.replace("<span class='span-sub'>(IFRS??)</span>", "")
    html = html.replace("<span class='span-sub'>(IFRS??)</span>", "")
    html = html.replace('\t', '')
    html = html.replace('\n', '')
    html = html.replace('\r', '')

    html = html.replace('2011/12', '2011')
    html = html.replace('2012/03', '2011')
    html = html.replace('2012/12', '2012')
    html = html.replace('2013/03', '2012')
    html = html.replace('2013/12', '2013')
    html = html.replace('2014/03', '2013')
    html = html.replace('2014/12', '2014')
    html = html.replace('2015/03', '2014')
    html = html.replace('2015/12', '2015')

    df_list = pd.read_html(html, index_col='??????')
    df = df_list[0]
    return df 
Example 5
Project: NBA-prediction   Author: christopherjenness   File: model.py    (license) View Source Project 6 votes vote down vote up
def get_stats(self, url):
        """
        Extracts statistics from URL

        Args:
            url (str): basketball-reference.com box score

        Returns:
            stats (pd.DataFrame): DataFrame of statistics from game
        """
        response = urllib2.urlopen(url)
        html = response.read()
        stat_html = html.replace('<!--', "")
        stat_html = stat_html.replace('-->', "")
        stats = pd.read_html(stat_html)
        return stats[-5] 
Example 6
Project: pyiem   Author: rheineke   File: pricehistory.py    (license) View Source Project 6 votes vote down vote up
def price_history_frame(mkt_id, year, month):
    """Returns price history as a DataFrame"""
    url = _build_url('pricehistory/PriceHistory_GetData.cfm')
    data = dict(Market_ID=mkt_id, Month='{:02d}'.format(month), Year=year)
    response = requests.post(url=url, data=data)
    index_cols = [iem.DATE, iem.CONTRACT]
    kwargs = dict(header=0, parse_dates=[iem.DATE], index_col=index_cols)
    try:
        dfs = pd.read_html(response.text, **kwargs)
    except ValueError:
        dfs = [pd.DataFrame()]

    # Expect a singleton list
    assert len(dfs) == 1

    # Remove duplicates, if any
    df = dfs[0]
    if len(df.index.unique()) != len(df.index):
        df = df.groupby(level=df.index.names).first()

    return df 
Example 7
Project: pyiem   Author: rheineke   File: pricehistory.py    (license) View Source Project 6 votes vote down vote up
def history_dates(mkt_id):
    url = _build_url('pricehistory/pricehistory_selectcontract.cfm')
    response = requests.get(url=url, params={'Market_ID': mkt_id})
    dfs = pd.read_html(response.text, index_col=0)

    # Expect a singleton list
    assert len(dfs) == 1

    df = dfs[0]

    mon_str = df.ix['Month:'][1]
    months = [dt.datetime.strptime(s[:3], '%b').month for s in mon_str.split()]
    year_str = df.ix['Year'][1]
    years = [int(s) for s in year_str.split()]

    return itertools.product(years, months) 
Example 8
Project: open-house-crawler   Author: data-skeptic   File: mass-populate.py    (license) View Source Project 6 votes vote down vote up
def run(query):
	r = requests.get('https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population')
	soup = BeautifulSoup(r.content)
	#
	tbl = soup.find_all('table')[3]
	df = pd.read_html(str(tbl))[0]
	#
	df.columns = df.iloc[0]
	#
	cities = df['City'].tolist()
	#
	for city in cities:
		i = city.find('[')
		if i != -1:
			city = city[0:i]
		city = city + ' ' + query
		print(city)
		populate.query_and_post(city)
		time.sleep(1) 
Example 9
Project: ShiPanE-Python-SDK   Author: sinall   File: client.py    (license) View Source Project 6 votes vote down vote up
def __query_new_stocks(self):
        DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc'
        html = lxml.html.parse(DATA_URL)
        res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
        if six.PY2:
            sarr = [etree.tostring(node) for node in res]
        else:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('<font color="red">*</font>', '')
        sarr = '<table>%s</table>' % sarr
        df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
        df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1)
        df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price']
        df['code'] = df['code'].map(lambda x: str(x).zfill(6))
        df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6))
        return df 
Example 10
Project: stock   Author: datablood   File: reference.py    (license) View Source Project 6 votes vote down vote up
def _sz_hz(date='', retry_count=3, pause=0.001):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            request = Request(rv.MAR_SZ_HZ_URL%(ct.P_TYPE['http'], ct.DOMAINS['szse'],
                                    ct.PAGES['szsefc'], date))
            lines = urlopen(request, timeout = 10).read()
            if len(lines) <= 200:
                return pd.DataFrame()
            df = pd.read_html(lines, skiprows=[0])[0]
            df.columns = rv.MAR_SZ_HZ_COLS
            df['opDate'] = date
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 11
Project: cfb-comparison   Author: pythonforsports   File: app.py    (license) View Source Project 6 votes vote down vote up
def generate_sched_table(team, year, max_rows=20):
    df = pd.DataFrame(teams)
    filter_team = df.loc[df["TeamAlt"] == team]
    filter_team['ESPNID'] = "http://www.espn.com/college-football/team/fpi/_/id/" \
                            + filter_team.ESPNID.map(str) + "/year/" + str(year)
    link = filter_team.tail(1)['ESPNID'].values[0]
    sched_dataframe = pd.read_html(link, header=1)[4]
    sched_dataframe.columns = ['Date', 'Opponent', 'Result/Proj', 'Opp FPI', 'Game Rating']

    return html.Table(
        # Header1
        [html.Tr([
            html.Th(html.H6([team + ' ' + str(year) + ' ' + 'Schedule']), colSpan=5, style=dict(textAlign="center")),
        ])] +

        # Header2
        [html.Tr([html.Td(col) for col in sched_dataframe.columns], style=dict(fontWeight="bold"))] +

        # Body
        [html.Tr([
            html.Td(sched_dataframe.iloc[i][col]) for col in sched_dataframe.columns
        ]) for i in range(min(len(sched_dataframe), max_rows))]
    ) 
Example 12
Project: TuShare   Author: andyzsf   File: reference.py    (license) View Source Project 5 votes vote down vote up
def _dist_cotent(year, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            if pageNo > 0:
                ct._write_console()
            html = lxml.html.parse(rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'],
                     ct.PAGES['163dp'], year, pageNo))  
            res = html.xpath('//div[@class=\"fn_rp_list\"]/table')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows=[0])[0]
            df = df.drop(df.columns[0], axis=1)
            df.columns = rv.DP_163_COLS
            df['divi'] = df['plan'].map(_fun_divi)
            df['shares'] = df['plan'].map(_fun_into)
            df = df.drop('plan', axis=1)
            df['code'] = df['code'].astype(object)
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            pages = []
            if pageNo == 0:
                page = html.xpath('//div[@class=\"mod_pages\"]/a')
                if len(page)>1:
                    asr = page[len(page)-2]
                    pages = asr.xpath('text()')
        except Exception as e:
            print(e)
        else:
            if pageNo == 0:
                return df, pages[0] if len(pages)>0 else 0
            else:
                return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 13
Project: TuShare   Author: andyzsf   File: reference.py    (license) View Source Project 5 votes vote down vote up
def _get_forecast_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], 
                                                ct.PAGES['fd'], year, quarter, pageNo,
                                                ct.PAGE_NUM[1]))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('--', '0')
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df = df.drop([4, 5, 8], axis=1)
        df.columns = ct.FORECAST_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+',nextPage[0])[0]
            return _get_forecast_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
            print(e) 
Example 14
Project: TuShare   Author: andyzsf   File: reference.py    (license) View Source Project 5 votes vote down vote up
def _newstocks(data, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'],
                         ct.PAGES['newstock'], pageNo))
            res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = sarr.replace('<font color="red">*</font>', '')
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
            df = df.drop([df.columns[idx] for idx in [1, 12, 13, 14]], axis=1)
            df.columns = rv.NEW_STOCKS_COLS
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
            tag = '???' if ct.PY3 else unicode('???', 'utf-8')
            hasNext = True if tag in res else False 
            data = data.append(df, ignore_index=True)
            pageNo += 1
            if hasNext:
                data = _newstocks(data, pageNo, retry_count, pause)
        except Exception as ex:
            print(ex)
        else:
            return data 
Example 15
Project: TuShare   Author: andyzsf   File: trading.py    (license) View Source Project 5 votes vote down vote up
def _parse_fq_data(url, index, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows = [0, 1])[0]
            if len(df) == 0:
                return pd.DataFrame()
            if index:
                df.columns = ct.HIST_FQ_COLS[0:7]
            else:
                df.columns = ct.HIST_FQ_COLS
            if df['date'].dtypes == np.object:
                df['date'] = df['date'].astype(np.datetime64)
            df = df.drop_duplicates('date')
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 16
Project: TuShare   Author: andyzsf   File: billboard.py    (license) View Source Project 5 votes vote down vote up
def _cap_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):   
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[0],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_GGTJ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _cap_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e) 
Example 17
Project: TuShare   Author: andyzsf   File: billboard.py    (license) View Source Project 5 votes vote down vote up
def _broker_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):   
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[1],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_YYTJ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _broker_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e) 
Example 18
Project: TuShare   Author: andyzsf   File: billboard.py    (license) View Source Project 5 votes vote down vote up
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):   
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[2],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df = df.drop([2,3], axis=1)
            df.columns = rv.LHB_JGZZ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _inst_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e) 
Example 19
Project: TuShare   Author: andyzsf   File: billboard.py    (license) View Source Project 5 votes vote down vote up
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):   
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3],
                                               ct.PAGES['fd'], '', pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_JGMX_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _inst_detail(pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e) 
Example 20
Project: TuShare   Author: andyzsf   File: fundamental.py    (license) View Source Project 5 votes vote down vote up
def _get_profit_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                              ct.PAGES['fd'], year,
                                              quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns=ct.PROFIT_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_profit_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass 
Example 21
Project: TuShare   Author: andyzsf   File: fundamental.py    (license) View Source Project 5 votes vote down vote up
def _get_operation_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.OPERATION_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                 ct.PAGES['fd'], year,
                                                 quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns=ct.OPERATION_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_operation_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
        print(e) 
Example 22
Project: TuShare   Author: andyzsf   File: fundamental.py    (license) View Source Project 5 votes vote down vote up
def _get_growth_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.GROWTH_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                              ct.PAGES['fd'], year,
                                              quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns=ct.GROWTH_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_growth_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
        print(e) 
Example 23
Project: TuShare   Author: andyzsf   File: fundamental.py    (license) View Source Project 5 votes vote down vote up
def _get_debtpaying_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                  ct.PAGES['fd'], year,
                                                  quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns = ct.DEBTPAYING_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_debtpaying_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
        print(e) 
Example 24
Project: TuShare   Author: andyzsf   File: fundamental.py    (license) View Source Project 5 votes vote down vote up
def _get_cashflow_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                ct.PAGES['fd'], year,
                                                quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns = ct.CASHFLOW_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_cashflow_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
        print(e) 
Example 25
Project: InplusTrader_Linux   Author: zhengwsh   File: func.py    (license) View Source Project 5 votes vote down vote up
def _parse_fq_data(url, index, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
            
            sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            if sarr == '':
                return None
            df = pd.read_html(sarr, skiprows = [0, 1])[0]
            if len(df) == 0:
                return pd.DataFrame()
            if index:
                df.columns = ct.HIST_FQ_COLS[0:7]
            else:
                df.columns = ct.HIST_FQ_COLS
            if df['date'].dtypes == np.object:
                df['date'] = df['date'].astype(np.datetime64)
            df = df.drop_duplicates('date')
        except ValueError as e:
            # ????????????
            return None
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 26
Project: SSieve   Author: davidimprovz   File: daily.py    (license) View Source Project 5 votes vote down vote up
def checkStockNameChanges():
        """
        Handles the renaming of stocks in the DB that have been renamed. It will switch 
        both the name and the symbol for every table in the database. 

        Uses PANDAS to capture html tables from the NASDAQ listings containing symbol changes. 

        Returns the changes in a PANDAS frame, if any. If problem scraping NASDAQ, returns error msg.
        If code error, returns a tuple of False, error.
        """
        try:

            path = 'http://www.nasdaq.com/markets/stocks/symbol-change-history.aspx?page='
            ticker_changes = pd.DataFrame()
            for i in np.arange(100): # set the number high enough to catch all pages
                page = str(i+1)
                full_path = ''.join([path, page])
                symbol_changes = pd.read_html(full_path, header=0)[3] # note: index could change in future if html is restructured
                # concat all of the changes together
                if 'No records found.' not in symbol_changes.iloc[0][0]:
                    ticker_changes = pd.concat([ticker_changes, symbol_changes], ignore_index=True)
                else: break # drop out of loop if there's nothing left to capture

            ticker_changes.rename(columns={'Old Symbol': 'Old', 'New Symbol': 'New', 'Effective Date': 'Date'}, inplace=True)

            # check returned value
            assert isinstance(ticker_changes, pd.DataFrame), "Expected stock name changes to return pandas DataFrame. Got %r instead" % type(tickers)

            return ticker_changes

        except Exception as e:
            return False, e

    # renameStocks()
    # ************** # 
Example 27
Project: coinbin.org   Author: kennethreitz   File: scraper.py    (license) View Source Project 5 votes vote down vote up
def get_coins():
    coins_db = OrderedDict()

    print(crayons.yellow('Scraping CoinMaketCap...'))

    r = session.get(url)
    html = pq(pq(r.content)('table')[0]).html()
    df = pandas.read_html("<table>{}</table>".format(html))
    df = pandas.concat(df)

    btc_value = float(df.to_dict()['Price'][0][1:].replace(',', ''))

    for row in df.itertuples():

        rank = int(row[1])
        name = ' '.join(row[2].split()[1:])
        ticker = row[3].lower()
        try:
            usd = float(row[5][1:].replace(',', ''))
        except ValueError:
            usd = 0
        finally:
            pass

        btc = convert_to_decimal(usd / btc_value)

        coins_db.update({ticker: {'rank': rank, 'name': name, 'ticker': ticker, 'usd': usd, 'btc': btc}})

    return coins_db 
Example 28
Project: esdata   Author: gjacopo   File: session.py    (license) View Source Project 5 votes vote down vote up
def read_html_table(self, url, **kwargs): 
        try:
            self.get_status(url)
        except:
            return None
        # set some default values (some are already default values for read_table)
        kwargs.update({'encoding': kwargs.get('encoding') or None})
        # run pandas...
        df = pd.read_html(url, **kwargs)
        return df
               
    #/************************************************************************/ 
Example 29
Project: Verum   Author: Data4Democracy   File: vhr_spider.py    (license) View Source Project 5 votes vote down vote up
def parse_rep_vote_history(self, response):
        # Some reps did not vote during a session. Test for the "Vote data is unavailable.
        # We capture the base information about the rep for later matching
        if "Vote data is unavailable" in response.css("#mainBody::text").extract()[3]:
            cur_url = response.url
            session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
            url = cur_url
            self.rep_info.append([rep_id, session_id, chamber])
        
        #Otherwise, we process the body of text.
        else:
            title = response.xpath("""//*[@id="title"]/text()""").extract_first()

            rep_title, rep_short_name, rep_district = self.get_name_district(title)
            #Fetch the main table - they use nested tables, so have to use a direct reference.
            table_rows = response.css('#mainBody > table').extract()[0]
            
            #Parse the html table and select relevant info for the vote.
            pd_table = pd.read_html(table_rows, header=0, match="Doc.", attrs={'cellspacing':0})[0][['RCS\xa0#', 'Doc.','Vote','Result']]
            
            #Get session and chamber id from URL and assign to each row
            cur_url = response.url
            session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
            pd_table['session_id'] = session_id
            pd_table['chamber'] = chamber
            pd_table['rep_id'] = rep_id
            pd_table['rep_title'] = rep_title
            pd_table['rep_short_name'] = rep_short_name
            pd_table['district'] = rep_district

            #Reorder columns
            pd_table = pd_table.reindex_axis(['session_id', 'chamber', 'rep_id', 'rep_short_name', 'rep_title', 'district', 'RCS\xa0#', 'Doc.', 'Vote', 'Result'], axis=1)

            return pd_table.to_dict(orient='records') 
Example 30
Project: electricitymap   Author: tmrowco   File: CA_AB.py    (license) View Source Project 5 votes vote down vote up
def fetch_price(country_code='CA-AB', session=None):
    """Requests the last known power price of a given country

    Arguments:
    country_code (optional) -- used in case a parser is able to fetch multiple countries
    session (optional)      -- request session passed in order to re-use an existing session

    Return:
    A dictionary in the form:
    {
      'countryCode': 'FR',
      'currency': EUR,
      'datetime': '2017-01-01T00:00:00Z',
      'price': 0.0,
      'source': 'mysource.com'
    }
    """

    r = session or requests.session()
    url = 'http://ets.aeso.ca/ets_web/ip/Market/Reports/SMPriceReportServlet?contentType=html/'
    response = r.get(url)
    df_prices = pd.read_html(response.text, match='Price', index_col=0, header=0)
    prices = df_prices[1]

    data = {}

    for rowIndex, row in prices.iterrows():
        price = row['Price ($)']
        if (isfloat(price)):
            hours = int(rowIndex.split(' ')[1]) - 1
            data[rowIndex] = {
                'datetime': arrow.get(rowIndex, 'MM/DD/YYYY').replace(hours=hours, tzinfo=ab_timezone).datetime,
                'countryCode': country_code,
                'currency': 'CAD',
                'source': 'ets.aeso.ca',
                'price': float(price),
            }

    return [data[k] for k in sorted(data.keys())] 
Example 31
Project: electricitymap   Author: tmrowco   File: CA_AB.py    (license) View Source Project 5 votes vote down vote up
def fetch_exchange(country_code1='CA-AB', country_code2='CA-BC', session=None):
    """Requests the last known power exchange (in MW) between two countries

    Arguments:
    country_code (optional) -- used in case a parser is able to fetch multiple countries
    session (optional)      -- request session passed in order to re-use an existing session

    Return:
    A dictionary in the form:
    {
      'sortedCountryCodes': 'DK->NO',
      'datetime': '2017-01-01T00:00:00Z',
      'netFlow': 0.0,
      'source': 'mysource.com'
    }
    """

    r = session or requests.session()
    url = 'http://ets.aeso.ca/ets_web/ip/Market/Reports/CSDReportServlet'
    response = r.get(url)
    df_exchanges = pd.read_html(response.text, match='INTERCHANGE', skiprows=0, index_col=0)

    flows = {
        'CA-AB->CA-BC': df_exchanges[1][1]['British Columbia'],
        'CA-AB->CA-SK': df_exchanges[1][1]['Saskatchewan'],
        'CA-AB->US': df_exchanges[1][1]['Montana']
    }
    sortedCountryCodes = '->'.join(sorted([country_code1, country_code2]))
    if sortedCountryCodes not in flows:
        raise NotImplementedError('This exchange pair is not implemented')

    return {
        'datetime': arrow.now(tz=ab_timezone).datetime,
        'sortedCountryCodes': sortedCountryCodes,
        'netFlow': float(flows[sortedCountryCodes]),
        'source': 'ets.aeso.ca'
    } 
Example 32
Project: electricitymap   Author: tmrowco   File: CR.py    (license) View Source Project 5 votes vote down vote up
def fetch_production(country_code='CR', session=None):
    # Do not use existing session as some amount of cache is taking place
    r = requests.session()
    url = 'https://appcenter.grupoice.com/CenceWeb/CencePosdespachoNacional.jsf'
    response = r.get(url)
    df_yesterday = pd.read_html(response.text, skiprows=1, index_col=0, header=0)[0]

    soup = BeautifulSoup(response.text, 'html.parser')
    yesterday_date = soup.select('#formPosdespacho:pickFechaInputDate')[0]['value']
    jsf_view_state = soup.select('#javax.faces.ViewState')[0]['value']

    yesterday = arrow.get(yesterday_date, 'DD/MM/YYYY', tzinfo=TIMEZONE)
    today = yesterday.shift(days=+1)

    data = [
        ('formPosdespacho', 'formPosdespacho'),
        ('formPosdespacho:pickFechaInputDate', today.format(DATE_FORMAT)),
        ('formPosdespacho:pickFechaInputCurrentDate', today.format(MONTH_FORMAT)),
        ('formPosdespacho:j_id35.x', ''),
        ('formPosdespacho:j_id35.y', ''),
        ('javax.faces.ViewState', jsf_view_state),
    ]
    response = r.post(url, cookies={}, data=data)
    df_today = pd.read_html(response.text, skiprows=1, index_col=0)[0]

    ydata = df_to_data(country_code, yesterday, df_yesterday)
    tdata = df_to_data(country_code, today, df_today)
    production = ydata + tdata
    unknown_plants()

    return production 
Example 33
Project: wikischolar   Author: evoapps   File: get.py    (license) View Source Project 5 votes vote down vote up
def convert_wiki_to_table(wiki_text, n_table=0):
    html_text = pypandoc.convert(wiki_text, 'html', 'mediawiki')
    tables = pandas.read_html(html_text)
    return tables[n_table] 
Example 34
Project: StockRecommendSystem   Author: doncat99   File: Fetch_Data_Stock_US_Short.py    (license) View Source Project 5 votes vote down vote up
def getSignleStockShortInfo(stock):
    df = pd.DataFrame()
    url = "http://shortsqueeze.com/?symbol=" + stock + "&submit=Short+Quote%E2%84%A2"
    repeat_times = 3
    downloadFailed = True

    for _ in range(repeat_times): 
        try:
            response = requests.get(url, timeout=15)
            downloadFailed = False
            break
        except Exception as e:
            print ("exception in get stock:" + stock, str(e))
            continue

    if downloadFailed:
        return "", df
    
    try:    
        tables = pd.read_html(response.text, attrs={'cellpadding': '3', 'width': '100%'})
    except Exception as e:
        print ("exception in parse stock:" + stock, str(e))
        return "", df

    for table in tables:
        if df.empty:
            df = table
        else:
            df = pd.concat([df, table])
    df = df.reset_index(drop=True, inplace=True)
    #print(df)
        
    soup = BeautifulSoup(response.text, 'lxml')
    dateString = soup.find('span', {"style" : "color:#999999;font-family: verdana, arial, helvetica;font-size:10px"}).get_text()
    date = datetime.datetime.strptime(dateString, '%A %B %d, %Y')
    return date, df.T 
Example 35
Project: frenchy   Author: miroli   File: Commune.py    (license) View Source Project 5 votes vote down vote up
def __init__(self, code, year):
        self._geo = get_geo(code, year)
        self.url = url_resolver(code, year, self._geo['region_code'],
                                self._geo['department_code'])

        tables = pd.read_html(self.url, header=0, encoding='utf8', decimal=',',
                              thousands=' ')
        self._parse(tables) 
Example 36
Project: pyiem   Author: rheineke   File: pricehistory.py    (license) View Source Project 5 votes vote down vote up
def read_quote_frames(mkt_conf):
    url = _market_quote_url(mkt_conf)
    response = requests.get(url=url)
    dfs = pd.read_html(response.text, index_col=0, header=0, na_values=['---'])

    # Data outside of the HTML tables
    table_headers = _table_headers(response.text)
    market_names = [_market_name(s) for s in table_headers]
    timestamps = [_timestamp(s) for s in table_headers]

    # Modify data frames
    mod_dfs = [_modify_frame(df, ts) for df, ts in zip(dfs, timestamps)]

    return OrderedDict((nm, df) for nm, df in zip(market_names, mod_dfs)) 
Example 37
Project: pyiem   Author: rheineke   File: session_test.py    (license) View Source Project 5 votes vote down vote up
def main():
    date_cols = [iem.ORDER_DATE, iem.EXPIRATION]
    kwargs = dict(index_col=iem.ORDER_DATE, parse_dates=date_cols)
    dfs = pd.read_html(table_text, **kwargs)
    df = dfs[0]

    oid_df = pd.DataFrame()
    cxl_o = iem.CANCEL_ORDER
    df[cxl_o] = df[cxl_o].combine_first(oid_df[cxl_o]) 
Example 38
Project: pyiem   Author: rheineke   File: session.py    (license) View Source Project 5 votes vote down vote up
def _frame(response, **kwargs):
    print(response.text)
    dfs = pd.read_html(response.text, **kwargs)

    # Expect a singleton list
    assert len(dfs) == 1

    return dfs[0] 
Example 39
Project: open-house-crawler   Author: data-skeptic   File: lambda_function.py    (license) View Source Project 5 votes vote down vote up
def parse_detail_page(b):
    prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
    other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
    # TODO: use the extended fields, add them to the list of properties
    tables = b.findAll('table', {'class': 'cell'})
    if len(tables) > 0:
        prop['listing_timestamp'] = datetime.datetime.now()
        addr_rows = b.findAll('td', {'class': 'addr'})
        addr = ' '.join(map(lambda x: x.getText(), addr_rows))
        t = tables[0]
        df = pd.read_html(str(t))[0]
        data = dict(zip(df[0], df[1]))
        prop['raw_address'] = addr
        prop['bedrooms'] = int(data['Bedrooms'])
        prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
        if data.has_key('Interior Sq Ft'):
            prop['building_size'] = int(data['Interior Sq Ft'])
        prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
        if data.has_key('Parking'):
            try:
                prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
            except ValueError:
                prop['car_spaces'] = -1
        return [prop]
    else:
        return None

# Takes a string of the raw version of the page and extracts any links we might want to crawl 
Example 40
Project: open-house-crawler   Author: data-skeptic   File: example.py    (license) View Source Project 5 votes vote down vote up
def parse_detail_page(content):
    prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
    other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
    # TODO: use the extended fields
    b = soup.BeautifulSoup(content)
    tables = b.findAll('table', {'class': 'cell'})
    if len(tables) > 0:
        prop['listing_timestamp'] = datetime.datetime.now()
        addr_rows = b.findAll('td', {'class': 'addr'})
        addr = ' '.join(map(lambda x: x.getText(), addr_rows))
        t = tables[0]
        df = pd.read_html(str(t))[0]
        data = dict(zip(df[0], df[1]))
        prop['raw_address'] = addr
        prop['bedrooms'] = int(data['Bedrooms'])
        prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
        if data.has_key('Interior Sq Ft'):
            prop['building_size'] = int(data['Interior Sq Ft'])
        prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
        if data.has_key('Parking'):
            try:
                prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
            except ValueError:
                prop['car_spaces'] = -1
        #for of in other_fields:
        #    if data.has_key(of):
        #        prop['features'].append({of: data[of]})
        return [prop]
    else:
        return None

# Takes a string of the raw version of the page and extracts any links we might want to crawl 
Example 41
Project: ShiPanE-Python-SDK   Author: sinall   File: stock.py    (license) View Source Project 5 votes vote down vote up
def new_stocks():
        url = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc'
        request = requests.get(url)
        doc = lxml.html.soupparser.fromstring(request.content, features='html.parser')
        table = doc.cssselect('table#NewStockTable')[0]
        table.remove(table.cssselect('thead')[0])
        table_html = lxml.html.etree.tostring(table).decode('utf-8')
        df = pd.read_html(table_html, skiprows=[0, 1])[0]
        df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1)
        df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price']
        df['code'] = df['code'].map(lambda x: str(x).zfill(6))
        df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6))
        return df 
Example 42
Project: stock   Author: datablood   File: reference.py    (license) View Source Project 5 votes vote down vote up
def _dist_cotent(year, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            if pageNo > 0:
                ct._write_console()
            html = lxml.html.parse(rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'],
                     ct.PAGES['163dp'], year, pageNo))
            res = html.xpath('//div[@class=\"fn_rp_list\"]/table')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows=[0])[0]
            df = df.drop(df.columns[0], axis=1)
            df.columns = rv.DP_163_COLS
            df['divi'] = df['plan'].map(_fun_divi)
            df['shares'] = df['plan'].map(_fun_into)
            df = df.drop('plan', axis=1)
            df['code'] = df['code'].astype(object)
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            pages = []
            if pageNo == 0:
                page = html.xpath('//div[@class=\"mod_pages\"]/a')
                if len(page)>1:
                    asr = page[len(page)-2]
                    pages = asr.xpath('text()')
        except Exception as e:
            print(e)
        else:
            if pageNo == 0:
                return df, pages[0] if len(pages)>0 else 0
            else:
                return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 43
Project: stock   Author: datablood   File: reference.py    (license) View Source Project 5 votes vote down vote up
def _get_forecast_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                ct.PAGES['fd'], year, quarter, pageNo,
                                                ct.PAGE_NUM[1]))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('--', '0')
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df = df.drop([4, 5, 8], axis=1)
        df.columns = ct.FORECAST_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+',nextPage[0])[0]
            return _get_forecast_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
            print(e) 
Example 44
Project: stock   Author: datablood   File: reference.py    (license) View Source Project 5 votes vote down vote up
def _newstocks(data, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'],
                         ct.PAGES['newstock'], pageNo))
            res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = sarr.replace('<font color="red">*</font>', '')
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
            df = df.drop([df.columns[idx] for idx in [1, 12, 13, 14]], axis=1)
            df.columns = rv.NEW_STOCKS_COLS
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
            tag = '???' if ct.PY3 else unicode('???', 'utf-8')
            hasNext = True if tag in res else False
            data = data.append(df, ignore_index=True)
            pageNo += 1
            if hasNext:
                data = _newstocks(data, pageNo, retry_count, pause)
        except Exception as ex:
            print(ex)
        else:
            return data 
Example 45
Project: stock   Author: datablood   File: trading.py    (license) View Source Project 5 votes vote down vote up
def _parse_fq_data(url, index, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows = [0, 1])[0]
            if len(df) == 0:
                return pd.DataFrame()
            if index:
                df.columns = ct.HIST_FQ_COLS[0:7]
            else:
                df.columns = ct.HIST_FQ_COLS
            if df['date'].dtypes == np.object:
                df['date'] = df['date'].astype(np.datetime64)
            df = df.drop_duplicates('date')
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 46
Project: stock   Author: datablood   File: billboard.py    (license) View Source Project 5 votes vote down vote up
def _cap_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[0],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_GGTJ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _cap_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e) 
Example 47
Project: stock   Author: datablood   File: billboard.py    (license) View Source Project 5 votes vote down vote up
def _broker_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[1],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_YYTJ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _broker_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e) 
Example 48
Project: stock   Author: datablood   File: billboard.py    (license) View Source Project 5 votes vote down vote up
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[2],
                                               ct.PAGES['fd'], last, pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df = df.drop([2,3], axis=1)
            df.columns = rv.LHB_JGZZ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _inst_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e) 
Example 49
Project: stock   Author: datablood   File: billboard.py    (license) View Source Project 5 votes vote down vote up
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3],
                                               ct.PAGES['fd'], '', pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_JGMX_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _inst_detail(pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e) 
Example 50
Project: stock   Author: datablood   File: fundamental.py    (license) View Source Project 5 votes vote down vote up
def _get_profit_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                              ct.PAGES['fd'], year,
                                              quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns=ct.PROFIT_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_profit_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass