Python pandas.read_html() Examples

The following are code examples for showing how to use pandas.read_html(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: Stock_Market_Forecast   Author: cuevas1208   File: data_Load.py    MIT License 6 votes vote down vote up
def getsp500():
    """
    list all sp500
    return a list a list from sp500
    checks to see if list has been exists if not it would be created
    """
    file_path = "../data/" + "sp500_list" + '.csv'
    if not (path.exists(file_path)):
        df = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
        df.columns = df.ix[0]
        df.drop(df.index[0], inplace=True)
        df.to_csv(file_path)
    else:
        # load list from CSV file
        # https://stackoverflow.com/questions/19699367/unicodedecodeerror-utf-8-codec-cant-decode-byte
        df = pd.read_csv(file_path, encoding = "ISO-8859-1")
    
    return df['Ticker symbol'].tolist() 
Example 2
Project: open-house-crawler   Author: data-skeptic   File: mass-populate.py    GNU General Public License v3.0 6 votes vote down vote up
def run(query):
	r = requests.get('https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population')
	soup = BeautifulSoup(r.content)
	#
	tbl = soup.find_all('table')[3]
	df = pd.read_html(str(tbl))[0]
	#
	df.columns = df.iloc[0]
	#
	cities = df['City'].tolist()
	#
	for city in cities:
		i = city.find('[')
		if i != -1:
			city = city[0:i]
		city = city + ' ' + query
		print(city)
		populate.query_and_post(city)
		time.sleep(1) 
Example 3
Project: opt_trade   Author: ejpjapan   File: spx_data_update.py    MIT License 6 votes vote down vote up
def scrape_sp5_div_yield():
    """Scrape S&P 500 dividend yield from www.multpl.com
    :rtype: pd.Dataframe
    """
    url = 'http://www.multpl.com/s-p-500-dividend-yield/table?f=m'
    # Package the request, send the request and catch the response: r
    raw_html_tbl = pd.read_html(url)
    dy_df = raw_html_tbl[0]
    # Clear dataframe
    dy_df.columns = dy_df.iloc[0]
    dy_df = dy_df.drop([0])
    dy_df[dy_df.columns[0]] = pd.to_datetime(dy_df.loc[:, dy_df.columns[0]],
                                             format='%b %d, %Y')
    dy_df = dy_df.set_index(dy_df.columns[0])
    dy_df = dy_df[dy_df.columns[0]]
    spx_dividend_yld = pd.to_numeric(dy_df.str.replace('%', '').str.replace('estimate', '').str.strip())
    spx_dividend_yld = spx_dividend_yld.reindex(spx_dividend_yld.index[::-1])
    spx_dividend_yld = spx_dividend_yld.resample('MS').bfill()
    return spx_dividend_yld 
Example 4
Project: QUANTAXIS   Author: QUANTAXIS   File: shipaneclient.py    MIT License 6 votes vote down vote up
def __query_new_stocks(self):
        DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc'
        html = lxml.html.parse(DATA_URL)
        res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
        if six.PY2:
            sarr = [etree.tostring(node) for node in res]
        else:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('<font color="red">*</font>', '')
        sarr = '<table>%s</table>' % sarr
        df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
        df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1)
        df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price']
        df['code'] = df['code'].map(lambda x: str(x).zfill(6))
        df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6))
        return df 
Example 5
Project: Pic2Brick   Author: lucaswo   File: pic2brick.py    MIT License 6 votes vote down vote up
def get_colours():
    headers = {
        'User-Agent': "Mozilla/5.0"
    }

    r = requests.get("https://www.bricklink.com/catalogColors.asp", headers=headers)
    tree = BeautifulSoup(r.text, "lxml")
    html_table = tree.select("table")[3].select("table")[0]
    colour_table = pd.read_html(str(html_table), header=0)[0]
    colour_table = colour_table.drop(["Unnamed: 1", "Unnamed: 2"] , axis=1)
    rgb_table = pd.DataFrame([hextoint(td.attrs["bgcolor"]) for td in html_table.select("td[bgcolor]")], 
                             columns=["r", "g", "b"])
    colour_table = colour_table.merge(rgb_table, left_index=True, right_index=True)
    current_colours = colour_table[colour_table["For Sale"] > 600]

    return current_colours 
Example 6
Project: Pic2Brick   Author: lucaswo   File: pic2brick.py    MIT License 6 votes vote down vote up
def get_part_list():
    headers = {
        'User-Agent': "Mozilla/5.0"
    }
    r = requests.get("https://www.bricklink.com/catalogList.asp?catType=P&catString=26", headers=headers)

    tree = BeautifulSoup(r.text, "lxml")

    html_table = tree.select("#ItemEditForm")[0].select("table")[1]
    part_table = pd.read_html(str(html_table), header=0)[0]
    part_table.drop("Image", axis=1, inplace=True)
    part_table.columns = ["ID", "Description"]
    part_table["Description"] = part_table["Description"].str.split("Cat").str[0].str[6:]
    part_table = part_table[part_table["Description"].str.len() < 10]

    return part_table 
Example 7
Project: xueqiu   Author: 1dot75cm   File: fund.py    MIT License 6 votes vote down vote up
def get_fund_histories(code, begin: str = '-1m', end: str = arrow.now(), size: int = 40):
    """get fund history data.

    :param begin: the start date of the results.
            value: -1w -2w -1m -3m -6m -1y -2y -3y -5y cyear or YYYY-MM-DD
    :param end: (optional) the end date of the results, default is `now`.
    :param size: (optional) the number of results, default is `40`.
    """
    begin = str2date(begin).format('YYYY-MM-DD')
    end = arrow.get(end).format('YYYY-MM-DD')
    page = 1
    while True:
        params = dict(code=code, sdate=begin, edate=end, per=size, page=page)
        resp = sess.get(api.fund_history, params=params)
        data = js2obj(resp.text, 'apidata')
        navdf = pd.read_html(data['content'])[0].iloc[:,:4]
        navdf.iloc[:,0] = navdf.iloc[:,0].apply(lambda x: x.replace('*',''))
        if data['curpage'] == 1: df = navdf
        else: df = df.append(navdf)
        if data['pages'] == 1 or data['pages'] == page:
            break
        page += 1
    df.columns = ['date','nav','cnav','percent']
    df['date'] = pd.to_datetime(df['date'])
    return df.set_index('date').sort_index(axis=0) 
Example 8
Project: spylon   Author: Valassis-Digital-Media   File: update_spark_params.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _fetch_documentation(version, base_url="https://spark.apache.org/docs"):
    doc_urls = [
        "{base_url}/{version}/configuration.html",
        "{base_url}/{version}/sql-programming-guide.html",
        "{base_url}/{version}/monitoring.html",
        "{base_url}/{version}/spark-standalone.html",
        "{base_url}/{version}/running-on-mesos.html",
        "{base_url}/{version}/running-on-yarn.html",
    ]

    for url in doc_urls:
        doc_url = url.format(version=version, base_url=base_url)
        # print(url)
        print("Loading spark properties from %s", doc_url)
        dfs = pd.read_html(doc_url, header=0)
        desired_cols = ["Property Name", "Default", "Meaning"]
        for df in dfs:
            if ("Property Name" in df) and ('Default' in df):
                for pn, default, desc in df[desired_cols].itertuples(index=False):
                    if type(default) == numpy.bool_:
                        default = bool(default)
                    yield pn, default, desc 
Example 9
Project: fund   Author: Frank-qlu   File: reference.py    Apache License 2.0 6 votes vote down vote up
def _profit_divis(pageNo, dataArr, nextPage):
        ct._write_console()
        html = lxml.html.parse('%sdata.cfi.cn/%s'%(ct.P_TYPE['http'], nextPage))
        res = html.xpath("//table[@class=\"table_data\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('--', '0')
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr, skiprows=[0])[0]
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@id=\"content\"]/div[2]/a[last()]/@href')[0]
        np = nextPage.split('&')[2].split('=')[1]
        if pageNo < int(np):
            return _profit_divis(int(np), dataArr, nextPage)
        else:
            return dataArr 
Example 10
Project: fund   Author: Frank-qlu   File: trading.py    Apache License 2.0 6 votes vote down vote up
def _today_ticks(symbol, tdate, pageNo, retry_count, pause):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'],
                                                         ct.DOMAINS['vsf'], ct.PAGES['t_ticks'],
                                                         symbol, tdate, pageNo
                                ))  
            res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            sarr = sarr.replace('--', '0')
            df = pd.read_html(StringIO(sarr), parse_dates=False)[0]
            df.columns = ct.TODAY_TICK_COLUMNS
            df['pchange'] = df['pchange'].map(lambda x : x.replace('%', ''))
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 11
Project: tushare   Author: waditu   File: reference.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _profit_divis(pageNo, dataArr, nextPage):
        ct._write_console()
        html = lxml.html.parse('%sdata.cfi.cn/%s'%(ct.P_TYPE['http'], nextPage))
        res = html.xpath("//table[@class=\"table_data\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('--', '0')
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr, skiprows=[0])[0]
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@id=\"content\"]/div[2]/a[last()]/@href')[0]
        np = nextPage.split('&')[2].split('=')[1]
        if pageNo < int(np):
            return _profit_divis(int(np), dataArr, nextPage)
        else:
            return dataArr 
Example 12
Project: wanggeService   Author: pchaos   File: hsgtcg.py    MIT License 5 votes vote down vote up
def scrap(url, browser, retryCount=2):
        """ 抓取网页table

        :param url: 网址
        :param browser: 浏览器
        :return: dataframe
        """
        try:
            while retryCount > 0:
                try:
                    browser.get(url)
                    time.sleep(random.random() / 4)
                    if 'thead' in browser.page_source:
                        break
                except Exception as e:
                    print(retryCount, e.args)
                    retryCount -= 1
                    if retryCount == 1:
                        mProxy.deleteProxy(myProxy)
            for x in ['lxml', 'xml', 'html5lib']:
                # 可能会出现lxml版本大于4.1.1时,获取不到table
                try:
                    soup = BeautifulSoup(browser.page_source, x)
                    table = soup.find_all(id='tb_cgtj')[0]
                    if table:
                        break
                except:
                    time.sleep(0.1)
                    print('using BeautifulSoup {}'.format(x))
            df = pd.read_html(str(table), header=1)[0]
            df.columns = ['tradedate', 'related', 'close', 'zd', 'hvol', 'hamount', 'hpercent', 'oneday', 'fiveday',
                          'tenday']
        except Exception as e:
            print(e.args)
            return pd.DataFrame()

        return df 
Example 13
Project: streamlit_finance_chart   Author: paduel   File: app.py    GNU General Public License v3.0 5 votes vote down vote up
def load_data():
    components = pd.read_html('https://en.wikipedia.org/wiki/List_of_S'
                    '%26P_500_companies')[0]
    return components.drop('SEC filings', axis=1).set_index('Symbol') 
Example 14
Project: redfin_houses   Author: huangyunict   File: house_details.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def parse_table(doc: pq) -> str:
    table = doc('table')
    table_html = "<table>{}</table>".format(table.html())
    df = pd.read_html(table_html)[0]
    return df.to_csv(index=False) 
Example 15
Project: QFiPy   Author: kouzapo   File: fixed_income.py    MIT License 5 votes vote down vote up
def get_yields():
	"""
	This function fetches the current yields of US Treasury securities
	and returns a dict with the values and the maturities as keys.
	Returns:
	-------
		yields: dict, a dictitonary object containing the yield for each maturiy.
	"""

	D = pd.read_html('https://www.treasury.gov/resource-center/data-chart-center/interest-rates/Pages/TextView.aspx?data=yield')[1]

	maturities_keys = ['1 mo', '2 mo', '3 mo', '6 mo', '1 yr', '2 yr', '3 yr', '5 yr', '7 yr', '10 yr', '20 yr', '30 yr']
	yields = {m: round(float(D[m].iloc[-1]) / 100, 4) for m in maturities_keys}

	return yields 
Example 16
Project: QFiPy   Author: kouzapo   File: utilities.py    MIT License 5 votes vote down vote up
def get_DJI_symbols():
	"""
	This function downloads the symbols of the Dow Jones index and saves them in a .dat file.
	"""

	f = open('data/symbols_files/DJI_symbols.dat', 'w')
	DJI_list = pd.read_html('https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average')

	for symbol in DJI_list[1]['Symbol']:
		f.write(symbol + '\n')

	f.close() 
Example 17
Project: QFiPy   Author: kouzapo   File: utilities.py    MIT License 5 votes vote down vote up
def get_GSPC_symbols():
	"""
	This function downloads the symbols of the S&P 500 index and saves them in a .dat file.
	"""

	f = open('data/symbols_files/GSPC_symbols.dat', 'w')
	GSPC_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

	for symbol in GSPC_list[0]['Symbol']:
		f.write(symbol + '\n')

	f.close() 
Example 18
Project: QFiPy   Author: kouzapo   File: utilities.py    MIT License 5 votes vote down vote up
def get_GDAXI_symbols():
	"""
	This function downloads the symbols of the DAX index and saves them in a .dat file.
	"""

	f = open('data/symbols_files/GDAXI_symbols.dat', 'w')
	GDAXI_list = pd.read_html('https://en.wikipedia.org/wiki/DAX')

	for symbol in GDAXI_list[2]['Ticker symbol']:
		f.write(symbol + '.DE\n')

	f.close() 
Example 19
Project: open-house-crawler   Author: data-skeptic   File: example.py    GNU General Public License v3.0 5 votes vote down vote up
def parse_detail_page(content):
    prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
    other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
    # TODO: use the extended fields
    b = soup.BeautifulSoup(content)
    tables = b.findAll('table', {'class': 'cell'})
    if len(tables) > 0:
        prop['listing_timestamp'] = datetime.datetime.now()
        addr_rows = b.findAll('td', {'class': 'addr'})
        addr = ' '.join(map(lambda x: x.getText(), addr_rows))
        t = tables[0]
        df = pd.read_html(str(t))[0]
        data = dict(zip(df[0], df[1]))
        prop['raw_address'] = addr
        prop['bedrooms'] = int(data['Bedrooms'])
        prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
        if data.has_key('Interior Sq Ft'):
            prop['building_size'] = int(data['Interior Sq Ft'])
        prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
        if data.has_key('Parking'):
            try:
                prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
            except ValueError:
                prop['car_spaces'] = -1
        #for of in other_fields:
        #    if data.has_key(of):
        #        prop['features'].append({of: data[of]})
        return [prop]
    else:
        return None

# Takes a string of the raw version of the page and extracts any links we might want to crawl 
Example 20
Project: open-house-crawler   Author: data-skeptic   File: lambda_function.py    GNU General Public License v3.0 5 votes vote down vote up
def parse_detail_page(b):
    prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
    other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
    # TODO: use the extended fields, add them to the list of properties
    tables = b.findAll('table', {'class': 'cell'})
    if len(tables) > 0:
        prop['listing_timestamp'] = datetime.datetime.now()
        addr_rows = b.findAll('td', {'class': 'addr'})
        addr = ' '.join(map(lambda x: x.getText(), addr_rows))
        t = tables[0]
        df = pd.read_html(str(t))[0]
        data = dict(zip(df[0], df[1]))
        prop['raw_address'] = addr
        prop['bedrooms'] = int(data['Bedrooms'])
        prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
        if data.has_key('Interior Sq Ft'):
            prop['building_size'] = int(data['Interior Sq Ft'])
        prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
        if data.has_key('Parking'):
            try:
                prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
            except ValueError:
                prop['car_spaces'] = -1
        return [prop]
    else:
        return None

# Takes a string of the raw version of the page and extracts any links we might want to crawl 
Example 21
Project: stock-analysis   Author: stefmolin   File: stock_reader.py    MIT License 5 votes vote down vote up
def get_bitcoin_data(self):
        """
        Get bitcoin historical OHLC data from coinmarketcap.com for given date range.

        Returns:
            A pandas dataframe with the bitcoin data.
        """
        return pd.read_html(
            'https://coinmarketcap.com/currencies/bitcoin/historical-data/?'
            'start={}&end={}'.format(
                self.start, self.end
            ),
            parse_dates=[0],
            index_col=[0]
        )[0].sort_index() 
Example 22
Project: zvt   Author: zvtvz   File: china_index_list_spider.py    MIT License 5 votes vote down vote up
def fetch_cni_index(self) -> None:
        """
        抓取国证指数列表
        """
        url = 'http://www.cnindex.com.cn/zstx/jcxl/'
        response = requests.get(url)
        response.encoding = 'utf-8'
        dfs = pd.read_html(response.text)

        # 第 9 个 table 之后为非股票指数
        dfs = dfs[1:9]

        result_df = pd.DataFrame()
        for df in dfs:
            header = df.iloc[0]
            df = df[1:]
            df.columns = header
            df.astype('str')

            result_df = pd.concat([result_df, df])

        result_df = result_df.drop('样本股数量', axis=1)
        result_df.columns = ['name', 'code', 'timestamp', 'base_point', 'list_date']
        result_df['timestamp'] = result_df['timestamp'].apply(lambda x: x.replace('-', ''))
        result_df['list_date'] = result_df['list_date'].apply(lambda x: x.replace('-', ''))
        result_df['category'] = 'csi'
        result_df = result_df.loc[result_df['code'].str.contains(r'^\d{6}$')]

        self.persist_index(result_df)
        self.logger.info('国证指数列表抓取完成...')

        # 抓取国证指数成分股
        self.fetch_cni_index_component(result_df)
        self.logger.info('国证指数成分股抓取完成...') 
Example 23
Project: igrfcoord   Author: space-physics   File: __init__.py    GNU General Public License v3.0 5 votes vote down vote up
def _table(page: str, out: str) -> Tuple[float, float]:
    tab = pandas.read_html(page, header=0, index_col=0)[0]
    tab.dropna(axis=0, how='all', inplace=True)

    if out == 'g' or out == 'geodetic':
        tag = 'Geographic'
    elif out == 'm' or out.startswith('geomag'):
        tag = 'Geomagnetic'
    else:
        raise ValueError('coordinate type must be "geomag" or "geodetic"')

    s = tab.at[tag, 'Latitude']
    if s[-1] == 'N':
        mlat = float(s[:-1])
    elif s[-1] == 'S':
        mlat = -float(s[:-1])
    else:
        raise ValueError(f'I expected N or S but got {s[-1]}')

    s = tab.at[tag, 'Longitude']
    if s[-1] == 'W':
        mlon = -float(s[:-1])
    elif s[-1] == 'E':
        mlon = float(s[:-1])
    else:
        raise ValueError(f'I expected E or W but got {s[-1]}')

    return mlat, mlon  # float must be above for - operator 
Example 24
Project: create-database   Author: nst-guide   File: opencellid.py    GNU General Public License v3.0 5 votes vote down vote up
def download_mobile_network_codes(self):
        url = 'https://en.wikipedia.org/wiki/Mobile_Network_Codes_in_ITU_region_3xx_(North_America)'
        # Get the Wikipedia table with a row that matches "Verizon Wireless"
        dfs = pd.read_html(url, match='Verizon Wireless')
        assert len(
            dfs) == 1, 'More than one match in wikipedia cell network tables'
        df = dfs[0]

        path = self.save_dir / 'network_codes.csv'
        df.to_csv(path, index=False) 
Example 25
Project: AutoLeague_ESPN   Author: asobey   File: parse_waiver.py    GNU General Public License v3.0 5 votes vote down vote up
def create_waiver(self, waiver_source_dict):
        """Listing of "position" playerIds on the waiver. Excludes players not playing this week (BYE or real life FA)"""

        df = pd.DataFrame()

        for start_index in range(0, len(waiver_source_dict)):  #
            try:  # fix this as it loses out on the last page i think
                soup = BeautifulSoup(waiver_source_dict[start_index].content, 'html.parser')
                table = soup.find('table', class_='playerTableTable')
                tdf = pd.read_html(str(table), flavor='bs4')[0]  # returns a list of df's, grab first
                tdf = tdf.iloc[2:, [0, 2, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17]]  # Identify the columns you want to keep by deleting the useless columns
                table_line = str(soup.find_all("td", {"class": "playertablePlayerName"}))
                tdf['ID'] = list(map(int, re.findall('playername_(\d+)', table_line)))
                df = df.append(tdf, ignore_index=True,
                               sort=False)  # !!!! non-concatenation axis is not aligned. remove the "sort=false" to troubleshoot
            except:
                pass
        df.columns = ['Player', 'Waiver Day', 'Team', 'Game Time', 'PRK', 'PTS', 'AVG', 'LAST', 'PROJ', 'OPRK', '%ST',
                      '%OWN', '+/-', 'ID']
        df['POS'] = df['Player'].str.split(',').str[1]  # parse out the position, part 1
        # if df['POS'].str.split().str[2] it contains the injury status, i think (Q, O, etc.)
        df['POS'] = df['POS'].str.split().str[1]  # parse out the position (might be a better way of doing this)
        df['Player'] = df['Player'].str.split(',').str[0]  # keep just player name
        df.query('PROJ != "--"', inplace=True)  # Delete the players with "--", as these are not playing this week
        df['PROJ'] = df['PROJ'].fillna(0).astype('float')  #
        col_order = ['Player', 'POS', 'Waiver Day', 'Team', 'Game Time', 'PRK', 'PTS', 'AVG', 'LAST', 'PROJ', 'OPRK', '%ST',
                      '%OWN', '+/-', 'ID'] # Changing col order for user UPDATE!!!!
        dfaa = df[col_order]  # there is a better way to do this
        # print(tabulate(dfaa, headers='keys', tablefmt='psg1'))

        return dfaa 
Example 26
Project: AutoLeague_ESPN   Author: asobey   File: a2.py    GNU General Public License v3.0 5 votes vote down vote up
def create_waiver(position='none'):
    """Listing of "position" playerIds on the waiver. Excludes players not playing this week (BYE or real life FA)"""
    cookies = {
        'espn_s2': privateData['espn_s2'],
        'SWID': privateData['SWID']
    }

    # slot codes used to get the right page
    slots = {'QB': 0, 'RB': 2, 'RB/WR': 3, 'WR': 4, 'TE': 6, 'D/ST': 16, 'K': 17, 'FLEX': 23}

    if position == 'none':
        parameters = {'leagueId': privateData['leagueid'], 'teamID': privateData['teamid'],
                      'avail': 1, 'injury': 2, 'context': 'freeagency', 'view': 'overview'}
    else:
        parameters = {'leagueId': privateData['leagueid'], 'teamID': privateData['teamid'],
                      'slotCategoryId': position, 'avail': 1, 'injury': 2, 'context': 'freeagency',
                      'view': 'overview'}

    df = pd.DataFrame(columns=['Projected', 'PlayerId'])

    for si in [0, 50, 100]:  # just use the first pages for now. not sure why the third isn't working with si 50 and 100
        try:
            parameters['startIndex'] = si
            r = requests.get('http://games.espn.com/ffl/freeagency',
                             params=parameters,
                             cookies=cookies)

            soup = BeautifulSoup(r.content, 'html.parser')
            table = soup.find('table', class_='playerTableTable')
            tdf = pd.read_html(str(table), flavor='bs4')[0]  # returns a list of df's, grab first
            tdf = tdf.iloc[2:, [13]]  # delete the useless columns
            tdf = add_player_id(tdf, soup)
            tdf.columns = ['Projected', 'PlayerId']
            df = df.append(tdf, ignore_index=True, sort=False)  # !!!! non-concatenation axis is not aligned. remove the "sort=false" to troubleshoot
        except:
            pass
    df.query('Projected != "--"', inplace=True)  # Delete the players with "--"
    df['Projected'] = df['Projected'].fillna(0).astype('float')  #

    return df 
Example 27
Project: AutoLeague_ESPN   Author: asobey   File: api_tests.py    GNU General Public License v3.0 5 votes vote down vote up
def create_leaders():

    cookies = {
        'espn_s2': privateData['espn_s2'],
        'SWID': privateData['SWID']
    }

    r = requests.get('http://games.espn.com/ffl/leaders',
                     params={'leagueId': 413011, 'seasonId': 2018,
                             'scoringPeriodId': 1,
                             'slotCategoryId': 0},
                     cookies=cookies)

    soup = BeautifulSoup(r.content, 'html.parser')
    table = soup.find('table', class_='playerTableTable')
    tdf = pd.read_html(str(table), flavor='bs4')[0]  # returns a list of df's, grab first

    print(tabulate(tdf, headers='keys', tablefmt='psg1'))

    tdf = tdf.drop([1, 3, 4, 7, 12, 16, 21, 25], axis=1)  # remove useless rows and columns

    print(tabulate(tdf, headers='keys', tablefmt='psg1'))

    tdf = tdf.drop([0]).reset_index(drop=True)  # drop 1st row (now column headers) and reindex
    tdf.columns = tdf.iloc[0]  # make 1st row the column headers
    tdf = tdf.drop([0]).reset_index(drop=True)  # drop 1st row (now column headers) and reindex
    # fix column header labels with something like tdf[2][1] = 'POS'
    # consider tdf.dropna(subset=[1]) to drop nan columns?

    table_str = tabulate(tdf, headers='keys', tablefmt='psql')
    # print(table_str)

    return 0 
Example 28
Project: StockRecommendSystem   Author: doncat99   File: Fetch_Data_Stock_US_Short.py    MIT License 5 votes vote down vote up
def getSignleStockShortInfo(stock):
    df = pd.DataFrame()
    url = "http://shortsqueeze.com/?symbol=" + stock + "&submit=Short+Quote%E2%84%A2"
    repeat_times = 3
    downloadFailed = True

    for _ in range(repeat_times): 
        try:
            response = requests.get(url, timeout=15)
            downloadFailed = False
            break
        except Exception as e:
            print ("exception in get stock:" + stock, str(e))
            continue

    if downloadFailed:
        return "", df
    
    try:    
        tables = pd.read_html(response.text, attrs={'cellpadding': '3', 'width': '100%'})
    except Exception as e:
        print ("exception in parse stock:" + stock, str(e))
        return "", df

    for table in tables:
        if df.empty:
            df = table
        else:
            df = pd.concat([df, table])
    df = df.reset_index(drop=True, inplace=True)
    #print(df)
        
    soup = BeautifulSoup(response.text, 'lxml')
    dateString = soup.find('span', {"style" : "color:#999999;font-family: verdana, arial, helvetica;font-size:10px"}).get_text()
    date = datetime.datetime.strptime(dateString, '%A %B %d, %Y')
    return date, df.T 
Example 29
Project: mvp-predict   Author: sidharthrajaram   File: dailypredictions.py    MIT License 5 votes vote down vote up
def getPlayerDataFrames(name):
    player_name = name.lower()
    ln_fi = player_name.find(' ') + 1  # index of first initial of last name
    first = player_name[:2]
    last = player_name[ln_fi:ln_fi + 5]

    url = "https://www.basketball-reference.com/players/" + player_name[ln_fi] + "/" + last + first + "01.html"
    if(name=='Anthony Davis'):
        url = "https://www.basketball-reference.com/players/d/davisan02.html"

    with urllib.request.urlopen(url) as response:
        # UTF-8 doesn't support some initial character on the websites for some reason!
        r = response.read().decode('latin-1')

    content = re.sub(r'(?m)^\<!--.*\n?', '', r)
    content = re.sub(r'(?m)^\-->.*\n?', '', content)

    soup = BeautifulSoup(content, 'html.parser')
    tables = soup.findAll('table')

    #change dep on website format
    reg_table = tables[0]
    adv_table = tables[4]

    reg_df = pd.read_html(str(reg_table))[0]
    adv_df = pd.read_html(str(adv_table))[0]

    reg_header = reg_df.columns.values.tolist()
    adv_header = adv_df.columns.values.tolist()
    return reg_df, adv_df 
Example 30
Project: mvp-predict   Author: sidharthrajaram   File: forecast.py    MIT License 5 votes vote down vote up
def getPlayerStats(name, advanced=True):
    player_name = name.lower()
    ln_fi = player_name.find(' ') + 1  # index of first initial of last name
    first = player_name[:2]
    last = player_name[ln_fi:ln_fi + 5]

    url = "https://www.basketball-reference.com/players/" + player_name[ln_fi] + "/" + last + first + "01.html"
    if(name=='Anthony Davis'):
        url = "https://www.basketball-reference.com/players/d/davisan02.html"
    elif(name=='Clint Capela'):
        url = "https://www.basketball-reference.com/players/c/capelca01.html"
    elif(name=='D\'Angelo Russell'):
        url = "https://www.basketball-reference.com/players/r/russeda01.html"
    elif(name=='Kemba Walker'):
        url = "https://www.basketball-reference.com/players/w/walkeke02.html"
    print(url)
    with urllib.request.urlopen(url) as response:
        # UTF-8 doesn't support some initial character on the websites for some reason!
        r = response.read().decode('latin-1')

    content = re.sub(r'(?m)^\<!--.*\n?', '', r)
    content = re.sub(r'(?m)^\-->.*\n?', '', content)

    soup = BeautifulSoup(content, 'html.parser')
    tables = soup.findAll('table')

    if advanced:
        table = tables[4]
    else:
        table = tables[0]

    df = pd.read_html(str(table))[0]
    return df 
Example 31
Project: gugu   Author: TabQ   File: reference.py    Apache License 2.0 5 votes vote down vote up
def __handleDistriPlan(self, year, pageNo, retry, pause):
        for _ in range(retry):
            time.sleep(pause)
            
            try:
                if pageNo > 0:
                    self._writeConsole()
                    
                # http://quotes.money.163.com/data/caibao/fpyg.html?reportdate=2018&sort=declaredate&order=desc&page=0
                html = lxml.html.parse(cf.DP_163_URL % (year, pageNo))  
                res = html.xpath('//table[@class=\"fn_cm_table\"]/tr')
                if self._PY3:
                    sarr = [etree.tostring(node).decode('utf-8') for node in res]
                else:
                    sarr = [etree.tostring(node) for node in res]
                sarr = ''.join(sarr)
                sarr = '<table>%s</table>' % sarr
                df = pd.read_html(sarr)[0]
                df = df.drop(0, axis=1)
                df.columns = cf.DP_163_COLS
                df['divi'] = df['plan'].map(self.__bonus)
                df['shares'] = df['plan'].map(self.__gift)
                df = df.drop('plan', axis=1)
                df['code'] = df['code'].astype(object)
                df['code'] = df['code'].map(lambda x : str(x).zfill(6))
                pages = []
                if pageNo == 0:
                    page = html.xpath('//div[@class=\"mod_pages\"]/a')
                    if len(page)>1:
                        asr = page[len(page)-2]
                        pages = asr.xpath('text()')
            except Exception as e:
                print(e)
            else:
                if pageNo == 0:
                    return df, pages[0] if len(pages)>0 else 0
                else:
                    return df
                
        raise IOError(cf.NETWORK_URL_ERROR_MSG) 
Example 32
Project: gugu   Author: TabQ   File: reference.py    Apache License 2.0 5 votes vote down vote up
def __handleForecast(self, year, quarter, pageNo, dataArr, retry, pause):
        self._writeConsole()
        
        for _ in range(retry):
            time.sleep(pause)
            
            try:
                # http://vip.stock.finance.sina.com.cn/q/go.php/vFinanceAnalyze/kind/performance/index.phtml?s_i=&s_a=&s_c=&s_type=&reportdate=2018&quarter=3&p=1&num=60
                request = self._session.get( cf.FORECAST_URL%( year, quarter, pageNo, cf.PAGE_NUM[1]), timeout=10 )
                request.encoding = 'gbk'
                text = request.text.replace('--', '')
                html = lxml.html.parse(StringIO(text))
                res = html.xpath("//table[@class=\"list_table\"]/tr")
                if self._PY3:
                    sarr = [etree.tostring(node).decode('utf-8') for node in res]
                else:
                    sarr = [etree.tostring(node) for node in res]
                sarr = ''.join(sarr)
                sarr = '<table>%s</table>'%sarr
                df = pd.read_html(sarr)[0]
                df = df.drop([4, 5, 8], axis=1)
                df.columns = cf.FORECAST_COLS
                dataArr = dataArr.append(df, ignore_index=True)
                nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
                if len(nextPage)>0:
                    pageNo = re.findall(r'\d+',nextPage[0])[0]
                    return self.__handleForecast(year, quarter, pageNo, dataArr, retry, pause)
                else:
                    return dataArr
            except Exception as e:
                    print(e)
                    
        raise IOError(cf.NETWORK_URL_ERROR_MSG) 
Example 33
Project: gugu   Author: TabQ   File: reference.py    Apache License 2.0 5 votes vote down vote up
def __handleIpo(self, data, pageNo, retry, pause):
        self._writeConsole()

        for _ in range(retry):
            time.sleep(pause)

            try:
                # http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc
                html = lxml.html.parse(cf.NEW_STOCKS_URL % pageNo)
                res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
                if not res:
                    return data
                
                if self._PY3:
                    sarr = [etree.tostring(node).decode('utf-8') for node in res]
                else:
                    sarr = [etree.tostring(node) for node in res]
                sarr = ''.join(sarr)
                sarr = sarr.replace('<font color="red">*</font>', '')
                sarr = '<table>%s</table>'%sarr
                df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
                df = df.drop([df.columns[idx] for idx in [12, 13, 14, 15]], axis=1)
                df.columns = cf.NEW_STOCKS_COLS
                df['code'] = df['code'].map(lambda x : str(x).zfill(6))
                df['xcode'] = df['xcode'].map(lambda x : str(x).zfill(6))
                res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
                tag = '下一页' if self._PY3 else unicode('下一页', 'utf-8')
                hasNext = True if tag in res else False 
                data = data.append(df, ignore_index=True)
                pageNo += 1
                if hasNext:
                    data = self.__handleIpo(data, pageNo, retry, pause)
            except Exception as ex:
                print(ex)
            else:
                return data 
Example 34
Project: gugu   Author: TabQ   File: stockinfo.py    Apache License 2.0 5 votes vote down vote up
def __handleStockProfiles(self, dataArr, date, page, retry, pause):
        self._writeConsole()
        
        for _ in range(retry):
            time.sleep(pause)
            
            try:
                html = lxml.html.parse(cf.ALL_STOCK_PROFILES_URL % (date, page))
                res = html.xpath('//table[@id="myTable04"]/tbody/tr')
                if not res:
                    return dataArr
                
                if self._PY3:
                    sarr = [etree.tostring(node).decode('utf-8') for node in res]
                else:
                    sarr = [etree.tostring(node) for node in res]
                sarr = ''.join(sarr)
                sarr = '<table>%s</table>' % sarr
                
                df = pd.read_html(sarr)[0]
                df = df.drop([0, 3, 5, 6, 7, 10, 11], axis = 1)
                df.columns = cf.ALL_STOCK_PROFILES_COLS
                df['code'] = df['code'].map(lambda x: str(x).zfill(6))
                
                dataArr = dataArr.append(df, ignore_index=True)
            except Exception as e:
                print(e)
            else:
                return self.__handleStockProfiles(dataArr, date, page+1, retry, pause) 
Example 35
Project: gugu   Author: TabQ   File: stockinfo.py    Apache License 2.0 5 votes vote down vote up
def __parsePage(self, url, year, quarter, page, column, dataArr, retry, pause, drop_column=None):
        self._writeConsole()
        
        for _ in range(retry):
            time.sleep(pause)
            
            try:
                request = self._session.get( url % (year, quarter, page, cf.PAGE_NUM[1]), timeout=10 )
                request.encoding = 'gbk'
                text = request.text.replace('--', '')
                html = lxml.html.parse(StringIO(text))
                res = html.xpath("//table[@class=\"list_table\"]/tr")
                if self._PY3:
                    sarr = [etree.tostring(node).decode('utf-8') for node in res]
                else:
                    sarr = [etree.tostring(node) for node in res]
                sarr = ''.join(sarr)
                sarr = '<table>%s</table>'%sarr
                df = pd.read_html(sarr)[0]
                if drop_column is not None:
                    df = df.drop(drop_column, axis=1)
                df.columns = column
                dataArr = dataArr.append(df, ignore_index=True)
                nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
                if len(nextPage) > 0:
                    page = re.findall(r'\d+', nextPage[0])[0]
                    return self.__parsePage(url, year, quarter, page, column, dataArr, retry, pause, drop_column)
                else:
                    return dataArr
            except Exception as e:
                print(e)
                
        raise IOError(cf.NETWORK_URL_ERROR_MSG) 
Example 36
Project: gugu   Author: TabQ   File: billboard.py    Apache License 2.0 5 votes vote down vote up
def __parsePage(self, kind, last, column, dataArr, pageNo=1, retry=3, pause=0.001, drop_column=None):
        self._writeConsole()
        
        for _ in range(retry):
            time.sleep(pause)
            
            try:
                request = self._session.get( cf.LHB_SINA_URL % (kind, last, pageNo), timeout=10 )
                request.encoding = 'gbk'
                html = lxml.html.parse(StringIO(request.text))
                res = html.xpath("//table[@id=\"dataTable\"]/tr")
                if self._PY3:
                    sarr = [etree.tostring(node).decode('utf-8') for node in res]
                else:
                    sarr = [etree.tostring(node) for node in res]
                sarr = ''.join(sarr)
                sarr = '<table>%s</table>'%sarr
                df = pd.read_html(sarr)[0]
                if drop_column is not None:
                    df = df.drop(drop_column, axis=1)
                df.columns = column
                dataArr = dataArr.append(df, ignore_index=True)
                nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
                if len(nextPage) > 0:
                    pageNo = re.findall(r'\d+', nextPage[0])[0]
                    return self.__parsePage(kind, last, column, dataArr, pageNo, retry, pause, drop_column)
                else:
                    return dataArr
            except Exception as e:
                print(e) 
Example 37
Project: nakamoto-coefficient   Author: YazzyYaz   File: market.py    MIT License 5 votes vote down vote up
def generate_market_data(self):
        session = HTMLSession()
        r = session.get(self.market_url)
        table = r.html.find('#markets-table', first=True)
        session.close()
        market_df = pd.read_html(table.html)[0]
        volume_data = market_df['Volume (24h)']
        volume_data = volume_data.str.replace('$', '', regex=False)
        volume_data = volume_data.str.replace('*** ', '', regex=False)
        volume_data = volume_data.str.replace('** ', '', regex=False)
        volume_data = volume_data.str.replace('* ', '', regex=False)
        volume_data = volume_data.str.replace(',', '', regex=False)
        volume_data = pd.to_numeric(volume_data)
        volume_data = volume_data.sort_values()
        self.data = np.array(volume_data) 
Example 38
Project: opt_trade   Author: ejpjapan   File: option_daily_prod.py    MIT License 5 votes vote down vote up
def get_dividend_yield():
        """Gets latest dividend yield"""
        # TO DO: Add check on date of latest dividend yield
        # TO DO: Change to RSL2 dividend yield
        # dividend_yield_history = DividendYieldHistory()
        # dividend_yield = dividend_yield_history.dy_monthly[-1] / 100
        print('Warning: RSL2 Using Fixed Dividend yield')
        dividend_yield = 0.0134

        return dividend_yield

#
# class _emfOptionAsset(OptionAsset):
#    def __init__(self, mkt_symbol='MXEF', vol_symbol='VXEEM', exchange=('CBOE', 'CBOE'), \
#                  currency='USD', multiplier='100', sec_type='IND'):
#       super().__init__(mkt_symbol, vol_symbol, exchange, \
#                  currency, multiplier, sec_type)
#       self.listing_spread = 10
#
#    @staticmethod
#    def get_option_implied_dividend_yld():
#        """Returns latest dividend yield for market"""
#        url = 'http://www.wsj.com/mdc/public/page/2_3021-peyield.html'
#        # Package the request, send the request and catch the response: r
#        raw_html_tbl = pd.read_html(url)
#        dy_df = raw_html_tbl[2]
#        latest_dividend_yield = float(dy_df.iloc[2, 4]) /100
#        return latest_dividend_yield 
Example 39
Project: hedgedata   Author: timkpaine   File: etf.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def composition(key):
    comp = pd.read_html(ETF_URL % key, attrs={'id': 'etfs-that-own'})[0]
    comp['% of Total'] = comp['% of Total'].str.rstrip('%').astype(float) / 100.0
    comp.columns = ['Symbol', 'Name', 'Percent']

    comp['Symbol'].apply(lambda x: symbols_map().get(x, x))
    return comp[['Symbol', 'Percent', 'Name']] 
Example 40
Project: NICERsoft   Author: paulray   File: ni_data_download.py    MIT License 5 votes vote down vote up
def print_nicer_segment(url = 'https://heasarc.gsfc.nasa.gov/docs/nicer/team_schedule/nicer_seg_team.html',
                        username = None, password=None):
    """
    This prints out the segment detail table in text format

    usage: % print_nicer_segment(username = "nicer_user_name" password = "nicer password")

    outputs: prints the nicer segment table to the terminal

    :param url: location of the segment detail page
    :param username: nicer team username
    :param password: nicer team password
    :return:
    """
    from bs4 import BeautifulSoup
    import requests
    if (not username) or (not password):
        raise ValueError("must supply username and password to access the NICER obs page")
    req = requests.get(url, auth=(username, password))
    if req.status_code != 200:
        raise ValueError('Problem accessing {0} with ({1}, {2}) \nReturn code: {3}'.format(
            url, username, password, req.status_code))
        
    soup = BeautifulSoup(req.text, 'lxml')
    tabs = soup.find_all('table')[1]    
    df = pd.read_html(str(tabs))
    return df[0]

# ----------------------------------------------------------------------
#   file handling
# ---------------------------------------------------------------------- 
Example 41
Project: cryptory   Author: dashee87   File: cryptory.py    MIT License 5 votes vote down vote up
def extract_coinmarketcap(self, coin, coin_col=False):
        """Retrieve basic historical information for a specific cryptocurrency from coinmarketcap.com
        
        Parameters
        ----------
        coin : the name of the cryptocurrency (e.g. 'bitcoin', 'ethereum', 'dentacoin')
        coin_col : whether to include the coin name as a column
            (default is False i.e. the column is not included)
            
        Returns
        -------
        pandas Dataframe
        """
        try:
            output = pd.read_html("https://coinmarketcap.com/currencies/{}/historical-data/?start={}&end={}".format(
                coin, self.from_date.replace("-", ""), self.to_date.replace("-", "")))[0]
        except:
            # future versions may split out the different exceptions (e.g. timeout)
            raise
        output = output.assign(Date=pd.to_datetime(output['Date']))
        for col in output.columns:
            if output[col].dtype == np.dtype('O'):
                output.loc[output[col]=="-",col]=0
                output[col] = output[col].astype('int64')
        output.columns = [re.sub(r"[^a-z]", "", col.lower()) for col in output.columns]
        if coin_col:
            output['coin'] = coin
        return output 
Example 42
Project: fastent   Author: fastent   File: api_utils.py    MIT License 5 votes vote down vote up
def fasttext_list():
    """
    Return a Dictionary of the possible fasttext models

    Args:
        None:
    Returns:
         diction_frac(dict) : Language to Model dictionary
    """

    diction_frac = {}
    try:
        content = requests.get("https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md").content
        webpage = LH.fromstring(content)
        allRefs = webpage.xpath('//a/@href')

        allRefs = [i for i in allRefs if 'amazonaws' in i and not 'zip' in i]
        allRefs

        df = pd.read_html(content)
        df = df[-1]

        assert(len(allRefs) ==  len(df['Unnamed: 0']) + len(df['Unnamed: 1'])+len(df['Unnamed: 2']))

        for i in range(len(allRefs)):
            if i%3 == 0:
                diction_frac[df['Unnamed: 0'][int(i/3)]] = allRefs[i]
            if i%3 == 1:
                diction_frac[df['Unnamed: 1'][int(i/3)]] = allRefs[i]
            if i%3 == 2:
                diction_frac[df['Unnamed: 2'][int(i/3)]] = allRefs[i]


    except Exception as e:
        print(e)
        return None

    return diction_frac 
Example 43
Project: electricitymap-contrib   Author: tmrowco   File: CA_AB.py    GNU General Public License v3.0 5 votes vote down vote up
def fetch_exchange(zone_key1='CA-AB', zone_key2='CA-BC', session=None, target_datetime=None, logger=None):
    """Requests the last known power exchange (in MW) between two countries

    Arguments:
    zone_key (optional) -- used in case a parser is able to fetch multiple countries
    session (optional)      -- request session passed in order to re-use an existing session

    Return:
    A dictionary in the form:
    {
      'sortedZoneKeys': 'DK->NO',
      'datetime': '2017-01-01T00:00:00Z',
      'netFlow': 0.0,
      'source': 'mysource.com'
    }
    """
    if target_datetime:
        raise NotImplementedError('This parser is not yet able to parse past dates')

    r = session or requests.session()
    url = 'http://ets.aeso.ca/ets_web/ip/Market/Reports/CSDReportServlet'
    response = r.get(url)
    df_exchanges = pd.read_html(response.text, match='INTERCHANGE', skiprows=0, index_col=0)

    flows = {
        'CA-AB->CA-BC': df_exchanges[1][1]['British Columbia'],
        'CA-AB->CA-SK': df_exchanges[1][1]['Saskatchewan'],
        'CA-AB->US-MT': df_exchanges[1][1]['Montana']
    }
    sortedZoneKeys = '->'.join(sorted([zone_key1, zone_key2]))
    if sortedZoneKeys not in flows:
        raise NotImplementedError('This exchange pair is not implemented')

    return {
        'datetime': arrow.now(tz=ab_timezone).datetime,
        'sortedZoneKeys': sortedZoneKeys,
        'netFlow': float(flows[sortedZoneKeys]),
        'source': 'ets.aeso.ca'
    } 
Example 44
Project: fund   Author: Frank-qlu   File: reference.py    Apache License 2.0 5 votes vote down vote up
def _dist_cotent(year, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            if pageNo > 0:
                ct._write_console()
            html = lxml.html.parse(rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'],
                     ct.PAGES['163dp'], year, pageNo))  
            res = html.xpath('//div[@class=\"fn_rp_list\"]/table')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows=[0])[0]
            df = df.drop(df.columns[0], axis=1)
            df.columns = rv.DP_163_COLS
            df['divi'] = df['plan'].map(_fun_divi)
            df['shares'] = df['plan'].map(_fun_into)
            df = df.drop('plan', axis=1)
            df['code'] = df['code'].astype(object)
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            pages = []
            if pageNo == 0:
                page = html.xpath('//div[@class=\"mod_pages\"]/a')
                if len(page)>1:
                    asr = page[len(page)-2]
                    pages = asr.xpath('text()')
        except Exception as e:
            print(e)
        else:
            if pageNo == 0:
                return df, pages[0] if len(pages)>0 else 0
            else:
                return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 45
Project: fund   Author: Frank-qlu   File: reference.py    Apache License 2.0 5 votes vote down vote up
def _get_forecast_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        gparser = etree.HTMLParser(encoding='GBK')
        html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], 
                                                ct.PAGES['fd'], year, quarter, pageNo,
                                                ct.PAGE_NUM[1]),
                               parser=gparser)
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('--', '0')
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df = df.drop([4, 5, 8], axis=1)
        df.columns = ct.FORECAST_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+',nextPage[0])[0]
            return _get_forecast_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
            print(e) 
Example 46
Project: fund   Author: Frank-qlu   File: reference.py    Apache License 2.0 5 votes vote down vote up
def _newstocks(data, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'],
                         ct.PAGES['newstock'], pageNo))
            res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
            if len(res) == 0:
                return data
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = sarr.replace('<font color="red">*</font>', '')
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
            df = df.drop([df.columns[idx] for idx in [12, 13, 14]], axis=1)
            df.columns = rv.NEW_STOCKS_COLS
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            df['xcode'] = df['xcode'].map(lambda x : str(x).zfill(6))
            res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
            tag = '下一页' if ct.PY3 else unicode('下一页', 'utf-8')
            hasNext = True if tag in res else False 
            data = data.append(df, ignore_index=True)
            pageNo += 1
            if hasNext:
                data = _newstocks(data, pageNo, retry_count, pause)
        except Exception as ex:
            print(ex)
        else:
            return data 
Example 47
Project: fund   Author: Frank-qlu   File: reference.py    Apache License 2.0 5 votes vote down vote up
def _newcbonds(pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        if pageNo != 1:
            ct._write_console()
        try:
            html = lxml.html.parse(rv.NEW_CBONDS_URL%(ct.P_TYPE['http'],ct.DOMAINS['sstar'],
                         pageNo))
            res = html.xpath('//table/tr')
            if len(res) == 0:
                return None
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(StringIO(sarr), skiprows=[0])
            if len(df) < 1:
                return None
            df = df[0]
            df = df.drop([df.columns[14], df.columns[15]], axis=1)
            df.columns = rv.NEW_CBONDS_COLS
            df['scode'] = df['scode'].map(lambda x: str(x).zfill(6))
            df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6))
        except Exception as ex:
            print(ex)
        else:
            return df 
Example 48
Project: fund   Author: Frank-qlu   File: fundamental.py    Apache License 2.0 5 votes vote down vote up
def _get_report_data(year, quarter, pageNo, dataArr,
                     retry_count=3, pause=0.001):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(ct.REPORT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'],
                             year, quarter, pageNo, ct.PAGE_NUM[1]))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            text = text.replace('--', '')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@class=\"list_table\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df = df.drop(11, axis=1)
            df.columns = ct.REPORT_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _get_report_data(year, quarter, pageNo, dataArr)
            else:
                return dataArr
        except Exception as e:
            pass
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 49
Project: fund   Author: Frank-qlu   File: fundamental.py    Apache License 2.0 5 votes vote down vote up
def _get_profit_data(year, quarter, pageNo, dataArr,
                     retry_count=3, pause=0.001):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                  ct.PAGES['fd'], year,
                                                  quarter, pageNo, ct.PAGE_NUM[1]))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            text = text.replace('--', '')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@class=\"list_table\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns=ct.PROFIT_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _get_profit_data(year, quarter, pageNo, dataArr)
            else:
                return dataArr
        except:
            pass
    raise IOError(ct.NETWORK_URL_ERROR_MSG) 
Example 50
Project: fund   Author: Frank-qlu   File: fundamental.py    Apache License 2.0 5 votes vote down vote up
def _get_operation_data(year, quarter, pageNo, dataArr,
                        retry_count=3, pause=0.001):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(ct.OPERATION_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                     ct.PAGES['fd'], year,
                                                     quarter, pageNo, ct.PAGE_NUM[1]))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            text = text.replace('--', '')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@class=\"list_table\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns=ct.OPERATION_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _get_operation_data(year, quarter, pageNo, dataArr)
            else:
                return dataArr
        except Exception as e:
            pass
    raise IOError(ct.NETWORK_URL_ERROR_MSG)