Python requests_html.HTMLSession() Examples

The following are 21 code examples of requests_html.HTMLSession(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module requests_html , or try the search function .
Example #1
Source File: facebook_scraper.py    From facebook-scraper with MIT License 7 votes vote down vote up
def __init__(self, session=None, requests_kwargs=None):
        if session is None:
            session = HTMLSession()
            session.headers.update(self.default_headers)

        if requests_kwargs is None:
            requests_kwargs = {}

        self.session = session
        self.requests_kwargs = requests_kwargs 
Example #2
Source File: instagram.py    From EagleEye with Do What The F*ck You Want To Public License 7 votes vote down vote up
def getLinks(self):
        session = HTMLSession()
        r = session.get('https://instagram.com/' + self.username)
        l = r.html.find('body > script:nth-child(5)')[0].text
        json_str = l[21:]
        json_str = json_str[:-1]
        json_parsed = json.loads(json_str)
        shortcodes = []
        try:
            images = json_parsed['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']
            for image in images:
                node = image['node']
                shortcode = node['shortcode']
                shortcodes.append(shortcode)
            links = []
            for sc in shortcodes:
                r = session.get('https://instagram.com/p/' + sc + '/?taken-by=' + self.username)
                img = r.html.find('meta[property="og:image"]')
                if len(img) > 0:
                    img = img[0]
                    links.append(img.attrs['content'])
            return links
        except:
            return [] 
Example #3
Source File: auth.py    From codechef-cli with GNU General Public License v3.0 6 votes vote down vote up
def make_login_req(username, password, disconnect_sessions):
    with HTMLSession() as session:
        set_session_cookies(session)

        resp = request(session=session)
        token = get_csrf_token(resp.html, CSRF_TOKEN_INPUT_ID)
        if not token:
            return [{'data': CSRF_TOKEN_MISSING, 'code': 500}]

        data = {
            'name': username,
            'pass': password,
            'form_id': LOGIN_FORM_ID[1:],
            'csrfToken': token
        }

        resp = request(session=session, method='POST', data=data)
        resp_html = resp.html

        if resp.status_code == 200:
            if resp_html.find(SESSION_LIMIT_FORM_ID):
                if disconnect_sessions:
                    resps = disconnect_active_sessions(session, resp_html)
                    save_session_cookies(session, username)
                    return resps
                else:
                    logout(session=session)
                    return [{'data': SESSION_LIMIT_MSG, 'code': 400}]
            elif resp_html.find(LOGOUT_BUTTON_CLASS):
                save_session_cookies(session, username)
                return [{'data': LOGIN_SUCCESS_MSG}]
            return [{'data': INCORRECT_CREDS_MSG, 'code': 400}]
        return [{'code': 503}] 
Example #4
Source File: crawler.py    From administrative-divisions-of-China-on-Python with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self):
        self._headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Accept-Encoding': ''
        }
        """
        头信息
        """

        self._session = HTMLSession()
        """
        HTMLSession 对象
        """ 
Example #5
Source File: test_config.py    From wikipron with Apache License 2.0 6 votes vote down vote up
def test_american_english_dialect_selection():
    # Pick a word for which Wiktionary has dialect-specified pronunciations
    # for both US and non-US English.
    word = "mocha"
    html_session = requests_html.HTMLSession()
    response = html_session.get(_PAGE_TEMPLATE.format(word=word))
    # Construct two configs to demonstrate the US dialect (non-)selection.
    config_only_us = config_factory(key="en", dialect="US | American English")
    config_any_dialect = config_factory(key="en")
    # Apply each config's XPath selector.
    results_only_us = response.html.xpath(config_only_us.pron_xpath_selector)
    results_any_dialect = response.html.xpath(
        config_any_dialect.pron_xpath_selector
    )
    assert (
        len(results_any_dialect)  # containing both US and non-US results
        > len(results_only_us)  # containing only the US result
        > 0
    ) 
Example #6
Source File: helpers.py    From codechef-cli with GNU General Public License v3.0 5 votes vote down vote up
def get_session():
    session = HTMLSession()

    if os.path.exists(COOKIES_FILE_PATH):
        set_session_cookies(session)
        session.cookies.load(ignore_discard=True, ignore_expires=True)
    return session 
Example #7
Source File: imicrobe.py    From grabseqs with MIT License 5 votes vote down vote up
def get_imicrobe_acc_metadata(pacc):
    """
    Function to get list of iMicrobe sample accession numbers from a particular
    project. Takes project accession number `pacc` and returns a list of iMicrobe
    accession numbers.
    """
    # Check accession format
    pacc = pacc.lower()
    if pacc.startswith("p"):
        pacc = pacc[1:]
    elif pacc.startswith("s"):
        return [pacc]
    else:
        raise(Exception("iMicrobe accession numbers should be prefixed with 'p' (project) or 's' (sample)"))

    # Grab sample info
    session = HTMLSession()
    r = session.get('https://www.imicrobe.us/#/projects/'+pacc)
    r.html.render(sleep = 1)

    sample_list = []
    for l in r.html.element("a"):
        i = l.items()
        try:
            if i[0][1].startswith("#/samples/"):
                sample_list.append(i[0][1][10:]) # add sample ID only
        except IndexError:
            continue
    session.close()

    # Format and return sample accession numbers
    return ["s"+ sID for sID in sample_list] 
Example #8
Source File: channels.py    From telegram with MIT License 5 votes vote down vote up
def extract_html(url, javascript_enabled=False):
    session = HTMLSession()
    response = session.get(url)
    if javascript_enabled:
        response.html.render()
        source_html = response.html.html
        return source_html
    else:
        return response.html.html


# method to parse the HTML from the Lyzem page 
Example #9
Source File: live_recorder.py    From bilibili-live-recorder with MIT License 5 votes vote down vote up
def __init__(self, cid, output_name='opt.mp4'):
        self.cid = cid
        self.api_url = 'http://api.live.bilibili.com/api/playurl?device=phone&platform=ios&scale=3&build=10000&' \
                       'cid={}&otype=json&platform=h5'.format(cid)
        self.output_dir = os.path.join(os.getcwd(), 'files')
        self._s = requests_html.HTMLSession() 
Example #10
Source File: worker.py    From scylla with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        """Initialize the worker object

        """

        self.session = HTMLSession() 
Example #11
Source File: scrape.py    From wikipron with Apache License 2.0 5 votes vote down vote up
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]:
    session = requests_html.HTMLSession()
    for member in data["query"]["categorymembers"]:
        word = member["title"]
        date = member["timestamp"]
        if _skip_word(word, config.no_skip_spaces_word) or _skip_date(
            date, config.cut_off_date
        ):
            continue
        request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10)
        for word, pron in config.extract_word_pron(word, request, config):
            yield word, pron 
Example #12
Source File: spider.py    From rxivist with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self):
    self.connection = db.Connection(config.db["host"], config.db["db"], config.db["user"], config.db["password"])
    self.session = HTMLSession(mock_browser=False)
    self.session.headers['User-Agent'] = config.user_agent
    self.log = Logger() 
Example #13
Source File: check_music.py    From snippet with MIT License 5 votes vote down vote up
def __init__(self):
        self._session = HTMLSession() 
Example #14
Source File: conftest.py    From kube-web-view with GNU General Public License v3.0 5 votes vote down vote up
def session(populated_cluster):

    url = populated_cluster["url"].rstrip("/")

    s = HTMLSession()

    def new_request(prefix, f, method, url, *args, **kwargs):
        return f(method, prefix + url, *args, **kwargs)

    s.request = partial(new_request, url, s.request)
    return s 
Example #15
Source File: search.py    From SQL-scanner with MIT License 5 votes vote down vote up
def find_links(self):

        session = HTMLSession()
        session.headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'

        url = self.base_url + self.parameters.format(self.query)

        while self.is_alive:
            try:
                html = session.get(url).html
            except:
                break

            for r in html.find('.b_algo'):
                a = r.find('h2', first=True).find('a', first=True)

                try:
                    link = a.attrs['href']
                except:
                    continue

                if self.is_valid(link):
                    self.links.put(link)

            next_page = self.next_page(html)

            if not next_page:
                break

            url = next_page

        with self.lock:
            self.is_searching = False 
Example #16
Source File: instagram_scraper.py    From instagram-scraper with MIT License 5 votes vote down vote up
def scrape_instagram_tag(tag: str, total_count: int=50, existing: set=None):
    """
    Scrape and yield recently tagged instagram photos.
    """
    if existing is None:
        existing = set()

    url = f'https://www.instagram.com/explore/tags/{tag}'
    session = HTMLSession()
    req = session.get(url)

    imgs = set(existing)
    count = 0
    page = 0

    while count <= total_count:
        req.html.render(scrolldown=page)
        images = req.html.xpath('//img[@alt]')
        page += 1
        for image in images:
            if count > total_count:
                break
            try:
                url, caption = image.attrs['src'], image.attrs['alt']
            except:
                pass
            else:
                if url in imgs:
                    continue
                imgs.add(url)
                hashtags = set(REGEXES['hashtag'].findall(caption))
                mentions = set(REGEXES['username'].findall(caption))
                count += 1
                yield url, caption, hashtags, mentions 
Example #17
Source File: mtc.py    From crypto51 with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
        self._session = HTMLSession() 
Example #18
Source File: test_helpers.py    From codechef-cli with GNU General Public License v3.0 5 votes vote down vote up
def test_get_session_no_cookies(self):
        """Should return requests_html.HTMLSession instance"""
        fake_logout()

        session = get_session()
        self.assertIsInstance(session, HTMLSession)
        self.assertEqual(len(session.cookies), 0) 
Example #19
Source File: test_helpers.py    From codechef-cli with GNU General Public License v3.0 5 votes vote down vote up
def test_get_session_cookies(self):
        """Should return requests_html.HTMLSession instance preloaded with cookies"""
        fake_login()

        session = get_session()
        self.assertIsInstance(session, HTMLSession)
        self.assertTrue(len(session.cookies) > 0) 
Example #20
Source File: list.py    From terraenv with MIT License 4 votes vote down vote up
def list_remote(args):
    program = args.program

    """ lists terraform/terragrunt versions """

    if program == "terraform":
        session = HTMLSession()
        terraform_url = session.get(
            "https://releases.hashicorp.com/terraform/")
        unstable_releases = '-'
        data = terraform_url.html.links
        data = filter(lambda x: program in x, data)
        data = filter(lambda x: unstable_releases not in x, data)
        available_versions = ['']

        for d in data:
            version = d.split('/')[2]
            available_versions.append(version)
        available_versions.remove('')
        available_versions.sort(key=StrictVersion)

        if args.commands in validate_versions_commands:
            return available_versions

        for version in available_versions:
            print(version)

    elif program == "terragrunt":
        session = HTMLSession()
        terragrunt_url = session.get(
            "https://api.github.com/repos/gruntwork-io/terragrunt/tags?per_page=1000")
        data = terragrunt_url.html.full_text
        parsed_json = (json.loads(data))
        available_versions = ['']

        for version in parsed_json:
            available_versions.append(version['name'].lstrip('v'))
        available_versions.remove('')
        available_versions.sort(key=StrictVersion)

        if args.commands in validate_versions_commands:
            return available_versions

        for version in available_versions:
            print(version)

    else:
        raise Exception(
            'Invalid Arguement !! It should be either terraform / terragrunt') 
Example #21
Source File: imicrobe.py    From grabseqs with MIT License 4 votes vote down vote up
def _parse_imicrobe_readpath_metadata(acc, download_metadata, metadata_agg):
    """
    Helper function to parse sample download paths from a sample page.
    Takes an `acc` with no prefix. Returns a dictionary with download paths
    for one or two reads like: {1:"url"} or {1:"url1", 2:"url2"}. Also returns
    aggregated metadata.
    """
    acc = str(acc)
    session = HTMLSession()
    r = session.get('https://www.imicrobe.us/#/samples/'+acc)
    r.html.render(scrolldown=4, sleep=4)
    file_links = list(r.html.links)
    # Find one or two links immediately followed by "Reads column (or equivalent)
    reads_colnames = ["Reads FASTQ", "Reads", "FASTQ", "upload.fastq"]

    for c in reads_colnames:
        hits = [m.start() for m in re.finditer("<td>"+c+"</td>", r.html.html)]
        if len(hits) > 0:
            break
    link_indices = []
    working_file_links = []
    for l in file_links:
        try:
            link_indices.append(r.html.html.index('"'+l+'"'))
            working_file_links.append(l)
        except ValueError: # sometimes they are formatted differently (if added by the project owner?)
            continue
    read_links = {}
    for j in range(len(hits)):
        read_links[j+1] = working_file_links[_closest_below_index(link_indices, hits[j])].replace("http://datacommons.cyverse.org/browse", "https://de.cyverse.org/anon-files")

    if download_metadata:
        html_str = str(r.html.html)
        relevant_section = html_str[html_str.index("<h2>Attributes"):html_str.index("<h2>Files")]
        table_only = relevant_section[relevant_section.index("<tbody>")+7:relevant_section.index("</tbody>")].replace(',',';')
        formatted_table = table_only.replace('</tr><tr>', '\n').replace('</td><td>', ',').replace('<tr>','').replace('<td>','').replace('</tr>','').replace('</td>','')
        listed_table = [z.split(',') for z in formatted_table.split('\n')]
        transposed_table =[[z[0] for z in listed_table],[z[1] for z in listed_table]]
        formatted_table = ','.join(transposed_table[0]) + '\n' + ','.join(transposed_table[1])
        if type(metadata_agg) == type(None):
            metadata_agg = pd.read_csv(StringIO(formatted_table))
        else:
            metadata_agg = metadata_agg.append(pd.read_csv(StringIO(formatted_table)),sort=True)
    return read_links, metadata_agg