Python six.moves.urllib.parse.urljoin() Examples

The following are 30 code examples of six.moves.urllib.parse.urljoin(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module six.moves.urllib.parse , or try the search function .
Example #1
Source File: monzo_api.py    From pymonzo with MIT License 7 votes vote down vote up
def _get_oauth_token(self):
        """
        Get Monzo access token via OAuth2 `authorization code` grant type.

        Official docs:
            https://monzo.com/docs/#acquire-an-access-token

        :returns: OAuth 2 access token
        :rtype: dict
        """
        url = urljoin(self.api_url, '/oauth2/token')

        oauth = OAuth2Session(
            client_id=self._client_id,
            redirect_uri=config.REDIRECT_URI,
        )

        token = oauth.fetch_token(
            token_url=url,
            code=self._auth_code,
            client_secret=self._client_secret,
        )

        return token 
Example #2
Source File: utils.py    From scrape with MIT License 6 votes vote down vote up
def clean_url(url, base_url=None):
    """Add base netloc and path to internal URLs and remove www, fragments."""
    parsed_url = urlparse(url)

    fragment = "{url.fragment}".format(url=parsed_url)
    if fragment:
        url = url.split(fragment)[0]

    # Identify internal URLs and fix their format
    netloc = "{url.netloc}".format(url=parsed_url)
    if base_url is not None and not netloc:
        parsed_base = urlparse(base_url)
        split_base = "{url.scheme}://{url.netloc}{url.path}/".format(url=parsed_base)
        url = urljoin(split_base, url)
        netloc = "{url.netloc}".format(url=urlparse(url))

    if "www." in netloc:
        url = url.replace(netloc, netloc.replace("www.", ""))
    return url.rstrip(string.punctuation) 
Example #3
Source File: regex.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        def clean_text(text):
            return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()

        def clean_url(url):
            clean_url = ''
            try:
                clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
            except ValueError:
                pass
            return clean_url

        if base_url is None:
            base_url = get_base_url(response_text, response_url, response_encoding)

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text] 
Example #4
Source File: lxmlhtml.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _extract_links(self, selector, response_url, response_encoding, base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector.root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                if self.strip:
                    attr_val = strip_html5_whitespace(attr_val)
                attr_val = urljoin(base_url, attr_val)
            except ValueError:
                continue  # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue
            url = to_native_str(url, encoding=response_encoding)
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url, _collect_string_content(el) or u'',
                        nofollow=rel_has_nofollow(el.get('rel')))
            links.append(link)
        return self._deduplicate_if_needed(links) 
Example #5
Source File: sgml.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            if isinstance(link.url, six.text_type):
                link.url = link.url.encode(response_encoding)
            try:
                link.url = urljoin(base_url, link.url)
            except ValueError:
                continue
            link.url = safe_url_string(link.url, response_encoding)
            link.text = to_unicode(link.text, response_encoding, errors='replace').strip()
            ret.append(link)

        return ret 
Example #6
Source File: htmlparser.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            if isinstance(link.url, six.text_type):
                link.url = link.url.encode(response_encoding)
            try:
                link.url = urljoin(base_url, link.url)
            except ValueError:
                continue
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret 
Example #7
Source File: lxmlhtml.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def _extract_links(self, selector, response_url, response_encoding, base_url):
        '''
        Pretty much the same function, just added 'ignore' to to_native_str()
        '''
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector.root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                attr_val = urljoin(base_url, attr_val)
            except ValueError:
                continue # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue
            # added 'ignore' to encoding errors
            url = to_native_str(url, encoding=response_encoding,
                                errors='ignore')
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url, _collect_string_content(el) or u'',
                        nofollow=rel_has_nofollow(el.get('rel')))
            links.append(link)
        return self._deduplicate_if_needed(links) 
Example #8
Source File: lxmlhtml.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _extract_links(self, selector, response_url, response_encoding, base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector.root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                if self.strip:
                    attr_val = strip_html5_whitespace(attr_val)
                attr_val = urljoin(base_url, attr_val)
            except ValueError:
                continue  # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue
            url = to_native_str(url, encoding=response_encoding)
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url, _collect_string_content(el) or u'',
                        nofollow=rel_has_nofollow(el.get('rel')))
            links.append(link)
        return self._deduplicate_if_needed(links) 
Example #9
Source File: test_monzo_api.py    From pymonzo with MIT License 6 votes vote down vote up
def test_class_get_oauth_token_method(self, mocker, mocked_monzo):
        """Test class `_get_oauth_token` method"""
        mocked_fetch_token = mocker.MagicMock()
        mocked_oauth2_session = mocker.patch('pymonzo.monzo_api.OAuth2Session')
        mocked_oauth2_session.return_value.fetch_token = mocked_fetch_token

        token = mocked_monzo._get_oauth_token()

        assert token == mocked_fetch_token.return_value

        mocked_oauth2_session.assert_called_once_with(
            client_id=mocked_monzo._client_id,
            redirect_uri=config.REDIRECT_URI,
        )
        mocked_fetch_token.assert_called_once_with(
            token_url=urljoin(mocked_monzo.api_url, '/oauth2/token'),
            code=mocked_monzo._auth_code,
            client_secret=mocked_monzo._client_secret,
        ) 
Example #10
Source File: connector.py    From designate with Apache License 2.0 6 votes vote down vote up
def _construct_url(self, relative_path, query_params=None, extattrs=None):
        if query_params is None:
            query_params = {}
        if extattrs is None:
            extattrs = {}

        if not relative_path or relative_path[0] == '/':
            raise ValueError('Path in request must be relative.')
        query = ''
        if query_params or extattrs:
            query = '?'

        if extattrs:
            attrs_queries = []
            for key, value in extattrs.items():
                LOG.debug("key: %s, value: %s", key, value)
                attrs_queries.append('*' + key + '=' + value['value'])
            query += '&'.join(attrs_queries)
        if query_params:
            if len(query) > 1:
                query += '&'
            query += parse.urlencode(query_params)

        baseurl = parse.urljoin(self.wapi_url, parse.quote(relative_path))
        return baseurl + query 
Example #11
Source File: redirect.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def process_response(self, request, response, spider):
        if (request.meta.get('dont_redirect', False) or
                response.status in getattr(spider, 'handle_httpstatus_list', []) or
                response.status in request.meta.get('handle_httpstatus_list', []) or
                request.meta.get('handle_httpstatus_all', False)):
            return response

        allowed_status = (301, 302, 303, 307, 308)
        if 'Location' not in response.headers or response.status not in allowed_status:
            return response

        location = safe_url_string(response.headers['location'])

        redirected_url = urljoin(request.url, location)

        if response.status in (301, 307, 308) or request.method == 'HEAD':
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        redirected = self._redirect_request_using_get(request, redirected_url)
        return self._redirect(redirected, request, spider, response.status) 
Example #12
Source File: uploads.py    From conda-concourse-ci with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_upload_channels(upload_config_dir, subdir, channels=None):
    """thought here was to provide whatever channel you have set as an output also to be an input

    Killed this in favor of setting channels in condarc in the docker image.
    """
    configurations = load_yaml_config_dir(upload_config_dir)
    channels = channels or []

    for config in configurations:
        if 'token' in config:
            channels.append(config['user'])
        elif 'server' in config:
            channels.append(parse.urljoin('http://' + config['server'],
                            config['destination_path'].format(subdir=subdir)))
        else:
            channels.append(config['channel'])
    return channels 
Example #13
Source File: regex.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        def clean_text(text):
            return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()

        def clean_url(url):
            clean_url = ''
            try:
                clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
            except ValueError:
                pass
            return clean_url

        if base_url is None:
            base_url = get_base_url(response_text, response_url, response_encoding)

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text] 
Example #14
Source File: session.py    From hfut with MIT License 6 votes vote down vote up
def prepare_request(self, request):
        parsed = parse.urlparse(request.url)
        # 非法字符检查
        if ENV['REQUEST_ARGUMENTS_CHECK'] and (not parsed.netloc or parsed.netloc == parse.urlparse(self.host).netloc):
            for k, v in reduce(lambda x, y: x + list(y.items()), (request.params, request.data), []):
                pattern = ENV['ILLEGAL_CHARACTERS_PATTERN']
                result = pattern.search(str(k)) or pattern.search(str(v))
                if result:
                    msg = ''.join(['参数中出现非法字符: ', result.group()])
                    raise ValidationError(msg)
        if not parsed.netloc:
            # requests 在准备 url 进行解析, 因此只能在准备前将 url 换成完整的地址
            # requests.models.PreparedRequest#prepare_url
            request.url = parse.urljoin(self.host, request.url)

        return super(BaseSession, self).prepare_request(request) 
Example #15
Source File: interface.py    From pulsar with Apache License 2.0 6 votes vote down vote up
def __init__(self, destination_params, transport):
        self.transport = transport
        remote_host = destination_params.get("url")
        assert remote_host is not None, "Failed to determine url for Pulsar client."
        if not remote_host.startswith("http"):
            remote_host = "http://%s" % remote_host
        manager = destination_params.get("manager", None)
        if manager:
            if "/managers/" in remote_host:
                log.warning("Ignoring manager tag '%s', Pulsar client URL already contains a \"/managers/\" path." % manager)
            else:
                remote_host = urljoin(remote_host, "managers/%s" % manager)
        if not remote_host.endswith("/"):
            remote_host = "%s/" % remote_host
        self.remote_host = remote_host
        self.private_token = destination_params.get("private_token", None) 
Example #16
Source File: stockfighter.py    From stockfighter with ISC License 6 votes vote down vote up
def place_new_order(self, stock, price, qty, direction, order_type):
        """Place an order for a stock.

        https://starfighter.readme.io/docs/place-new-order
        """
        url_fragment = 'venues/{venue}/stocks/{stock}/orders'.format(
            venue=self.venue,
            stock=stock,
        )
        data = {
          "stock": stock,
          "price": price,
          "venue": self.venue,
          "account": self.account,
          "qty": qty,
          "direction": direction,
          "orderType": order_type,
        }
        url = urljoin(self.base_url, url_fragment)
        resp = self.session.post(url, json=data)
        return resp.json() 
Example #17
Source File: sgml.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            if isinstance(link.url, six.text_type):
                link.url = link.url.encode(response_encoding)
            try:
                link.url = urljoin(base_url, link.url)
            except ValueError:
                continue
            link.url = safe_url_string(link.url, response_encoding)
            link.text = to_unicode(link.text, response_encoding, errors='replace').strip()
            ret.append(link)

        return ret 
Example #18
Source File: sentinel.py    From sentinelsat with GNU General Public License v3.0 6 votes vote down vote up
def is_online(self, id):
        """Returns whether a product is online

        Parameters
        ----------
        id : string
            UUID of the product, e.g. 'a8dd0cfd-613e-45ce-868c-d79177b916ed'

        Returns
        -------
        bool
            True if online, False if in LTA

        """
        # Check https://scihub.copernicus.eu/userguide/ODataAPI#Products_entity for more information

        url = urljoin(self.api_url, "odata/v1/Products('{}')/Online/$value".format(id))
        r = self.session.get(url, auth=self.session.auth, timeout=self.timeout)
        _check_scihub_response(r)
        return r.json() 
Example #19
Source File: form.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _get_form_url(form, url):
    if url is None:
        action = form.get('action')
        if action is None:
            return form.base_url
        return urljoin(form.base_url, strip_html5_whitespace(action))
    return urljoin(form.base_url, url) 
Example #20
Source File: template_utils.py    From eclcli with Apache License 2.0 5 votes vote down vote up
def get_file_contents(from_data, files, base_url=None,
                      ignore_if=None, recurse_if=None,
                      is_object=False, object_request=None):

    if recurse_if and recurse_if(from_data):
        if isinstance(from_data, dict):
            recurse_data = six.itervalues(from_data)
        else:
            recurse_data = from_data
        for value in recurse_data:
            get_file_contents(value, files, base_url, ignore_if, recurse_if,
                              is_object, object_request)

    if isinstance(from_data, dict):
        for key, value in six.iteritems(from_data):
            if ignore_if and ignore_if(key, value):
                continue

            if base_url and not base_url.endswith('/'):
                base_url = base_url + '/'

            str_url = parse.urljoin(base_url, value)
            if str_url not in files:
                if is_object and object_request:
                    file_content = object_request('GET', str_url)
                else:
                    file_content = utils.read_url_content(str_url)
                if is_template(file_content):
                    if is_object:
                        template = get_template_contents(
                            template_object=str_url, files=files,
                            object_request=object_request)[1]
                    else:
                        template = get_template_contents(
                            template_url=str_url, files=files)[1]
                    file_content = jsonutils.dumps(template)
                files[str_url] = file_content
            # replace the data value with the normalised absolute URL
            from_data[key] = str_url 
Example #21
Source File: utils.py    From eclcli with Apache License 2.0 5 votes vote down vote up
def normalise_file_path_to_url(path):
    if parse.urlparse(path).scheme:
        return path
    path = os.path.abspath(path)
    return parse.urljoin('file:', request.pathname2url(path)) 
Example #22
Source File: utils.py    From eclcli with Apache License 2.0 5 votes vote down vote up
def base_url_for_url(url):
    parsed = parse.urlparse(url)
    parsed_dir = os.path.dirname(parsed.path)
    return parse.urljoin(url, parsed_dir) 
Example #23
Source File: utils.py    From eclcli with Apache License 2.0 5 votes vote down vote up
def resolve_param_get_file(file, base_url):
    if base_url and not base_url.endswith('/'):
        base_url = base_url + '/'
    str_url = parse.urljoin(base_url, file)
    return read_url_content(str_url) 
Example #24
Source File: sentinel.py    From sentinelsat with GNU General Public License v3.0 5 votes vote down vote up
def get_product_odata(self, id, full=False):
        """Access OData API to get info about a product.

        Returns a dict containing the id, title, size, md5sum, date, footprint and download url
        of the product. The date field corresponds to the Start ContentDate value.

        If `full` is set to True, then the full, detailed metadata of the product is returned
        in addition to the above.

        Parameters
        ----------
        id : string
            The UUID of the product to query
        full : bool
            Whether to get the full metadata for the Product. False by default.

        Returns
        -------
        dict[str, Any]
            A dictionary with an item for each metadata attribute

        Notes
        -----
        For a full list of mappings between the OpenSearch (Solr) and OData attribute names
        see the following definition files:
        https://github.com/SentinelDataHub/DataHubSystem/blob/master/addon/sentinel-1/src/main/resources/META-INF/sentinel-1.owl
        https://github.com/SentinelDataHub/DataHubSystem/blob/master/addon/sentinel-2/src/main/resources/META-INF/sentinel-2.owl
        https://github.com/SentinelDataHub/DataHubSystem/blob/master/addon/sentinel-3/src/main/resources/META-INF/sentinel-3.owl
        """
        url = urljoin(self.api_url, "odata/v1/Products('{}')?$format=json".format(id))
        if full:
            url += "&$expand=Attributes"
        response = self.session.get(url, auth=self.session.auth, timeout=self.timeout)
        _check_scihub_response(response)
        values = _parse_odata_response(response.json()["d"])
        return values 
Example #25
Source File: sentinel.py    From sentinelsat with GNU General Public License v3.0 5 votes vote down vote up
def _format_url(self, order_by=None, limit=None, offset=0):
        if limit is None:
            limit = self.page_size
        limit = min(limit, self.page_size)
        url = "search?format=json&rows={}".format(limit)
        url += "&start={}".format(offset)
        if order_by:
            url += "&orderby={}".format(order_by)
        return urljoin(self.api_url, url) 
Example #26
Source File: serve.py    From pipenv with MIT License 5 votes vote down vote up
def join(self, url, allow_fragments=True):
        return urljoin(self.url, url, allow_fragments=allow_fragments) 
Example #27
Source File: allocator_remote.py    From universe with MIT License 5 votes vote down vote up
def _get_request(self, route):
        url = urlparse.urljoin(self.base_url, route)
        extra_logger.info("[%s] GET %s", self.label, url)
        resp = self.session.get(url, auth=(self.api_key, ''), timeout=self.request_timeout)
        return self._handle_resp(resp) 
Example #28
Source File: allocator_remote.py    From universe with MIT License 5 votes vote down vote up
def _delete_request(self, route):
        url = urlparse.urljoin(self.base_url, route)
        extra_logger.info("[%s] DELETE %s", self.label, url)
        resp = self.session.delete(url, auth=(self.api_key, ''), timeout=self.request_timeout)
        return self._handle_resp(resp) 
Example #29
Source File: allocator_remote.py    From universe with MIT License 5 votes vote down vote up
def _post_request(self, route, data, description):
        url = urlparse.urljoin(self.base_url, route)
        extra_logger.info('[%s] %s: POST %s: %s', self.label, description, url, json.dumps(data))
        resp = self.session.post(urlparse.urljoin(self.base_url, route),
                                 data=json.dumps(data), auth=(self.api_key, ''),
                                 timeout=self.request_timeout,
        )
        return self._handle_resp(resp) 
Example #30
Source File: client.py    From osim-rl with MIT License 5 votes vote down vote up
def _get_request(self, route):
        url = urlparse.urljoin(self.remote_base, route)
        logger.info("GET {}".format(url))
        resp = self.session.get(url)
        return self._parse_server_error_or_raise_for_status(resp)