Python scrapy.http.TextResponse() Examples

The following are 18 code examples of scrapy.http.TextResponse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.http , or try the search function .
Example #1
Source File: test_pagestorage.py    From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_save_response_with_trim(self):
        self.instance._writer.maxitemsize = 26
        self.instance.hsref.job.key = '123/45/67'
        resp = TextResponse(
            'http://resp', request=Request('http://req'), encoding='cp1251',
            body='\r\n\r\n<html><body></body></html>\r\n \0\0\0\0\0')
        with mock.patch.object(Spider, 'logger') as log:
            spider = Spider('default')
            self.instance.save_response(resp, self.spider)
        log.warning.assert_called_with(
            "Page not saved, body too large: <http://resp>")
        self.instance.trim_html = True
        self.instance.save_response(resp, spider)
        self.instance._writer.write.assert_called_with(
            {u'body': u'<html><body></body></html>', u'_encoding': u'cp1251',
             u'_type': u'_pageitem',
             u'_key': u'9b4bed7e56103ddf63455ed39145f61f53b3c702',
             u'url': u'http://resp', '_jobid': '123/45/67'}) 
Example #2
Source File: iterators.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _body_or_str(obj, unicode=True):
    expected_types = (Response, six.text_type, six.binary_type)
    assert isinstance(obj, expected_types), \
        "obj must be %s, not %s" % (
            " or ".join(t.__name__ for t in expected_types),
            type(obj).__name__)
    if isinstance(obj, Response):
        if not unicode:
            return obj.body
        elif isinstance(obj, TextResponse):
            return obj.text
        else:
            return obj.body.decode('utf-8')
    elif isinstance(obj, six.text_type):
        return obj if unicode else obj.encode('utf-8')
    else:
        return obj.decode('utf-8') if unicode else obj 
Example #3
Source File: response.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def open_in_browser(response, _openfunc=webbrowser.open):
    """Open the given response in a local web browser, populating the <base>
    tag for external links to work
    """
    from scrapy.http import HtmlResponse, TextResponse
    # XXX: this implementation is a bit dirty and could be improved
    body = response.body
    if isinstance(response, HtmlResponse):
        if b'<base' not in body:
            repl = '<head><base href="%s">' % response.url
            body = body.replace(b'<head>', to_bytes(repl))
        ext = '.html'
    elif isinstance(response, TextResponse):
        ext = '.txt'
    else:
        raise TypeError("Unsupported response type: %s" %
                        response.__class__.__name__)
    fd, fname = tempfile.mkstemp(ext)
    os.write(fd, body)
    os.close(fd)
    return _openfunc("file://%s" % fname) 
Example #4
Source File: httpcompression.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def process_response(self, request, response, spider):

        if request.method == 'HEAD':
            return response
        if isinstance(response, Response):
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding:
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                respcls = responsetypes.from_args(headers=response.headers, \
                    url=response.url, body=decoded_body)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response 
Example #5
Source File: httpcompression.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def process_response(self, request, response, spider):

        if request.method == 'HEAD':
            return response
        if isinstance(response, Response):
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding:
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                respcls = responsetypes.from_args(headers=response.headers, \
                    url=response.url, body=decoded_body)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response 
Example #6
Source File: iterators.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _body_or_str(obj, unicode=True):
    expected_types = (Response, six.text_type, six.binary_type)
    assert isinstance(obj, expected_types), \
        "obj must be %s, not %s" % (
            " or ".join(t.__name__ for t in expected_types),
            type(obj).__name__)
    if isinstance(obj, Response):
        if not unicode:
            return obj.body
        elif isinstance(obj, TextResponse):
            return obj.text
        else:
            return obj.body.decode('utf-8')
    elif isinstance(obj, six.text_type):
        return obj if unicode else obj.encode('utf-8')
    else:
        return obj.decode('utf-8') if unicode else obj 
Example #7
Source File: response.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def open_in_browser(response, _openfunc=webbrowser.open):
    """Open the given response in a local web browser, populating the <base>
    tag for external links to work
    """
    from scrapy.http import HtmlResponse, TextResponse
    # XXX: this implementation is a bit dirty and could be improved
    body = response.body
    if isinstance(response, HtmlResponse):
        if b'<base' not in body:
            repl = '<head><base href="%s">' % response.url
            body = body.replace(b'<head>', to_bytes(repl))
        ext = '.html'
    elif isinstance(response, TextResponse):
        ext = '.txt'
    else:
        raise TypeError("Unsupported response type: %s" %
                        response.__class__.__name__)
    fd, fname = tempfile.mkstemp(ext)
    os.write(fd, body)
    os.close(fd)
    return _openfunc("file://%s" % fname) 
Example #8
Source File: datauri.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def download_request(self, request, spider):
        uri = parse_data_uri(request.url)
        respcls = responsetypes.from_mimetype(uri.media_type)

        resp_kwargs = {}
        if (issubclass(respcls, TextResponse) and
                uri.media_type.split('/')[0] == 'text'):
            charset = uri.media_type_parameters.get('charset')
            resp_kwargs['encoding'] = charset

        return respcls(url=request.url, body=uri.data, **resp_kwargs) 
Example #9
Source File: test_pagestorage.py    From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_save_response(self):
        self.instance._writer = mock.MagicMock()
        self.instance._writer.maxitemsize = 10
        # wrong response type
        self.instance.save_response(
            Response('http://resp', request=Request('http://req')),
            self.spider)
        assert not self.instance._writer.write.called
        # get request with large body
        resp1 = TextResponse('http://resp1',
                             request=Request('http://req1'),
                             body='looong loong body',
                             encoding='cp1251')
        self.instance.save_response(resp1, self.spider)
        assert not self.instance._writer.write.called
        # get request with ok-body
        self.instance.hsref = mock.Mock()
        self.instance.hsref.job.key = '123/45/67'
        resp2 = TextResponse('http://resp2', request=Request('http://req2'),
                             body='body', encoding='cp1251',
                             headers={'Set-Cookie': [b'coo1=test;abc=1',
                                                     b'coo2=tes1;cbd=2']})
        self.instance.save_response(resp2, self.spider)
        self.instance._writer.write.assert_called_with(
            {'body': u'body', '_encoding': 'cp1251', '_type': '_pageitem',
             '_key': 'bad42100b1d34e29973a79e512aabb4db885b712',
             'cookies': ['coo1=test', 'coo2=tes1'],
             'url': 'http://resp2', '_jobid': '123/45/67'}) 
Example #10
Source File: scrapy_pagestorage.py    From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def save_response(self, response, spider):
        if isinstance(response, TextResponse):
            fp = request_fingerprint(response.request)
            payload = {
                "_key": fp,
                "_jobid": self.hsref.job.key,
                "_type": "_pageitem",
                "_encoding": response.encoding,
                "url": response.url,
            }
            self._set_cookies(payload, response)

            if response.request.method == 'POST':
                payload["postdata"] = dict(parse_qsl(response.request.body.decode()))

            payload["body"] = response.body_as_unicode()
            if self.trim_html:
                payload['body'] = payload['body'].strip(' \r\n\0')

            if len(payload['body']) > self._writer.maxitemsize:
                spider.logger.warning("Page not saved, body too large: <%s>" %
                                      response.url)
                return

            try:
                self._writer.write(payload)
            except ValueTooLarge as exc:
                spider.logger.warning("Page not saved, %s: <%s>" %
                                      (exc, response.url)) 
Example #11
Source File: middlewares.py    From hq-proxies with MIT License 5 votes vote down vote up
def process_exception(self, request, exception, spider):
        if isinstance(exception, self.DONT_RETRY_ERRORS):
            return TextResponse(url=request.meta['proxy']) 
Example #12
Source File: datauri.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def download_request(self, request, spider):
        uri = parse_data_uri(request.url)
        respcls = responsetypes.from_mimetype(uri.media_type)

        resp_kwargs = {}
        if (issubclass(respcls, TextResponse) and
                uri.media_type.split('/')[0] == 'text'):
            charset = uri.media_type_parameters.get('charset')
            resp_kwargs['encoding'] = charset

        return respcls(url=request.url, body=uri.data, **resp_kwargs) 
Example #13
Source File: test_utils.py    From scrapy-poet with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, response: TextResponse):
        self.response = response 
Example #14
Source File: test_utils.py    From scrapy-poet with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_is_provider_using_response():
    assert is_provider_using_response(PageObjectInputProvider) is False
    assert is_provider_using_response(ResponseDataProvider) is True
    # TextProductProvider wrongly annotates response dependency as
    # TextResponse, instead of using the Response type.
    assert is_provider_using_response(TextProductProvider) is False
    assert is_provider_using_response(DummyProductProvider) is False
    assert is_provider_using_response(FakeProductProvider) is False
    assert is_provider_using_response(StringProductProvider) is False 
Example #15
Source File: test_utils.py    From scrapy-poet with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def parse12(self, response: TextResponse, book_page: DummyProductPage):
        pass 
Example #16
Source File: test_utils.py    From scrapy-poet with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def parse11(self, response: TextResponse):
        pass 
Example #17
Source File: iterators.py    From learn_python3_spider with MIT License 4 votes vote down vote up
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
    """ Returns an iterator of dictionaries from the given csv object

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8

    delimiter is the character used to separate fields on the given obj.

    headers is an iterable that when provided offers the keys
    for the returned dictionaries, if not the first row is used.

    quotechar is the character used to enclosure fields on the given obj.
    """

    encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'

    def row_to_unicode(row_):
        return [to_unicode(field, encoding) for field in row_]

    # Python 3 csv reader input object needs to return strings
    if six.PY3:
        lines = StringIO(_body_or_str(obj, unicode=True))
    else:
        lines = BytesIO(_body_or_str(obj, unicode=False))

    kwargs = {}
    if delimiter: kwargs["delimiter"] = delimiter
    if quotechar: kwargs["quotechar"] = quotechar
    csv_r = csv.reader(lines, **kwargs)

    if not headers:
        try:
            row = next(csv_r)
        except StopIteration:
            return
        headers = row_to_unicode(row)

    for row in csv_r:
        row = row_to_unicode(row)
        if len(row) != len(headers):
            logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
                           "should be: %(csvheader)d)",
                           {'csvlnum': csv_r.line_num, 'csvrow': len(row),
                            'csvheader': len(headers)})
            continue
        else:
            yield dict(zip(headers, row)) 
Example #18
Source File: collector.py    From collectors with MIT License 4 votes vote down vote up
def collect(conf, conn):
    """Collect ICD-XX-CM conditions.
    """

    # For more information see:
    # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html
    URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip'
    FILE = 'Tabular.xml'
    VERSION = 'ICD-10-CM'
    LAST_UPDATED = '2015-10-01'

    # Prepare xml
    zip = requests.get(URL).content
    xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read()
    res = TextResponse(url=URL, body=xml, encoding='utf-8')

    count = 0
    for diag in res.xpath('//diag'):
        # We need only leafs
        childs = diag.xpath('./diag')
        if not childs:
            continue

        # Get data
        data = {
            'name': diag.xpath('./name/text()').extract_first(),
            'desc': diag.xpath('./desc/text()').extract_first(),
            'terms': diag.xpath('.//note/text()').extract(),
            'version': VERSION,
            'last_updated': LAST_UPDATED,
        }

        # Create record
        record = Record.create(URL, data)

        # Write record
        record.write(conf, conn)

        # Log info
        count += 1
        if not count % 100:
            logger.info('Collected %s "%s" conditions', count, record.table)