Python tldextract.TLDExtract() Examples

The following are 13 code examples of tldextract.TLDExtract(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tldextract , or try the search function

Example #1

Source File: action_handler.py From scrapy-cluster with MIT License

6 votes

def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ActionHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ActionHandler")
            # plugin is essential to functionality
            sys.exit(1)

Example #2

Source File: zookeeper_handler.py From scrapy-cluster with MIT License

6 votes

def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ZookeeperHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ZookeeperHandler")
            # plugin is essential to functionality
            sys.exit(1)

Example #3

Source File: scraper_handler.py From scrapy-cluster with MIT License

6 votes

def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ScraperHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ScraperHandler")
            # plugin is essential to functionality
            sys.exit(1)

Example #4

Source File: domain.py From OneForAll with GNU General Public License v3.0

6 votes

def extract(self):
        """
        extract domain

        >>> d = Domain('www.example.com')
        <domain.Domain object>
        >>> d.extract()
        ExtractResult(subdomain='www', domain='example', suffix='com')

        :return: extracted domain results
        """
        data_storage_dir = setting.data_storage_dir
        extract_cache_file = data_storage_dir.joinpath('public_suffix_list.dat')
        tldext = tldextract.TLDExtract(extract_cache_file)
        result = self.match()
        if result:
            return tldext(result)
        else:
            return None

Example #5

Source File: url_checker.py From YaYaGen with BSD 2-Clause "Simplified" License

5 votes

def __init__(self, jconfig, vtapikey):
        """
        Load the DOMAINS_WHITELIST and setup the tld-extractor
        """
        try:
            with open(jconfig['DOMAINS_WHITELIST'], 'rb') as f_in:
                UrlChecker.whitelist = pickle.load(f_in)
        except:
            log.error("URL whitelist loading error")
            UrlChecker.whitelist = list()
        cache_file = jconfig['TOP_DOMAINS_CACHE']
        UrlChecker.__tld = tldextract.TLDExtract(cache_file=cache_file)
        UrlChecker.__vtapikey = vtapikey

Example #6

Source File: test_plugins.py From scrapy-cluster with MIT License

5 votes

def test_scrape_handler(self):
        valid = {
            "url": "www.stuff.com",
            "crawlid": "abc124",
            "appid": "testapp",
            "spiderid": "link",
            "priority": 5,
        }
        handler = ScraperHandler()
        handler.extract = tldextract.TLDExtract()
        handler.redis_conn = MagicMock()

        # check it is added to redis
        handler.redis_conn.zadd = MagicMock(side_effect=AssertionError("added"))
        try:
            handler.handle(valid)
            self.fail("Action not called")
        except AssertionError as e:
            self.assertEquals("added", str(e))

        # check timeout is added
        handler.redis_conn.zadd = MagicMock()
        handler.redis_conn.set = MagicMock(side_effect=AssertionError("expires"))
        valid['expires'] = 124242
        try:
            handler.handle(valid)
            self.fail("Expires not called")
        except AssertionError as e:
            self.assertEquals("expires", str(e))

Example #7

Source File: distributed_scheduler.py From scrapy-cluster with MIT License

5 votes

def __init__(self, server, persist, update_int, timeout, retries, logger,
                 hits, window, mod, ip_refresh, add_type, add_ip, ip_regex,
                 backlog_blacklist, queue_timeout):
        '''
        Initialize the scheduler
        '''
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.ip_update_interval = ip_refresh
        self.add_type = add_type
        self.add_ip = add_ip
        self.item_retires = retries
        self.logger = logger
        self.ip_regex = re.compile(ip_regex)
        self.backlog_blacklist = backlog_blacklist
        self.queue_timeout = queue_timeout

        # set up tldextract
        self.extract = tldextract.TLDExtract()

        self.update_ipaddress()

        # if we need better uuid's mod this line
        self.my_uuid = str(uuid.uuid4()).split('-')[4]

Example #8

Source File: utils.py From yeti with Apache License 2.0

5 votes

def tldextract_parser(url):
    parts = None

    try:
        parts = TLDExtract(**tld_extract_dict)(url)
    except Exception as e:
        logging.error(e)

    return parts

Example #9

Source File: web_utils.py From luscan-devel with GNU General Public License v2.0

5 votes

def split_hostname(hostname):
    """
    Splits a hostname into its subdomain, domain and TLD parts.

    For example:

    >>> from golismero.api.net.web_utils import ParsedURL
    >>> d = ParsedURL("http://www.example.com/")
    >>> d.split_hostname()
    ('www', 'example', 'com')
    >>> d = ParsedURL("http://some.subdomain.of.example.co.uk/")
    >>> d.split_hostname()
    ('some.subdomain.of', 'example', 'co.uk')
    >>> '.'.join(d.split_hostname())
    'some.subdomain.of.example.co.uk'

    :param hostname: Hostname to split.
    :type hostname: str

    :returns: Subdomain, domain and TLD.
    :rtype: tuple(str, str, str)
    """
    extract = TLDExtract(fetch = False)
    result  = extract( to_utf8(hostname) )
    return result.subdomain, result.domain, result.suffix


#------------------------------------------------------------------------------

Example #10

Source File: __init__.py From recipe-scrapers with MIT License

5 votes

def get_domain(url):
    tldextract = TLDExtract(suffix_list_urls=None)
    url_info = tldextract(url)
    return "{}.{}".format(url_info.domain, url_info.suffix)

Example #11

Source File: domaintools.py From metadoc with MIT License

5 votes

def get_domain(self, url):
    no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
    tld = no_fetch_extract(url)
    self.domain = "{}.{}".format(tld.domain, tld.suffix)

Example #12

Source File: client.py From lexicon with MIT License

4 votes

def __init__(self, config=None):
        if not config:
            # If there is not config specified, we load a non-interactive configuration.
            self.config = non_interactive_config_resolver()
        elif not isinstance(config, ConfigResolver):
            # If config is not a ConfigResolver, we are in a legacy situation.
            # We protect this part of the Client API.
            self.config = legacy_config_resolver(config)
        else:
            self.config = config

        # Validate configuration
        self._validate_config()

        runtime_config = {}

        # Process domain, strip subdomain
        domain_extractor = tldextract.TLDExtract(cache_file=TLDEXTRACT_CACHE_FILE,
                                                 include_psl_private_domains=True)
        domain_parts = domain_extractor(
            self.config.resolve('lexicon:domain'))
        runtime_config['domain'] = '{0}.{1}'.format(
            domain_parts.domain, domain_parts.suffix)

        if self.config.resolve('lexicon:delegated'):
            # handle delegated domain
            delegated = self.config.resolve('lexicon:delegated').rstrip('.')
            if delegated != runtime_config.get('domain'):
                # convert to relative name
                if delegated.endswith(runtime_config.get('domain')):
                    delegated = delegated[:-len(runtime_config.get('domain'))]
                    delegated = delegated.rstrip('.')
                # update domain
                runtime_config['domain'] = '{0}.{1}'.format(
                    delegated, runtime_config.get('domain'))

        self.action = self.config.resolve('lexicon:action')
        self.provider_name = (self.config.resolve('lexicon:provider_name')
                              or self.config.resolve('lexicon:provider'))

        self.config.add_config_source(DictConfigSource(runtime_config), 0)

        provider_module = importlib.import_module(
            'lexicon.providers.' + self.provider_name)
        provider_class = getattr(provider_module, 'Provider')
        self.provider = provider_class(self.config)

Example #13

Source File: BTG.py From BTG with GNU General Public License v3.0

4 votes

def extend_IOC(self, argument, observable_list):
        """
            Extending IOC from URL into URL + DOMAIN + IP
        """
        if config['offline']:
            # Cache search
            # TODO
            if "TLDE_cache" in config:
                cache_file = "%s%s" % (config['temporary_cache_path'], config['TLDE_cache'])
                cache_extract = tldextract.TLDExtract(cache_file=cache_file)
                extract = cache_extract(argument)
        else:
            # Live search
            no_cache_extract = tldextract.TLDExtract(cache_file=False)
            extract = no_cache_extract(argument)

        try:
            registered_domain = extract.registered_domain
        except:
            registered_domain = None
        try:
            suffix_domain = extract.suffix
        except:
            suffix_domain = None
        try:
            complete_domain = '.'.join(part for part in extract if part)
        except:
            complete_domain = None
        domains = [registered_domain, suffix_domain, complete_domain]

        IPs = [None, None, None]
        if not config["offline"]:
            for domain in domains:
                try:
                    IP = socket.gethostbyname(domain)
                except:
                    IP = None
                IPs.append(IP)

        for domain in domains:
            if domain is not None and domain not in observable_list:
                observable_list.append(domain)
        for IP in IPs:
            if IP is not None and IP not in observable_list:
                observable_list.append(IP)