Python BeautifulSoup.BeautifulSoup() Examples

The following are 30 code examples of BeautifulSoup.BeautifulSoup(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module BeautifulSoup , or try the search function .
Example #1
Source File: crawl.py    From oxidizr with GNU General Public License v2.0 7 votes vote down vote up
def extract_context(html, url):
    soup = BeautifulSoup(html)
    # Insert into Content (under this domain)
    texts = soup.findAll(text=True)
    try:
        Content.objects.create(
            url=url,
            title=soup.title.string,
            summary=helpers.strip_tags(" \n".join(filter(visible, texts)))[:4000],
            last_crawled_at=datetime.datetime.utcnow().replace(tzinfo=pytz.utc)
        )
    except IntegrityError:
        println('%s - already existed in Content' % url)
    soup.prettify()
    return [str(anchor['href'])
            for anchor in soup.findAll('a', attrs={'href': re.compile("^http://")}) if anchor['href']] 
Example #2
Source File: bot.py    From ogame-bot with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def update_planet_fleet(self, planet):
        resp = self.br.open(self._get_url('fleet', planet))
        soup = BeautifulSoup(resp)
        ships = {}
        for k, v in self.SHIPS.iteritems():
            available = 0
            try:
                s = soup.find(id='button' + v)
                available = int(s.find('span', 'textlabel').nextSibling.replace('.', ''))
            except:
                available = 0
            ships[k] = available

        #self.logger.info('Updating %s fleet' % planet)
        #self.logger.info('%s' % fleet)
        planet.ships = ships 
Example #3
Source File: sanitize_html.py    From anytask with MIT License 6 votes vote down vote up
def sanitize_html(value):
    valid_tags = ALLOWED_TAGS.split()
    valid_attrs = 'href src target alt'.split()

    if not value:
        return ''

    soup = BeautifulSoup(value)

    if not (soup.find('div', 'not-sanitize')):
        for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
            comment.extract()
        for tag in soup.findAll(True):
            if tag.name not in valid_tags:
                tag.hidden = True
            tag.attrs = [(attr, val) for attr, val in tag.attrs
                         if attr in valid_attrs]
        return '<p>' + soup.renderContents().decode('utf8').replace('javascript:', '').replace("\n", '</p><p>') + '</p>'
    return soup.renderContents().decode('utf8') 
Example #4
Source File: urlvoid.py    From omnibus with MIT License 6 votes vote down vote up
def run(self):
        url = 'http://urlvoid.com/scan/%s/' % self.artifact['name']

        try:
            status, response = get(url, headers=self.headers)

            if status:
                data = BeautifulSoup(response.text)

                if data.findAll('div', attrs={'class': 'bs-callout bs-callout-info'}):
                    pass

                elif data.findAll('div', attrs={'class': 'bs-callout bs-callout-warning'}):
                    self.artifact['data']['urlvoid'] = {}
                    for each in data.findAll('img', alt='Alert'):
                        site = each.parent.parent.td.text.lstrip()
                        url = each.parent.a['href']
                        self.artifact['data']['urlvoid'][site] = url

        except Exception as err:
            warning('Caught exception in module (%s)' % str(err)) 
Example #5
Source File: get_proxy.py    From get_proxy with GNU Affero General Public License v3.0 6 votes vote down vote up
def parse_checkerproxy(self, html):
        ''' Only get elite proxies from checkerproxy '''
        ips = []
        soup = BeautifulSoup(html)
        for tr in soup.findAll('tr'):
            if len(tr) == 19:
                ip_found = False
                elite = False
                ip_port = None
                tds = tr.findAll('td')
                for td in tds:
                    if ':' in td.text:
                        ip_found = True
                        ip_port_re = re.match('(\d{1,3}\.){3}\d{1,3}:\d{1,5}', td.text)
                        if ip_port_re:
                            ip_port = ip_port_re.group()
                        if not ip_port:
                            ip_found = False
                    if 'Elite' in td.text:
                        elite = True
                    if ip_found == True and elite == True:
                        ips.append(str(ip_port))
                        break
        return ips 
Example #6
Source File: ipvoid.py    From omnibus with MIT License 6 votes vote down vote up
def run(self):
        url = 'http://www.ipvoid.com/scan/%s/' % self.artifact['name']

        try:
            status, response = get(url, headers=self.headers)

            if status:
                data = BeautifulSoup(response.text)

                if data.findAll('span', attrs={'class': 'label label-success'}):
                    pass

                elif data.findAll('span', attrs={'class': 'label label-danger'}):
                    for each in data.findAll('img', alt='Alert'):
                        site = each.parent.parent.td.text.lstrip()
                        url = each.parent.a['href']
                        self.artifact['data']['ipvoid'] = {site: url}
        except Exception as err:
            warning('Caught exception in module (%s)' % str(err)) 
Example #7
Source File: log_download.py    From IRCLogParser with GNU General Public License v3.0 6 votes vote down vote up
def ubuntu_url(start_date, end_date):
    """
    Args:
        start_date (date object): Starting date from which logs need to be fetched 
        end_date (date object) : Last date for which logs need to be fetched
    Returns:
        Yields channel name, current_date, and url at which log for returned
        channel and current_date is present.
    """
    
    for current_date in rrule(freq=DAILY, dtstart=start_date, until=end_date):
        url = UBUNTU_ENDPOINT.format(current_date.year,month=current_date.month, day=current_date.day)
        
        r = send_request(url)
        soup = BeautifulSoup(r)
        links = soup.findAll(href=re.compile(".txt"))
        
        for link in links:
            channel = link.string
            channel_ = channel[1:]
            
            yield channel, current_date, UBUNTU_CHANNEL_ENDPOINT.format(current_date.year, month=current_date.month, day=current_date.day, channel=channel_) 
Example #8
Source File: bot.py    From ogame-bot with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_player_status(self, destination, origin_planet=None):
        if not destination:
            return
            
        status = {}
        origin_planet = origin_planet or self.get_closest_planet(destination)
        galaxy, system, position = destination.split(':')

        url = self._get_url('galaxyCnt', origin_planet)
        data = urlencode({'galaxy': galaxy, 'system': system})
        resp = self.br.open(url, data=data)
        soup = BeautifulSoup(resp)

        soup.find(id='galaxytable')
        planets = soup.findAll('tr', {'class': 'row'})
        target_planet = planets[int(position)-1]
        name_el = target_planet.find('td', 'playername')
        status['name'] = name_el.find('span').text

        status['inactive'] = 'inactive' in name_el.get('class', '')
        return status 
Example #9
Source File: service.py    From xbmc-betaseries with GNU General Public License v2.0 6 votes vote down vote up
def get_soup(content):
    # check if page content can be used
    pattern = "subtitles from the source! - Addic7ed.com"
    try:
        soup = BeautifulSoup(content)
        title = str(soup.findAll("title")[0])
        if title.find(pattern) > -1:
            return soup
        else:
            log("bad page, maybe index after 404")
            return False
    except:
        log("badly formatted content")
        if self_notify:
            xbmc.executebuiltin((u'Notification(%s,%s,%s,%s)' % (__addonname__, __language__(30009), 750, __icon__)).encode('utf-8', 'ignore'))
        return False 
Example #10
Source File: MrJattParser.py    From song-cli with MIT License 6 votes vote down vote up
def list_of_all_href(self,html):
		'''
		It will return all hyper links found in the mr-jatt page for download
		'''	
		soup=BeautifulSoup(html)
		links=[]
		a_list=soup.findAll('a','touch')
		for x in xrange(len(a_list)-1):
			link = a_list[x].get('href')
			name = a_list[x]
			name = str(name)
			name=re.sub(r'<a.*/>|<span.*">|</span>|</a>|<a.*html">|<font.*">|</font>','',name)
			name=re.sub(r'^[0-9]+\.','',name)
			links.append([link,name])

		#quit()
		return links 
Example #11
Source File: wallbase.py    From spider with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def spider_image(url):
    user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
    headers = {'User-Agent': user_agent}
    request = urllib2.Request(url, headers = headers)
    response = urllib2.urlopen(request)
    soup = BeautifulSoup(response.read())
    result = {}

    img = soup.findAll("img", attrs={"class": re.compile("wall")}, limit=1)
    result['url'] = img[0]['src']
    
    for size in soup.findAll("div", "l1"):
        if size.parent.name == "a":
            result['size'] = size.text
            break
    
    return result 
Example #12
Source File: GoogleParser.py    From song-cli with MIT License 6 votes vote down vote up
def parse_google(self,html):
		'''It will parse google html response
			and return the first url
		 '''
		soup = BeautifulSoup(html)
		href=soup.find('div','g').find('a').get('href')
		href_list=href.split('&')
		download_url=href_list[0]
		download_url=download_url.strip()
		download_url=download_url.replace('/url?q=','')
		return download_url 
Example #13
Source File: service.py    From xbmc-betaseries with GNU General Public License v2.0 6 votes vote down vote up
def get_soup(content):
    # check if page content can be used
    pattern = "TVsubtitles.net - "
    try:
        soup = BeautifulSoup(content)
        title = str(soup.findAll("title")[0])
        if title.find(pattern) > -1:
            return soup
        else:
            log("bad page, maybe index after 404")
            return False
    except:
        log("badly formatted content")
        if self_notify:
            xbmc.executebuiltin((u'Notification(%s,%s,%s,%s)' % (__addonname__, __language__(30009), 750, __icon__)).encode('utf-8', 'ignore'))
        return False 
Example #14
Source File: Old_regression.py    From AILearners with Apache License 2.0 5 votes vote down vote up
def scrapePage(inFile,outFile,yr,numPce,origPrc):
    from BeautifulSoup import BeautifulSoup
    fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
    soup = BeautifulSoup(fr.read())
    i=1
    currentRow = soup.findAll('table', r="%d" % i)
    while(len(currentRow)!=0):
        currentRow = soup.findAll('table', r="%d" % i)
        title = currentRow[0].findAll('a')[1].text
        lwrTitle = title.lower()
        if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
            newFlag = 1.0
        else:
            newFlag = 0.0
        soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
        if len(soldUnicde)==0:
            print "item #%d did not sell" % i
        else:
            soldPrice = currentRow[0].findAll('td')[4]
            priceStr = soldPrice.text
            priceStr = priceStr.replace('$','') #strips out $
            priceStr = priceStr.replace(',','') #strips out ,
            if len(soldPrice)>1:
                priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
            print "%s\t%d\t%s" % (priceStr,newFlag,title)
            fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
        i += 1
        currentRow = soup.findAll('table', r="%d" % i)
    fw.close() 
Example #15
Source File: bot.py    From ogame-bot with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def login(self, username=None, password=None):
        username = username or self.username
        password = password or self.password
        
        try:
            resp = self.br.open(self.MAIN_URL, timeout=10)
            soup = BeautifulSoup(resp)
        except:
            return False
        
        alert = soup.find(id='attack_alert')

        # no redirect on main page == user logged in
        if resp.geturl() != self.BASE_URL and alert:
            self.logged_in = True
            self.logger.info('Logged as: %s' % username)
            return True
        
        self.logger.info('Logging in..')
        self.br.select_form(name='loginForm')
        self.br.form['uni'] = ['s%s-pl.ogame.gameforge.com' % self.uni]
        self.br.form['login'] = username
        self.br.form['pass'] = password
        self.br.submit()

        if self.br.geturl().startswith(self.MAIN_URL):
            self.logged_in = True
            self.logger.info('Logged as: %s' % username)
            return True
        else:
            self.logged_in = False
            self.logger.error('Login failed!')
            return False 
Example #16
Source File: Bankmail.py    From lokun-record with GNU Affero General Public License v3.0 5 votes vote down vote up
def isb_parse(self, payload):
        """Base64 encoded and missing the plaintext part of the message"""
        p = BeautifulSoup(b64decode(payload))
        amounttable = p.find('table').fetchNextSiblings()[5].find("tr")
        amount = int(re.sub(r"[^0-9]", "", amounttable.td.text))    
        skyring = amounttable.fetchNextSiblings()[0].td.text.encode("utf-8")
        return skyring, amount, "isb" 
Example #17
Source File: soup.py    From nzb-subliminal with GNU General Public License v3.0 5 votes vote down vote up
def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None) 
Example #18
Source File: soup.py    From nzb-subliminal with GNU General Public License v3.0 5 votes vote down vote up
def elementClass(self, name, namespace):
        if namespace is not None:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        return Element(Tag(self.soup, name), self.soup, namespace) 
Example #19
Source File: soup.py    From nzb-subliminal with GNU General Public License v3.0 5 votes vote down vote up
def documentClass(self):
        self.soup = BeautifulSoup("")
        return Element(self.soup, self.soup, None) 
Example #20
Source File: bot.py    From ogame-bot with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def fetch_planets(self):
        self.logger.info('Fetching planets..')
        resp = self.br.open(self.PAGES['main']).read()

        self.calc_time(resp)

        soup = BeautifulSoup(resp)
        self.planets = []
        self.moons = []

        try:
            for i, c in enumerate(soup.findAll('a', 'planetlink')):
                name = c.find('span', 'planet-name').text
                coords = c.find('span', 'planet-koords').text[1:-1]
                url = c.get('href')
                p_id = int(c.parent.get('id').split('-')[1])
                construct_mode = len(c.parent.findAll('a', 'constructionIcon')) != 0
                p = Planet(p_id, name, coords, url, construct_mode)
                if i == 0:
                    p.mother = True
                self.planets.append(p)

                #check if planet has moon
                moon = c.parent.find('a', 'moonlink')
                if moon and 'moonlink' in moon['class']:
                    url = moon.get('href')
                    m_id = url.split('cp=')[1]
                    m = Moon(m_id, coords, url)
                    self.moons.append(m)
        except:
            self.logger.exception('Exception while fetching planets')
        else:
            self.check_attacks(soup) 
Example #21
Source File: wallbase.py    From spider with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def spider(url):
    user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
    headers = {'User-Agent': user_agent}
    request = urllib2.Request(url, headers = headers)
    response = urllib2.urlopen(request)
    soup = BeautifulSoup(response.read())
    results = []
    
    for thumb in soup.findAll("img", attrs={"class": re.compile("file"), "data-original": True}):
        thumb_parent = thumb.findParent("a")
        #print thumb_parent['href']
        result = spider_image(thumb_parent['href'])
        results.append(result)

    return results 
Example #22
Source File: soup.py    From nzb-subliminal with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, namespaceHTMLElements):
        if namespaceHTMLElements:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        _base.TreeBuilder.__init__(self, namespaceHTMLElements) 
Example #23
Source File: soup.py    From nzb-subliminal with GNU General Public License v3.0 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, BeautifulSoup): # Document or DocumentFragment
            return (_base.DOCUMENT,)

        elif isinstance(node, Declaration): # DocumentType
            string = unicode(node.string)
            #Slice needed to remove markup added during unicode conversion,
            #but only in some versions of BeautifulSoup/Python
            if string.startswith('<!') and string.endswith('>'):
                string = string[2:-1]
            m = self.doctype_regexp.match(string)
            #This regexp approach seems wrong and fragile
            #but beautiful soup stores the doctype as a single thing and we want the seperate bits
            #It should work as long as the tree is created by html5lib itself but may be wrong if it's
            #been modified at all
            #We could just feed to it a html5lib tokenizer, I guess...
            assert m is not None, "DOCTYPE did not match expected format"

            name = m.group('name')
            publicId = m.group('publicId')
            if publicId is not None:
                systemId = m.group('systemId1')
            else:
                systemId = m.group('systemId2')
            return _base.DOCTYPE, name, publicId or "", systemId or ""

        elif isinstance(node, Comment):
            string = unicode(node.string)
            if string.startswith('<!--') and string.endswith('-->'):
                string = string[4:-3]
            return _base.COMMENT, string

        elif isinstance(node, unicode): # TextNode
            return _base.TEXT, node

        elif isinstance(node, Tag): # Element
            return (_base.ELEMENT, namespaces["html"], node.name,
                    dict(node.attrs).items(), node.contents)
        else:
            return _base.UNKNOWN, node.__class__.__name__ 
Example #24
Source File: browser.py    From Hatkey with GNU General Public License v3.0 5 votes vote down vote up
def get_soup(self):
        """Returns beautiful soup of the current document."""
        import BeautifulSoup
        return BeautifulSoup.BeautifulSoup(self.data) 
Example #25
Source File: clientform.py    From NoobSec-Toolkit with GNU General Public License v2.0 5 votes vote down vote up
def close(self):
        sgmllib.SGMLParser.close(self)
        self.end_body()


# sigh, must support mechanize by allowing dynamic creation of classes based on
# its bundled copy of BeautifulSoup (which was necessary because of dependency
# problems) 
Example #26
Source File: GitHubCodeSearch.py    From SimplyEmail with GNU General Public License v3.0 5 votes vote down vote up
def process(self):
        dl = Download.Download(verbose=self.verbose)
        # Get all the USER code Repos
        # https://github.com/search?p=2&q=enron.com+&ref=searchresults&type=Code&utf8=✓
        UrlList = []
        while self.Counter <= self.Depth:
            if self.verbose:
                p = ' [*] GitHub Code Search on page: ' + str(self.Counter)
                print helpers.color(p, firewall=True)
            try:
                url = "https://github.com/search?p=" + str(self.Counter) + "&q=" + \
                    str(self.domain) + "+&ref=searchresults&type=Code&utf8=✓"
                r = dl.requesturl(url, useragent=self.UserAgent, raw=True, timeout=10)
                if r.status_code != 200:
                    break
            except Exception as e:
                error = " [!] Major isself.Counter += 1sue with GitHub Search:" + \
                    str(e)
                print helpers.color(error, warning=True)
            RawHtml = r.content
            # Parse the results for our URLS)
            soup = BeautifulSoup(RawHtml)
            for a in soup.findAll('a', href=True):
                a = a['href']
                if a.startswith('/'):
                    UrlList.append(a)
            self.Counter += 1
        # Now take all gathered URL's and gather the HTML content needed
        for url in UrlList:
            try:
                url = "https://github.com" + url
                html = dl.requesturl(url, useragent=self.UserAgent, timeout=10)
                self.Html += html
            except Exception as e:
                error = " [!] Connection Timed out on Github Search:" + str(e)
                print helpers.color(error, warning=True) 
Example #27
Source File: GitHubGistSearch.py    From SimplyEmail with GNU General Public License v3.0 5 votes vote down vote up
def process(self):
        dl = Download.Download(verbose=self.verbose)
        # Get all the USER code Repos
        # https://github.com/search?p=2&q=enron.com+&ref=searchresults&type=Code&utf8=✓
        UrlList = []
        while self.Counter <= self.Depth:
            if self.verbose:
                p = ' [*] GitHub Gist Search Search on page: ' + \
                    str(self.Counter)
                print helpers.color(p, firewall=True)
            try:
                # search?p=2&q=%40enron.com&ref=searchresults&utf8=✓
                url = "https://gist.github.com/search?p=" + str(self.Counter) + "&q=%40" + \
                    str(self.domain) + "+&ref=searchresults&utf8=✓"
                r = dl.requesturl(url, useragent=self.UserAgent, raw=True, timeout=10)
                if r.status_code != 200:
                    break
            except Exception as e:
                error = " [!] Major issue with GitHubGist Search:" + \
                    str(e)
                print helpers.color(error, warning=True)
            RawHtml = r.content
            # Parse the results for our URLS)
            soup = BeautifulSoup(RawHtml)
            for a in soup.findAll('a', href=True):
                a = a['href']
                if a.startswith('/'):
                    UrlList.append(a)
            self.Counter += 1
        # Now take all gathered URL's and gather the HTML content needed
        for url in UrlList:
            try:
                url = "https://gist.github.com" + url
                html = dl.requesturl(url, useragent=self.UserAgent, timeout=10)
                self.Html += html
            except Exception as e:
                error = " [!] Connection Timed out on GithubGist Search:" + \
                    str(e)
                print helpers.color(error, warning=True) 
Example #28
Source File: Connect6.py    From SimplyEmail with GNU General Public License v3.0 5 votes vote down vote up
def Connect6Download(self, url):
        '''
        Downloads raw source of Connect6 page.
        '''
        NameList = []
        try:
            if url.startswith('http') or url.startswith('https'):
                r = requests.get(url, headers=self.UserAgent)
            else:
                url = 'http://' + str(url)
                if self.verbose:
                    p = " [*] Now downloading Connect6 Source: " + str(url)
                    print helpers.color(p, firewall=True)
                r = requests.get(url, headers=self.UserAgent)
        except Exception as e:
            error = " [!] Major issue with Downloading Connect6 source:" + \
                str(e)
            print helpers.color(error, warning=True)
        try:
            if r:
                rawhtml = r.content
                soup = BeautifulSoup(rawhtml)
                try:
                    for utag in soup.findAll("ul", {"class": "directoryList"}):
                        for litag in utag.findAll('li'):
                            NameList.append(litag.text)
                            if self.verbose:
                                p = " [*] Connect6 Name Found: " + \
                                    str(litag.text)
                                print helpers.color(p, firewall=True)
                except:
                    pass
                return NameList
            # for a in soup.findAll('a', href=True):
        except Exception as e:
            print e 
Example #29
Source File: Connect6.py    From SimplyEmail with GNU General Public License v3.0 5 votes vote down vote up
def Connect6AutoUrl(self):
        # Using startpage to attempt to get the URL
        # https://www.google.com/search?q=site:connect6.com+domain.com
        try:
            # This returns a JSON object
            urllist = []
            domain = self.domain.split('.')
            url = "https://www.google.com/search?q=site:connect6.com+%22" + \
                domain[0] + '%22'
            r = requests.get(url, headers=self.UserAgent)
        except Exception as e:
            error = "[!] Major issue with Google Search: for Connect6 URL" + \
                str(e)
            print helpers.color(error, warning=True)
        try:
            rawhtml = r.content
            soup = BeautifulSoup(rawhtml)
            for a in soup.findAll('a', href=True):
                try:
                    l = urlparse.parse_qs(
                        urlparse.urlparse(a['href']).query)['q']
                    if 'site:connect6.com' not in l[0]:
                        l = l[0].split(":")
                        urllist.append(l[2])
                except:
                    pass
            if urllist:
                y = 0
                s = 0
                for x in urllist:
                    if "/c" in x:
                        urllist.insert(s, urllist.pop(y))
                        s += 1
                    y += 1
            return urllist
        except Exception as e:
            print e
            return urllist 
Example #30
Source File: media.py    From flask_reddit with MIT License 5 votes vote down vote up
def get_top_img(url, timeout=4):
    """
    Nothing fancy here, we merely check if the page author
    set a designated image or if the url itself is an image.

    This method could be mutch better but we are favoring ease
    of installation and simplicity of speed.
    """
    if not url:
        return None

    url = clean_url(url)

    # if the url is referencing an img itself, return it
    if url.split('.')[-1].lower() in img_extensions:
        return url
    try:
        html = requests.get(url, timeout=timeout).text
        soup = BeautifulSoup.BeautifulSoup(html)

        og_image = (soup.find('meta', property='og:image') or
                    soup.find('meta', attrs={'name': 'og:image'}))

        if og_image and og_image['content']:
            src_url = og_image['content']
            return make_abs(url, src_url)

        # <link rel="image_src" href="http://...">
        thumbnail_spec = soup.find('link', rel='image_src')
        if thumbnail_spec and thumbnail_spec['href']:
            src_url = thumbnail_spec['href']
            return make_abs(url, src_url)

    except Exception, e:
        print 'FAILED WHILE EXTRACTING THREAD IMG', str(e)
        return None