import nltk
from newspaper import Article
from nltk.tag.stanford import POSTagger

class Extractor(object):
    """Extracts possible place names from text.

      text (str or unicode): The text to parse. Unicode is accepted.
      url (list of str): The url to parse, if there is one.
      places (list): The list of possible place names found. 

    def __init__(self, text=None, url=None):
        """Inits the parser.
            text (str or unicode): The text to parse. Unicode is accepted.
            url (str): Alternatively pass a url, which will be downloaded and
                stripped of HTML.
        if not text and not url:
            raise Exception('text or url is required')

        self.text = text
        self.url = url
        self.places = []
        if self.url is not None:
    def download_text(self):
        """Downloads text from self.url and strip HTML tags.
        if not self.text and self.url:
            a = Article(self.url)
            self.text = a.text

    def named_entities(self):
        # word_tokenize should work well for most non-CJK languages
        text = nltk.word_tokenize(self.text)
        # TODO: this works only for english. Stanford's pos tagger supports
        # more languages
        # PT corpus
        pos_tag = nltk.pos_tag(text)
        nes = nltk.ne_chunk(pos_tag)
        return nes

    def find_entities(self):
        """Parse text and tokenize it.
        nes = self.named_entities()
        for ne in nes:
            if type(ne) is nltk.tree.Tree:
                if ne.label() in ['GPE', 'PERSON', 'ORGANIZATION']:
                    self.places.append(u' '.join([i[0] for i in ne.leaves()]))