Python nltk.corpus.names.words() Examples

The following are 16 code examples of nltk.corpus.names.words(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.corpus.names , or try the search function .
Example #1
Source File: Anaphora.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 6 votes vote down vote up
def __init__(self):
        males = [(name, 'male') for name in names.words('male.txt')]
        females = [(name, 'female') for name in names.words('female.txt')]
        combined = males + females
        random.shuffle(combined)
        training = [(self.feature(name), gender) for (name, gender) in combined]
        self._classifier = nltk.NaiveBayesClassifier.train(training) 
Example #2
Source File: svm.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def demo():

    def gender_features(word):
        return {'last_letter': word[-1], 'penultimate_letter': word[-2]}

    from nltk.classify import accuracy
    from nltk.corpus import names


    import random
    names = ([(name, 'male') for name in names.words('male.txt')] +
             [(name, 'female') for name in names.words('female.txt')])
    import random
    random.seed(60221023)
    random.shuffle(names)

    featuresets = [(gender_features(n), g) for (n,g) in names]
    train_set, test_set = featuresets[500:], featuresets[:500]

    print '--- nltk.classify.svm demo ---'
    print 'Number of training examples:', len(train_set)
    classifier = SvmClassifier.train(train_set)
    print 'Total SVM dimensions:', len(classifier._svmfeatureindex)
    print 'Label mapping:', classifier._labelmapping
    print '--- Processing an example instance ---'
    print 'Reference instance:', names[0]
    print 'NLTK-format features:\n    ' + str(test_set[0])
    print 'SVMlight-format features:\n    ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex))
    distr = classifier.prob_classify(test_set[0][0])
    print 'Instance classification and confidence:', distr.max(), distr.prob(distr.max())
    print '--- Measuring classifier performance ---'
    print 'Overall accuracy:', accuracy(classifier, test_set) 
Example #3
Source File: expand_EXPN.py    From normalise with GNU General Public License v3.0 5 votes vote down vote up
def maximum_overlap(w, i, text):
    """Return the candidate expansion with the highest overlap."""
    best = 0
    current = []
    curr = ''
    t_matches = tag_matches(i, text)
    if t_matches:
        if len(t_matches) == 1:
            if t_matches[0] in brown_common:
                return t_matches[0]
            else:
                return w
        for cand in t_matches:
            olap = overlap(i, cand, text)
            if olap > best and cand in words:
                best = olap
                current = [cand]
            elif olap == best and best != 0:
                current.append(cand)
        best = 0
        for c in current:
            if c in brown_common:
                freq = brown_common[c]
            else:
                freq = 0
            if freq < best:
                best = freq
                curr = c
            elif freq == best and len(tag_matches(i, text)) == 1:
                best = freq
                curr = c
            return curr
    if curr == '':
        return w
    else:
        return curr 
Example #4
Source File: expand_EXPN.py    From normalise with GNU General Public License v3.0 5 votes vote down vote up
def overlap(i, word, text):
    """Return overlap between words in the context of the abbreviation and
       words in the signatures generated for each candidate expansion."""
    overlap = 0
    sig = gen_signature(word)
    context = gen_context(i, text)
    for w in context:
        if w in sig:
            if w in brown_common:
                overlap += brown_common[w]
            else:
                overlap += log(1161192 / 1)
    return overlap 
Example #5
Source File: expand_EXPN.py    From normalise with GNU General Public License v3.0 5 votes vote down vote up
def gen_signature(word):
    """Generate a signature for each candidate expansion, using contextual
       information from the Brown corpus, as well as WordNet definitions and
       examples (if applicable)."""
    if word in gen_signature.dict:
        return gen_signature.dict[word]
    inds = find_matches(word)
    if len(inds) > 50:
        f = len(inds) / 50
        inds = [inds[int(i * f)] for i in range(50)]
    signature = defaultdict(int)
    for i in inds:
        for w in gen_context(i, brown):
            signature[w] += 1
    sig = {w for w in signature
           if signature[w] > 1
           and w not in stopwords.words('english') and w != ','}
    if word in wn.words():
        if wn.synsets(word) and str(wn.synsets(word)[0]).count("'") == 2:
            define = (eval("wn.{}.definition()".format(
                      str(wn.synsets(word)[0]).lower())))
            examples = (eval("wn.{}.examples()".format(
                        str(wn.synsets(word)[0]).lower())))
            if examples:
                for ex in examples:
                        sig.update([w for w in wt(ex)
                                   if w not in stopwords.words('english')])
            if define:
                        sig.update([w for w in wt(define)
                                   if w not in stopwords.words('english')])
    gen_signature.dict[word] = sig
    return sig 
Example #6
Source File: expand_EXPN.py    From normalise with GNU General Public License v3.0 5 votes vote down vote up
def gen_context(i, text):
    """Generate context for the abbreviation - 4 words either side unless
       sentence is too short."""
    ind = i
    context = []
    text = text[:]
    if not isinstance(i, int):
        ind = int(i)
        split_token = text[ind]
        del text[ind]
        parts = split({ind: (split_token, 'SPLT')}, verbose=False)
        for it in sorted(parts, reverse=True):
            text.insert(ind, parts[it][0])
    start = ind
    end = ind + 1
    sloop = True
    while sloop and start > 0:
        if text[start - 1] not in ['.', '!', '?']:
            start -= 1
        else:
            sloop = False
    eloop = True
    while eloop and end <= len(text) - 1:
        if text[end] in ['.', '!', '?']:
            eloop = False
        else:
            end += 1
    if ind - start < 4:
        if end - start >= 9:
            context += text[start: start + 9]
        else:
            context += text[start: end]
    elif end - ind < 5:
        if end - start >= 9:
            context += text[end - 9: end]
        else:
            context += text[start: end]
    else:
        context += text[ind - 4: ind + 5]
    return context 
Example #7
Source File: expand_EXPN.py    From normalise with GNU General Public License v3.0 5 votes vote down vote up
def gen_candidates(word):
    """Generate a list of candidate expansions given an abbreviation."""
    vowel_cands = []
    start_cands = []
    start_and_end_cands = []
    reg_cons = ''
    reg_start = ''
    reg_start_and_end = ''
    for lt in word.lower():
        if lt.isalpha():
            reg_cons += lt
            reg_cons += '[aeiou]*'
    reg_cons += '$'
    regex_cons = re.compile(reg_cons)
    for lt in word.lower():
        if lt.isalpha():
            reg_start += lt
    regex_start = re.compile(reg_start)
    last = find_last_letter(word)
    if last == 's':
        last = find_last_letter(word[:word.rfind(last)]) + last
    for lt in word[:word.rfind(last)].lower():
        if lt.isalpha():
            reg_start_and_end += lt
    reg_start_and_end += '.*{}$'.format(last)
    regex_start_and_end = re.compile(reg_start_and_end)
    for w in words:
        if regex_cons.match(w):
            vowel_cands.append(w)
        elif regex_start_and_end.match(w):
            start_and_end_cands.append(w)
        elif regex_start.match(w):
            start_cands.append(w)
    return vowel_cands, start_and_end_cands, start_cands 
Example #8
Source File: hobbs.py    From hobbs with MIT License 5 votes vote down vote up
def gender_match(tree, pos, pro):
    """ Takes a proposed antecedent and pronoun and checks whether
    they match in gender. Only checks for mismatches between singular
    proper name antecedents and singular pronouns.
    """
    male_names = (name.lower() for name in names.words('male.txt'))
    female_names = (name.lower() for name in names.words('female.txt'))
    male_pronouns = ["he", "him", "himself"]
    female_pronouns = ["she", "her", "herself"]
    neuter_pronouns = ["it", "itself"]
    
    for c in tree[pos]:
        if isinstance(c, nltk.Tree) and c.label() in nominal_labels:
            # If the proposed antecedent is a recognized male name,
            # but the pronoun being resolved is either female or
            # neuter, they don't match
            if c.leaves()[0].lower() in male_names:
                if pro in female_pronouns:
                    return False
                elif pro in neuter_pronouns:
                    return False
            # If the proposed antecedent is a recognized female name,
            # but the pronoun being resolved is either male or 
            # neuter, they don't match
            elif c.leaves()[0].lower() in female_names:
                if pro in male_pronouns:
                    return False
                elif pro in neuter_pronouns:
                    return False
            # If the proposed antecedent is a numeral, but the 
            # pronoun being resolved is not neuter, they don't match
            elif c.leaves()[0].isdigit():
                if pro in male_pronouns:
                    return False
                elif pro in female_pronouns:
                    return False

    return True 
Example #9
Source File: testingNLP.py    From python-urbanPlanning with MIT License 5 votes vote down vote up
def splitter(data,num_words):
    words=data.split(' ')
    output=[]
    cur_count=0
    cur_words=[]
    for word in words:
        cur_words.append(word)
        cur_count+=1
        if cur_count==num_words:
            output.append(' '.join(cur_words))
            cur_words=[]
            cur_count=0
    output.append(' '.join(cur_words))
    return output 
Example #10
Source File: testingNLP.py    From python-urbanPlanning with MIT License 5 votes vote down vote up
def splitter(data,num_words):
    words=data.split(' ')
    output=[]
    cur_count=0
    cur_words=[]
    for word in words:
        cur_words.append(word)
        cur_count+=1
        if cur_count==num_words:
            output.append(' '.join(cur_words))
            cur_words=[]
            cur_count=0
    output.append(' '.join(cur_words))
    return output 
Example #11
Source File: util.py    From razzy-spinner with GNU General Public License v3.0 4 votes vote down vote up
def names_demo(trainer, features=names_demo_features):
    from nltk.corpus import names
    import random

    # Construct a list of classified names, using the names corpus.
    namelist = ([(name, 'male') for name in names.words('male.txt')] +
                [(name, 'female') for name in names.words('female.txt')])

    # Randomly split the names into a test & train set.
    random.seed(123456)
    random.shuffle(namelist)
    train = namelist[:5000]
    test = namelist[5000:5500]

    # Train up a classifier.
    print('Training classifier...')
    classifier = trainer( [(features(n), g) for (n, g) in train] )

    # Run the classifier on the test data.
    print('Testing classifier...')
    acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
    print('Accuracy: %6.4f' % acc)

    # For classifiers that can find probabilities, show the log
    # likelihood and some sample probability distributions.
    try:
        test_featuresets = [features(n) for (n, g) in test]
        pdists = classifier.prob_classify_many(test_featuresets)
        ll = [pdist.logprob(gold)
              for ((name, gold), pdist) in zip(test, pdists)]
        print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test)))
        print()
        print('Unseen Names      P(Male)  P(Female)\n'+'-'*40)
        for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
            if gender == 'male':
                fmt = '  %-15s *%6.4f   %6.4f'
            else:
                fmt = '  %-15s  %6.4f  *%6.4f'
            print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
    except NotImplementedError:
        pass

    # Return the classifier
    return classifier 
Example #12
Source File: util.py    From razzy-spinner with GNU General Public License v3.0 4 votes vote down vote up
def partial_names_demo(trainer, features=names_demo_features):
    from nltk.corpus import names
    import random

    male_names = names.words('male.txt')
    female_names = names.words('female.txt')

    random.seed(654321)
    random.shuffle(male_names)
    random.shuffle(female_names)

    # Create a list of male names to be used as positive-labeled examples for training
    positive = map(features, male_names[:2000])

    # Create a list of male and female names to be used as unlabeled examples
    unlabeled = map(features, male_names[2000:2500] + female_names[:500])

    # Create a test set with correctly-labeled male and female names
    test = [(name, True) for name in male_names[2500:2750]] \
        + [(name, False) for name in female_names[500:750]]

    random.shuffle(test)

    # Train up a classifier.
    print('Training classifier...')
    classifier = trainer(positive, unlabeled)

    # Run the classifier on the test data.
    print('Testing classifier...')
    acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
    print('Accuracy: %6.4f' % acc)

    # For classifiers that can find probabilities, show the log
    # likelihood and some sample probability distributions.
    try:
        test_featuresets = [features(n) for (n, m) in test]
        pdists = classifier.prob_classify_many(test_featuresets)
        ll = [pdist.logprob(gold)
              for ((name, gold), pdist) in zip(test, pdists)]
        print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test)))
        print()
        print('Unseen Names      P(Male)  P(Female)\n'+'-'*40)
        for ((name, is_male), pdist) in zip(test, pdists)[:5]:
            if is_male == True:
                fmt = '  %-15s *%6.4f   %6.4f'
            else:
                fmt = '  %-15s  %6.4f  *%6.4f'
            print(fmt % (name, pdist.prob(True), pdist.prob(False)))
    except NotImplementedError:
        pass

    # Return the classifier
    return classifier 
Example #13
Source File: util.py    From luscan-devel with GNU General Public License v2.0 4 votes vote down vote up
def names_demo(trainer, features=names_demo_features):
    from nltk.corpus import names
    import random

    # Construct a list of classified names, using the names corpus.
    namelist = ([(name, 'male') for name in names.words('male.txt')] +
                [(name, 'female') for name in names.words('female.txt')])

    # Randomly split the names into a test & train set.
    random.seed(123456)
    random.shuffle(namelist)
    train = namelist[:5000]
    test = namelist[5000:5500]

    # Train up a classifier.
    print 'Training classifier...'
    classifier = trainer( [(features(n), g) for (n,g) in train] )

    # Run the classifier on the test data.
    print 'Testing classifier...'
    acc = accuracy(classifier, [(features(n),g) for (n,g) in test])
    print 'Accuracy: %6.4f' % acc

    # For classifiers that can find probabilities, show the log
    # likelihood and some sample probability distributions.
    try:
        test_featuresets = [features(n) for (n,g) in test]
        pdists = classifier.batch_prob_classify(test_featuresets)
        ll = [pdist.logprob(gold)
              for ((name, gold), pdist) in zip(test, pdists)]
        print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test))
        print
        print 'Unseen Names      P(Male)  P(Female)\n'+'-'*40
        for ((name, gender), pdist) in zip(test, pdists)[:5]:
            if gender == 'male':
                fmt = '  %-15s *%6.4f   %6.4f'
            else:
                fmt = '  %-15s  %6.4f  *%6.4f'
            print fmt % (name, pdist.prob('male'), pdist.prob('female'))
    except NotImplementedError:
        pass

    # Return the classifier
    return classifier 
Example #14
Source File: util.py    From luscan-devel with GNU General Public License v2.0 4 votes vote down vote up
def partial_names_demo(trainer, features=names_demo_features):
    from nltk.corpus import names
    import random

    male_names = names.words('male.txt')
    female_names = names.words('female.txt')

    random.seed(654321)
    random.shuffle(male_names)
    random.shuffle(female_names)

    # Create a list of male names to be used as positive-labeled examples for training
    positive = map(features, male_names[:2000])

    # Create a list of male and female names to be used as unlabeled examples
    unlabeled = map(features, male_names[2000:2500] + female_names[:500])

    # Create a test set with correctly-labeled male and female names
    test = [(name, True) for name in male_names[2500:2750]] \
        + [(name, False) for name in female_names[500:750]]

    random.shuffle(test)

    # Train up a classifier.
    print 'Training classifier...'
    classifier = trainer(positive, unlabeled)

    # Run the classifier on the test data.
    print 'Testing classifier...'
    acc = accuracy(classifier, [(features(n),m) for (n,m) in test])
    print 'Accuracy: %6.4f' % acc

    # For classifiers that can find probabilities, show the log
    # likelihood and some sample probability distributions.
    try:
        test_featuresets = [features(n) for (n,m) in test]
        pdists = classifier.batch_prob_classify(test_featuresets)
        ll = [pdist.logprob(gold)
              for ((name, gold), pdist) in zip(test, pdists)]
        print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test))
        print
        print 'Unseen Names      P(Male)  P(Female)\n'+'-'*40
        for ((name, is_male), pdist) in zip(test, pdists)[:5]:
            if is_male == True:
                fmt = '  %-15s *%6.4f   %6.4f'
            else:
                fmt = '  %-15s  %6.4f  *%6.4f'
            print fmt % (name, pdist.prob(True), pdist.prob(False))
    except NotImplementedError:
        pass

    # Return the classifier
    return classifier 
Example #15
Source File: util.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 4 votes vote down vote up
def names_demo(trainer, features=names_demo_features):
    from nltk.corpus import names
    import random

    # Construct a list of classified names, using the names corpus.
    namelist = [(name, 'male') for name in names.words('male.txt')] + [
        (name, 'female') for name in names.words('female.txt')
    ]

    # Randomly split the names into a test & train set.
    random.seed(123456)
    random.shuffle(namelist)
    train = namelist[:5000]
    test = namelist[5000:5500]

    # Train up a classifier.
    print('Training classifier...')
    classifier = trainer([(features(n), g) for (n, g) in train])

    # Run the classifier on the test data.
    print('Testing classifier...')
    acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
    print('Accuracy: %6.4f' % acc)

    # For classifiers that can find probabilities, show the log
    # likelihood and some sample probability distributions.
    try:
        test_featuresets = [features(n) for (n, g) in test]
        pdists = classifier.prob_classify_many(test_featuresets)
        ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
        print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
        print()
        print('Unseen Names      P(Male)  P(Female)\n' + '-' * 40)
        for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
            if gender == 'male':
                fmt = '  %-15s *%6.4f   %6.4f'
            else:
                fmt = '  %-15s  %6.4f  *%6.4f'
            print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
    except NotImplementedError:
        pass

    # Return the classifier
    return classifier 
Example #16
Source File: util.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 4 votes vote down vote up
def partial_names_demo(trainer, features=names_demo_features):
    from nltk.corpus import names
    import random

    male_names = names.words('male.txt')
    female_names = names.words('female.txt')

    random.seed(654321)
    random.shuffle(male_names)
    random.shuffle(female_names)

    # Create a list of male names to be used as positive-labeled examples for training
    positive = map(features, male_names[:2000])

    # Create a list of male and female names to be used as unlabeled examples
    unlabeled = map(features, male_names[2000:2500] + female_names[:500])

    # Create a test set with correctly-labeled male and female names
    test = [(name, True) for name in male_names[2500:2750]] + [
        (name, False) for name in female_names[500:750]
    ]

    random.shuffle(test)

    # Train up a classifier.
    print('Training classifier...')
    classifier = trainer(positive, unlabeled)

    # Run the classifier on the test data.
    print('Testing classifier...')
    acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
    print('Accuracy: %6.4f' % acc)

    # For classifiers that can find probabilities, show the log
    # likelihood and some sample probability distributions.
    try:
        test_featuresets = [features(n) for (n, m) in test]
        pdists = classifier.prob_classify_many(test_featuresets)
        ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
        print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
        print()
        print('Unseen Names      P(Male)  P(Female)\n' + '-' * 40)
        for ((name, is_male), pdist) in zip(test, pdists)[:5]:
            if is_male == True:
                fmt = '  %-15s *%6.4f   %6.4f'
            else:
                fmt = '  %-15s  %6.4f  *%6.4f'
            print(fmt % (name, pdist.prob(True), pdist.prob(False)))
    except NotImplementedError:
        pass

    # Return the classifier
    return classifier