Python nltk.corpus.names.words() Examples
The following are 16
code examples of nltk.corpus.names.words().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.corpus.names
, or try the search function
.
Example #1
Source File: Anaphora.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 6 votes |
def __init__(self): males = [(name, 'male') for name in names.words('male.txt')] females = [(name, 'female') for name in names.words('female.txt')] combined = males + females random.shuffle(combined) training = [(self.feature(name), gender) for (name, gender) in combined] self._classifier = nltk.NaiveBayesClassifier.train(training)
Example #2
Source File: svm.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def demo(): def gender_features(word): return {'last_letter': word[-1], 'penultimate_letter': word[-2]} from nltk.classify import accuracy from nltk.corpus import names import random names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) import random random.seed(60221023) random.shuffle(names) featuresets = [(gender_features(n), g) for (n,g) in names] train_set, test_set = featuresets[500:], featuresets[:500] print '--- nltk.classify.svm demo ---' print 'Number of training examples:', len(train_set) classifier = SvmClassifier.train(train_set) print 'Total SVM dimensions:', len(classifier._svmfeatureindex) print 'Label mapping:', classifier._labelmapping print '--- Processing an example instance ---' print 'Reference instance:', names[0] print 'NLTK-format features:\n ' + str(test_set[0]) print 'SVMlight-format features:\n ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex)) distr = classifier.prob_classify(test_set[0][0]) print 'Instance classification and confidence:', distr.max(), distr.prob(distr.max()) print '--- Measuring classifier performance ---' print 'Overall accuracy:', accuracy(classifier, test_set)
Example #3
Source File: expand_EXPN.py From normalise with GNU General Public License v3.0 | 5 votes |
def maximum_overlap(w, i, text): """Return the candidate expansion with the highest overlap.""" best = 0 current = [] curr = '' t_matches = tag_matches(i, text) if t_matches: if len(t_matches) == 1: if t_matches[0] in brown_common: return t_matches[0] else: return w for cand in t_matches: olap = overlap(i, cand, text) if olap > best and cand in words: best = olap current = [cand] elif olap == best and best != 0: current.append(cand) best = 0 for c in current: if c in brown_common: freq = brown_common[c] else: freq = 0 if freq < best: best = freq curr = c elif freq == best and len(tag_matches(i, text)) == 1: best = freq curr = c return curr if curr == '': return w else: return curr
Example #4
Source File: expand_EXPN.py From normalise with GNU General Public License v3.0 | 5 votes |
def overlap(i, word, text): """Return overlap between words in the context of the abbreviation and words in the signatures generated for each candidate expansion.""" overlap = 0 sig = gen_signature(word) context = gen_context(i, text) for w in context: if w in sig: if w in brown_common: overlap += brown_common[w] else: overlap += log(1161192 / 1) return overlap
Example #5
Source File: expand_EXPN.py From normalise with GNU General Public License v3.0 | 5 votes |
def gen_signature(word): """Generate a signature for each candidate expansion, using contextual information from the Brown corpus, as well as WordNet definitions and examples (if applicable).""" if word in gen_signature.dict: return gen_signature.dict[word] inds = find_matches(word) if len(inds) > 50: f = len(inds) / 50 inds = [inds[int(i * f)] for i in range(50)] signature = defaultdict(int) for i in inds: for w in gen_context(i, brown): signature[w] += 1 sig = {w for w in signature if signature[w] > 1 and w not in stopwords.words('english') and w != ','} if word in wn.words(): if wn.synsets(word) and str(wn.synsets(word)[0]).count("'") == 2: define = (eval("wn.{}.definition()".format( str(wn.synsets(word)[0]).lower()))) examples = (eval("wn.{}.examples()".format( str(wn.synsets(word)[0]).lower()))) if examples: for ex in examples: sig.update([w for w in wt(ex) if w not in stopwords.words('english')]) if define: sig.update([w for w in wt(define) if w not in stopwords.words('english')]) gen_signature.dict[word] = sig return sig
Example #6
Source File: expand_EXPN.py From normalise with GNU General Public License v3.0 | 5 votes |
def gen_context(i, text): """Generate context for the abbreviation - 4 words either side unless sentence is too short.""" ind = i context = [] text = text[:] if not isinstance(i, int): ind = int(i) split_token = text[ind] del text[ind] parts = split({ind: (split_token, 'SPLT')}, verbose=False) for it in sorted(parts, reverse=True): text.insert(ind, parts[it][0]) start = ind end = ind + 1 sloop = True while sloop and start > 0: if text[start - 1] not in ['.', '!', '?']: start -= 1 else: sloop = False eloop = True while eloop and end <= len(text) - 1: if text[end] in ['.', '!', '?']: eloop = False else: end += 1 if ind - start < 4: if end - start >= 9: context += text[start: start + 9] else: context += text[start: end] elif end - ind < 5: if end - start >= 9: context += text[end - 9: end] else: context += text[start: end] else: context += text[ind - 4: ind + 5] return context
Example #7
Source File: expand_EXPN.py From normalise with GNU General Public License v3.0 | 5 votes |
def gen_candidates(word): """Generate a list of candidate expansions given an abbreviation.""" vowel_cands = [] start_cands = [] start_and_end_cands = [] reg_cons = '' reg_start = '' reg_start_and_end = '' for lt in word.lower(): if lt.isalpha(): reg_cons += lt reg_cons += '[aeiou]*' reg_cons += '$' regex_cons = re.compile(reg_cons) for lt in word.lower(): if lt.isalpha(): reg_start += lt regex_start = re.compile(reg_start) last = find_last_letter(word) if last == 's': last = find_last_letter(word[:word.rfind(last)]) + last for lt in word[:word.rfind(last)].lower(): if lt.isalpha(): reg_start_and_end += lt reg_start_and_end += '.*{}$'.format(last) regex_start_and_end = re.compile(reg_start_and_end) for w in words: if regex_cons.match(w): vowel_cands.append(w) elif regex_start_and_end.match(w): start_and_end_cands.append(w) elif regex_start.match(w): start_cands.append(w) return vowel_cands, start_and_end_cands, start_cands
Example #8
Source File: hobbs.py From hobbs with MIT License | 5 votes |
def gender_match(tree, pos, pro): """ Takes a proposed antecedent and pronoun and checks whether they match in gender. Only checks for mismatches between singular proper name antecedents and singular pronouns. """ male_names = (name.lower() for name in names.words('male.txt')) female_names = (name.lower() for name in names.words('female.txt')) male_pronouns = ["he", "him", "himself"] female_pronouns = ["she", "her", "herself"] neuter_pronouns = ["it", "itself"] for c in tree[pos]: if isinstance(c, nltk.Tree) and c.label() in nominal_labels: # If the proposed antecedent is a recognized male name, # but the pronoun being resolved is either female or # neuter, they don't match if c.leaves()[0].lower() in male_names: if pro in female_pronouns: return False elif pro in neuter_pronouns: return False # If the proposed antecedent is a recognized female name, # but the pronoun being resolved is either male or # neuter, they don't match elif c.leaves()[0].lower() in female_names: if pro in male_pronouns: return False elif pro in neuter_pronouns: return False # If the proposed antecedent is a numeral, but the # pronoun being resolved is not neuter, they don't match elif c.leaves()[0].isdigit(): if pro in male_pronouns: return False elif pro in female_pronouns: return False return True
Example #9
Source File: testingNLP.py From python-urbanPlanning with MIT License | 5 votes |
def splitter(data,num_words): words=data.split(' ') output=[] cur_count=0 cur_words=[] for word in words: cur_words.append(word) cur_count+=1 if cur_count==num_words: output.append(' '.join(cur_words)) cur_words=[] cur_count=0 output.append(' '.join(cur_words)) return output
Example #10
Source File: testingNLP.py From python-urbanPlanning with MIT License | 5 votes |
def splitter(data,num_words): words=data.split(' ') output=[] cur_count=0 cur_words=[] for word in words: cur_words.append(word) cur_count+=1 if cur_count==num_words: output.append(' '.join(cur_words)) cur_words=[] cur_count=0 output.append(' '.join(cur_words)) return output
Example #11
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 4 votes |
def names_demo(trainer, features=names_demo_features): from nltk.corpus import names import random # Construct a list of classified names, using the names corpus. namelist = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) # Randomly split the names into a test & train set. random.seed(123456) random.shuffle(namelist) train = namelist[:5000] test = namelist[5000:5500] # Train up a classifier. print('Training classifier...') classifier = trainer( [(features(n), g) for (n, g) in train] ) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(n), g) for (n, g) in test]) print('Accuracy: %6.4f' % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, g) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test))) print() print('Unseen Names P(Male) P(Female)\n'+'-'*40) for ((name, gender), pdist) in list(zip(test, pdists))[:5]: if gender == 'male': fmt = ' %-15s *%6.4f %6.4f' else: fmt = ' %-15s %6.4f *%6.4f' print(fmt % (name, pdist.prob('male'), pdist.prob('female'))) except NotImplementedError: pass # Return the classifier return classifier
Example #12
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 4 votes |
def partial_names_demo(trainer, features=names_demo_features): from nltk.corpus import names import random male_names = names.words('male.txt') female_names = names.words('female.txt') random.seed(654321) random.shuffle(male_names) random.shuffle(female_names) # Create a list of male names to be used as positive-labeled examples for training positive = map(features, male_names[:2000]) # Create a list of male and female names to be used as unlabeled examples unlabeled = map(features, male_names[2000:2500] + female_names[:500]) # Create a test set with correctly-labeled male and female names test = [(name, True) for name in male_names[2500:2750]] \ + [(name, False) for name in female_names[500:750]] random.shuffle(test) # Train up a classifier. print('Training classifier...') classifier = trainer(positive, unlabeled) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(n), m) for (n, m) in test]) print('Accuracy: %6.4f' % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, m) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test))) print() print('Unseen Names P(Male) P(Female)\n'+'-'*40) for ((name, is_male), pdist) in zip(test, pdists)[:5]: if is_male == True: fmt = ' %-15s *%6.4f %6.4f' else: fmt = ' %-15s %6.4f *%6.4f' print(fmt % (name, pdist.prob(True), pdist.prob(False))) except NotImplementedError: pass # Return the classifier return classifier
Example #13
Source File: util.py From luscan-devel with GNU General Public License v2.0 | 4 votes |
def names_demo(trainer, features=names_demo_features): from nltk.corpus import names import random # Construct a list of classified names, using the names corpus. namelist = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) # Randomly split the names into a test & train set. random.seed(123456) random.shuffle(namelist) train = namelist[:5000] test = namelist[5000:5500] # Train up a classifier. print 'Training classifier...' classifier = trainer( [(features(n), g) for (n,g) in train] ) # Run the classifier on the test data. print 'Testing classifier...' acc = accuracy(classifier, [(features(n),g) for (n,g) in test]) print 'Accuracy: %6.4f' % acc # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n,g) in test] pdists = classifier.batch_prob_classify(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test)) print print 'Unseen Names P(Male) P(Female)\n'+'-'*40 for ((name, gender), pdist) in zip(test, pdists)[:5]: if gender == 'male': fmt = ' %-15s *%6.4f %6.4f' else: fmt = ' %-15s %6.4f *%6.4f' print fmt % (name, pdist.prob('male'), pdist.prob('female')) except NotImplementedError: pass # Return the classifier return classifier
Example #14
Source File: util.py From luscan-devel with GNU General Public License v2.0 | 4 votes |
def partial_names_demo(trainer, features=names_demo_features): from nltk.corpus import names import random male_names = names.words('male.txt') female_names = names.words('female.txt') random.seed(654321) random.shuffle(male_names) random.shuffle(female_names) # Create a list of male names to be used as positive-labeled examples for training positive = map(features, male_names[:2000]) # Create a list of male and female names to be used as unlabeled examples unlabeled = map(features, male_names[2000:2500] + female_names[:500]) # Create a test set with correctly-labeled male and female names test = [(name, True) for name in male_names[2500:2750]] \ + [(name, False) for name in female_names[500:750]] random.shuffle(test) # Train up a classifier. print 'Training classifier...' classifier = trainer(positive, unlabeled) # Run the classifier on the test data. print 'Testing classifier...' acc = accuracy(classifier, [(features(n),m) for (n,m) in test]) print 'Accuracy: %6.4f' % acc # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n,m) in test] pdists = classifier.batch_prob_classify(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test)) print print 'Unseen Names P(Male) P(Female)\n'+'-'*40 for ((name, is_male), pdist) in zip(test, pdists)[:5]: if is_male == True: fmt = ' %-15s *%6.4f %6.4f' else: fmt = ' %-15s %6.4f *%6.4f' print fmt % (name, pdist.prob(True), pdist.prob(False)) except NotImplementedError: pass # Return the classifier return classifier
Example #15
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def names_demo(trainer, features=names_demo_features): from nltk.corpus import names import random # Construct a list of classified names, using the names corpus. namelist = [(name, 'male') for name in names.words('male.txt')] + [ (name, 'female') for name in names.words('female.txt') ] # Randomly split the names into a test & train set. random.seed(123456) random.shuffle(namelist) train = namelist[:5000] test = namelist[5000:5500] # Train up a classifier. print('Training classifier...') classifier = trainer([(features(n), g) for (n, g) in train]) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(n), g) for (n, g) in test]) print('Accuracy: %6.4f' % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, g) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test))) print() print('Unseen Names P(Male) P(Female)\n' + '-' * 40) for ((name, gender), pdist) in list(zip(test, pdists))[:5]: if gender == 'male': fmt = ' %-15s *%6.4f %6.4f' else: fmt = ' %-15s %6.4f *%6.4f' print(fmt % (name, pdist.prob('male'), pdist.prob('female'))) except NotImplementedError: pass # Return the classifier return classifier
Example #16
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def partial_names_demo(trainer, features=names_demo_features): from nltk.corpus import names import random male_names = names.words('male.txt') female_names = names.words('female.txt') random.seed(654321) random.shuffle(male_names) random.shuffle(female_names) # Create a list of male names to be used as positive-labeled examples for training positive = map(features, male_names[:2000]) # Create a list of male and female names to be used as unlabeled examples unlabeled = map(features, male_names[2000:2500] + female_names[:500]) # Create a test set with correctly-labeled male and female names test = [(name, True) for name in male_names[2500:2750]] + [ (name, False) for name in female_names[500:750] ] random.shuffle(test) # Train up a classifier. print('Training classifier...') classifier = trainer(positive, unlabeled) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(n), m) for (n, m) in test]) print('Accuracy: %6.4f' % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, m) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test))) print() print('Unseen Names P(Male) P(Female)\n' + '-' * 40) for ((name, is_male), pdist) in zip(test, pdists)[:5]: if is_male == True: fmt = ' %-15s *%6.4f %6.4f' else: fmt = ' %-15s %6.4f *%6.4f' print(fmt % (name, pdist.prob(True), pdist.prob(False))) except NotImplementedError: pass # Return the classifier return classifier