python source code of nlp_countWords

Project: practicalDataAnalysisCookbook (GitHub Link)

practicalDataAnalysisCookbook-master
- Codes
  - Chapter08
    - graph_fraudOrigin.py
    - graph_handling.py
    - graph_fraudTransactions.py
  - __pycache__
    - helper.cpython-34.pyc
  - Chapter01
    - data_binning.py
    - retrieve_html.py
    - data_standardize.py
    - store_SQLite.py
    - read_xml.py
    - data_impute.py
    - store_mongodb.py
    - read_json_alternative.py
    - data_dummy_code.py
    - read_csv.py
    - read_json.py
    - store_postgresql.py
    - read_xlsx.py
    - read_csv_alternative.py
    - read_xlsx_alternative.py
  - .DS_Store
  - Chapter10
    - MixedLogit
      - dcm_mixed.py
      - __pycache__
        headers.cpython-35.pyc
        dcm_mixed.cpython-35.pyc
      - dcm_mixed_param.py
      - headers.py
      - dcm_mixed.log
      - __parametersUsed.py
      - dcm_mixed.tex
      - hessian.lis
      - hess.lis
    - .DS_Store
    - MNL
      - __pycache__
        dcm_mnl.cpython-35.pyc
        headers.cpython-35.pyc
      - dcm_mnl.log
      - headers.py
      - dcm_mnl.py
      - __parametersUsed.py
      - dcm_mnl.tex
      - hessian.lis
      - hess.lis
      - dcm_mnl_param.py
    - Nested
      - __pycache__
        headers.cpython-35.pyc
        dcm_nested.cpython-35.pyc
      - headers.py
      - dcm_nested_param.py
      - dcm_nested.py
      - dcm_nested.tex
      - __parametersUsed.py
      - hessian.lis
      - hess.lis
      - dcm_nested.log
    - run_pythonbiogeme.sh
    - dcm_dataPrep.py
    - TestingIIA
      - dcm_iia.log
      - dcm_iia_simul.log
      - __pycache__
        dcm_iia.cpython-35.pyc
        dcm_mnl_simul.cpython-35.pyc
        dcm_mnl.cpython-35.pyc
        headers.cpython-35.pyc
        dcm_iia_simul.cpython-35.pyc
      - dcm_mnl_simul.py
      - dcm_iia_simul.py
      - dcm_iia_param.py
      - headers.py
      - dcm_iia_testing.py
      - dcm_mnl_simul.log
      - .DS_Store
      - beta.draws
      - __parametersUsed.py
      - dcm_iia.py
      - dcm_iia.tex
  - Chapter02
    - data_describe_alternative.py
    - d3
      - d3.v3.min.js
      - d3_multivariate.html
    - data_split_alternative.py
    - data_correlations.py
    - data_split.py
    - data_multivariate_charts.py
    - data_histograms_alternative.py
    - data_sampling.py
    - data_histograms.py
    - data_interactions.py
    - data_sampling_alternative.py
    - data_describe.py
  - Chapter05
    - reduce_randomizedPCA.py
    - reduce_kernelPCA_alternative.py
    - reduce_kernelPCA.py
    - reduce_kNN.py
    - reduce_LDA.py
    - reduce_pca_alternative.py
    - reduce_pca.py
  - Chapter03
    - classification_gradientBoosting.py
    - classification_decisionTree_alternative.py
    - classification_ann_alternative.py
    - classification_svm.py
    - __pycache__
      - helper.cpython-34.pyc
    - classification_randomForest.py
    - classification_logistic_alternative.py
    - classification_decisionTree.py
    - classification_svm_alternative.py
    - classification_ann.py
    - classification_naiveBayes.py
    - classification_logistic.py
  - Chapter07
    - ts_seasonalDecomposition.py
    - ts_handlingData.py
    - ts_handlingData_alternative.py
    - ts_timeSeriesFunctions.py
    - ts_arima.py
    - ts_smoothing_alternative.py
    - ts_filtering.py
    - ts_timeSeriesFunctions_alternative.py
    - ts_detrendAndRemoveSeasonality.py
    - ts_smoothing.py
    - ts_filtering_alternative.py
  - helper.py
  - Chapter11
    - sim_gasStation.py
    - sim_gasStation_alternative.py
    - sim_sheepWolvesPredation.py
    - sim_recharge.py
  - Chapter04
    - clustering_dbscan.py
    - __pycache__
      - sompy.cpython-34.pyc
    - clustering_kmeans.py
    - clustering_cmeans.py
    - clustering_kmeans_search.py
    - clustering_meanShift.py
    - clustering_kmeans_search_alternative.py
    - clustering_birch.py
    - clustering_hierarchical.py
    - clustering_kmeans_alternative.py
    - clustering_hierarchical_alternative.py
  - Chapter06
    - regression_svm_alternative.py
    - regression_ols.py
    - regression_knn.py
    - regression_linear_alternative.py
    - regression_ann.py
    - regression_ols_alternative.py
    - regression_multicollinearity.py
    - regression_linear.py
    - regression_svm.py
    - regression_randomForest.py
    - regression_cart.py
  - Chapter09
    - nlp_tokenize.py
    - nlp_classify.py
    - nlp_countWords.py
    - nlp_pos.py
    - nlp_read.py
    - nlp_sentence.py
    - nlp_pos_alternative.py
    - nlp_download.py
- LICENSE
- .DS_Store
- Data
  - Chapter08
    - fraud.gephi
    - twitter.gephi
  - Chapter01
    - realEstate_trans.db
    - realEstate_trans.tsv
    - realEstate_trans_dirty.csv
    - realEstate_trans_price_imputed.csv
    - realEstate_trans.xlsx
    - realEstate_trans.csv
    - realEstate_trans_standardized.csv
    - realEstate_trans_less_dirty.csv
  - .DS_Store
  - Chapter10
    - options.json
  - Chapter02
    - realEstate_corellations.csv
    - realEstate_descriptives.csv
    - realEstate_test.csv
    - realEstate_d3.csv
    - realEstate_sample2.csv
    - Figures
    - realEstate_train.csv
    - realEstate_test2.csv
    - realEstate_train2.csv
    - realEstate_sample.csv
  - Chapter05
    - charts
      - .DS_Store
  - Chapter03
    - decisionTree
      - tree.dot
    - randomForest
      - tree_7.dot
      - tree_5.dot
      - tree_8.dot
      - tree_1.dot
      - tree_4.dot
      - tree_0.dot
      - tree_2.dot
      - convertToPdf.sh
      - tree_3.dot
      - tree_9.dot
      - tree_6.dot
    - bank_contacts_data_dict.txt
  - Chapter07
    - charts
      - .DS_Store
    - combined_flow_decomposed.csv
    - combined_flow.csv
    - combined_flow_d.csv
    - columbia.csv
    - american.csv
  - Chapter04
  - Chapter06
    - charts
      - .DS_Store
    - CART
      - tree.dot
      - tree_red.dot
  - Chapter09
    - charts
      - sent2.ps
      - sent1.ps
    - ST_gunLaws_lemmas.txt
    - ST_gunLaws_words.txt
    - ST_gunLaws_stems.txt
    - ST_gunLaws.txt
- README.md
- .gitignore

import nltk
import re
import numpy as np
import matplotlib.pyplot as plt

def preprocess_data(text):
    global sentences, tokenized
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    sentences =  nltk.sent_tokenize(text)
    tokenized = [tokenizer.tokenize(s) for s in sentences]

# import the data
guns_laws = '../../Data/Chapter09/ST_gunLaws.txt'

with open(guns_laws, 'r') as f:
    article = f.read()

# chunk into sentences and tokenize
sentences = []
tokenized = []

preprocess_data(article)

# part-of-speech tagging
tagged_sentences = [nltk.pos_tag(w) for w in tokenized]

# extract names entities -- regular expressions approach
tagged = []

pattern = '''
    ENT: {<DT>?(<NNP|NNPS>)+}
'''

tokenizer = nltk.RegexpParser(pattern)

for sent in tagged_sentences:
    tagged.append(tokenizer.parse(sent))

# keep named entities together
words = []
lemmatizer = nltk.WordNetLemmatizer()

for sentence in tagged:
    for pos in sentence:
        if type(pos) == nltk.tree.Tree:
            words.append(' '.join([w[0] for w in pos]))
        else:
            words.append(lemmatizer.lemmatize(pos[0]))

# remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
words = [w for w in words if w.lower() not in stopwords]

# and calculate frequencies
freq = nltk.FreqDist(words)

# sort descending on frequency
f = sorted(freq.items(), key=lambda x: x[1], reverse=True)

# print top words
top_words = [w for w in f if w[1] > 1]
print(top_words, len(top_words))

# plot 10 top words
top_words_transposed = list(zip(*top_words))
y_pos = np.arange(len(top_words_transposed[0][:10]))[::-1]

plt.barh(y_pos, top_words_transposed[1][:10], 
    align='center', alpha=0.5)
plt.yticks(y_pos, top_words_transposed[0][:10])
plt.xlabel('Frequency')
plt.ylabel('Top words')

plt.savefig('../../Data/Chapter09/charts/word_frequency.png',
    dpi=300)