python source code of data_cleaning

Project: Hands-On-Ensemble-Learning-with-Python (GitHub Link)

Hands-On-Ensemble-Learning-with-Python-master
- Chapter08
  - kmeans_raw.py
  - oe_vote_tsne.py
  - oe_co_occurence.py
  - kmeans_cluster.py
  - oe_vote.py
  - kmeans_intro.py
  - oe_graph_closure.py
  - agglomerative.py
  - voting_example.py
- Chapter13
  - ensemble_cluster_normalized.py
  - ensemble_cluster.py
  - exploratory.py
  - insights.py
  - ensemble_cluster_tsne.py
  - clustering.py
- LICENSE
- Chapter01
  - algos_demo.py
  - datasets_demo.py
- Chapter12
  - ensemble_fc_models.py
  - exploratory.py
  - single_dense_model.py
  - single_dot_model.py
- Chapter10
  - voting_regressor.py
  - stacking_regressor.py
  - stacking.py
  - simulator.py
  - voting.py
  - regression.py
  - exploratory.py
  - boosting.py
  - bagging.py
  - simulator_plain.py
  - random_forest.py
- README.md
- Chapter02
  - motivation.py
  - bias_variance.py
- Chapter05
  - bagging_sklearn_classification.py
  - bootstrapping.py
  - bagging_custom.py
  - validation_curves.py
  - bagging_custom_parallel.py
  - bagging_sklearn_regression.py
- Chapter03
  - custom_voting_implementation.py
  - scikit_soft_voting_2knn.py
  - scikit_hard_voting.py
  - scikit_soft_voting_analysis.py
  - custom_voting_implementation_analysis.py
  - scikit_soft_voting.py
- Chapter07
  - rf_classification.py
  - extra_tree_regression.py
  - extra_tree_classification_validation_curves.py
  - rf_classification_validation_curves.py
  - extra_tree_classification.py
  - rf_regression.py
  - probability_to_choose.py
- Chapter11
  - comparisons.py
  - exploratory.py
  - stream_sentiment.py
  - data_cleaning.py
  - base_learners_twitter.py
- Chapter04
  - stacking_regression.py
  - stacking_classification.py
  - linear_nonlinear_example.py
  - stacking_classification_analysis.py
  - stacking_regression_analysis.py
  - stacking_classifiers.py
- Chapter06
  - boosting_overfit.py
  - adaboost_custom.py
  - gradient_boosting_custom.py
  - adaboost_sklearn_classification.py
  - xgb_regression.py
  - xgb_classification.py
  - adaboost_sklearn_regression.py
  - dataset_segmentation.py
  - gradient_boosting_sklearn_regression.py
  - gradient_boosting_sklearn_classification.py
- Chapter09
  - stacking.py
  - xgboosting.py
  - voting.py
  - adaboost.py
  - exploratory.py
  - bagging.py
  - stacking_classifier.py
  - dt_optimize.py
  - unrelated_presentation_phd.py
  - logistic_regression.py
  - random_forest.py
  - base.py

import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation

# Read the data and assign labels
labels = ['polarity', 'id', 'date', 'query', 'user', 'text']
data = pd.read_csv("sent140.csv", names=labels)

# Keep only text and polarity, change polarity to 0-1
data = data[['text', 'polarity']]
data.polarity.replace(4, 1, inplace=True)

# Create a list of stopwords
stops = stopwords.words("english")

# Add stop variants without single quotes
no_quotes = []
for word in stops:
    if "'" in word:
        no_quotes.append(re.sub(r'\'', '', word))
stops.extend(no_quotes)


def clean_string(string):
    # Remove HTML entities
    tmp = re.sub(r'\&\w*;', '', string)
    # Remove @user
    tmp = re.sub(r'@(\w+)', '', tmp)
    # Remove links
    tmp = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+', '', tmp)
    # Lowercase
    tmp = tmp.lower()
    # Remove Hashtags
    tmp = re.sub(r'#(\w+)', '', tmp)
    # Remove repeating chars
    tmp = re.sub(r'(.)\1{1,}', r'\1\1', tmp)
    # Remove anything that is not letters
    tmp = re.sub("[^a-zA-Z]", " ", tmp)
    # Remove anything that is less than two characters
    tmp = re.sub(r'\b\w{1,2}\b', '', tmp)
    # Remove multiple spaces
    tmp = re.sub(r'\s\s+', ' ', tmp)
    return tmp



def preprocess(string):

    stemmer = PorterStemmer()
    # Remove any punctuation character
    removed_punc = ''.join([char for char in string if char not in punctuation])

    cleaned = []
    # Remove any stopword
    for word in removed_punc.split(' '):
        if word not in stops:
            cleaned.append(stemmer.stem(word.lower()))
    return ' '.join(cleaned)




# Shuffle
data = data.sample(frac=1).reset_index(drop=True)
# Clean
data.text = data.text.apply(clean_string)
# Pre-process
data.text = data.text.apply(preprocess)
# Save to CSV
data.to_csv('sent140_preprocessed.csv', index=False)