python source code of psykey

# author: Asitang Mishra
# asitang.mishra@jpl.nasa.gov
# asitang@gmail.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.



import nltk
import string
import os
from stemming.porter2 import stem
import io
import sys
import argparse
import csv
import features as feat


# A class to do stylstic extractions from text: To use programatically, initialize the class. This will calculate different kinds of stylistic features from the text,
# eg. as many times it finds a punctuation it will add the word 'punc' to the 'featspace list'. Similarly, all the extractions are added to 'featspace'
# in form of signatures. To access a specific feature, call that specific method from the object of the class.
class psykey:
    def __init__(self, text, wordlistfolder):
        self.text = text
        self.tokens = nltk.word_tokenize(text)
        self.sentenses = nltk.sent_tokenize(text)
        self.tags = nltk.pos_tag(self.tokens)

        self.featspace = []

        self.psykfeatspace(self.featspace, wordlistfolder)
        self.bigrams(self.featspace)
        self.number_count(self.featspace)
        self.punc_count(self.featspace)
        self.big_word_count(self.featspace)
        self.words_per_sentence(self.featspace)
        self.sentence_count(self.featspace)
        self.countPOS(self.featspace, 'CC')
        self.countPOS(self.featspace, 'NP')
        self.countPOS(self.featspace, 'NNP')
        self.words(self.featspace)
        self.stem(self.featspace)

    # Counts a specific POS tags
    def countPOS(self, featspace, postag):
        tags = self.tags
        count = 0
        for word in tags:
            if word[1] == postag:
                count += 1
                featspace.append(postag)

    # Counts number of words
    def words(self, featspace):
        tokens = self.tokens
        featspace.extend(tokens)
        return len(tokens)

    # Count number of sentenses
    def sentence_count(self, featspace):
        sentences = self.sentenses
        count = len(sentences)
        for i in range(0, count):
            featspace.append('sentcount')
        return count

    # Counds the average number of words per sentence
    def words_per_sentence(self, featspace):
        token_length = len(self.tokens)
        sentences_length = len(self.sentenses)
        count = int(token_length / sentences_length)
        for i in range(0, count):
            featspace.append('wordspersentence')

    # Counts the number of big words: words bigger than 6 chars
    def big_word_count(self, featspace):
        count = 0
        tokens = self.tokens
        for word in tokens:
            if len(word) > 6:
                count += 1
                featspace.append('bigword')

    # Counts the total number of punctuations in the text
    def punc_count(self, featspace):
        count = 0
        tokens = self.tokens
        punctuations = string.punctuation.replace('.', '')
        for word in tokens:
            if word in punctuations:
                count += 1
                featspace.append('punc')

    # Counts teh number of numerical words in text
    def number_count(self, featspace):
        count = 0
        tokens = self.tokens
        for word in tokens:
            flag = 0
            for ch in word:
                if ch in '0123456789':
                    flag = 1
                    break
            if flag == 1:
                featspace.append('numbers')
                count += 1

    # Creates bigrams
    def bigrams(self, featspace):
        tokens = self.tokens
        for count in range(0, len(tokens) - 1):
            featspace.append(tokens[count] + tokens[count + 1])

    # Opens the folder with all the wordlists. matches the words in text with the words in each file. If match found, creates a feature/signature with the name of the
    # file.
    def psykfeatspace(self, featspace, wordlistfolder):
        tokens = self.tokens
        for filename in os.listdir(wordlistfolder):
            if '.txt' in filename:
                names = [line.strip() for line in open(os.path.join(wordlistfolder , filename), 'r')]
            else:
                continue
            for token in tokens:
                if token in names:
                    featspace.append(filename.replace('.txt', ''))

    # Creates stemmed words from the text
    def stem(self, featspace):
        tokens = self.tokens
        for token in tokens:
            featspace.append(stem(token))


def ClaculatePairwise(inputdir, outputcsv, wordlists):
    files = os.listdir(inputdir)
    calculated=set()
    with open(outputcsv, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["file1", "file2", "Similarity_score"])
        for file1 in files:
            for file2 in files:
                if '.txt' in file1 and '.txt' in file2 and file1+'\t'+file2 not in calculated and file1!=file2:
                    calculated.add(file1+'\t'+file2)
                    calculated.add(file2 + '\t' + file1)
                    text1 = open(os.path.join(inputdir,file1), 'rU')
                    text2 = open(os.path.join(inputdir,file2), 'rU')
                else:
                    continue
                raw1 = text1.read()
                raw2 = text2.read()
                psykey1 = psykey(raw1, wordlists)
                psykey2 = psykey(raw2, wordlists)

                score = feat.get_cosine_similarity(psykey1.featspace, psykey2.featspace)
                a.writerow([file1, file2, score])

                text1.close()
                text2.close()


if __name__ == '__main__':
    argParser = argparse.ArgumentParser('Cosine Similarity based on stylistic features')
    argParser.add_argument('--inputDir', required=True,help='path to directory for storing the output CSV File, containing pair-wise Cosine similarity on stylistic features')
    argParser.add_argument('--outCSV', required=True, help='path to output file')
    argParser.add_argument('--wordlists', required=True, help='wordlist folder with files containing lists of words (one per line)')
    args = argParser.parse_args()

    if args.inputDir and args.outCSV and args.wordlists:
        ClaculatePairwise(args.inputDir, args.outCSV, args.wordlists)