python source code of main

import re
import os
import numpy as np
import sklearn.datasets
from fuzzywuzzy import fuzz
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
import itertools
import random
import sys
import json
import tensorflow as tf
from pathlib import Path
from urllib.request import urlretrieve
from .tatabahasa import *
from .utils import *

home = str(Path.home())+'/Malaya'
stopwords_location = home+'/stop-word-kerulnet'
char_settings = home+'/char-settings.json'
char_frozen = home+'/char_frozen_model.pb'
concat_settings = home+'/concat-settings.json'
concat_frozen = home+'/concat_frozen_model.pb'
attention_settings = home+'/attention-settings.json'
attention_frozen = home+'/attention_pos_frozen_model.pb'
STOPWORDS = None

stopword_tatabahasa = list(set(tanya_list+perintah_list+pangkal_list+bantu_list+penguat_list+\
                penegas_list+nafi_list+pemeri_list+sendi_list+pembenar_list+nombor_list+\
                suku_bilangan_list+pisahan_list+keterangan_list+arah_list+hubung_list+gantinama_list))

LOC = os.path.dirname(os.path.abspath(__file__))

try:
    if not os.path.exists(home):
        os.makedirs(home)
except:
    print('cannot make directory for cache, exiting.')
    sys.exit(1)

if not os.path.isfile(stopwords_location):
    print('downloading stopwords')
    download_file("https://raw.githubusercontent.com/DevconX/Malaya/master/data/stop-word-kerulnet", stopwords_location)
with open(stopwords_location,'r') as fopen:
    STOPWORDS = list(filter(None, fopen.read().split('\n')))

class USER_BAYES:
    def __init__(self,multinomial,label,vectorize):
        self.multinomial = multinomial
        self.label = label
        self.vectorize = vectorize
    def predict(self, string):
        vectors = self.vectorize.transform([string])
        results = self.multinomial.predict_proba(vectors)[0]
        out = []
        for no, i in enumerate(self.label):
            out.append((i,results[no]))
        return out

class NORMALIZE:
    def __init__(self,user,corpus):
        self.user = user
        self.corpus = corpus
    def normalize(self,string):
        original_string = string
        string = string.lower()
        if string[0] == 'x':
            if len(string) == 1:
                return 'tak'
            result_string = 'tak '
            string = string[1:]
        else:
            result_string = ''
        results = []
        for i in range(len(self.user)):
            total = 0
            for k in self.user[i]: total += fuzz.ratio(string, k)
            results.append(total)
        if len(np.where(np.array(results) > 60)[0]) < 1:
            return original_string
        ids = np.argmax(results)
        return result_string + self.corpus[ids]

class DEEP_MODELS:
    def __init__(self,nodes,sess,predict):
        self.nodes = nodes
        self.sess = sess
        self.__predict = predict
    def predict(self,string):
        return self.__predict(string,self.sess,self.nodes)

VOWELS = "aeiou"
PHONES = ['sh', 'ch', 'ph', 'sz', 'cz', 'sch', 'rz', 'dz']

def isWord(word):
    if word:
        consecutiveVowels = 0
        consecutiveConsonents = 0
        for idx, letter in enumerate(word.lower()):
            vowel = True if letter in VOWELS else False
            if idx:
                prev = word[idx-1]
                prevVowel = True if prev in VOWELS else False
                if not vowel and letter == 'y' and not prevVowel:
                    vowel = True
                if prevVowel != vowel:
                    consecutiveVowels = 0
                    consecutiveConsonents = 0
            if vowel:
                consecutiveVowels += 1
            else:
                consecutiveConsonents +=1
            if consecutiveVowels >= 3 or consecutiveConsonents > 3:
                return False
            if consecutiveConsonents == 3:
                subStr = word[idx-2:idx+1]
                if any(phone in subStr for phone in PHONES):
                    consecutiveConsonents -= 1
                    continue
                return False
    return True

list_laughing = ['huhu','haha','gaga','hihi','wkawka','wkwk','kiki','keke','rt']
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ')
    string = string.replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', '', unidecode(string))
    string = [y.strip() for y in word_tokenize(string.lower()) if isWord(y.strip())]
    string = [y for y in string if all([y.find(k) < 0 for k in list_laughing]) and y[:len(y)//2] != y[len(y)//2:]]
    string = ' '.join(string).lower()
    string = (''.join(''.join(s)[:2] for _, s in itertools.groupby(string))).split()
    return ' '.join([y for y in string if y not in STOPWORDS])

def process_word(word, lower=True):
    if lower:
        word = word.lower()
    else:
        if word.isupper():
            word = word.title()
    word = re.sub('[^A-Za-z0-9\- ]+', '', word)
    if word.isdigit():
        word = 'NUM'
    return word

def clearstring(string):
    string = unidecode(string)
    string = re.sub('[^A-Za-z ]+', '', string)
    string = word_tokenize(string)
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string).lower()
    string = ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    return ' '.join([i for i in string.split() if i not in STOPWORDS])

def str_idx(corpus, dic, UNK=3):
    maxlen = max([len(i) for i in corpus])
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

def generate_char_seq(batch,idx2word,char2idx):
    x = [[len(idx2word[i]) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((batch.shape[0],batch.shape[1],maxlen),dtype=np.int32)
    for i in range(batch.shape[0]):
        for k in range(batch.shape[1]):
            for no, c in enumerate(idx2word[batch[i,k]]):
                temp[i,k,-1-no] = char2idx[c]
    return temp

def get_entity_char(string,sess,model):
    batch_x = str_idx([process_word(w) for w in string.split()],model['char2idx'])
    logits, logits_pos = sess.run([tf.argmax(model['logits'],1),tf.argmax(model['logits_pos'],1)],feed_dict={model['X']:batch_x})
    results = []
    for no, i in enumerate(string.split()):
        results.append((i,model['idx2tag'][str(logits[no])],model['idx2pos'][str(logits_pos[no])]))
    return results

def get_entity_concat(string,sess,model):
    test_X = []
    for w in string.split():
        w = process_word(w)
        try:
            test_X.append(model['word2idx'][w])
        except:
            test_X.append(2)
    array_X = np.array([test_X])
    batch_x_char = generate_char_seq(array_X,model['idx2word'],model['char2idx'])
    Y_pred,Y_pos = sess.run([model['crf_decode'],model['crf_decode_pos']],feed_dict={model['word_ids']:array_X,
                                              model['char_ids']:batch_x_char})
    results = []
    for no, i in enumerate(string.split()):
        results.append((i,model['idx2tag'][str(Y_pred[0,no])],model['idx2pos'][str(Y_pos[0,no])]))
    return results

def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

def deep_learning(model='attention'):
    if model == 'char':
        if not os.path.isfile(char_settings):
            print('downloading char settings')
            download_file("https://raw.githubusercontent.com/DevconX/Malaya/master/data/char-settings.json", char_settings)
        with open(char_settings,'r') as fopen:
            nodes = json.loads(fopen.read())
        if not os.path.isfile(char_frozen):
            print('downloading frozen char model')
            download_file("https://raw.githubusercontent.com/DevconX/Malaya/master/data/char_frozen_model.pb", char_frozen)
        g=load_graph(char_frozen)
        nodes['X'] = g.get_tensor_by_name('import/Placeholder:0')
        nodes['logits'] = g.get_tensor_by_name('import/logits:0')
        nodes['logits_pos'] = g.get_tensor_by_name('import/logits_pos:0')
        return DEEP_MODELS(nodes,tf.InteractiveSession(graph=g),get_entity_char)
    elif model == 'concat':
        if not os.path.isfile(concat_settings):
            print('downloading concat settings')
            download_file("https://raw.githubusercontent.com/DevconX/Malaya/master/data/concat-settings.json", concat_settings)
        with open(concat_settings,'r') as fopen:
            nodes = json.loads(fopen.read())
        if not os.path.isfile(concat_frozen):
            print('downloading frozen concat model')
            download_file("https://raw.githubusercontent.com/DevconX/Malaya/master/data/concat_frozen_model.pb", concat_frozen)
        g=load_graph(concat_frozen)
        nodes['word_ids'] = g.get_tensor_by_name('import/Placeholder:0')
        nodes['char_ids'] = g.get_tensor_by_name('import/Placeholder_1:0')
        nodes['crf_decode'] = g.get_tensor_by_name('import/entity-logits/cond/Merge:0')
        nodes['crf_decode_pos'] = g.get_tensor_by_name('import/pos-logits/cond/Merge:0')
        nodes['idx2word'] = {int(k):v for k,v in nodes['idx2word'].items()}
        return DEEP_MODELS(nodes,tf.InteractiveSession(graph=g),get_entity_concat)
    elif model == 'attention':
        if not os.path.isfile(attention_settings):
            print('downloading attention settings')
            download_file("https://raw.githubusercontent.com/DevconX/Malaya/master/data/attention-settings.json", attention_settings)
        with open(attention_settings,'r') as fopen:
            nodes = json.loads(fopen.read())
        if not os.path.isfile(attention_frozen):
            print('downloading frozen attention model')
            download_file("https://raw.githubusercontent.com/DevconX/Malaya/master/data/attention_frozen_model.pb", attention_frozen)
        g=load_graph(attention_frozen)
        nodes['word_ids'] = g.get_tensor_by_name('import/Placeholder:0')
        nodes['char_ids'] = g.get_tensor_by_name('import/Placeholder_1:0')
        nodes['crf_decode'] = g.get_tensor_by_name('import/entity-logits/cond/Merge:0')
        nodes['crf_decode_pos'] = g.get_tensor_by_name('import/pos-logits/cond/Merge:0')
        nodes['idx2word'] = {int(k):v for k,v in nodes['idx2word'].items()}
        return DEEP_MODELS(nodes,tf.InteractiveSession(graph=g),get_entity_concat)
    else:
        raise Exception('model not supported')

def tokenizer(string):
    return [word_tokenize(t) for t in sent_tokenize(s)]

def naive_POS(word):
    for key, vals in tatabahasa_dict.items():
        if word in vals:
            return (key,word)
    try:
        if len(re.findall(r'^(.*?)(%s)$'%('|'.join(hujung[:1])), i)[0]) > 1:
            return ('KJ',word)
    except:
        pass
    try:
        if len(re.findall(r'^(.*?)(%s)'%('|'.join(permulaan[:-4])), word)[0]) > 1:
            return ('KJ',word)
    except Exception as e:
        pass
    if len(word) > 2:
        return ('KN',word)
    else:
        return ('',word)

def naive_POS_string(string):
    string = string.lower()
    results = []
    for i in word_tokenize(string):
        results.append(naive_POS(i))
    return results

def stemming(word):
    try:
        word = re.findall(r'^(.*?)(%s)$'%('|'.join(hujung)), word)[0][0]
        mula = re.findall(r'^(.*?)(%s)'%('|'.join(permulaan[::-1])), word)[0][1]
        return word.replace(mula,'')
    except:
        return word

def variant(word):
    word = word.lower()
    splits = [(word[:i], word[i:]) for i in range(len(word)+1)]
    deletes = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
    replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts = [a + c + b for a, b in splits for c in alphabet]
    return np.unique(deletes + transposes + replaces + inserts, return_counts=True)

def basic_normalize(string):
    result = []
    for i in string.lower().split():
        if i == 'x':
            result.append('tidak')
        elif i[-1] == '2':
            result.append(i[:-1]+'-'+i[:-1])
        else:
            result.append(i)
    return ' '.join(result)

def train_normalize(corpus):
    if not isinstance(corpus, list) and not isinstance(corpus, tuple):
        raise Exception('a list or a tuple of word needed for the corpus')
    transform = []
    for i in corpus:
        i = i.lower()
        result = []
        result.append(''.join(char for char in i if char not in 'aeiou'))
        if i[-1] == 'a':
            result.append(i[:-1]+'e')
            result.append(i+'k')
        if i[-2:] == 'ar':
            result.append(i[:-2]+'o')
        if i[:2] == 'ha':
            result.append(i[1:])
        splitted_double = i.split('-')
        if len(splitted_double) > 1 and splitted_double[0] == splitted_double[1]:
            result.append(splitted_double[0]+'2')
        transform.append(result)
    return NORMALIZE(transform,corpus)

def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        data_ = list(filter(None, data_))
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

def train_bayes(corpus,tokenizing=True,cleaning=True,normalizing=True,stem=True,vector='tfidf',split=0.2):
    multinomial,labels,vectorize = None, None, None
    if vector.lower().find('tfidf') < 0 and vector.lower().find('bow'):
        raise Exception('Invalid vectorization technique')
    if isinstance(corpus, str):
        trainset = sklearn.datasets.load_files(container_path = corpus, encoding = 'UTF-8')
        trainset.data, trainset.target = separate_dataset(trainset)
        data, target = trainset.data, trainset.target
        labels = trainset.target_names
    if isinstance(corpus, list) or isinstance(corpus, tuple):
        corpus = np.array(corpus)
        data, target = corpus[:,0].tolist(),corpus[:,1].tolist()
        labels = np.unique(target).tolist()
        target = LabelEncoder().fit_transform(target)
    c = list(zip(data, target))
    random.shuffle(c)
    data, target = zip(*c)
    data, target = list(data), list(target)
    if stem:
        for i in range(len(data)): data[i] = ' '.join([stemming(k) for k in data[i].split()])
    if cleaning:
        for i in range(len(data)): data[i] = clearstring(data[i],tokenizing)
    if vector.lower().find('tfidf') >= 0:
        vectorize = TfidfVectorizer().fit(data)
        vectors = vectorize.transform(data)
    else:
        vectorize = CountVectorizer().fit(data)
        vectors = vectorize.transform(data)
    multinomial = MultinomialNB()
    if split:
        train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = split)
        multinomial.partial_fit(train_X, train_Y,classes=np.unique(target))
        predicted = multinomial.predict(test_X)
        print(metrics.classification_report(test_Y, predicted, target_names = labels))
    else:
        multinomial.partial_fit(vectors,target,classes=np.unique(target))
        predicted = multinomial.predict(vectors)
        print(metrics.classification_report(target, predicted, target_names = labels))
    return USER_BAYES(multinomial,labels,vectorize)