## This is an educational random forest implementation ## References: ## * A. Criminisi, J. Shotton, and E. Konukoglu, Decision Forests: ## A Unified Framework for Classification, Regression, Density Estimation, ## Manifold Learning and Semi-Supervised Learning. Foundations and Trends in ## Computer Graphics and Computer Vision. NOW Publishers. Vol.7: No 2-3, pp 81-227. 2012. ## ## * Jamie Shotton, Toby Sharp, Pushmeet Kohli, Sebastian Nowozin, John Winn, ## and Antonio Criminisi, Decision Jungles: Compact and Rich Models for ## Classification, in Proc. NIPS, 2013 import random from collections import Counter import numpy as np import copy def split_data(data, label=0, length=50): 'Take a large text and divide it into chunks' strings = [data[i:i+length] for i in range(0, len(data) - length, length)] random.shuffle(strings) strings = [(s, label) for s in strings] test = strings[:len(strings) * 10 / 100] training = strings[len(strings) * 10 / 100:] return test, training def entropy(data): 'Computes the binary entropy of labelled data' v = Counter([b for _, b in data]).values() d = np.array(v) / float(sum(v)) return - sum(d * np.log(d)) def split(train, feat): 'Split data according to an infromation gain criterium' ## first compute the entropy Hx = entropy(train) if Hx < 0.000001: raise Exception("Entropy very low") L1 = [] L2 = [] for t in train: if feat in t[0]: L1 += [t] else: L2 += [t] E1 = entropy(L1) E2 = entropy(L2) L = float(len(train)) H = Hx - E1 * len(L1)/L - E2 * len(L2)/L return H, L1, L2, feat ## -------------------------- ## - The random forest code - ## -------------------------- def build_tree(train, features, levels=5, numfeatures=100): 'Train a decision tree based on labeled data and features' if levels == 0: C1 = Counter([b for _, b in train]) Leaf = (None, C1) return Leaf else: try: X = (split(train, F) for F in random.sample(features, numfeatures)) H, L1, L2, F = max(X) M1 = build_tree(L1, features, levels - 1, numfeatures) M2 = build_tree(L2, features, levels - 1, numfeatures) Branch = (F, M1, M2) return Branch except: return build_tree(train, features, levels=0) def classify(tree, item): 'Get a decision for an item using a tree' if len(tree) == 2: assert tree[0] is None return tree[1] else: fet, L1, L2 = tree if fet in item: return classify(L1, item) else: return classify(L2, item) ## ---------------------------- ## - The decision jungle code - ## ---------------------------- def build_jungle(train, features, levels=20, numfeatures=100): DAG = {0: copy.copy(train)} Candidate_sets = [0] next_ID = 0 M = 20 for level in range(levels): result_sets = [] for tdata_idx in Candidate_sets: tdata = DAG[tdata_idx] if entropy(tdata) == 0.0: next_ID += 1 idx1 = next_ID result_sets += [idx1] DAG[idx1] = tdata + [] del DAG[tdata_idx][:] DAG[tdata_idx] += [True, idx1, idx1] continue X = (split(tdata, F) for F in random.sample(features, numfeatures)) H, L1, L2, F = max(X) # Branch = (F, M1, M2) next_ID += 1 idx1 = next_ID DAG[idx1] = L1 next_ID += 1 idx2 = next_ID DAG[idx2] = L2 result_sets += [idx1, idx2] del DAG[tdata_idx][:] DAG[tdata_idx] += [F, idx1, idx2] ## Now optimize the result sets here random.shuffle(result_sets) basic = result_sets[:M] for r in result_sets[M:]: maxv = None maxi = None for b in basic: L = float(len(DAG[r] + DAG[b])) e1 = len(DAG[r]) * entropy(DAG[r]) e2 = len(DAG[b]) * entropy(DAG[b]) newe = L * entropy(DAG[r] + DAG[b]) score = abs(e1 + e2 - newe) if maxv is None: maxv = score maxi = b continue if score < maxv: maxv = score maxi = b DAG[maxi] += DAG[r] del DAG[r] DAG[r] = DAG[maxi] Candidate_sets = basic for tdata_idx in Candidate_sets: tdata = DAG[tdata_idx] C1 = Counter([b for _, b in tdata]) del DAG[tdata_idx][:] DAG[tdata_idx] += [None, C1] return DAG def classify_jungle(DAG, item): branch = DAG[0] while branch[0] is not None: try: fet, L1, L2 = branch if fet == True or fet in item: branch = DAG[L1] else: branch = DAG[L2] except: print len(branch) raise return branch[1] ## ------------------------- ## - Sample classification - ## ------------------------- if __name__ == "__main__": # dataEN = file("../data/pg23428.txt").read() # dataFR = file("../data/pg5711.txt").read() dataEN = file("../data/pg110.txt").read() dataFR = file("../data/pg42671.txt").read() length = 200 testEN, trainEN = split_data(dataEN, label=0, length=length) testFR, trainFR = split_data(dataFR, label=1, length=length) print "training: EN=%s FR=%s" % (len(trainEN), len(trainFR)) train = trainEN + trainFR random.shuffle(train) test = testEN + testFR random.shuffle(test) ## Now make a bunch of features ## A feature is in at least 10% of strings ## but also at most in 90% of strings sometrain = random.sample(train, 1000) features = set() while len(features) < 700: fragment, _ = random.choice(sometrain) l = int(round(random.expovariate(0.20))) b = random.randint(0, max(0, length - l)) feat = fragment[b:b+l] ## Test C = 0 for st, _ in sometrain: if feat in st: C += 1 f = float(C) / 1000 if f > 0.01 and f < 0.99 and feat not in features: features.add(feat) features = list(features) manytrees = [] jungle = [] for i in range(10): print "Build tree %s" % i size = len(train) / 3 training_sample = random.sample(train, size) tree = build_jungle(training_sample, features, numfeatures=100) jungle += [tree] tree = build_tree(training_sample, features, numfeatures=100) manytrees += [tree] testdata = test results_tree = Counter() results_jungle = Counter() for item, cat in testdata: # Trees c = Counter() for tree in manytrees: c += classify(tree, item) res = (max(c, key=lambda x: c[x]), cat) results_tree.update([res]) # Jungle c = Counter() for tree in jungle: c += classify_jungle(tree, item) res = (max(c, key=lambda x: c[x]), cat) results_jungle.update([res]) print print "Results Tree Jungle" print "True positives: %4d %4d" \ % (results_tree[(1, 1)], results_jungle[(1, 1)]) print "True negatives: %4d %4d" \ % (results_tree[(0, 0)], results_jungle[(0, 0)]) print "False positives: %4d %4d" \ % (results_tree[(1, 0)], results_jungle[(1, 0)]) print "False negatives: %4d %4d" \ % (results_tree[(0, 1)], results_jungle[(0, 1)])