/******************************************************************************* * Copyright (c) 2011 Dipanjan Das * Language Technologies Institute, * Carnegie Mellon University, * All Rights Reserved. * * LinDekNeighbors.java is part of SEMAFOR 2.0. * * SEMAFOR 2.0 is free software: you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SEMAFOR 2.0 is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with SEMAFOR 2.0. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package edu.cmu.cs.lti.ark.fn.identification; import java.io.BufferedReader; import java.io.FileReader; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.StringTokenizer; import edu.cmu.cs.lti.ark.fn.data.prep.ParsePreparation; import edu.cmu.cs.lti.ark.fn.wordnet.WordNetRelations; import edu.cmu.cs.lti.ark.util.ds.Pair; import gnu.trove.THashSet; import gnu.trove.TObjectIntHashMap; public class LinDekNeighbors { public static final int MAX_NEIGHBORS = 20; public static void main(String[] args) { Pair<TObjectIntHashMap<String>, TObjectIntHashMap<String>> p = readAdjectivesAndAdverbs(); TObjectIntHashMap<String> adjectives = p.getFirst(); TObjectIntHashMap<String> adverbs = p.getSecond(); System.out.println("Number of adjectives:" + adjectives.size()); System.out.println("Number of adverbs:" + adverbs.size()); try { //createNeighborsForAdverbsAndAdjectives(p); //createNeighborsForNouns(); createNeighborsForVerbs(); } catch (IOException e) { e.printStackTrace(); System.exit(-1); } } public static void createNeighborsForVerbs() throws IOException { String stopfile = "lrdata/stopwords.txt"; String wnConfigFile = "file_properties.xml"; WordNetRelations wnr = new WordNetRelations(stopfile, wnConfigFile); System.out.println(wnr.getLemmaForWord("worse", "J")); if (true) System.exit(-1); String lindekdirectory = "/home/dipanjan/work/fall2010/SSL/FNData"; String outFile = "/home/dipanjan/work/fall2010/SSL/FNData/lindekneighbors.dat"; BufferedWriter bWriter = new BufferedWriter(new FileWriter(outFile, true)); String line; BufferedReader bReader = new BufferedReader(new FileReader(lindekdirectory + "/simV.lsp")); line = bReader.readLine(); ArrayList<String> lines = new ArrayList<String>(); while (line != null) { line = line.trim(); lines.clear(); while (!line.equals("))")) { lines.add(line); line = bReader.readLine(); } String firstLine = lines.get(0); firstLine = firstLine.substring(1); int ind = firstLine.indexOf("(desc"); firstLine = firstLine.substring(0, ind).trim(); String pos = null; // multiword case boolean isAdjective = false; boolean isAdverb = false; int knn = 0; String outline = ""; for (int i = 1; i < lines.size() ; i ++) { StringTokenizer st = new StringTokenizer(lines.get(i).trim(), " \t", true); ArrayList<String> toks = new ArrayList<String>(); while (st.hasMoreTokens()) { toks.add(st.nextToken()); } double value = new Double(toks.get(toks.size()-1)); String unit = ""; for (int j = 0; j < toks.size()-2; j++) { unit += toks.get(j); } if (unit.startsWith("\"")) { if (!unit.endsWith("\"")) { System.out.println("Problem with unit:" + unit); System.exit(-1); } unit = unit.substring(1, unit.length()-1).toLowerCase(); } else { String lc = unit.toLowerCase(); lc = wnr.getLemmaForWord(lc, "V"); unit = lc; } outline += unit + ".v\t" + value +"\t"; knn++; if (knn > MAX_NEIGHBORS) { break; } } outline = outline.trim(); if (firstLine.startsWith("\"")) { if (!firstLine.endsWith("\"")) { System.out.println("Problem with unit:" + firstLine); System.exit(-1); } firstLine = firstLine.substring(1, firstLine.length()-1).toLowerCase(); } else { String lc = firstLine.toLowerCase(); firstLine = wnr.getLemmaForWord(lc, "V"); } bWriter.write(firstLine + ".v\t" + outline + "\n"); line = bReader.readLine(); } bReader.close(); bWriter.close(); } public static void createNeighborsForNouns() throws IOException { String stopfile = "lrdata/stopwords.txt"; String wnConfigFile = "file_properties.xml"; WordNetRelations wnr = new WordNetRelations(stopfile, wnConfigFile); String lindekdirectory = "/home/dipanjan/work/fall2010/SSL/FNData"; String outFile = "/home/dipanjan/work/fall2010/SSL/FNData/lindekneighbors.dat"; BufferedWriter bWriter = new BufferedWriter(new FileWriter(outFile, true)); String line; BufferedReader bReader = new BufferedReader(new FileReader(lindekdirectory + "/simN.lsp")); line = bReader.readLine(); ArrayList<String> lines = new ArrayList<String>(); while (line != null) { line = line.trim(); lines.clear(); while (!line.equals("))")) { lines.add(line); line = bReader.readLine(); } String firstLine = lines.get(0); firstLine = firstLine.substring(1); int ind = firstLine.indexOf("(desc"); firstLine = firstLine.substring(0, ind).trim(); String pos = null; // multiword case boolean isAdjective = false; boolean isAdverb = false; int knn = 0; String outline = ""; for (int i = 1; i < lines.size() ; i ++) { StringTokenizer st = new StringTokenizer(lines.get(i).trim(), " \t", true); ArrayList<String> toks = new ArrayList<String>(); while (st.hasMoreTokens()) { toks.add(st.nextToken()); } double value = new Double(toks.get(toks.size()-1)); String unit = ""; for (int j = 0; j < toks.size()-2; j++) { unit += toks.get(j); } if (unit.startsWith("\"")) { if (!unit.endsWith("\"")) { System.out.println("Problem with unit:" + unit); System.exit(-1); } unit = unit.substring(1, unit.length()-1).toLowerCase(); } else { String lc = unit.toLowerCase(); lc = wnr.getLemmaForWord(lc, "N"); unit = lc; } outline += unit + ".n\t" + value +"\t"; knn++; if (knn > MAX_NEIGHBORS) { break; } } outline = outline.trim(); if (firstLine.startsWith("\"")) { if (!firstLine.endsWith("\"")) { System.out.println("Problem with unit:" + firstLine); System.exit(-1); } firstLine = firstLine.substring(1, firstLine.length()-1).toLowerCase(); } else { String lc = firstLine.toLowerCase(); firstLine = wnr.getLemmaForWord(lc, "N"); } bWriter.write(firstLine + ".n\t" + outline + "\n"); line = bReader.readLine(); } bReader.close(); bWriter.close(); } public static void createNeighborsForAdverbsAndAdjectives(Pair<TObjectIntHashMap<String>, TObjectIntHashMap<String>> p) throws IOException { String lindekdirectory = "/home/dipanjan/work/fall2010/SSL/FNData"; String outFile = "/home/dipanjan/work/fall2010/SSL/FNData/lindekneighbors.dat"; BufferedWriter bWriter = new BufferedWriter(new FileWriter(outFile)); String line; ArrayList<String> mAdjectives = ParsePreparation.readSentencesFromFile(lindekdirectory + "/mult.word.j"); THashSet<String> adjSet = new THashSet<String>(); for (int i = 0; i < mAdjectives.size(); i++) { String w = mAdjectives.get(i).toLowerCase(); adjSet.add(w); } ArrayList<String> mAdverbs = ParsePreparation.readSentencesFromFile(lindekdirectory + "/mult.word.r"); THashSet<String> advSet = new THashSet<String>(); for (int i = 0; i < mAdverbs.size(); i++) { String w = mAdverbs.get(i).toLowerCase(); advSet.add(w); } TObjectIntHashMap<String> dAdjectives = p.getFirst(); TObjectIntHashMap<String> dAdverbs = p.getSecond(); // adverbs and adjectives; BufferedReader bReader = new BufferedReader(new FileReader(lindekdirectory + "/simA.lsp")); line = bReader.readLine(); ArrayList<String> lines = new ArrayList<String>(); while (line != null) { line = line.trim(); lines.clear(); while (!line.equals("))")) { lines.add(line); line = bReader.readLine(); } String firstLine = lines.get(0); firstLine = firstLine.substring(1); int ind = firstLine.indexOf("(desc"); firstLine = firstLine.substring(0, ind).trim(); // multiword case boolean isAdjective = false; boolean isAdverb = false; int knn = 0; String outline = ""; for (int i = 1; i < lines.size() ; i ++) { StringTokenizer st = new StringTokenizer(lines.get(i).trim(), " \t", true); ArrayList<String> toks = new ArrayList<String>(); while (st.hasMoreTokens()) { toks.add(st.nextToken()); } double value = new Double(toks.get(toks.size()-1)); String unit = ""; for (int j = 0; j < toks.size()-2; j++) { unit += toks.get(j); } if (unit.startsWith("\"")) { if (!unit.endsWith("\"")) { System.out.println("Problem with unit:" + unit); System.exit(-1); } unit = unit.substring(1, unit.length()-1).toLowerCase(); isAdjective = isMultiWordAdjective(adjSet, unit); isAdverb = isMultiWordAdverb(advSet, unit); } else { String lc = unit.toLowerCase(); int wpos = findWordPOS(dAdjectives, dAdverbs, lc); if (wpos == 1) { isAdjective = true; } else if (wpos == 2) { isAdverb = true; } else { isAdjective = true; isAdverb = true; } unit = lc; } if (isAdjective) outline += unit + ".a\t" + value +"\t"; if (isAdverb) outline += unit + ".adv\t" + value +"\t"; knn++; if (knn > MAX_NEIGHBORS) { break; } } outline = outline.trim(); if (firstLine.startsWith("\"")) { if (!firstLine.endsWith("\"")) { System.out.println("Problem with unit:" + firstLine); System.exit(-1); } firstLine = firstLine.substring(1, firstLine.length()-1).toLowerCase(); isAdjective = isMultiWordAdjective(adjSet, firstLine); isAdverb = isMultiWordAdverb(advSet, firstLine); } else { String lc = firstLine.toLowerCase(); int wpos = findWordPOS(dAdjectives, dAdverbs, lc); if (wpos == 1) { isAdjective = true; } else if (wpos == 2) { isAdverb = true; } else { isAdjective = true; isAdverb = true; } firstLine = lc; } if (isAdjective) { bWriter.write(firstLine + ".a\t" + outline + "\n"); } if (isAdverb) { bWriter.write(firstLine + ".adv\t" + outline + "\n"); } line = bReader.readLine(); } bReader.close(); bWriter.close(); } public static boolean isMultiWordAdjective(THashSet<String> mAdjectives, String word) { return mAdjectives.contains(word); } public static boolean isMultiWordAdverb(THashSet<String> mAdverbs, String word) { return mAdverbs.contains(word); } // 1 : adjective // 2: adverb // 3: both public static int findWordPOS(TObjectIntHashMap<String> dAdjectives, TObjectIntHashMap<String> dAdverbs, String word) { int adjCount = dAdjectives.get(word); int advCount = dAdverbs.get(word); if (adjCount == 0 && advCount == 0) { if (word.endsWith("ly")) return 2; else { return 3; } } int total = adjCount + advCount; double adjProb = (double)adjCount / (double) total; double advProb = (double)advCount / (double) total; if (Math.abs(adjProb - advProb) < 0.2) { return 3; } else { if (adjProb > advProb) { return 1; } else { return 2; } } } public static Pair<TObjectIntHashMap<String>, TObjectIntHashMap<String>> readAdjectivesAndAdverbs() { String adjFile = "/home/dipanjan/work/fall2010/SSL/FNData/gw.a"; String advFile = "/home/dipanjan/work/fall2010/SSL/FNData/gw.adv"; ArrayList<String> adjectives = ParsePreparation.readSentencesFromFile(adjFile); ArrayList<String> adverbs = ParsePreparation.readSentencesFromFile(advFile); TObjectIntHashMap<String> adjMap = new TObjectIntHashMap<String>(); TObjectIntHashMap<String> advMap = new TObjectIntHashMap<String>(); for (String string: adjectives) { String[] toks = string.trim().split("\t"); adjMap.put(toks[0], new Integer(toks[1])); } for (String string: adverbs) { String[] toks = string.trim().split("\t"); try { advMap.put(toks[0], new Integer(toks[1])); } catch (Exception e) { System.out.println(string + "\n\n"); e.printStackTrace(); System.exit(-1); } } return new Pair<TObjectIntHashMap<String>, TObjectIntHashMap<String>>(adjMap, advMap); } public static Pair<TObjectIntHashMap<String>, TObjectIntHashMap<String>> scanAdjectivesAndAdverbs() { String largeFile = "/home/dipanjan/work/fall2010/SSL/FNData/AP_1m.all.lemma.tags"; TObjectIntHashMap<String> adjectives = new TObjectIntHashMap<String>(); TObjectIntHashMap<String> adverbs = new TObjectIntHashMap<String>(); try { BufferedReader bReader = new BufferedReader(new FileReader(largeFile)); String line = null; int count = 0; while ((line = bReader.readLine()) != null) { line = line.trim(); String[] toks = line.split("\t"); int numTokens = new Integer(toks[0]); for (int i = 0; i < numTokens; i++) { String pos = toks[1 + numTokens + i]; if (pos.startsWith("J")) { int c = adjectives.get(toks[1 + i].toLowerCase()); adjectives.put(toks[1 + i].toLowerCase(), c+1); } else if (pos.startsWith("RB")) { int c = adverbs.get(toks[1 + i].toLowerCase()); adverbs.put(toks[1 + i].toLowerCase(), c+1); } } count++; if (count % 1000 == 0) { System.out.print(". "); } if (count % 10000 == 0) { System.out.println(count); } // if (count > 1000) // break; } bReader.close(); } catch (IOException e) { e.printStackTrace(); System.exit(-1); } return new Pair<TObjectIntHashMap<String>, TObjectIntHashMap<String>>(adjectives, adverbs); } }