package pitt.search.examples; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import pitt.search.semanticvectors.FlagConfig; import pitt.search.semanticvectors.VectorStoreDeterministic; import pitt.search.semanticvectors.orthography.ProportionVectors; import pitt.search.semanticvectors.vectors.Vector; import pitt.search.semanticvectors.vectors.VectorFactory; import java.io.IOException; import java.io.StringReader; /** * Class for analyzing documents that have xml markup for tagged sections. */ public class MarkedUpDocumentAnalyzer { private FlagConfig flagConfig; private VectorStoreDeterministic elementalVectors; private DocumentBuilder documentBuilder; private ProportionVectors proportionVectors; public MarkedUpDocumentAnalyzer(FlagConfig flagConfig) { this.flagConfig = flagConfig; this.elementalVectors = new VectorStoreDeterministic(flagConfig); this.proportionVectors = new ProportionVectors(flagConfig); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); try { this.documentBuilder = dbFactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { e.printStackTrace(); } } public Document getXmlDocumentFromString(String source) throws IOException, SAXException { return documentBuilder.parse(new InputSource(new StringReader(source))); } public Vector getVectorForTaggedDocument(Document document) { Vector documentVector = VectorFactory.createZeroVector(this.flagConfig.vectortype(), this.flagConfig.dimension()); NodeList nodeList = document.getElementsByTagName("*"); int numElements = nodeList.getLength(); for (int i = 0; i < numElements; i++) { Node node = nodeList.item(i); Vector elementVector = this.elementalVectors.getVector(node.getNodeName()).copy(); elementVector.bind(this.proportionVectors.getProportionVector(((double)i)/numElements)); documentVector.superpose(elementVector, 1, null); } documentVector.normalize(); return documentVector; } public Vector getVectorForString(String input) throws IOException, SAXException { return this.getVectorForTaggedDocument(this.getXmlDocumentFromString(input)); } private static String othello = "<document><setting>Othello is a hero.</setting>" + "<marriage>Othello and Desdemona get married.</marriage>" + "<confusion>Othello wrongly suspects and kills Desdemona.</confusion></document>"; private static String midsummerNightsDream = "<document><setting>Everyone goes to the forest.</setting>" + "<confusion>They are tricked into falling in love with the wrong people.</confusion>" + "<marriage>Hermia marries Lysander. Helena marries Demetrius.</marriage></document>"; private static String twelfthNight = "<document><setting>Viola is shipwrecked but useful.</setting>" + "<confusion>She disguises herself and woos Olivia on behalf of Orsino.</confusion>" + "<marriage>Viola marries Orsino. Olivia marries Sebastian.</marriage></document>"; public static void main(String[] args) throws ParserConfigurationException, IOException, SAXException { FlagConfig flagConfig = FlagConfig.getFlagConfig(args); MarkedUpDocumentAnalyzer analyzer = new MarkedUpDocumentAnalyzer(flagConfig); Vector othelloVector = analyzer.getVectorForString(othello); Vector midsummerVector = analyzer.getVectorForString(midsummerNightsDream); Vector twelfthNightVector = analyzer.getVectorForString(twelfthNight); System.out.println("Structural similarity of Othello with A Midsummer Night's Dream:"); System.out.println(othelloVector.measureOverlap(midsummerVector)); System.out.println("Structural similarity of Othello with Twelfth Night:"); System.out.println(twelfthNightVector.measureOverlap(othelloVector)); System.out.println("Structural similarity of A Midsummer Night's Dream with Twelfth Night:"); System.out.println(twelfthNightVector.measureOverlap(midsummerVector)); } }