/* * Copyright (c) 2015-2016 The University Of Sheffield. * * This file is part of gateplugin-LearningFramework * (see https://github.com/GateNLP/gateplugin-LearningFramework). * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 2.1 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this software. If not, see <http://www.gnu.org/licenses/>. */ package gate.plugin.learningframework.features; import gate.util.GateRuntimeException; import java.io.File; import java.io.StringReader; import java.net.URL; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; /** * Parse a feature specification and create an initial FeatureInfo object. * * @author Johann Petrak */ public class FeatureSpecification { //private static Logger LOGGER = Logger.getLogger(FeatureSpecification.class.getName()); private org.jdom.Document jdomDocConf = null; private URL url; // for error checking we remember all mappings from embedding ids to // each of the things that can be specified about embeddings: // file, dims, train private Map<String,String> embeddingId2file = new HashMap<>(); private Map<String,Integer> embeddingId2dims = new HashMap<>(); private Map<String,Integer> embeddingId2minfreq = new HashMap<>(); private Map<String,String> embeddingId2train = new HashMap<>(); /** * Constructor from URL * @param configFileURL URL of feature config XML file */ public FeatureSpecification(URL configFileURL) { url = configFileURL; SAXBuilder saxBuilder = new SAXBuilder(false); try { try { jdomDocConf = saxBuilder.build(configFileURL); parseConfigXml(); } catch (JDOMException jde) { throw new GateRuntimeException(jde); } } catch (java.io.IOException ex) { throw new GateRuntimeException("Error parsing config file URL " + url, ex); } } /** * Constructor from String * @param configString XML string */ public FeatureSpecification(String configString) { SAXBuilder saxBuilder = new SAXBuilder(false); try { try { jdomDocConf = saxBuilder.build(new StringReader(configString)); parseConfigXml(); } catch (JDOMException jde) { throw new GateRuntimeException(jde); } } catch (java.io.IOException ex) { throw new GateRuntimeException("Error parsing config file String:\n" + configString, ex); } } /** * Constructor from File * @param configFile File for feature config XML file */ public FeatureSpecification(File configFile) { SAXBuilder saxBuilder = new SAXBuilder(false); try { try { this.jdomDocConf = saxBuilder.build(configFile); parseConfigXml(); } catch (JDOMException jde) { throw new GateRuntimeException(jde); } } catch (java.io.IOException ex) { throw new GateRuntimeException("Error parsing config file " + configFile, ex); } } private void parseConfigXml() { // TODO: process children in order, then dispatch how to parse based on type. // Then, parsing ATTRIBUTE and ATTRIBUTELIST is nearly identical except that // we parse the range in addition for ATTRIBUTELIST. // Make an else part where we document how we might add additional stuff... Element rootElement = jdomDocConf.getRootElement(); @SuppressWarnings("unchecked") List<Element> elements = rootElement.getChildren(); int n = 0; for (Element element : elements) { n++; String elementName = element.getName().toLowerCase(); if (elementName.equals("attribute")) { featureInfo.add(parseSimpleAttribute(element, n)); } else if (elementName.equals("attributelist")) { FeatureSpecSimpleAttribute att = parseSimpleAttribute(element, n); int from = Integer.parseInt(element.getChildText("FROM")); int to = Integer.parseInt(element.getChildText("TO")); String withinType = getChildTextOrElse(element, "WITHIN", null); featureInfo.add(new FeatureSpecAttributeList(att, withinType, from, to)); } else if (elementName.equals("ngram")) { featureInfo.add(parseNgramAttribute(element, n)); } else { throw new GateRuntimeException("Not a recognized element name for the LearningFramework config file: " + elementName); } } // go through all the feature specifications and make sure the // embedding settings are set to whatever we have stored for the id. // If after this some values are still blank, it is the responsibility // of the backend code to find the appropriate default values since // different backends or algorithms could work better with different // defaults for (FeatureSpecAttribute fs : featureInfo.featureSpecs) { if(fs.datatype == Datatype.nominal) { if(fs.emb_file.isEmpty()) { String tmp_emb_file = embeddingId2file.get(fs.emb_id); if(tmp_emb_file != null) { fs.emb_file = tmp_emb_file; } } if(fs.emb_dims == 0) { Integer tmp_emb_dims = embeddingId2dims.get(fs.emb_id); if(tmp_emb_dims != null) { fs.emb_dims = tmp_emb_dims; } } if(fs.emb_train.isEmpty()) { String tmp_emb_train = embeddingId2train.get(fs.emb_id); if(tmp_emb_train != null) { fs.emb_train = tmp_emb_train; } } if(fs.emb_minfreq == 0) { Integer tmp_emb_minfreq = embeddingId2minfreq.get(fs.emb_id); if(tmp_emb_minfreq != null) { fs.emb_minfreq = tmp_emb_minfreq; } else { fs.emb_minfreq = 1; } } } } } // parseConfigXml private FeatureSpecAttribute parseAndAddEmbeddingInfo(Element element, int i, FeatureSpecAttribute spec) { // expects any FeatureSpec object and will add embedding info to it, if present // This using the info already stored in the instance members to check for // contradictions in the specification // the element is the parent, so lets first get the embedding child, if any Element emb = getChildOrNull(element, "EMBEDDINGS"); if(emb==null) { return spec; // nothing there, nothing to do } // get all the possible settings for the embedding String emb_id = getChildTextOrElse(emb, "ID", ""); String emb_file = getChildTextOrElse(emb, "FILE", ""); String emb_dims_str = getChildTextOrElse(emb, "DIMS", ""); String emb_minfreq_str = getChildTextOrElse(emb, "MINFREQ", ""); String emb_train = getChildTextOrElse(emb,"TRAIN",""); // only if any of the file,dim, or train things are set to non-empty, // we need to bother if(!emb_file.isEmpty()) { // TODO: check the file exists already here!! String have_file = embeddingId2file.get(emb_id); if(have_file == null) { embeddingId2file.put(emb_id, emb_file); spec.emb_file = emb_file; } else if(!emb_file.equals(have_file)) { throw new GateRuntimeException("EMBEDDING setting file to "+emb_file+" for attribute "+i+" contradicts "+ have_file+" set previously"); } else { spec.emb_file = emb_file; } } else { // the file spec is empty: // if the file spec has been set for this id earlier, use that, // otherwise the default value in the specification object is unchanged // (empty string) String have_file = embeddingId2file.get(emb_id); if(have_file != null) { spec.emb_file = have_file; } } if(!emb_train.isEmpty()) { if(!emb_train.equals("yes") && !emb_train.equals("no") && !emb_train.equals("mapping") && !emb_train.equals("onehot")) { throw new GateRuntimeException("EMBEDDING TRAIN setting must be one of yes, no, onehot or mapping for attribute"+i); } String have_train = embeddingId2train.get(emb_id); if(have_train == null) { embeddingId2train.put(emb_id, emb_train); spec.emb_train = emb_train; } else if(!emb_train.equals(have_train)) { throw new GateRuntimeException("EMBEDDING setting train to "+emb_train+" for attribute "+i+" contradicts "+ have_train+" set previously"); } else { spec.emb_train = emb_train; } } else { String have_train = embeddingId2train.get(emb_id); if(have_train != null) { spec.emb_train = have_train; } } if(!emb_dims_str.isEmpty()) { Integer have_dims = embeddingId2dims.get(emb_id); Integer emb_dims = Integer.parseInt(emb_dims_str); if(have_dims == null) { embeddingId2dims.put(emb_id, emb_dims); spec.emb_dims = emb_dims; } else if(!emb_dims.equals(have_dims)) { throw new GateRuntimeException("EMBEDDING setting dims to "+emb_dims+" for attribute "+i+" contradicts "+ have_dims+" set previously"); } else { spec.emb_dims = emb_dims; } } else { Integer have_dims = embeddingId2dims.get(emb_id); if(have_dims != null) { spec.emb_dims = have_dims; } } if(!emb_minfreq_str.isEmpty()) { Integer have_minfreq = embeddingId2minfreq.get(emb_id); Integer emb_minfreq = Integer.parseInt(emb_minfreq_str); if(have_minfreq == null) { embeddingId2minfreq.put(emb_id, emb_minfreq); spec.emb_minfreq = emb_minfreq; } else if(!emb_minfreq.equals(have_minfreq)) { throw new GateRuntimeException("EMBEDDING setting minfreq to "+emb_minfreq+" for attribute "+i+" contradicts "+ have_minfreq+" set previously"); } else { spec.emb_minfreq = emb_minfreq; } } else { Integer have_minfreq = embeddingId2minfreq.get(emb_id); if(have_minfreq != null) { spec.emb_minfreq = have_minfreq; } } spec.emb_id = emb_id; return spec; } private FeatureSpecSimpleAttribute parseSimpleAttribute(Element attributeElement, int i) { String aname = getChildTextOrElse(attributeElement, "NAME", "").trim(); String feat = getChildTextOrElse(attributeElement, "FEATURE", "").trim(); String dtstr = getChildTextOrElse(attributeElement, "DATATYPE", null); if (!feat.isEmpty() && dtstr == null) { throw new GateRuntimeException("DATATYPE not specified for ATTRIBUTE " + i); } if(feat.isEmpty()) { if(dtstr == null) { dtstr = "bool"; } else if(!dtstr.equals("bool") && !dtstr.equals("boolean")) { throw new GateRuntimeException("DATATYPE must be bool or not specified if no feature given in ATTRIBUTE "+i); } } if(dtstr.equals("boolean")) { dtstr = "bool"; // allow both but internally we use bool to avoid keyword clash. } Datatype dt = Datatype.valueOf(dtstr); // TODO: this should be named ANNOTATIONTYPE or ANNTYPE to avoid confusion // with the datatype String atype = getChildTextOrElse(attributeElement, "TYPE", ""); // if empty we use the instance annotation type, whatever that is //if (atype.isEmpty()) { // System.err.println("Warning: TYPE in ATTRIBUTE "+i+" is empty, using instance annotation type"); //} String codeasstr = getChildTextOrElse(attributeElement, "CODEAS", "").toLowerCase(); if (!codeasstr.isEmpty() && !codeasstr.equals("one_of_k") && !codeasstr.equals("number")) { throw new GateRuntimeException("CODEAS for ATTRIBUTE " + i + " specified but not one_of_k or number but " + codeasstr); } // codeas currently only makes sense and is used for nominal, so complain if it is specified // for other datatypes if(!codeasstr.isEmpty() && (dt != Datatype.nominal) ) { throw new GateRuntimeException("CODEAS can only be used with DATATYPE nominal for ATTRIBUTE "+i); } // for non-nominal, we always really use number if(codeasstr.isEmpty() && dt != Datatype.nominal) { codeasstr = "number"; } // for nominal the default when not specified is on_of_k if(codeasstr.isEmpty() && dt == Datatype.nominal) { codeasstr = "one_of_k"; } CodeAs codeas = CodeAs.valueOf(codeasstr); // the default for missingvaluetreatment is special_value for numeric and // number-coded nominal, but for one-of-k coded values, we use "zero_value" // because this is usually how the absence of such values is coded! String missingValueTreatmentStr; String featureName4Value = ""; if(dt==Datatype.nominal && codeas==CodeAs.one_of_k) { missingValueTreatmentStr = getChildTextOrElse(attributeElement, "MISSINGVALUETREATMENT", "keep"); featureName4Value = getChildTextOrElse(attributeElement,"FEATURENAME4VALUE",""); } else if (dt==Datatype.bool) { missingValueTreatmentStr = getChildTextOrElse(attributeElement, "MISSINGVALUETREATMENT", "zero_value"); } else { missingValueTreatmentStr = getChildTextOrElse(attributeElement, "MISSINGVALUETREATMENT", "special_value"); } MissingValueTreatment mvt = MissingValueTreatment.valueOf(missingValueTreatmentStr); // If the datatype is not anything other than nominal, we also allow the // setting "listsep" for automatical list splitting String listsep = getChildTextOrElse(attributeElement, "LISTSEP", ""); if(!listsep.isEmpty()) { if(dt!=Datatype.nominal) { throw new GateRuntimeException("LISTSEP only allowed if datatype is nominal"); } } String withinType = getChildTextOrElse(attributeElement, "WITHIN", null); String defaultMissingValue = ""; if(dt == Datatype.bool) { defaultMissingValue = "false"; } else if(dt == Datatype.numeric) { defaultMissingValue = "0.0"; } String missingValueValue = getChildTextOrElse(attributeElement, "MISSINGVALUE", defaultMissingValue); // TODO: not implemented yet, but we should add this!! String scalingMethod = ""; String transformMethod = ""; FeatureSpecSimpleAttribute att = new FeatureSpecSimpleAttribute( aname, atype, feat, dt, codeas, mvt, missingValueValue, scalingMethod, transformMethod, withinType, listsep, featureName4Value ); // now if this is a nominal attribute, add any embedding block if(dt == Datatype.nominal) { att = (FeatureSpecSimpleAttribute)parseAndAddEmbeddingInfo(attributeElement, i, att); } else { if(getChildOrNull(attributeElement, "EMBEDDINGS") != null) { throw new GateRuntimeException("EMBEDDINGS not allowed for non-NOMINAL ATTRIBUTE "+i); } } return att; } private FeatureSpecAttribute parseNgramAttribute(Element ngramElement, int i) { String aname = getChildTextOrElse(ngramElement,"NAME","").trim(); String annType = getChildTextOrElse(ngramElement,"TYPE","").trim(); if (annType.isEmpty()) { throw new GateRuntimeException("TYPE in NGRAM " + i + " must not be missing or empty"); } String numberString = getChildTextOrElse(ngramElement,"NUMBER","1").trim(); String featureName4Value = getChildTextOrElse(ngramElement,"FEATURENAME4VALUE",""); String maxlen = getChildTextOrElse(ngramElement,"MAXLEN","0"); String shorten = getChildTextOrElse(ngramElement,"SHORTEN","").toLowerCase(Locale.US); String feature = getChildTextOrElse(ngramElement,"FEATURE","").trim(); if (feature.isEmpty()) { throw new GateRuntimeException("FEATURE in NGRAM " + i + " must not be missing or empty"); } if (!(shorten.equals("") || shorten.equals("left") || shorten.equals("both") || shorten.equals("middle"))) { throw new GateRuntimeException("SHORTEN must be missing, empty or one of right, left, middle, both"); } FeatureSpecNgram ng = new FeatureSpecNgram( aname, Integer.parseInt(numberString), annType, feature, featureName4Value ); ng.maxlen = Integer.parseInt(maxlen); ng.shorten = shorten; ng = (FeatureSpecNgram)parseAndAddEmbeddingInfo(ngramElement, i, ng); return ng; } private FeatureInfo featureInfo = new FeatureInfo(); /** * Return the FeatureInfo object for this specification. * * This will always return a new deep copy of the FeatureInfo that corresponds * to the information inf the FeatureSepcification. * * @return FeatureInfo instance */ public FeatureInfo getFeatureInfo() { return new FeatureInfo(featureInfo); // this returns a cloned copy of the original } //// HELPER METHODS /** * Return the text of a single child element or a default value. This checks that there is at most * one child of this annType and throws and exception if there are more than one. If there is no * child of this name, then the value elseVal is returned. NOTE: the value returned is trimmed if * it is a string, but case is preserved. * NOTE: this tries both the all-uppercase and the all-lowercase variant of the given name. */ private static String getChildTextOrElse(Element parent, String name, String elseVal) { @SuppressWarnings("unchecked") List<Element> children = parent.getChildren(name); if (children.size() > 1) { throw new GateRuntimeException("Element " + parent.getName() + " has more than one nested " + name + " element"); } if(children.isEmpty()) { return elseVal; } String tmp = parent.getChildTextTrim(name.toUpperCase()); if(tmp == null) { tmp = parent.getChildText(name.toLowerCase()); } if (tmp == null) { return elseVal; } else { return tmp; } } private static Element getChildOrNull(Element parent, String name) { @SuppressWarnings("unchecked") List<Element> children = parent.getChildren(name); if (children.size() > 1) { throw new GateRuntimeException("Element " + parent.getName() + " has more than one nested " + name + " element"); } else if (children.isEmpty()) { return null; } else { return children.get(0); } } @Override public String toString() { // The only difference between this and the feature info is that we also // know about the embedding mapping, so just print the feature info return featureInfo.toString(); } }