cc.mallet.types.Alphabet Java Examples

The following examples show how to use cc.mallet.types.Alphabet. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EngineMB.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
protected void updateInfo() {
  //System.err.println("In updateInfo, model is "+model);
  if(model!=null) {
    info.modelClass = model.getClass().getName();
  }
  info.nrTrainingInstances = corpusRepresentation.getRepresentationMallet().size();
  info.nrTrainingDimensions = corpusRepresentation.getRepresentationMallet().getDataAlphabet().size();    
  LFPipe pipe = corpusRepresentation.getPipe();
  Alphabet targetAlph = pipe.getTargetAlphabet();
  if(targetAlph == null) {
    info.nrTargetValues = 0;
  } else {
    info.nrTargetValues = targetAlph.size();
    //info.classLabels = 
    Object[] objs = targetAlph.toArray();
    ArrayList<String> labels = new ArrayList<>();
    for(Object obj : objs) { labels.add(obj.toString()); }
    info.classLabels = labels;
  }
  
}
 
Example #2
Source File: PipeScaleMinMaxAll.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
/**
 * Constructor from alphabet and feature stats.
 * @param alphabet alphabet
 * @param stats feature stats
 */
public PipeScaleMinMaxAll(Alphabet alphabet, FVStatsMeanVarAll stats) {
  super(alphabet, null);
  List<PerFeatureStats> pfss = stats.getStats();
  int n = pfss.size();
  min = new double[n];
  max = new double[n];
  normalize = new boolean[n];
  for(int i=0; i<n; i++) {
    PerFeatureStats pfs = pfss.get(i);
    // we do not normalize binary features and we do not normalize features with no
    // values at all
    if(pfs.binary != null && pfs.binary != true) {
      min[i] = pfs.min;
      max[i] = pfs.max;
    } else {
      normalize[i] = false;
    }
  }
  System.err.println("DEBUG: Creating PipeScaleMinMaxAll instance with mins="+Arrays.toString(min)+",maxs="+Arrays.toString(max));
}
 
Example #3
Source File: PipeScaleMeanVarAll.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
/**
 * Constructor from alphabet and stats.
 * @param alphabet alphabet
 * @param stats feature stats
 */
public PipeScaleMeanVarAll(Alphabet alphabet, FVStatsMeanVarAll stats) {
  super(alphabet, null);
  List<PerFeatureStats> pfss = stats.getStats();
  int n = pfss.size();
  means = new double[n];
  variances = new double[n];
  normalize = new boolean[n];
  for(int i=0; i<n; i++) {
    PerFeatureStats pfs = pfss.get(i);
    // we do not normalize binary features and we do not normalize features with no
    // values at all
    if(pfs.binary != null && pfs.binary != true) {
      means[i] = pfs.mean;
      variances[i] = pfs.var;
      normalize[i] = true;
    } else {
      means[i] = Double.NaN;
      variances[i] = Double.NaN;
      normalize[i] = false;
    }
  }
  //System.err.println("DEBUG: Creating PipeScaleMeanVarAll instance with means="+Arrays.toString(means)+
  //        ",variances="+Arrays.toString(variances)+",flags="+Arrays.toString(normalize));
}
 
Example #4
Source File: MaxEntClassifierTrainer.java    From baleen with Apache License 2.0 6 votes vote down vote up
private HashMap<Integer, ArrayList<Integer>> mapFeaturesToLabels(
    Alphabet dataAlphabet, Alphabet targetAlphabet) {

  HashMap<Integer, ArrayList<Integer>> featuresAndLabels = new HashMap<>();

  labelsAndFeatures.forEach(
      (k, v) -> {
        Integer label = targetAlphabet.lookupIndex(k);
        v.forEach(
            f -> {
              Integer feature = dataAlphabet.lookupIndex(f);
              ArrayList<Integer> labels = featuresAndLabels.get(feature);
              if (labels == null) {
                labels = new ArrayList<>();
                featuresAndLabels.put(feature, labels);
              }
              labels.add(label);
            });
      });

  return featuresAndLabels;
}
 
Example #5
Source File: EngineMBPythonNetworksBase.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
protected AbstractMap.SimpleEntry<String,Integer> findOutMode(CorpusRepresentationMalletTarget crm)  {
  InstanceList instances = crm.getRepresentationMallet();
  // we pass on a "mode" for the learning problem, which is one of the following:
  // - classind: predict the index of a class
  // - classcosts: targets are vectors of class costs
  // - regr: regression
  // we also pass on another parameter which provides details of the learning problem:
  // - the number of class indices in case of classind and classcosts
  // - 0 as a dummy value in case of "regr"
  
  int nrClasses = 0;
  String mode = "regr";
  Alphabet ta = crm.getPipe().getTargetAlphabet();
  
  if(ta != null) {
    // if this is invoked for training, we should have a first instance, but for 
    // application, we do not have any instances yet. If we do not have any instances, we 
    // just use dummy values for now since at the moment we do not need this information
    // at application time. Should we ever need it we need to store this in the pipe!
    if(instances==null || instances.isEmpty()) {
      mode="classind";
      nrClasses=-1;
    } else {
      Instance firstInstance = instances.get(0);
      Object targetObj = firstInstance.getTarget();
      if(targetObj instanceof NominalTargetWithCosts) {
        NominalTargetWithCosts target = (NominalTargetWithCosts)targetObj;
        nrClasses = target.getCosts().length;
        mode = "classcosts";
      } else {
        mode = "classind";
        nrClasses = ta.size();
      }
    }
  } 
  AbstractMap.SimpleEntry<String,Integer> ret = new AbstractMap.SimpleEntry<>(mode,nrClasses);
  return ret;
}
 
Example #6
Source File: TopicModelPipe.java    From baleen with Apache License 2.0 5 votes vote down vote up
/**
 * Construct topic model pipe with given stopwords and alphabets
 *
 * @param stopwords to be removed
 * @param alphabet to use
 */
public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) {
  // @formatter:off
  super(
      ImmutableList.of(
          new CharSequenceLowercase(),
          new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")),
          new RemoveStopwords(stopwords),
          new TokenSequence2FeatureSequence(alphabet)));
  // @formatter:on
}
 
Example #7
Source File: CorpusExporterMRARFF.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Convert alphabet to ARFF declaration string.
 * @param alph Mallet alphabet
 * @param mvt missing value treatment setting
 * @return ARFF declaration
 */
public String alphabet2Arff(Alphabet alph, MissingValueTreatment mvt) {
  // NOTE: mvt can be null, if this is used for a target!!
  StringBuilder sb = new StringBuilder();
  sb.append("{");
  for(int i=0; i<alph.size(); i++) {
    if(i>0) sb.append(",");
    String val = alph.lookupObject(i).toString();
    sb.append(escape4Arff(val));
  }
  // TODO: we may need to add the definition for the missing value here,
  // but by default, we do not do that.
  sb.append("}");
  return sb.toString();
}
 
Example #8
Source File: CorpusRepresentationMallet.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
@Override
public List<String> getLabelList() {
  Alphabet ta = pipe.getTargetAlphabet();
  if (ta != null) {
     Object[] ls = ta.toArray();
     List<String> list = new ArrayList<>();
     for(Object o : ls) {
       list.add(o.toString());
     }
     return list;
  } else {
    return new ArrayList<>();
  }
}
 
Example #9
Source File: FeatureExtractionMalletSparse.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
/**
 * Extract the class for an instance for sequence tagging.
 *
 * In the case of sequence tagging, we construct the class based on the instance's position
 * relative to the class annotation annType. If it occurs at the beginning of the class
 * annotation, it's a "beginning". In the middle or at the end, it's an "inside". Instances that
 * don't occur in the span of a class annotation are an "outside".
 *
 * @param inst instance
 * @param alph the label alphabet to use, must be an instance of LabelAlphabet
 * @param classAS class annotation set
 * @param instanceAnnotation  the instance annotation, e.g. "Token".
 * @param seqEncoder sequence encoder instance
 */
public static void extractClassForSeqTagging(Instance inst, Alphabet alph, AnnotationSet classAS, Annotation instanceAnnotation, SeqEncoder seqEncoder) {
  String target;
  Document doc = classAS.getDocument();
  if (!(alph instanceof LabelAlphabet)) {
    throw new GateRuntimeException("LF extractClassForSeqTagging: the alphabet must be of type LabelAlphabet"
            + " for instance annotation at offset " + gate.Utils.start(instanceAnnotation)
            + " in document " + doc.getName());
  }
  LabelAlphabet labelalph = (LabelAlphabet) alph;
  AnnotationSet overlappingClassAnns = Utils.getOverlappingAnnotations(classAS, instanceAnnotation);
  // NOTE: previously we only allowed at most one class annotation, but now we are as flexible
  // as possible here: any number of class annotations of any number of types can overlap.
  // The class label for each instance is generated from the complete list of what overlaps,
  // e.g. beginning of T1, beginning of another T1, continuation of T2 and end of T3 
  // The class labels for such combinations only get generated if an overlap actually occurs,
  // so if we only ever see nicely separated annotations, then we will never see the combined labels.
  // Labels are dynamically generated as a string of pipe-separated type names, with the flag
  // (beginning=B, inside=I) appended, or class "O" if outside of all types. 
  // The ordering of types in the class label name must be consistent: TODO!!
  // NOTE: this should be one of several possible ways to do it, implemented in several
  // methods/classes and choosable through e.g. the "algorithmParameter" settings.
  // Then we could use approaches like BIO, BMEWO, BMEWO+ (see
  // https://lingpipe-blog.com/2009/10/14/coding-chunkers-as-taggers-io-bio-bmewo-and-bmewo/)
  // or the ones listed in http://cs229.stanford.edu/proj2005/KrishnanGanapathy-NamedEntityRecognition.pdf
  // Whenever we choose a strategy here, the strategy needs to get stored in the 
  // model info file and re-used at application time!
  // NOTE: need to see if the label alphabet growing setting is handled correctly!
  
  // if there is at least one overlapping class annotation
  if (overlappingClassAnns.size() > 0) {
    // convert the set of annotation types to a list of type|code names
    // this should eventually be parametrizable so we can choose one of several methods
    // ideally we implement this as a method of one of an instance of several Seq2Class 
    // subclasses. If it is an instance we could maybe also implement methods where we
    // need to remember something about the last instance for which we did it!
    target = seqEncoder.seqAnns2ClassLabel(overlappingClassAnns, instanceAnnotation, doc);
    if(target.isEmpty()) {
      target = SeqEncoder.CODE_OUTSIDE;
    }
  } else {
    //No overlapping mentions so it's an outside
    target = SeqEncoder.CODE_OUTSIDE;
  }
  // if debugging is enabled, we put the 
  // the target class on the instance annotation
  if (DEBUG_SEQUENCE_CLASS) {
    instanceAnnotation.getFeatures().put("LF_sequenceClass", target);
  }
  // we now have the target label as a string, now set the target of the instance to 
  // to the actual label
  // NOTE: the target alphabet for such an instance MUST be a LabelAlphabet!
  synchronized(labelalph) {
    inst.setTarget(labelalph.lookupLabel(target));
  }
}
 
Example #10
Source File: CorpusRepresentationMalletLDA.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
/**
 * Get a Mallet FeatureSequence Instance for the tokens in the span.
 * The span is what is covered by the original instance annotation.
 * @param from start offset
 * @param to end offset 
 * @param tokenAS  annotation set containing the token-like annotations
 * @param tokenFeatureName feature in the token-like annotations to use or empty for document text
 * @return  mallet instance containing a feature sequence 
 */
public Instance getInstanceFor(
        long from,
        long to,
        AnnotationSet tokenAS,
        String tokenFeatureName) {

  
  if(tokenFeatureName == null) {
    tokenFeatureName = "";
  }
  Document doc = tokenAS.getDocument();
  List<Annotation> tokenAnnotations = tokenAS.get(from, to).inDocumentOrder();
  // System.err.println("DEBUG: getInstanceFor from="+from+", to="+to+", tokenanns="+tokenAnnotations.size());
  List<String> tokenList = new ArrayList<>();
  String str;
  for(Annotation tokenAnnotation : tokenAnnotations) {
    if(tokenFeatureName.isEmpty()) {
      str = gate.Utils.cleanStringFor(doc, tokenAnnotation);
    } else {
      str = (String)tokenAnnotation.getFeatures().get(tokenFeatureName);
    }
    if(str != null && !str.isEmpty()) {
      tokenList.add(str);
    }
  }
  TokenSequence tokenSeq = new TokenSequence(tokenList.toArray());
  //System.err.println("DEBUG: tokensequence="+tokenSeq);
  //System.err.println("DEBUG: alphabet growStopped()="+instances.getAlphabet().growthStopped());
  
  
  // NOTE: the following will create a feature sequence that contains -1 entries
  // for tokens which are not in the alphabet, if alphabet growth has been stopped
  // FeatureSequence featSeq = tokenSeq.toFeatureSequence(instances.getAlphabet());
  
  // Instead we create the FeatureSequence ourselves
  FeatureSequence featSeq = new FeatureSequence(instances.getAlphabet(), tokenSeq.size());
  Alphabet alph = instances.getAlphabet();
  for(int i=0; i<tokenSeq.size(); i++) {
    int idx = alph.lookupIndex(tokenSeq.get(i).getText());
    if(idx > -1) {
      featSeq.add(idx);
    }
  }
  /*
  System.err.println("DEBUG: fseq size="+featSeq.size());
  System.err.println("DEBUG: fseq length="+featSeq.getLength());
  System.err.println("DEBUG: fseq feats="+Arrays.toString(featSeq.getFeatures()));
  System.err.println("DEBUG: fseq feats="+Arrays.toString(featSeq.getFeatures()));
  System.err.println("DEBUG: fseq featIndexSequence="+Arrays.toString(featSeq.toFeatureIndexSequence()));
  */
  // append the start offset to the document name, using a pipe character
  return new Instance(featSeq, null, doc.getName()+"|"+from, null);

}
 
Example #11
Source File: TestPipeSerialization.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
@Test
public void testPipeSerialization1() throws ResourceInstantiationException, IOException, ClassNotFoundException {
  String spec = "<ROOT>"+
          "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE><CODEAS>number</CODEAS></ATTRIBUTE>"+
          "</ROOT>";    
  FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo();
  // Create a pipe with a data and target alphabet
  Pipe tmppipe = new Noop(new LFAlphabet(),new LabelAlphabet());
  List<Pipe> pipes = new ArrayList<>();
  pipes.add(tmppipe);
  LFPipe pipe = new LFPipe(pipes);
  pipe.setFeatureInfo(fi);
  
  // add an entry to the data alphabet
  pipe.getDataAlphabet().lookupIndex("feature1");
  // extract an instance - this should create/update the alphabet for the number representation of the feature
  Document doc = newDocument();
  Annotation instAnn = addAnn(doc,"",0,0,"theType",gate.Utils.featureMap("feature1","val1"));
  Instance inst = newInstance();
  FeatureSpecAttribute attr = fi.getAttributes().get(0);
  // make sure the attribute is a SimpleAttribute as expected
  assertEquals(FeatureSpecSimpleAttribute.class, attr.getClass());
  FeatureSpecSimpleAttribute sa = (FeatureSpecSimpleAttribute)attr;
  FeatureExtractionMalletSparse.extractFeature(inst, sa, doc.getAnnotations(), instAnn);
  // verify that we do have an alphabet in the attribute info
  assertNotNull(sa.alphabet);    
  System.err.println("DEBUG: the alphabet we have is "+sa.alphabet);
  assertTrue(sa.alphabet.contains("val1"));
  // remember that alphabet for later
  Alphabet valuealphabet = sa.alphabet;
  
  // No serialize the lfpipe
  File tmpFile = File.createTempFile("LF_test",".pipe");
  tmpFile.deleteOnExit();
  try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(tmpFile))) {
    oos.writeObject(pipe);
  }    
  LFPipe pipe2;
  try ( // Now read it back and check if everything is there
          ObjectInputStream ois = new ObjectInputStream (new FileInputStream(tmpFile))) {
    pipe2 = (LFPipe) ois.readObject();
  }
  // check if the data and target alphabets match
  assertTrue(pipe2.alphabetsMatch(pipe));
  // Do we have a feature info?
  assertNotNull(pipe2.getFeatureInfo());
  // do we have attributes?
  assertNotNull(pipe2.getFeatureInfo().getAttributes());
  // is there exactly one attribute
  assertEquals(1, pipe2.getFeatureInfo().getAttributes().size());
  // does that attribute have an alphabet
  assertNotNull(((FeatureSpecSimpleAttribute)pipe2.getFeatureInfo().getAttributes().get(0)).alphabet);
  // is the alphabet identical to what we originally had
  assertEquals(valuealphabet,((FeatureSpecSimpleAttribute)pipe2.getFeatureInfo().getAttributes().get(0)).alphabet);
}
 
Example #12
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
@Test
public void extractNgram1() {
  String spec = "<ROOT>"+
          "<NGRAM><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>1</NUMBER></NGRAM>"+
          "<NGRAM><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>2</NUMBER></NGRAM>"+
          "<NGRAM><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>3</NUMBER></NGRAM>"+
          "</ROOT>";
  FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo();
  List<FeatureSpecAttribute> as = fi.getAttributes();

  Alphabet a = new LFAlphabet();
  AugmentableFeatureVector afv = new AugmentableFeatureVector(a);
  Instance inst = new Instance(afv,null,null,null);
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 20, "instanceType", gate.Utils.featureMap());
  addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1"));
  addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2"));
  addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3"));
  addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4"));
  addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5"));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(0)+" (one-grams) FV="+inst.getData());
  assertEquals(5,inst.getAlphabet().size());
  System.err.println("Alphabet N1="+inst.getAlphabet());
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok1"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok2"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok3"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok4"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok5"));
  assertEquals(5,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok5"),EPS);
  
  // now the bigrams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(1), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(1)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N2="+inst.getAlphabet());
  assertEquals(4,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok1┋tok2"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok2┋tok3"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok4┋tok5"));
  assertEquals(4,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok1┋tok2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok2┋tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok4┋tok5"),EPS);

  // and the 3-grams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(2), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(2)+" (tri-grams) FV="+inst.getData());
  System.err.println("Alphabet N3="+inst.getAlphabet());
  assertEquals(3,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N3═tok1┋tok2┋tok3"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N3═tok2┋tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N3═tok3┋tok4┋tok5"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N3═tok1┋tok2┋tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N3═tok2┋tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N3═tok3┋tok4┋tok5"),EPS);
}
 
Example #13
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
@Test
public void extractNgram2() {
  // essentially the same as extractNgram1 but explicitly specifies the name to use as internal
  // feature name
  String spec = "<ROOT>"+
          "<NGRAM><NAME>ng1</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>1</NUMBER></NGRAM>"+
          "<NGRAM><NAME>ngram2</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>2</NUMBER></NGRAM>"+
          "<NGRAM><NAME>someName</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>3</NUMBER></NGRAM>"+
          "</ROOT>";
  FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo();
  List<FeatureSpecAttribute> as = fi.getAttributes();
  System.err.println("NGRAMS with explicitly specified name!!");
  Alphabet a = new LFAlphabet();
  AugmentableFeatureVector afv = new AugmentableFeatureVector(a);
  Instance inst = new Instance(afv,null,null,null);
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 20, "instanceType", gate.Utils.featureMap());
  addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1"));
  addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2"));
  addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3"));
  addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4"));
  addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5"));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(0)+" (one-grams) FV="+inst.getData());
  assertEquals(5,inst.getAlphabet().size());
  System.err.println("Alphabet N1="+inst.getAlphabet());
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok1"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok2"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok3"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok4"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok5"));
  assertEquals(5,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok5"),EPS);
  
  // now the bigrams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(1), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(1)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N2="+inst.getAlphabet());
  assertEquals(4,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok1┋tok2"));
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok2┋tok3"));
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok4┋tok5"));
  assertEquals(4,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok1┋tok2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok2┋tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok4┋tok5"),EPS);

  // and the 3-grams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(2), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(2)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N3="+inst.getAlphabet());
  assertEquals(3,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("someName╬N3═tok1┋tok2┋tok3"));
  assertTrue(inst.getAlphabet().contains("someName╬N3═tok2┋tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("someName╬N3═tok3┋tok4┋tok5"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok1┋tok2┋tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok2┋tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok3┋tok4┋tok5"),EPS);
}
 
Example #14
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
@Test
public void extractNgram3() {
  // same as Ngram2 but also use featureName4Value and test the filtering if we have a null
  // value for the second token. 
  String spec = "<ROOT>"+
          "<NGRAM><NAME>ng1</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>1</NUMBER><FEATURENAME4VALUE>val</FEATURENAME4VALUE></NGRAM>"+
          "<NGRAM><NAME>ngram2</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>2</NUMBER><FEATURENAME4VALUE>val</FEATURENAME4VALUE></NGRAM>"+
          "<NGRAM><NAME>someName</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>3</NUMBER><FEATURENAME4VALUE>val</FEATURENAME4VALUE></NGRAM>"+
          "</ROOT>";
  FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo();
  List<FeatureSpecAttribute> as = fi.getAttributes();
  System.err.println("NGRAMS with explicitly specified name, filtered by featurename4value!!");
  Alphabet a = new LFAlphabet();
  AugmentableFeatureVector afv = new AugmentableFeatureVector(a);
  Instance inst = new Instance(afv,null,null,null);
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 20, "instanceType", gate.Utils.featureMap());
  addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1","val",1.0));
  addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2"));
  addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3","val",1.0));
  addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4","val",1.0));
  addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5","val",1.0));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("Ngram3: After N1 extract "+as.get(0)+" (one-grams) FV="+inst.getData());
  assertEquals(4,inst.getAlphabet().size());
  System.err.println("Ngram3: Alphabet N1="+inst.getAlphabet());
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok1"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok3"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok4"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok5"));
  assertEquals(4,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok5"),EPS);
  
  // now the bigrams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(1), doc.getAnnotations(), instAnn);
  System.err.println("Ngram3: After N2 extract "+as.get(1)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N2="+inst.getAlphabet());
  assertEquals(2,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok4┋tok5"));
  assertEquals(2,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok4┋tok5"),EPS);

  // and the 3-grams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(2), doc.getAnnotations(), instAnn);
  System.err.println("Ngram3: After N3 extract "+as.get(2)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N3="+inst.getAlphabet());
  assertEquals(1,inst.getAlphabet().size());
  assertEquals(1,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok3┋tok4┋tok5"),EPS);
}
 
Example #15
Source File: Utils.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
public static Instance newInstance(Alphabet alph) {
  return new Instance(new AugmentableFeatureVector(alph),null,null,null);    
}
 
Example #16
Source File: LDAModelEstimator.java    From RankSys with Mozilla Public License 2.0 4 votes vote down vote up
@Override
public Alphabet getDataAlphabet() {
    return alphabet;
}
 
Example #17
Source File: MaxEntClassifierTrainer.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(JobSettings settings) throws AnalysisEngineProcessException {

  Pipe pipe = new MaxEntClassifierPipe(labelsAndFeatures.keySet(), stopwords);

  InstanceList instances = new InstanceList(pipe);
  instances.addThruPipe(getDocumentsFromMongoWithRandonLabelAssignement());

  Alphabet targetAlphabet = instances.getTargetAlphabet();
  HashMap<Integer, ArrayList<Integer>> featuresAndLabels =
      mapFeaturesToLabels(instances.getDataAlphabet(), targetAlphabet);

  int numLabels = targetAlphabet.size();
  HashMap<Integer, double[]> constraintsMap =
      FeatureConstraintUtil.setTargetsUsingHeuristic(featuresAndLabels, numLabels, 0.9);

  MaxEntKLFLGEConstraints geConstraints =
      new MaxEntKLFLGEConstraints(instances.getDataAlphabet().size(), numLabels, false);
  constraintsMap
      .entrySet()
      .forEach(e -> geConstraints.addConstraint(e.getKey(), e.getValue(), 1));
  ArrayList<MaxEntGEConstraint> constraints = new ArrayList<>();
  constraints.add(geConstraints);

  // Create a classifier trainer, and use it to create a classifier
  MaxEntGETrainer trainer = new MaxEntGETrainer(constraints);
  trainer.setMaxIterations(numIterations);
  trainer.setGaussianPriorVariance(variance);

  instances.forEach(
      i -> {
        i.unLock();
        i.setTarget(null);
        i.lock();
      });

  Classifier classifier = trainer.train(instances);

  List<Classification> classify = classifier.classify(instances);

  writeClassificationToMongo(classify);
  new ObjectFile(classifier, modelFile).write();
}
 
Example #18
Source File: Attributes.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
/**
 * Generate the attributes object from the information in the pipe.
 * The pipe should be a LFPipe, but we also try to come up with something
 * if it is an ordinary pipe. 
 * 
 * @param pipe  mallet pipe
 * @param instanceType instance type
 */
public Attributes(Pipe pipe, String instanceType) {
  // first create the attributes (independent vars)    
  Alphabet dataAlphabet = pipe.getDataAlphabet();
  // if we can, also represent the pipe as LFPipe
  LFPipe lfPipe;
  FeatureInfo featureInfo = null;
  if(pipe instanceof LFPipe) {
    lfPipe = (LFPipe)pipe;
    featureInfo = lfPipe.getFeatureInfo();
  }
  // the alphabet we use if we have a boolean variable
  LFAlphabet booleanAlph = new LFAlphabet();
  booleanAlph.lookupIndex("false");
  booleanAlph.lookupIndex("true");    
  for(int i =0; i<dataAlphabet.size(); i++) {
    String malletFeatureName = (String) dataAlphabet.lookupObject(i);
    // create an attribute with default settings for datatype, code and 
    // alphabet, if we got more information about it we will override later
    Attribute attr = new Attribute(
            malletFeatureName, i, Datatype.numeric, null, null, null);
    // add it
    attributes.add(attr);
    name2index.put(malletFeatureName, i);
    // If we have a LFPipe, also get some additional info about the type, values etc.
    // NOTE that the default type for features that indicate the presence of
    // strings, ngrams etc. (which we assume when nothing else is declared)
    // is numeric, so that instead of 0/1 we can have counts or tf/idf or 
    // other scores. So only if there is an explicity declaration of a different
    // type, we will change the default values.
    if(featureInfo != null) {
      FeatureSpecAttribute fsAttr = 
              FeatureExtractionMalletSparse.lookupAttributeForFeatureName(
                featureInfo.getAttributes(),
                malletFeatureName,
                instanceType);
      if(fsAttr instanceof FeatureSpecAttributeList) {
        FeatureSpecAttributeList fsAttrList = (FeatureSpecAttributeList)fsAttr;
        attr.codeAs = fsAttrList.codeas;
        attr.mvTreatment = fsAttrList.missingValueTreatment;
        attr.datatype = fsAttrList.datatype;
        if(fsAttrList.datatype == Datatype.bool) {
          attr.alphabet = booleanAlph;
        } else if(fsAttrList.datatype == Datatype.nominal) {
          if(fsAttrList.codeas == CodeAs.number) {
            attr.alphabet = fsAttrList.alphabet;
          }
        } 
      } else if(fsAttr instanceof FeatureSpecSimpleAttribute) {
        FeatureSpecSimpleAttribute fsAttrSimple = (FeatureSpecSimpleAttribute)fsAttr;
        attr.codeAs = fsAttrSimple.codeas;
        attr.mvTreatment = fsAttrSimple.missingValueTreatment;
        attr.datatype = fsAttrSimple.datatype;
        if(fsAttrSimple.datatype == Datatype.bool) {
          attr.alphabet = booleanAlph;
        } else if(fsAttrSimple.datatype == Datatype.nominal) {
          if(fsAttrSimple.codeas == CodeAs.number) {
            attr.alphabet = fsAttrSimple.alphabet;
          }
        }           
      } else if(fsAttr instanceof FeatureSpecNgram) {
        // nothing to do here
      } else if(fsAttr==null) {
        // This can also happen if we try to look up a START/STOP feature which 
        // is created by us and for which not specification exists. In this case,
        // we simply do nothing and use the default attr we have created above
        if(malletFeatureName.endsWith(FeatureExtractionMalletSparse.START_SYMBOL) || 
           malletFeatureName.endsWith(FeatureExtractionMalletSparse.STOP_SYMBOL)) {
          // do nothing
        } else {
          throw new RuntimeException("FeatureSpecification is null for feature "+
                i+", name="+malletFeatureName+ 
                "\nFeatureSpecification is "+featureInfo);
        }
      } else {
        throw new RuntimeException(
                "Impossible: found odd FeatureSpecAttribute type "+fsAttr.getClass());
      }
    }
  }
  @SuppressWarnings("unchecked")
  LabelAlphabet targetAlphabet = (LabelAlphabet)pipe.getTargetAlphabet();
  // if the target alphabet exists, we assume a nominal target
  // The target index is the next index after the last independent attribute
  // index. This is convenient for Weka.
  targetAttribute = new Attribute("target", attributes.size(), Datatype.numeric, null, null, null);
  if(targetAlphabet != null) {
    targetAttribute.alphabet = targetAlphabet;
    targetAttribute.datatype = Datatype.nominal;
  }
}
 
Example #19
Source File: TopicModelPipe.java    From baleen with Apache License 2.0 2 votes vote down vote up
/**
 * Construct topic model pipe with given stopwords
 *
 * @param stopwords to be removed
 */
public TopicModelPipe(Collection<String> stopwords) {
  this(stopwords, new Alphabet());
}