cc.mallet.pipe.Pipe Java Examples

The following examples show how to use cc.mallet.pipe.Pipe. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: CorpusRepresentationMalletTarget.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

6 votes

/**
 * Extract the independent features for a single instance annotation.
 * 
 * Extract the independent features for a single annotation according to the information
 * in the featureInfo object. The information in the featureInfo instance gets updated 
 * by this. 
 * 
 * NOTE: this method is static so that it can be used in the CorpusRepresentationMalletSeq class too.
 * 
 * @param instanceAnnotation instance annotation
 * @param inputAS input annotation set
 * @param targetFeatureName feature name of target
 * @param featureInfo feature info instance
 * @param pipe mallet pipe
 * @param nameFeature name feature
 * @return  Instance
 */
static Instance extractIndependentFeaturesHelper(
        Annotation instanceAnnotation,
        AnnotationSet inputAS,
        FeatureInfo featureInfo,
        Pipe pipe) {
  
  AugmentableFeatureVector afv = new AugmentableFeatureVector(pipe.getDataAlphabet());
  // Constructor parms: data, target, name, source
  Instance inst = new Instance(afv, null, null, null);
  for(FeatureSpecAttribute attr : featureInfo.getAttributes()) {
    FeatureExtractionMalletSparse.extractFeature(inst, attr, inputAS, instanceAnnotation);
  }
  // TODO: we destructively replace the AugmentableFeatureVector by a FeatureVector here,
  // but it is not clear if this is beneficial - our assumption is that yes.
  inst.setData(((AugmentableFeatureVector)inst.getData()).toFeatureVector());
  return inst;
}

Example #2

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

6 votes

public static void addBrainRegionLexicons(List<String> usedPipeNames,
        List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException,
        Exception {
    usedPipeNames.add("BrainRegions");
    // BRAINREGION Lexicons
    pipes.add(new TrieLexiconMembership("NNHu", new File(LEXICON_HOME
            + "NN2002Human.txt"), ignoreCase));
    pipes.add(new TrieLexiconMembership("NNMouseRat", new File(LEXICON_HOME
            + "NN2007RatMouse.txt"), ignoreCase));
    pipes.add(new TrieLexiconMembership("Allen", new File(LEXICON_HOME
            + "Allen.txt"), ignoreCase));
    pipes.add(new TrieLexiconMembership("BAMS", new File(LEXICON_HOME
            + "BAMS.txt"), ignoreCase));
    pipes.add(new TrieLexiconMembership("AllRegions", new File(LEXICON_HOME
            + "AllRegions.txt"), ignoreCase));

    pipes.addAll(NGramPipeFactory.getAllGramsPipes("AllRegions", new File(
            LEXICON_HOME + "AllRegions.txt"), ignoreCase));
}

Example #3

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

6 votes

public static void addTextPressoPipes(List<String> usedPipeNames,
        List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException,
        Exception {
    usedPipeNames.add("TextPresso");
    // TEXTPRESSO files, files are split by how many tokens
    for (int i = 1; i < 8; i++) {
        pipes.add(new TrieLexiconMembership("textPresso" + i, new File(
                LEXICON_HOME + "TextPresso-wordLength-" + i + ".txt"),
                ignoreCase));
    }
    pipes.add(new TrieLexiconMembership("textPressoAll", new File(
            LEXICON_HOME + "TextPresso-all.txt"), ignoreCase));

    pipes.addAll(NGramPipeFactory.getAllGramsPipes("textPressoAll",
            new File(LEXICON_HOME + "TextPresso-all.txt"), ignoreCase));
}

Example #4

Source File: MaxEntClassifierTrainerTest.java From baleen with Apache License 2.0

6 votes

@Test
public void testTaskProducesValidModelFile() throws Exception {

  File modelFile = modelPath.toFile();
  assertTrue(modelFile.exists());

  Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object();
  assertTrue(classifier.getLabelAlphabet().contains("pos"));
  assertTrue(classifier.getLabelAlphabet().contains("neg"));

  Pipe pipe = classifier.getInstancePipe();
  InstanceList instanceList = new InstanceList(pipe);
  instanceList.addThruPipe(
      new Instance("I love this amazing awesome classifier.", null, null, null));
  instanceList.addThruPipe(new Instance("I can't stand this horrible test.", null, null, null));

  assertEquals(
      "pos", classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString());
  assertEquals(
      "neg", classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString());
}

Example #5

Source File: MalletClassifierTrainerTest.java From baleen with Apache License 2.0

6 votes

private void validateModel() {
  File modelFile = modelPath.toFile();
  assertTrue(modelFile.exists());

  Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object();
  assertTrue(classifier.getLabelAlphabet().contains("pos"));
  assertTrue(classifier.getLabelAlphabet().contains("neg"));

  Pipe pipe = classifier.getInstancePipe();
  InstanceList instanceList = new InstanceList(pipe);

  instanceList.addThruPipe(
      new Instance("I love this amazing awesome classifier.", "", null, null));
  instanceList.addThruPipe(new Instance("I can't stand this horrible test.", "", null, null));

  ImmutableSet<String> labels = ImmutableSet.of("pos", "neg");
  assertTrue(
      labels.contains(
          classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString()));
  assertTrue(
      labels.contains(
          classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString()));
}

Example #6

Source File: MalletClassifierTrainer.java From baleen with Apache License 2.0

6 votes

@Override
protected void execute(JobSettings settings) throws AnalysisEngineProcessException {

  Pipe pipe = new ClassifierPipe(stopwords);
  InstanceList instances = new InstanceList(pipe);
  instances.addThruPipe(getDocumentsFromMongo());

  InstanceList training = null;
  InstanceList testing = null;
  if (forTesting > 0.0) {
    InstanceList[] ilists = instances.split(new double[] {1 - forTesting, forTesting});
    training = ilists[0];
    testing = ilists[1];
  } else {
    training = instances;
  }

  processTrainerDefinitions(training, testing);
}

Example #7

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addSubstringRegexPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("Substring regexes");

    // "thalamic" and nuclie are probably in the 1-grams
    for (String substring : new String[] { "cortic", "cerebel" }) {
        pipes.add(new RegexMatches(substring + "Regex", compile(".*"
                + substring + ".*", CASE_INSENSITIVE)));
    }
}

Example #8

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static List<Pipe> getPipes() throws Exception {

        List<Pipe> pipes = newArrayList();

        pipes.add(new Jcas2TokenSequence());
        pipes.add(new Target2LabelSequence());

        // more piiiiipes
        addAllGoodPipes(pipes);

        pipes.add(new FeatureWindow(window, window));
        // for debugging pipes.add(new PrintInputAndTarget());
        pipes.add(new TokenSequence2FeatureVectorSequence());
        return pipes;
    }

Example #9

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addAllGoodPipes(List<Pipe> pipes) throws Exception {

        List<String> usedPipeNames = new LinkedList<String>();

        // / if (StaticOption.getBoolean("TextPipe"))
        addTextPipe(usedPipeNames, pipes);

        // ren/ addOriginalMarkupPipes();
        addAreaRegexPipes(usedPipeNames, pipes);
        // this catches tracts, change?
        // /if (StaticOption.getBoolean("SubstringRegexPipes"))
        addSubstringRegexPipes(usedPipeNames, pipes);
        addSpineRegexPipes(usedPipeNames, pipes);

        // /if (StaticOption
        // .getBoolean("SmallLexicons_TextPressoPipes_BrainRegionLexicons_AbbreviationLexiconPipes_AreaLexicons"))
        addSmallLexicons(usedPipeNames, pipes, ignoreCase);
        addTextPressoPipes(usedPipeNames, pipes, ignoreCase);
        addBrainRegionLexicons(usedPipeNames, pipes, ignoreCase);
        // ren/ addPigeonLexicon(usedPipeNames, pipes, ignoreCase);
        addAbbreviationLexiconPipes(usedPipeNames, pipes);
        addAreaLexicons(usedPipeNames, pipes, ignoreCase);

        addLengthPipes(usedPipeNames, pipes);

        if (Jcas2TokenSequence.NEW_FEATURES)
            addFullTextPipes(usedPipeNames, pipes);

        // / if (StaticOption.getBoolean("HandMadeRegexPipes_MalletNEPipes")) {
        addHandMadeRegexPipes(usedPipeNames, pipes);
        addMalletNEPipes(usedPipeNames, pipes);
    }

Example #10

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

/** Pipes added based on experience with full text */
private static void addFullTextPipes(List<String> usedPipeNames,
        List<Pipe> pipes) {

    // blabla 24 24
    pipes.add(new LongRegexSpaced("digit_then_other_then_digit", Pattern
            .compile("\\d+[^\\d]+\\d+"), 2, 4));

    // 30 mM K SO , 5 mM MgCl 6H O, 10 mM 24 24 22 HEPES
    pipes.add(new LongRegexSpaced(
            "digit_then_other_then_digit_then_other_then_digit", Pattern
                    .compile(".*\\d+[^\\d\\n]+\\d+[^\\d\\n]+\\d+.*"), 4, 9));

    // n 19
    // n 5
    pipes.add(new LongRegexSpaced("n_space_digit", Pattern
            .compile("n \\d+"), 2, 2));
    pipes.add(new LongRegexSpaced("parenthesis_n_space_digit_parenthesis",
            Pattern.compile("\\( n \\d+ \\)"), 3, 4));
    pipes.add(new LongRegexSpaced("n_space_digit_parenthesis", Pattern
            .compile("n \\d+ \\)"), 3, 4));
    pipes.add(new LongRegexSpaced("parenthesis_n_space_digit", Pattern
            .compile("\\( n \\d+"), 3, 4));

    // Fig is never found in any lexicon
    pipes.add(new RegexMatches("Figure", Pattern.compile(".*Fig.*")));
}

Example #11

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

private static void addAbbreviationLexiconPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws IOException {

    usedPipeNames.add("AbbrevLex");
    File ratMouse = new File(LEXICON_HOME + "NN2007RatMouseAbbrev.txt");
    File human = new File(LEXICON_HOME + "NN2002HumanAbbrev.txt");
    boolean ignoreCase = true;
    // should be one word only but may not..
    pipes.add(new TrieLexiconMembership("NNHumanAbbrev", human, ignoreCase));
    pipes.add(new TrieLexiconMembership("NNRatMouseAbbrev", ratMouse,
            ignoreCase));

    addPrefixPipes(pipes, ratMouse, "NNHumanAbbrevPrefix");
    addPrefixPipes(pipes, human, "NNRatMouseAbbrevPrefix");
}

Example #12

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addPrefixPipes(List<Pipe> pipes, File file, String name)
        throws IOException {
    for (String line : linesFrom(file.getAbsolutePath())) {
        pipes.add(new RegexMatches(name, compile("(" + line.trim()
                + ".{1,3})", CASE_INSENSITIVE)));
    }
}

Example #13

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addAreaRegexPipes(List<String> usedPipeNames,
        List<Pipe> pipes) {
    usedPipeNames.add("Area regexes");

    pipes.add(new LongRegexSpaced("Brodmann", Pattern
            .compile("areas? \\d+((, ?\\d)*,? (or|and) \\d+)?"), 2, 9));

    // a looser version that allows just letters
    pipes.add(new LongRegexSpaced(
            "Areas",
            compile("areas? (\\p{Upper}|\\d)+((, ?(\\p{Upper}|\\d))*,? (or|and) (\\p{Upper}|\\d)+)?"),
            2, 9));

}

Example #14

Source File: NGramPipeFactory.java From bluima with Apache License 2.0

5 votes

public static List<Pipe> getAllGramsPipes(String name, File inputFile,
        boolean ignoreCase, int startGramSize) throws Exception {
    // go up to seven?
    List<Pipe> pipes = new LinkedList<Pipe>();
    for (int i = startGramSize; i < 7; i++) {
        pipes.add(getNGramPipe(name, inputFile, ignoreCase, i));
    }
    return pipes;
}

Example #15

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addSpineRegexPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("SpineRegex");
    // T1-T12
    // L1-L5
    // S1-S5
    // C1-C8
    pipes.add(new LongRegexMatches("SpinalParts", Pattern
            .compile("([LS][1-5])|T((1[0-2]?)|[2-9])|(C[1-8])"), 1, 2));

}

Example #16

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addSmallLexicons(List<String> usedPipeNames,
        List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException {
    usedPipeNames.add("SmallLex");
    pipes.add(new LexiconMembership("chudlerListWord", new File(
            LEXICON_HOME + "chudler.txt"), ignoreCase));
    pipes.add(new LexiconMembership("directionWord", new File(LEXICON_HOME
            + "directions.txt"), ignoreCase));
    pipes.add(new LexiconMembership("extendedDirectionWord", new File(
            LEXICON_HOME + "extendedDirections.txt"), ignoreCase));
    pipes.add(new LexiconMembership("stopWord", new File(LEXICON_HOME
            + "stop.txt"), ignoreCase));
}

Example #17

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addHandMadeRegexPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("Handmade regexes");
    pipes.add(new LongRegexSpaced("of_The", compile("of the"), 2, 2));
    pipes.add(new LongRegexSpaced("part_Of", compile("part of"), 2, 2));
    pipes.add(new LongRegexSpaced("neurnEnd", compile("(.* neurons)"), 2, 3));
    pipes.add(new LongRegexSpaced("nucleiEnd", compile("(.* nuclei)"), 2, 3));
    pipes.add(new LongRegexSpaced("nclusEnd", compile("(.* nucleus)"), 2, 5));
    pipes.add(new LongRegexSpaced("fieldEnd", compile("(.* field)"), 2, 4));
    pipes.add(new LongRegexSpaced("cortexEnd", compile("(.* cortex)"), 2, 3));
    pipes.add(new LongRegexSpaced("areaEnd", compile("(.* area)"), 2, 4));
    pipes.add(new LongRegexSpaced("territoryEnd",
            compile("(.* territory)|(.* territories)"), 2, 4));
}

Example #18

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addLengthPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("Length");
    // length feature - binary bins
    pipes.add(new LengthBins("Length", new int[] { 1, 2, 3, 5, 8, 11, 14,
            18, 22 }));

    // from some calcs the average brain token is 6.92 while the outside is
    // 4.64 (~3.55 stdev)
    pipes.add(new LengthBins("LengthThreshold", new int[] { 6 }));
}

Example #19

Source File: LDA.java From topic-detection with Apache License 2.0

5 votes

/**
 * Creates a list of Malelt instances from a list of documents
 * @param texts a list of documents
 * @return a list of Mallet instances
 * @throws IOException
 */
private InstanceList createInstanceList(List<String> texts) throws IOException
{
	ArrayList<Pipe> pipes = new ArrayList<Pipe>();
	pipes.add(new CharSequence2TokenSequence());
	pipes.add(new TokenSequenceLowercase());
	pipes.add(new TokenSequenceRemoveStopwords());
	pipes.add(new TokenSequence2FeatureSequence());
	InstanceList instanceList = new InstanceList(new SerialPipes(pipes));
	instanceList.addThruPipe(new ArrayIterator(texts));
	return instanceList;
}

Example #20

Source File: NGramPipeFactory.java From bluima with Apache License 2.0

5 votes

public static Pipe getNGramPipe(String name, File inputFile,
        boolean ignoreCase, int gram) throws Exception {
    File tempFile = File.createTempFile("ngram", ".txt");
    // System.out.println( "Your temp file is " +
    // tempFile.getCanonicalPath() );
    // Arrange for it to be deleted at exit.
    tempFile.deleteOnExit();

    BufferedWriter bw = new BufferedWriter(new FileWriter(tempFile));
    BufferedReader br = new BufferedReader(new FileReader(inputFile));
    while (br.ready()) {
        String line = br.readLine().trim();
        if (line.equals(""))
            continue; // ignore blank lines
        // check null
        String[] gramStrings = getGrams(line, gram);
        if (gramStrings != null) {
            for (String gramString : gramStrings) {
                bw.write(gramString);
                // System.out.println(gramString);
                bw.newLine();
            }
        }
    }
    bw.close();
    br.close();
    return new TrieLexiconMembership(name + "-" + gram + "-gram", tempFile,
            ignoreCase);
}

Example #21

Source File: ReferencesClassifierTrainer.java From bluima with Apache License 2.0

5 votes

static List<Pipe> getPipes() {

        List<Pipe> pipes = newArrayList();
        pipes.add(new Target2Label());
        pipes.add(new MyInput2RegexTokens());

        // pipes.add(new PrintInputAndTarget());

        pipes.add(new TokenSequence2FeatureSequence());
        pipes.add(new FeatureSequence2FeatureVector());
        return pipes;
    }

Example #22

Source File: CorpusRepresentationMalletTarget.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

/**
 * Constructor for creating a new CorpusRepresentation from a FeatureInfo. 
 * 
 * @param fi FeatureInfo instance
 * @param targetType type of target
 */
public CorpusRepresentationMalletTarget(FeatureInfo fi, TargetType targetType) {
  featureInfo = fi;
  scalingMethod = fi.getGlobalScalingMethod();    
  this.targetType = targetType;
  LabelAlphabet targetAlphabet = (targetType == TargetType.NOMINAL) ? new LabelAlphabet() : null;
  Pipe innerPipe = new Noop(new LFAlphabet(), targetAlphabet);
  List<Pipe> pipes = new ArrayList<>();
  pipes.add(innerPipe);
  pipe = new LFPipe(pipes);
  pipe.setFeatureInfo(fi);
  instances = new LFInstanceList(pipe);
}

Example #23

Source File: CorpusRepresentationMalletLDA.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

public CorpusRepresentationMalletLDA(FeatureInfo fi) {
  featureInfo = fi;  // always null
  // since we always pass a null feature info, the scaling method is hard-wired to be NONE
  scalingMethod = ScalingMethod.NONE;

  // TODO: we really do not need any of this, figure out if we can simplify,
  // but keeping this should not really do any harm!
  Pipe innerPipe = new Noop(new LFAlphabet(), null);
  List<Pipe> pipes = new ArrayList<>();
  pipes.add(innerPipe);
  pipe = new LFPipe(pipes);
  pipe.setFeatureInfo(fi);
  instances = new LFInstanceList(pipe);
  targetType = TargetType.NONE;
}

Example #24

Source File: CorpusRepresentationMalletSeq.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

public CorpusRepresentationMalletSeq(FeatureInfo fi) {
  featureInfo = fi;
  scalingMethod = fi.getGlobalScalingMethod();

  Pipe innerPipe = new Noop(new LFAlphabet(), new LabelAlphabet());
  List<Pipe> pipes = new ArrayList<>();
  pipes.add(innerPipe);
  pipe = new LFPipe(pipes);
  pipe.setFeatureInfo(fi);
  instances = new LFInstanceList(pipe);
  targetType = TargetType.NOMINAL;
}

Example #25

Source File: NGramPipeFactory.java From bluima with Apache License 2.0

4 votes

public static List<Pipe> getAllGramsPipes(String name, File inputFile,
        boolean ignoreCase) throws Exception {
    return getAllGramsPipes(name, inputFile, ignoreCase, 1);
}

Example #26

Source File: NEPipes.java From bluima with Apache License 2.0

4 votes

public NEPipes() {
    super(
            new Pipe[] {
                    //new TokenText( "text=" ),

                    new RegexMatches( "SingleLetter", Pattern.compile( "[A-Za-z]" ) ),
                    new RegexMatches( "AllCaps", Pattern.compile( ALLCAPS ) ),
                    new RegexMatches( "AllLower", Pattern.compile( ALLLOWER ) ),
                    new RegexMatches( "InitCaps", Pattern.compile( INITCAPS ) ),
                    new RegexMatches( "MixedCase", Pattern.compile( MIXEDCASE ) ),
                    new RegexMatches( "MixedNum", Pattern.compile( MIXEDNUM ) ),
                    new RegexMatches( "EndSentPunc", Pattern.compile( ENDSENTENCE ) ),
                    new RegexMatches( "Punc", Pattern.compile( PUNCTUATION ) ),
                    new RegexMatches( "Bracket", Pattern.compile( BRACKET ) ),
                    new RegexMatches( "Ordinal", Pattern.compile( ORDINAL, Pattern.CASE_INSENSITIVE ) ),

                    new LongRegexMatches( "Quoted", Pattern.compile( QUOTED ), 1, 4 ),
                    new LongRegexMatches( "Bracketed", Pattern.compile( BRACKETED ), 1, 4 ),
                    new LongRegexMatches( "Initial", Pattern.compile( INITIAL ), 2, 2 ),
                    new LongRegexMatches( "Ellipse", Pattern.compile( DOTS ), 1, 2 ),
                    new LongRegexMatches( "Dashes", Pattern.compile( DASHES ), 2, 2 ),
                    new LongRegexMatches( "Fraction", Pattern.compile( FRACTION ), 1, 3 ),
                    new LongRegexMatches( "DotDecimal", Pattern.compile( DOTDECIMAL ), 1, 3 ),

                    new LongRegexMatches( "Percent", Pattern.compile( "(" + RANGE + "|" + DECIMAL + ")%" ), 2, 4 ),
                    new RegexMatches( "10^3n", Pattern.compile( ILLION, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "Numeric", Pattern.compile( DECIMAL ), 1, 3 ),
                    new LongRegexMatches( "BigNumber", Pattern.compile( COMMA_DECIMAL ), 1, 7 ),
                    new LongRegexMatches( "kmbNumber",
                            Pattern.compile( DECIMAL + ILLION, Pattern.CASE_INSENSITIVE ), 1, 4 ),
                    new RegexMatches( "kmbMixed", Pattern.compile( MIXED_ILLION, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "Dollars", Pattern.compile( "[$](" + RANGE + "|" + DECIMAL + "|"
                            + COMMA_DECIMAL + "|" + DECIMAL + ILLION + "|" + MIXED_ILLION + ")",
                            Pattern.CASE_INSENSITIVE ), 2, 8 ),

                    new RegexMatches( "NumberWord", Pattern.compile( NUMBER_WORD, Pattern.CASE_INSENSITIVE ) ),
                   //FIXME useful beyond this?
                    new RegexMatches( "Currency", Pattern.compile( CURRENCY, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "MoneyWords", Pattern.compile( MONEYWORDS, Pattern.CASE_INSENSITIVE ), 2,
                            4 ),

                    new LongRegexMatches( "AmPm", Pattern.compile( AMPM, Pattern.CASE_INSENSITIVE ), 1, 4 ),
                    new RegexMatches( "MixedAmPm", Pattern.compile( MIXED_AMPM, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "TimeNum", Pattern.compile( TIMENUM ), 3, 5 ),
                    new RegexMatches( "TimeZone", Pattern.compile( TIMEZONES, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "Time", Pattern.compile( TIME, Pattern.CASE_INSENSITIVE ), 1, 9 ),
                    new LongRegexMatches( "TimeRange", Pattern.compile( TIMERANGE, Pattern.CASE_INSENSITIVE ), 3,
                            19 ),

                    new LongRegexMatches( "P10", Pattern.compile( P10 ), 3, 7 ),
                    new LongRegexMatches( "P5", Pattern.compile( P10 ), 3, 3 ),
                    new LongRegexMatches( "Phone", Pattern.compile( P10 + "|" + P5 ), 3, 7 ),

                    new RegexMatches( "UncasedMonthName", Pattern.compile( MONTHNAME, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "UncasedMonthAbbr",
                            Pattern.compile( MONTHABBR, Pattern.CASE_INSENSITIVE ), 1, 2 ),
                    new LongRegexMatches( "CasedMonth", Pattern.compile( MONTH ), 1, 2 ),
                    new LongRegexMatches( "UncasedMonth", Pattern.compile( MONTH, Pattern.CASE_INSENSITIVE ), 1, 2 ),

                    new RegexMatches( "UncasedWeekdayName", Pattern.compile( WEEKDAYNAME, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "UncasedWeekdayAbbr", Pattern.compile( WEEKDAYABBR,
                            Pattern.CASE_INSENSITIVE ), 1, 2 ),
                    new LongRegexMatches( "CasedWeekday", Pattern.compile( WEEKDAY ), 1, 2 ),
                    new LongRegexMatches( "UncasedWeekday", Pattern.compile( WEEKDAY, Pattern.CASE_INSENSITIVE ),
                            1, 2 ),

                    new LongRegexMatches( "MonthDay", Pattern.compile( MONTHDAY, Pattern.CASE_INSENSITIVE ), 2, 3 ),
                    new LongRegexMatches( "DayMonthDay", Pattern.compile( DAYMONTHDAY, Pattern.CASE_INSENSITIVE ),
                            3, 6 ),
                    new LongRegexMatches( "MonthYear", Pattern.compile( MONTHYEAR, Pattern.CASE_INSENSITIVE ), 2, 4 ),
                    new LongRegexMatches( "MonthDayYear",
                            Pattern.compile( MONTHDAYYEAR, Pattern.CASE_INSENSITIVE ), 3, 5 ),
                    new LongRegexMatches( "DayMonthDayYear", Pattern.compile( DAYMONTHDAYYEAR,
                            Pattern.CASE_INSENSITIVE ), 4, 8 ),

                    new LongRegexMatches( "SeparatorDate", Pattern.compile( SEPDATE ), 3, 5 ),
                    new LongRegexMatches( "FullSeparatorDate", Pattern.compile( FULLSEPDATE ), 5, 5 ),
            } );
}

Example #27

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

4 votes

public static void addMalletNEPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("Mallet NE");
    // random pipes from general NER
    pipes.addAll(new NEPipes().pipes());
}

Example #28

Source File: Attributes.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

/**
 * Generate the attributes object from the information in the pipe.
 * The pipe should be a LFPipe, but we also try to come up with something
 * if it is an ordinary pipe. 
 * 
 * @param pipe  mallet pipe
 * @param instanceType instance type
 */
public Attributes(Pipe pipe, String instanceType) {
  // first create the attributes (independent vars)    
  Alphabet dataAlphabet = pipe.getDataAlphabet();
  // if we can, also represent the pipe as LFPipe
  LFPipe lfPipe;
  FeatureInfo featureInfo = null;
  if(pipe instanceof LFPipe) {
    lfPipe = (LFPipe)pipe;
    featureInfo = lfPipe.getFeatureInfo();
  }
  // the alphabet we use if we have a boolean variable
  LFAlphabet booleanAlph = new LFAlphabet();
  booleanAlph.lookupIndex("false");
  booleanAlph.lookupIndex("true");    
  for(int i =0; i<dataAlphabet.size(); i++) {
    String malletFeatureName = (String) dataAlphabet.lookupObject(i);
    // create an attribute with default settings for datatype, code and 
    // alphabet, if we got more information about it we will override later
    Attribute attr = new Attribute(
            malletFeatureName, i, Datatype.numeric, null, null, null);
    // add it
    attributes.add(attr);
    name2index.put(malletFeatureName, i);
    // If we have a LFPipe, also get some additional info about the type, values etc.
    // NOTE that the default type for features that indicate the presence of
    // strings, ngrams etc. (which we assume when nothing else is declared)
    // is numeric, so that instead of 0/1 we can have counts or tf/idf or 
    // other scores. So only if there is an explicity declaration of a different
    // type, we will change the default values.
    if(featureInfo != null) {
      FeatureSpecAttribute fsAttr = 
              FeatureExtractionMalletSparse.lookupAttributeForFeatureName(
                featureInfo.getAttributes(),
                malletFeatureName,
                instanceType);
      if(fsAttr instanceof FeatureSpecAttributeList) {
        FeatureSpecAttributeList fsAttrList = (FeatureSpecAttributeList)fsAttr;
        attr.codeAs = fsAttrList.codeas;
        attr.mvTreatment = fsAttrList.missingValueTreatment;
        attr.datatype = fsAttrList.datatype;
        if(fsAttrList.datatype == Datatype.bool) {
          attr.alphabet = booleanAlph;
        } else if(fsAttrList.datatype == Datatype.nominal) {
          if(fsAttrList.codeas == CodeAs.number) {
            attr.alphabet = fsAttrList.alphabet;
          }
        } 
      } else if(fsAttr instanceof FeatureSpecSimpleAttribute) {
        FeatureSpecSimpleAttribute fsAttrSimple = (FeatureSpecSimpleAttribute)fsAttr;
        attr.codeAs = fsAttrSimple.codeas;
        attr.mvTreatment = fsAttrSimple.missingValueTreatment;
        attr.datatype = fsAttrSimple.datatype;
        if(fsAttrSimple.datatype == Datatype.bool) {
          attr.alphabet = booleanAlph;
        } else if(fsAttrSimple.datatype == Datatype.nominal) {
          if(fsAttrSimple.codeas == CodeAs.number) {
            attr.alphabet = fsAttrSimple.alphabet;
          }
        }           
      } else if(fsAttr instanceof FeatureSpecNgram) {
        // nothing to do here
      } else if(fsAttr==null) {
        // This can also happen if we try to look up a START/STOP feature which 
        // is created by us and for which not specification exists. In this case,
        // we simply do nothing and use the default attr we have created above
        if(malletFeatureName.endsWith(FeatureExtractionMalletSparse.START_SYMBOL) || 
           malletFeatureName.endsWith(FeatureExtractionMalletSparse.STOP_SYMBOL)) {
          // do nothing
        } else {
          throw new RuntimeException("FeatureSpecification is null for feature "+
                i+", name="+malletFeatureName+ 
                "\nFeatureSpecification is "+featureInfo);
        }
      } else {
        throw new RuntimeException(
                "Impossible: found odd FeatureSpecAttribute type "+fsAttr.getClass());
      }
    }
  }
  @SuppressWarnings("unchecked")
  LabelAlphabet targetAlphabet = (LabelAlphabet)pipe.getTargetAlphabet();
  // if the target alphabet exists, we assume a nominal target
  // The target index is the next index after the last independent attribute
  // index. This is convenient for Weka.
  targetAttribute = new Attribute("target", attributes.size(), Datatype.numeric, null, null, null);
  if(targetAlphabet != null) {
    targetAttribute.alphabet = targetAlphabet;
    targetAttribute.datatype = Datatype.nominal;
  }
}

Example #29

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

4 votes

public static void addAreaLexicons(List<String> usedPipeNames,
        List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException {
    usedPipeNames.add("Areawords");
    pipes.add(new LexiconMembership("areawords", new File(LEXICON_HOME
            + "areawords.txt"), ignoreCase));
}

Example #30

Source File: ITEngineMalletClass.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Test
public void testEngineMalletClass1() throws MalformedURLException, ResourceInstantiationException {
  File configFile = new File("tests/cl-ionosphere/feats.xml");
  FeatureSpecification spec = new FeatureSpecification(configFile);
  FeatureInfo featureInfo = spec.getFeatureInfo();
  Engine engine = Engine.create(AlgorithmClassification.MalletC45_CL_MR, "", featureInfo, TargetType.NOMINAL, null);
  CorpusRepresentationMalletTarget crm = (CorpusRepresentationMalletTarget)engine.getCorpusRepresentation();
  System.err.println("TESTS: have engine "+engine);
  
  // load a document and train the model
  Document doc = loadDocument(new File("tests/cl-ionosphere/ionosphere_gate.xml"));
  System.err.println("TESTS: have document");
  
  AnnotationSet instanceAS = doc.getAnnotations().get("Mention");
  AnnotationSet sequenceAS = null;
  AnnotationSet inputAS = doc.getAnnotations();
  AnnotationSet classAS = null;
  String targetFeature = "class";
  String nameFeature = null;
  crm.add(instanceAS, sequenceAS, inputAS, classAS, targetFeature, TargetType.NOMINAL, "", nameFeature, null);
  System.err.println("TESTS: added instances, number of instances now: "+crm.getRepresentationMallet().size());
  engine.trainModel(null,"","");
  System.err.println("TESTS: model trained");
  System.err.println("TESTS: engine before saving: "+engine);
  engine.saveEngine(new File("."));
  
  // Now check if we can restore the engine and thus the corpus representation
  Engine engine2 = Engine.load(new File(".").toURI().toURL(), "");
  System.err.println("RESTORED engine is "+engine2);
  
  // check if the corpusRepresentation has been restored correctly
  CorpusRepresentation cr2 = engine2.getCorpusRepresentation();
  assertNotNull(cr2);
  assertTrue(cr2 instanceof CorpusRepresentationMalletTarget);
  CorpusRepresentationMalletTarget crmc2 = (CorpusRepresentationMalletTarget)cr2;
  Pipe pipe = crmc2.getPipe();
  assertNotNull(pipe);
  assertTrue(pipe instanceof LFPipe);
  LFPipe lfpipe = (LFPipe)pipe;
  FeatureInfo fi = lfpipe.getFeatureInfo();
  assertNotNull(fi);
  
  AnnotationSet lfAS = doc.getAnnotations("LF");
  String parms = "";
  List<ModelApplication> gcs = engine2.applyModel(instanceAS, inputAS, sequenceAS, parms);
  System.err.println("Number of classifications: "+gcs.size());
  ModelApplication.applyClassification(doc, gcs, "target", lfAS, null);
  
  System.err.println("Original instances: "+instanceAS.size()+", classification: "+lfAS.size());
  
  // quick and dirty evaluation: go through all the original annotations, get the 
  // co-extensive annotations from LF, and compare the values from the "class" feature
  int total = 0;
  int correct = 0;
  for(Annotation orig : instanceAS) {
    total++;
    Annotation lf = gate.Utils.getOnlyAnn(gate.Utils.getCoextensiveAnnotations(lfAS, orig));
    //System.err.println("ORIG="+orig+", lf="+lf);
    if(orig.getFeatures().get("class").equals(lf.getFeatures().get("target"))) {
      correct++;
    }
  }
  
  double acc = (double)correct / (double)total;
  System.err.println("Got total="+total+", correct="+correct+", acc="+acc);
  assertEquals(0.9630, acc, 0.01);
}