/* * Copyright 2017 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.experiments.pipeline.uima; import cmu.arktweetnlp.Twokenize; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.component.CasAnnotator_ImplBase; import java.util.List; /** * Uses ArkTweetTokenizer but writes no sentence boundaries. * * @author Ivan Habernal */ public class ArkTweetTokenizerFixed extends CasAnnotator_ImplBase { private Type tokenType; @Override public void typeSystemInit(TypeSystem aTypeSystem) throws AnalysisEngineProcessException { super.typeSystemInit(aTypeSystem); tokenType = aTypeSystem.getType(Token.class.getName()); } @Override public void process(CAS cas) throws AnalysisEngineProcessException { String text = cas.getDocumentText(); // NOTE: Twokenize provides a API call that performs a normalization first - this would // require a mapping to the text how it is present in the CAS object. Due to HTML escaping // that would become really messy, we use the call which does not perform any normalization List<String> tokenize = Twokenize.tokenize(text); int offset = 0; for (String t : tokenize) { int start = text.indexOf(t, offset); int end = start + t.length(); createTokenAnnotation(cas, start, end); offset = end; } } private void createTokenAnnotation(CAS cas, int start, int end) { AnnotationFS tokenAnno = cas.createAnnotation(tokenType, start, end); cas.addFsToIndexes(tokenAnno); } }