java source code of AnnotatedText

/*******************************************************************************
 * Copyright 2014 A3 lab (Dipartimento di Informatica, Università di Pisa)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package it.acubelab.tagme;

import it.unimi.dsi.lang.MutableString;

import java.io.IOException;
import java.io.Reader;
import java.util.List;

public class AnnotatedText {
	
	public static final int INIT_BUFFER_CAPACITY = 1024;

	/**
	 * The original text
	 */
	MutableString original;
	/**
	 * The cleaned text. The array could be greater than the actual text. See text_len
	 */
	char[] text;
	/**
	 * The length of the cleaned text
	 */
	int text_len;
	/**
	 * The offset of the cleaned char wrt the cleaned text.
	 * Means that text[i] was originally at the position offset[i] in the original text
	 */
	int[] offsets;
	/**
	 * Specify if the char at position i (of the cleaned text) was a breaking char
	 * See TagmeParser.breakingCharSet
	 */
	boolean[] breaking_pos;
	
	/**
	 * Positions of tagged already defined for this text. Used only for testing.
	 * If null (initialization value), the text is not pre-tagged
	 */
	int[] pre_tagged_pos = null;
	public int[] getPreTaggedPos() {
		return pre_tagged_pos;
	}
	public void setPreTaggedPos(int[] pre_tagged_pos) {
		this.pre_tagged_pos = pre_tagged_pos;
	}

	/**
	 * List of all annotations, pruned, ignored, disambiguated...
	 * The list is sorted based on the position of the anchor in the text
	 */
	List<Annotation> annotations;
	
	/**
	 * Length of the window size (in terms of tokens) for anchor parsing
	 */
	int parsingWindow = TagmeParser.DEFAULT_WINDOW;
	/**
	 * Minimum link probability used for parsing
	 */
	float minLP = Anchor.MIN_LP;
	
	
	// LONG TEXT MANAGEMENT
	boolean isLong;
	public static final int WIN_START = 0;
	public static final int WIN_END = 1;
	/**
	 * Disambiguation windows for all anchors in the text
	 * For each annotation at idx i of annotations,
	 * windows[i] contains the start and the end of the surrounding window for the annotation
	 * The window is expressed in terms of indexes of other annotation in the list
	 * The array has a dimension of [N][2], where N is the size of annotations list.
	 */
	int[][] windows;
	/**
	 * Information about pruning (substring pruning algorithm) for a certain window
	 * It could be that an anchor A could be pruned if you consider it in a certain window for anchor B
	 * (eg. because window for anchor B contains an anchor C that is super-string of A)
	 * But if you consider A in another window that doesn't contain C, A has not to be pruned.
	 * So windowPruning[B][A] specifies whether the anchor A in the window for anchor B has to be ignored for disambiguation
	 * The matrix has a dimension of [N][N], where N is the size of annotations list
	 */
	boolean [][] windowPruning;
	
	/**
	 * Leading chars removed at the beginning of the text.
	 */
	//private int removedLeadingChars = 0;
	
	/**Creates an annotated text object reading data from a reader of default capacity.
	 * @param reader where to read the data from.
	 * @throws IOException if an error occurred while reading from the reader.
	 */
	public AnnotatedText(Reader reader) throws IOException
	{
		this(reader, INIT_BUFFER_CAPACITY);
	}
	
	/**Creates an annotated text object reading data from a reader.
	 * @param reader where to read the data from.
	 * @param length the maximum length of the text to read.
	 * @throws IOException if an error occurred while reading from the reader.
	 */
	public AnnotatedText(Reader reader, int length) throws IOException
	{
		original = new MutableString(length+1);
		char[] buffer = new char[length+1];
		int read = 0;
		while((read=reader.read(buffer, 0, length+1)) >= 0)
			original.append(buffer, 0, read);
	}
	
	/** Create an annotated text from a given string, skipping all leading chars that are not letters nor digits.
	 * @param text the annotated text.
	 */
	public AnnotatedText(String text){
		/*while(removedLeadingChars<text.length() && !Character.isLetterOrDigit(text.charAt(removedLeadingChars)))
			removedLeadingChars++;
		this.original = new MutableString(text.substring(removedLeadingChars,text.length()));
		*/
		this.original=new MutableString(text);
	}
	
	
	public int getOriginalTextStart(Annotation s){
		//dobbiamo sommare removedLeadingChars perchè il chiamante che ha passato l'original text
		//non ha modificato il testo rimuovendo i leading chars all'inizio
		return /*removedLeadingChars+*/offsets[s.start];
	}
	
	public int getOriginalTextEnd(Annotation s){
		/*
		 * We need to find the position of the last character of the clean text in the original text
		 * and then add 1. (interval end is exclusive). The behavior below is no longer needed
 		 */
		return /*removedLeadingChars+*/offsets[s.end-1]+1;
		
		//quando la fine dello spot è fuori dagli offsets vuol dire che la fine corrisponde alla fine del testo reale
		//se invece l'offset e' negativo sono alla fine del testo pulito e quindi devo stare attento a possibili
		//caratteri terminali del testo pulito che sono stati strippati.
//		if (s.originalEnd >= offsets.length || offsets[s.originalEnd] <0)
//		else
//			s.originalEnd = offsets[s.originalEnd];
	}
	
	public String getOriginalText(Annotation a){
		/*
		 * qui non dobbiamo sommare removedLeadingChars,
		 * perchè il nostro original è già shiftato
		 */
		return original.subSequence(offsets[a.start], offsets[a.end-1]+1).toString();
	}
	public String getText(Annotation a){
		return new String(text, a.start, a.end-a.start);
	}
	public String getText(){
		return new String(text, 0, text_len);
	}
	
	public List<Annotation> getAnnotations(){
		return this.annotations;
	}
	
	public AnnotationWindow getWindowIterator(){
		return new AnnotationWindow();
	}
	public boolean isLong(){
		return isLong;
	}
	public MutableString getOriginal(){
		return original;
	}
	
	public boolean isPruned(Annotation a)
	{
		if (!isLong) return a.ignored || a.pruned;
		else {
			int id = annotations.indexOf(a);
			if (id < 0) throw new IllegalArgumentException();
			else return windows[id][WIN_START] < 0;
		}
	}
	
	/**
	 * Used to iterate through the list of disambiguation windows<br/>
	 * Usage:<br/>
	 * - call setStartAnnotation to initialize the iterator to a given anchor A<br/>
	 * - call next() to move the iterator<br/>
	 * - if next() has returned true, use current() to retrieve the anchor in the window<br/>
	 * - iterate until next() returns false<br/>
	 * To iterate the window of another anchor, re-call setStartAnnotation <br/>
	 */
	public final class AnnotationWindow
	{
		int annot;
		int cursor;
		boolean empty;
		boolean all = false;
		public void setStartAnnotation(int a){
			setStartAnnotation(a, false);
		}
		public void setStartAnnotation(int a, boolean all){
			annot = a;
			cursor = isLong? windows[a][WIN_START]-1 : -1;
			Annotation an = annotations.get(a);
			empty = an.pruned || an.ignored || (isLong && windows[a][WIN_START]<0);
			this.all = all;
		}
		public boolean empty(){
			return empty;
		}
		public boolean next(){
			if (empty) return false;
			cursor++;
			if (isLong){
				while (cursor <= windows[annot][WIN_END] &&
						(!all && windowPruning != null && windowPruning[annot][cursor] || cursor==annot)
					)
					cursor++;
				
				return cursor <= windows[annot][WIN_END];
				
			} else {
				while (cursor < annotations.size() &&
						(!all && (annotations.get(cursor).ignored || annotations.get(cursor).pruned) || cursor==annot)
					)
					cursor++;
				return cursor < annotations.size();
			}
		}
		public Annotation current(){
			return annotations.get(cursor);
		}
		public boolean currentIsPruned(){
			return isLong?
					windowPruning != null && windowPruning[annot][cursor] :
					(annotations.get(cursor).ignored || annotations.get(cursor).pruned);
		}
		
	}
}