Java Code Examples for it.unimi.dsi.lang.MutableString#array()

The following examples show how to use it.unimi.dsi.lang.MutableString#array() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: tagme   File: Chars.java    License: Apache License 2.0 5 votes vote down vote up
/** Convert a string from UTF-16 to ASCII.
 * @param input the string to convert.
 */
public static void normalizeASCII(MutableString input)
{
	char[] chars = input.array();
	int len = input.length();
	for(int i=0; i<len; i++)
		chars[i] = UTF16toASCII.MAP[(int)chars[i]];
}
 
Example 2
Source Project: tagme   File: WikipediaAnchorParser.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Remove all punctuations for an anchor, i.e. remove all but letters, digits and whitespaces
 *
 * @param input
 * @param ignoreChars A set of character (no digits, no letters) that are ignored when removing
 * @param ignoreSequences if true, it does not remove sequences of the same characters i.e. '!!!'
 * @return A new MutableString
 */
public static MutableString removePunctuations(MutableString input, String ignoreChars, boolean ignoreSequences)
{
	int len = input.length();
	char[] array = input.array();
	MutableString norm = new MutableString(len);

	int i=0, last=0;
	while(i<len)
	{
		while(i<len && (
				Character.isLetter(array[i]) ||
				Character.isDigit(array[i]) ||
				(ignoreChars!=null && ignoreChars.indexOf(array[i])>=0) ||
				(ignoreSequences && !Character.isWhitespace(array[i]) && (i>0 && array[i-1]==array[i] || i<len-1 && array[i+1]==array[i]) )
				))
			i++;

		if (i>last) {
			if (norm.length() > 0) norm.append(' ');
			norm.append(array, last, i-last);
		}

		while(i<len && !(
				Character.isLetter(array[i]) ||
				Character.isDigit(array[i]) ||
				(ignoreChars!=null && ignoreChars.indexOf(array[i])>=0) ||
				(ignoreSequences && !Character.isWhitespace(array[i]) && (i>0 && array[i-1]==array[i] || i<len-1 && array[i+1]==array[i]) )
				)){
			i++;
			last=i;
		}
	}
	return norm;
}
 
Example 3
Source Project: tagme   File: WikipediaAnchorParser.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Manage dots, removing them if they are part of an abbreviation, or replacing them with
 * withespaces if they are the last char of a word
 * @param input
 * @return
 */
public static MutableString removeDots(MutableString input)
{
	int len = input.length();
	char[] array = input.array();
	MutableString res = new MutableString(len);

	boolean isLastDot = false;
	int i=0, last=0;
	while(i<len)
	{
		while(i<len && array[i]!='.' && !Character.isWhitespace(array[i]))
			i++;

		if (i>last) {
			if (isLastDot && res.length() > 0) res.append(' ');
			res.append(array, last, i-last);
		}

		isLastDot = false;
		while(i<len && (array[i]=='.' || Character.isWhitespace(array[i])))
		{
			if (Character.isWhitespace(array[i]) ||
					(i<len-2 && array[i+2]!='.' && !Character.isWhitespace(array[i+2])) ||
					(i==len-2 && i>1 && array[i-2]!='.' && !Character.isWhitespace(array[i-2]))
					)
				isLastDot = true;
			i++;
			last=i;
		}
	}
	return res;


}
 
Example 4
/**
 * Replaces entities with the corresponding characters.
 * 
 * <P>This method will modify the mutable string <code>s</code> so that all legal occurrences
 * of entities are replaced by the corresponding character.
 * 
 * @param s a mutable string whose entities will be replaced by the corresponding characters.
 * @param entity a support mutable string used by {@link #scanEntity(char[], int, int, boolean, MutableString)}.
 * @param loose a parameter that will be passed to {@link #scanEntity(char[], int, int, boolean, MutableString)}.
 */
protected void replaceEntities( final MutableString s, final MutableString entity, final boolean loose ) {

	final char[] a = s.array();
	int length = s.length();

	/* We examine the string *backwards*, so that i is always a valid index. */

	int i = length, j;
	while( i-- > 0 )
		if ( a[ i ] == '&' && ( j = scanEntity( a, i, length - i, loose, entity ) ) != -1 ) 
			length = s.replace( i, j, lastEntity ).length();
}
 
Example 5
Source Project: tagme   File: WikiTextExtractor.java    License: Apache License 2.0 4 votes vote down vote up
public MutableString removeStructure(MutableString input, boolean onlyAbstract)
	{
		
		MutableString buffer = new MutableString(1024);
		FastBufferedReader tokenizer = new FastBufferedReader(input);
		
		MutableString text = new MutableString(2048);
		String punts = ":.;,-";
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				if (text.length() > MIN_ABSTRACT_CHARS && onlyAbstract){
					text.deleteCharAt(text.length()-1);
					return text;					
				}
				
//				MutableString linestr = new MutableString(buffer.trim());
				MutableString linestr = buffer.trim();
				if (linestr.length() == 0) continue;
				
				int start;
				int end;
				String chars;
				char[] line = linestr.array();
				int line_len = linestr.length();
				
				char first = linestr.charAt(0);
				switch (first)
				{
				case '=':{
					chars = " =";
					for(start=0; start <line_len && chars.indexOf(line[start])>=0; start++);
					for(end=line_len-1; end >= 0  && chars.indexOf(line[end])>=0; end--);
					
					if (start < end){
						text.append(linestr.subSequence(start, end+1));
						text.append(". ");
					}
					break;
				}
					
				case '*':
				case '#':
				case ':':
				case ';':{
					
					chars = "*#:; ";
					for(start=0; start<line_len && chars.indexOf(line[start])>=0 ; start++);
					
					if (start < line_len-1){
						text.append(linestr.subSequence(start, linestr.length()));
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					
					break;
				}
				case '{':
				case '|':
					break;
				case '.':
				case '-':{
					linestr.delete(new char[]{'.','-'});
					if (linestr.length() > 0){
						text.append(linestr);
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					break;
				}
				default:{
					if (linestr.lastChar() == '}')
						break;
					text.append(linestr);
					if (punts.indexOf(text.lastChar())<0)
						text.append('.');
					text.append(' ');
				}
				}
			}
		} catch (IOException e) {}
		if (text.length()>0) text.deleteCharAt(text.length()-1);
		return text;
	}
 
Example 6
public Utf16MutableStringBitVector( final MutableString s, final boolean prefixFree ) {
	this.a = s.array();
	actualEnd = s.length() * Character.SIZE;
	length = actualEnd + ( prefixFree ? Character.SIZE : 0 );
}
 
Example 7
public ISOMutableStringBitVector( final MutableString s, final boolean prefixFree ) {
	this.a = s.array();
	actualEnd = s.length() * Byte.SIZE;
	length = actualEnd + ( prefixFree ? Byte.SIZE : 0 );
}
 
Example 8
/** Creates a new fast buffered reader by wrapping a given mutable string and using a set of additional word constituents.
 * 
 * <p>The effect of {@link #setReader(Reader)} on a buffer created with
 * this constructor is undefined.
 *
 * @param s the mutable string that will be wrapped by the reader.
 * @param wordConstituents a set of characters that will be considered word constituents.
 */
public FastBufferedReader( final MutableString s, final CharSet wordConstituents ) {
	this( s.array(), 0, s.length(), wordConstituents );
}
 
Example 9
/** Creates a new fast buffered reader by wrapping a given mutable string.
 * <p>The effect of {@link #setReader(Reader)} on a buffer created with
 * this constructor is undefined.
 *
 * @param s the mutable string that will be wrapped by the reader.
 */
public FastBufferedReader( final MutableString s ) {
	this( s.array(), 0, s.length() );
}