Java Code Examples for it.unimi.dsi.lang.MutableString#length()

The following examples show how to use it.unimi.dsi.lang.MutableString#length() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: tagme   File: Chars.java    License: Apache License 2.0 6 votes vote down vote up
/** Splits the input string in char sequences using char c as delimiter (c is discarded).
 * @param input
 * @param c
 * @return the splitted sub-strings.
 */
public static CharSequence[] split (MutableString input, char c){
	
	if (input.indexOf(c)<0) return new CharSequence[]{input};
	
	ObjectArrayList<CharSequence> tokens = new ObjectArrayList<CharSequence>();
	int pos=-1, last=0;
	while((pos=input.indexOf(c, last))>=0)
	{
		if (last<pos)
			tokens.add(input.subSequence(last, pos));
		else
			tokens.add(new MutableString(""));
		last = pos+1;
	}
	if (last < input.length()) tokens.add(input.subSequence(last, input.length()));
	return tokens.toArray(Chars.EMPTY_STRINGS);
}
 
Example 2
/** Returns the element at the specified position in this front-coded list by storing it in a mutable string.
 *
 * @param index an index in the list.
 * @param s a mutable string that will contain the string at the specified position.
 */
public void get( final int index, MutableString s ) { 
	if ( utf8 ) {
		final byte[] a = byteFrontCodedList.getArray( index );
		s.length( countUTF8Chars( a ) );
		byte2Char( a, s.array() );
	}
	else {
		int res = charFrontCodedList.get( index, s.array() );
		if ( res < 0 ) {
			s.length( s.array().length - res );
			res = charFrontCodedList.get( index, s.array() );
		}
		else s.length( res );
	}
}
 
Example 3
Source Project: tagme   File: Chars.java    License: Apache License 2.0 5 votes vote down vote up
/** Convert a string from UTF-16 to ASCII.
 * @param input the string to convert.
 */
public static void normalizeASCII(MutableString input)
{
	char[] chars = input.array();
	int len = input.length();
	for(int i=0; i<len; i++)
		chars[i] = UTF16toASCII.MAP[(int)chars[i]];
}
 
Example 4
Source Project: tagme   File: WikipediaAnchorParser.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Remove all punctuations for an anchor, i.e. remove all but letters, digits and whitespaces
 *
 * @param input
 * @param ignoreChars A set of character (no digits, no letters) that are ignored when removing
 * @param ignoreSequences if true, it does not remove sequences of the same characters i.e. '!!!'
 * @return A new MutableString
 */
public static MutableString removePunctuations(MutableString input, String ignoreChars, boolean ignoreSequences)
{
	int len = input.length();
	char[] array = input.array();
	MutableString norm = new MutableString(len);

	int i=0, last=0;
	while(i<len)
	{
		while(i<len && (
				Character.isLetter(array[i]) ||
				Character.isDigit(array[i]) ||
				(ignoreChars!=null && ignoreChars.indexOf(array[i])>=0) ||
				(ignoreSequences && !Character.isWhitespace(array[i]) && (i>0 && array[i-1]==array[i] || i<len-1 && array[i+1]==array[i]) )
				))
			i++;

		if (i>last) {
			if (norm.length() > 0) norm.append(' ');
			norm.append(array, last, i-last);
		}

		while(i<len && !(
				Character.isLetter(array[i]) ||
				Character.isDigit(array[i]) ||
				(ignoreChars!=null && ignoreChars.indexOf(array[i])>=0) ||
				(ignoreSequences && !Character.isWhitespace(array[i]) && (i>0 && array[i-1]==array[i] || i<len-1 && array[i+1]==array[i]) )
				)){
			i++;
			last=i;
		}
	}
	return norm;
}
 
Example 5
Source Project: tagme   File: WikipediaAnchorParser.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Manage dots, removing them if they are part of an abbreviation, or replacing them with
 * withespaces if they are the last char of a word
 * @param input
 * @return
 */
public static MutableString removeDots(MutableString input)
{
	int len = input.length();
	char[] array = input.array();
	MutableString res = new MutableString(len);

	boolean isLastDot = false;
	int i=0, last=0;
	while(i<len)
	{
		while(i<len && array[i]!='.' && !Character.isWhitespace(array[i]))
			i++;

		if (i>last) {
			if (isLastDot && res.length() > 0) res.append(' ');
			res.append(array, last, i-last);
		}

		isLastDot = false;
		while(i<len && (array[i]=='.' || Character.isWhitespace(array[i])))
		{
			if (Character.isWhitespace(array[i]) ||
					(i<len-2 && array[i+2]!='.' && !Character.isWhitespace(array[i+2])) ||
					(i==len-2 && i>1 && array[i-2]!='.' && !Character.isWhitespace(array[i-2]))
					)
				isLastDot = true;
			i++;
			last=i;
		}
	}
	return res;


}
 
Example 6
Source Project: tagme   File: WikiTextExtractor.java    License: Apache License 2.0 5 votes vote down vote up
public List<WikiLink> extractDisambiguationLinks(MutableString cleanText)
	{
		FastBufferedReader tokenizer = new FastBufferedReader(cleanText);
		MutableString buffer = new MutableString(1024);
		List<WikiLink> links = new ArrayList<WikiLink>();
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				buffer.trim();
				if (buffer.length() == 0) continue;
				
				if (buffer.charAt(0) == '*')
				{
					int start = 1;
					for(; start<buffer.length() && buffer.charAt(start)=='*' ; start++);
					buffer.delete(start, buffer.length()).trim();
					
					if (buffer.length() == 0) continue;
//					if (!buffer.startsWith("[[")) continue;
					
					List<WikiLink> lineLinks = extractLinkFromCleanedLine(buffer);
					if (lineLinks.size()>0) links.add(lineLinks.get(0));
				}
			}
		} catch (IOException ioe){}
		
		return links;
		
	}
 
Example 7
Source Project: BUbiNG   File: RuntimeConfiguration.java    License: Apache License 2.0 5 votes vote down vote up
/** Adds a (or a set of) new IPv4 to the black list; the IPv4 can be specified directly or it can be a file (prefixed by
 *  <code>file:</code>).
 *
 * @param spec the specification (an IP address, or a file prefixed by <code>file</code>).
 * @throws ConfigurationException
 * @throws FileNotFoundException
 */
public void addBlackListedIPv4(final String spec) throws ConfigurationException, FileNotFoundException {
		if (spec.length() == 0) return; // Skip empty specs
		if (spec.startsWith("file:")) {
			final LineIterator lineIterator = new LineIterator(new FastBufferedReader(new InputStreamReader(new FileInputStream(spec.substring(5)), Charsets.ISO_8859_1)));
			while (lineIterator.hasNext()) {
				final MutableString line = lineIterator.next();
				if (line.length() > 0) blackListedIPv4Addresses.add(handleIPv4(line.toString()));
			}
		}
		else blackListedIPv4Addresses.add(handleIPv4(spec));
}
 
Example 8
/**
 * Replaces entities with the corresponding characters.
 * 
 * <P>This method will modify the mutable string <code>s</code> so that all legal occurrences
 * of entities are replaced by the corresponding character.
 * 
 * @param s a mutable string whose entities will be replaced by the corresponding characters.
 * @param entity a support mutable string used by {@link #scanEntity(char[], int, int, boolean, MutableString)}.
 * @param loose a parameter that will be passed to {@link #scanEntity(char[], int, int, boolean, MutableString)}.
 */
protected void replaceEntities( final MutableString s, final MutableString entity, final boolean loose ) {

	final char[] a = s.array();
	int length = s.length();

	/* We examine the string *backwards*, so that i is always a valid index. */

	int i = length, j;
	while( i-- > 0 )
		if ( a[ i ] == '&' && ( j = scanEntity( a, i, length - i, loose, entity ) ) != -1 ) 
			length = s.replace( i, j, lastEntity ).length();
}
 
Example 9
protected MutableString getTerm( final int index, final MutableString s ) {
	ensureStream();
	// We perform a binary search to find the  block to which s could possibly belong.
	int block = Arrays.binarySearch( blockStart, index );
	if ( block < 0 ) block = - block - 2;

	try {
		dumpStream.position( blockOffset[ block ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		int suffixLength, prefixLength = -1;

		for( int i = index - blockStart[ block ] + 1; i-- != 0; ) { 
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int j = 0; j < suffixLength; j++ ) s.charAt( j + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
		}
		
		return s;
	}
	catch( IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
}
 
Example 10
private long getIndex( final Object o ) {
	final CharSequence term = (CharSequence)o;
	ensureStream();
	// If term contains any character not coded by the prefix coder, we can return -1
	if ( ! isEncodable( term ) ) return -1;

	/* If term is in the map, any string extending term must follow term. Thus,
	 * term can be in the map only if it can be found in the left block
	 * of an approximated interval for itself. */
	Interval interval = intervalApproximator.getApproximatedInterval( term );
	if ( interval == Intervals.EMPTY_INTERVAL ) return -1;
	try {
		dumpStream.position( blockOffset[ interval.left ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		MutableString s = new MutableString();
		int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ];

		/* We scan the dump file, stopping if we exhaust the block */
		while( count < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( s.equals( term ) ) return count;
			count++;
		}
		
		return -1;
	}
	catch (IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
}
 
Example 11
public boolean next( final MutableString word, final MutableString nonWord ) throws IOException {
	int i;
	final char buffer[] = this.buffer;

	if ( noMoreCharacters() ) return false;

	word.length( 0 );
	nonWord.length( 0 );

	for(;;) {
		for( i = 0; i < avail && isWordConstituent( buffer[ pos + i ] ); i++ );

		word.append( buffer, pos, i  );
		pos += i; 
		avail -= i;
		
		if ( avail > 0 || noMoreCharacters() ) break;
	}
	
	if ( noMoreCharacters() ) return true;

	for(;;) {
		for( i = 0; i < avail && ! isWordConstituent( buffer[ pos + i ] ); i++ );

		nonWord.append( buffer, pos, i  );
		pos += i; 
		avail -= i;

		if ( avail > 0 || noMoreCharacters() ) return true;
	}
}
 
Example 12
Source Project: tagme   File: WikipediaAnchorParser.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * It normalizes the anchor text:
 * 1. ascii normalization
 * 2. delete brackets at the end (i.e. in titles)
 * 3. delete a pattern at the beginning of the text, see anchorStart
 * 4. delete all dots '.'
 * 5. replace all punctuations with whitespaces, except for {@link WikipediaAnchorParser#SPECIAL_PUNCTS}
 * 6. if the original contained any of {@link WikipediaAnchorParser#SPECIAL_PUNCTS},
 * 	  it returns 2 anchors (with those puncts replaced by whitespace, and with those puncts deleted)
 *    otherwise, the normalization at 5.
 * If the normalization process doesn't produce a valid anchor, an empty array is returned
 * @param original
 * @param anchorStart A pattern that identifies some common articles or preposition to be deleted if they occur at the beginning of the anchor
 * @return
 */
public static CharSequence[] parseAnchor(CharSequence original, Pattern anchorStart)
{

	MutableString anchor = Chars.toNormalizedASCII(original);
	anchor.squeezeSpace();
	anchor.trim();

	if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

	anchor.loose();
	anchor.toLowerCase();

	Matcher m = P_FINAL_BRACKETS.matcher(anchor);
	if (m.find()) anchor.delete(m.start(), m.end());
	anchor.trim();
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchorStart != null)
	{
		Matcher m2 = anchorStart.matcher(anchor);
		if (m2.find()) anchor.delete(m2.start(), m2.end());
		anchor.trim();
		if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;
	}

	anchor = removeDots(anchor);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	anchor = removePunctuations(anchor, SPECIAL_PUNCTS, false);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchor.indexOfAnyOf(SPECIAL_PUNCTS_CHARS, 0)<0){
		if (!contaisText(anchor)) return Chars.EMPTY_STRINGS;
		else return new CharSequence[]{anchor};
	} else {
		MutableString anchorNoPuncts = new MutableString(anchor);
		anchor.replace(SPECIAL_PUNCTS_CHARS, SPECIAL_PUNCTS_CHAR_MAP);
		//			Chars.trimMultispace(anchor);
		anchor.squeezeSpace().trim();
		if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

		anchorNoPuncts.delete(SPECIAL_PUNCTS_CHARS);
		anchorNoPuncts.squeezeSpace().trim();
		if (anchorNoPuncts.length() < MIN_ANCHOR_LEN || !contaisText(anchorNoPuncts)) return new CharSequence[]{anchor};
		else return new CharSequence[]{anchor, anchorNoPuncts};
	}
}
 
Example 13
Source Project: tagme   File: WikiTextExtractor.java    License: Apache License 2.0 4 votes vote down vote up
public MutableString removeStructure(MutableString input, boolean onlyAbstract)
	{
		
		MutableString buffer = new MutableString(1024);
		FastBufferedReader tokenizer = new FastBufferedReader(input);
		
		MutableString text = new MutableString(2048);
		String punts = ":.;,-";
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				if (text.length() > MIN_ABSTRACT_CHARS && onlyAbstract){
					text.deleteCharAt(text.length()-1);
					return text;					
				}
				
//				MutableString linestr = new MutableString(buffer.trim());
				MutableString linestr = buffer.trim();
				if (linestr.length() == 0) continue;
				
				int start;
				int end;
				String chars;
				char[] line = linestr.array();
				int line_len = linestr.length();
				
				char first = linestr.charAt(0);
				switch (first)
				{
				case '=':{
					chars = " =";
					for(start=0; start <line_len && chars.indexOf(line[start])>=0; start++);
					for(end=line_len-1; end >= 0  && chars.indexOf(line[end])>=0; end--);
					
					if (start < end){
						text.append(linestr.subSequence(start, end+1));
						text.append(". ");
					}
					break;
				}
					
				case '*':
				case '#':
				case ':':
				case ';':{
					
					chars = "*#:; ";
					for(start=0; start<line_len && chars.indexOf(line[start])>=0 ; start++);
					
					if (start < line_len-1){
						text.append(linestr.subSequence(start, linestr.length()));
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					
					break;
				}
				case '{':
				case '|':
					break;
				case '.':
				case '-':{
					linestr.delete(new char[]{'.','-'});
					if (linestr.length() > 0){
						text.append(linestr);
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					break;
				}
				default:{
					if (linestr.lastChar() == '}')
						break;
					text.append(linestr);
					if (punts.indexOf(text.lastChar())<0)
						text.append('.');
					text.append(' ');
				}
				}
			}
		} catch (IOException e) {}
		if (text.length()>0) text.deleteCharAt(text.length()-1);
		return text;
	}
 
Example 14
public Utf16MutableStringBitVector( final MutableString s, final boolean prefixFree ) {
	this.a = s.array();
	actualEnd = s.length() * Character.SIZE;
	length = actualEnd + ( prefixFree ? Character.SIZE : 0 );
}
 
Example 15
public ISOMutableStringBitVector( final MutableString s, final boolean prefixFree ) {
	this.a = s.array();
	actualEnd = s.length() * Byte.SIZE;
	length = actualEnd + ( prefixFree ? Byte.SIZE : 0 );
}
 
Example 16
public Interval getInterval( final CharSequence prefix ) {
	ensureStream();
	// If prefix contains any character not coded by the prefix coder, we can return the empty interval.
	if ( ! isEncodable( prefix ) ) return Intervals.EMPTY_INTERVAL;

	// We recover the left extremes of the intervals where extensions of prefix could possibly lie.
	Interval interval = intervalApproximator.getApproximatedInterval( prefix );
	// System.err.println( "Approximate interval: " + interval + " , terms: [" + blockStart[ interval.left ] + ", " + blockStart[ interval.right ] + "]" );

	if ( interval == Intervals.EMPTY_INTERVAL ) return interval;
	try {
		dumpStream.position( blockOffset[ interval.left ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		MutableString s = new MutableString();
		int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ], start = -1, end = -1;

		/* We scan the dump file, stopping if we exhaust the block */
		while( count < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( s.startsWith( prefix ) ) {
				start = count;
				break; 
			}
			count++;
		}
		
		/* If we did not find our string, there are two possibilities: if the
		 * interval contains one point, there is no string extending prefix. But
		 * if  the interval  is larger, the first string of the second block in the
		 * interval must be an extension of prefix. */
		if ( start < 0 && interval.length() == 1 ) return Intervals.EMPTY_INTERVAL;
		else start = count;
		
		end = start + 1;
		//assert dumpStream.readBits() <= blockSize;

		/* If the interval contains more than one point, the last string with
		 * given prefix is necessarily contained in the last block, and we
		 * must restart the search process. */
		if ( interval.length() > 1  ) {
			dumpStream.position( blockOffset[ interval.right ] * blockSize );
			dumpStream.readBits( 0 );
			s.length( 0 );
			end = blockStart[ interval.right ];
			blockEnd = blockStart[ interval.right + 1 ];
			prefixLength = -1;
		}
		
		
		while( end < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( ! s.startsWith( prefix ) ) break;
			end++;
		}
		
		return Interval.valueOf( start, end - 1 );
	} catch (IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
	
}
 
Example 17
public boolean next( final MutableString word, final MutableString nonWord ) throws IOException {
	nonWord.length( 0 );
	return fastBufferedReader.readLine( word ) != null;
}
 
Example 18
/** Reads a line into the given mutable string.
 *
 * <P>The next line of input (defined as in {@link java.io.BufferedReader#readLine()})
 * will be stored into <code>s</code>. Note that if <code>s</code> is 
 * not {@linkplain it.unimi.dsi.lang.MutableString loose}
 * this method will be quite inefficient.
 *
 * @param s a mutable string that will be used to store the next line (which could be empty).
 * @return <code>s</code>, or <code>null</code> if the end of file was found, in which
 * case <code>s</code> is unchanged.
 */

public MutableString readLine( final MutableString s ) throws IOException {
	char c = 0;
	int i;

	if ( noMoreCharacters() ) return null;

	s.length( 0 );

	for(;;) {
		for( i = 0; i < avail && ( c = buffer[ pos + i ] ) != '\n' && c != '\r' ; i++ );

		s.append( buffer, pos, i  );
		pos += i; 
		avail -= i;

		if ( avail > 0 ) {
			if ( c == '\n' ) { // LF only.
				pos++;
				avail--;
			}
			else { // c == '\r'
				pos++;
				avail--;
				if ( avail > 0 ) {
					if ( buffer[ pos ] == '\n' ) { // CR/LF with LF already in the buffer.
						pos ++;
						avail--;
					}
				}
				else { // We must search for the LF.
					if ( noMoreCharacters() ) return s;
					if ( buffer[ 0 ] == '\n' ) {
						pos++;
						avail--;
					}
				}
			}
			return s;
		}
		else if ( noMoreCharacters() ) return s;
	}
}
 
Example 19
/** Creates a new fast buffered reader by wrapping a given mutable string and using a set of additional word constituents.
 * 
 * <p>The effect of {@link #setReader(Reader)} on a buffer created with
 * this constructor is undefined.
 *
 * @param s the mutable string that will be wrapped by the reader.
 * @param wordConstituents a set of characters that will be considered word constituents.
 */
public FastBufferedReader( final MutableString s, final CharSet wordConstituents ) {
	this( s.array(), 0, s.length(), wordConstituents );
}
 
Example 20
/** Creates a new fast buffered reader by wrapping a given mutable string.
 * <p>The effect of {@link #setReader(Reader)} on a buffer created with
 * this constructor is undefined.
 *
 * @param s the mutable string that will be wrapped by the reader.
 */
public FastBufferedReader( final MutableString s ) {
	this( s.array(), 0, s.length() );
}