Java Code Examples for it.unimi.dsi.lang.MutableString#delete()

The following examples show how to use it.unimi.dsi.lang.MutableString#delete() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ImmutableBinaryTrie.java    From database with GNU General Public License v2.0 6 votes vote down vote up
private void recToString( final Node n, final MutableString printPrefix, final MutableString result, final MutableString path, final int level ) {
	if ( n == null ) return;
	
	//System.err.println( "Called with prefix " + printPrefix );
	
	result.append( printPrefix ).append( '(' ).append( level ).append( ')' );
	
	if ( n.path != null ) {
		path.append( LongArrayBitVector.wrap( n.path, n.pathLength ) );
		result.append( " path:" ).append( LongArrayBitVector.wrap( n.path, n.pathLength ) );
	}
	if ( n.word >= 0 ) result.append( " word: " ).append( n.word ).append( " (" ).append( path ).append( ')' );

	result.append( '\n' );
	
	path.append( '0' );
	recToString( n.left, printPrefix.append( '\t' ).append( "0 => " ), result, path, level + 1 );
	path.charAt( path.length() - 1, '1' ); 
	recToString( n.right, printPrefix.replace( printPrefix.length() - 5, printPrefix.length(), "1 => "), result, path, level + 1 );
	path.delete( path.length() - 1, path.length() ); 
	printPrefix.delete( printPrefix.length() - 6, printPrefix.length() );
	
	//System.err.println( "Path now: " + path + " Going to delete from " + ( path.length() - n.pathLength));
	
	path.delete( path.length() - n.pathLength, path.length() );
}
 
Example 2
Source File: WikiTextExtractor.java    From tagme with Apache License 2.0 5 votes vote down vote up
@Override
		public int handleLink(MutableString input, int start, int end, WikiLink link) {
			if (link == WikiLink.EMPTY){
				input.delete(start, end);
				return start;
			} else {
				input.replace(start, end, link.anchor);
				return start+link.anchor.length();
//				return 0;
			}
		}
 
Example 3
Source File: WikiTextExtractor.java    From tagme with Apache License 2.0 5 votes vote down vote up
@Override
public int handleLink(MutableString input, int start, int end, WikiLink link) {
	if (link == WikiLink.EMPTY){
		input.delete(start, end);
		return start;
	} else {
		return end;
	}
}
 
Example 4
Source File: ImmutableExternalPrefixMap.java    From database with GNU General Public License v2.0 5 votes vote down vote up
protected MutableString getTerm( final int index, final MutableString s ) {
	ensureStream();
	// We perform a binary search to find the  block to which s could possibly belong.
	int block = Arrays.binarySearch( blockStart, index );
	if ( block < 0 ) block = - block - 2;

	try {
		dumpStream.position( blockOffset[ block ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		int suffixLength, prefixLength = -1;

		for( int i = index - blockStart[ block ] + 1; i-- != 0; ) { 
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int j = 0; j < suffixLength; j++ ) s.charAt( j + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
		}
		
		return s;
	}
	catch( IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
}
 
Example 5
Source File: ImmutableExternalPrefixMap.java    From database with GNU General Public License v2.0 5 votes vote down vote up
private long getIndex( final Object o ) {
	final CharSequence term = (CharSequence)o;
	ensureStream();
	// If term contains any character not coded by the prefix coder, we can return -1
	if ( ! isEncodable( term ) ) return -1;

	/* If term is in the map, any string extending term must follow term. Thus,
	 * term can be in the map only if it can be found in the left block
	 * of an approximated interval for itself. */
	Interval interval = intervalApproximator.getApproximatedInterval( term );
	if ( interval == Intervals.EMPTY_INTERVAL ) return -1;
	try {
		dumpStream.position( blockOffset[ interval.left ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		MutableString s = new MutableString();
		int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ];

		/* We scan the dump file, stopping if we exhaust the block */
		while( count < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( s.equals( term ) ) return count;
			count++;
		}
		
		return -1;
	}
	catch (IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
}
 
Example 6
Source File: WikipediaAnchorParser.java    From tagme with Apache License 2.0 4 votes vote down vote up
/**
 * It normalizes the anchor text:
 * 1. ascii normalization
 * 2. delete brackets at the end (i.e. in titles)
 * 3. delete a pattern at the beginning of the text, see anchorStart
 * 4. delete all dots '.'
 * 5. replace all punctuations with whitespaces, except for {@link WikipediaAnchorParser#SPECIAL_PUNCTS}
 * 6. if the original contained any of {@link WikipediaAnchorParser#SPECIAL_PUNCTS},
 * 	  it returns 2 anchors (with those puncts replaced by whitespace, and with those puncts deleted)
 *    otherwise, the normalization at 5.
 * If the normalization process doesn't produce a valid anchor, an empty array is returned
 * @param original
 * @param anchorStart A pattern that identifies some common articles or preposition to be deleted if they occur at the beginning of the anchor
 * @return
 */
public static CharSequence[] parseAnchor(CharSequence original, Pattern anchorStart)
{

	MutableString anchor = Chars.toNormalizedASCII(original);
	anchor.squeezeSpace();
	anchor.trim();

	if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

	anchor.loose();
	anchor.toLowerCase();

	Matcher m = P_FINAL_BRACKETS.matcher(anchor);
	if (m.find()) anchor.delete(m.start(), m.end());
	anchor.trim();
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchorStart != null)
	{
		Matcher m2 = anchorStart.matcher(anchor);
		if (m2.find()) anchor.delete(m2.start(), m2.end());
		anchor.trim();
		if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;
	}

	anchor = removeDots(anchor);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	anchor = removePunctuations(anchor, SPECIAL_PUNCTS, false);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchor.indexOfAnyOf(SPECIAL_PUNCTS_CHARS, 0)<0){
		if (!contaisText(anchor)) return Chars.EMPTY_STRINGS;
		else return new CharSequence[]{anchor};
	} else {
		MutableString anchorNoPuncts = new MutableString(anchor);
		anchor.replace(SPECIAL_PUNCTS_CHARS, SPECIAL_PUNCTS_CHAR_MAP);
		//			Chars.trimMultispace(anchor);
		anchor.squeezeSpace().trim();
		if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

		anchorNoPuncts.delete(SPECIAL_PUNCTS_CHARS);
		anchorNoPuncts.squeezeSpace().trim();
		if (anchorNoPuncts.length() < MIN_ANCHOR_LEN || !contaisText(anchorNoPuncts)) return new CharSequence[]{anchor};
		else return new CharSequence[]{anchor, anchorNoPuncts};
	}
}
 
Example 7
Source File: WikiTextExtractor.java    From tagme with Apache License 2.0 4 votes vote down vote up
public MutableString removeStructure(MutableString input, boolean onlyAbstract)
	{
		
		MutableString buffer = new MutableString(1024);
		FastBufferedReader tokenizer = new FastBufferedReader(input);
		
		MutableString text = new MutableString(2048);
		String punts = ":.;,-";
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				if (text.length() > MIN_ABSTRACT_CHARS && onlyAbstract){
					text.deleteCharAt(text.length()-1);
					return text;					
				}
				
//				MutableString linestr = new MutableString(buffer.trim());
				MutableString linestr = buffer.trim();
				if (linestr.length() == 0) continue;
				
				int start;
				int end;
				String chars;
				char[] line = linestr.array();
				int line_len = linestr.length();
				
				char first = linestr.charAt(0);
				switch (first)
				{
				case '=':{
					chars = " =";
					for(start=0; start <line_len && chars.indexOf(line[start])>=0; start++);
					for(end=line_len-1; end >= 0  && chars.indexOf(line[end])>=0; end--);
					
					if (start < end){
						text.append(linestr.subSequence(start, end+1));
						text.append(". ");
					}
					break;
				}
					
				case '*':
				case '#':
				case ':':
				case ';':{
					
					chars = "*#:; ";
					for(start=0; start<line_len && chars.indexOf(line[start])>=0 ; start++);
					
					if (start < line_len-1){
						text.append(linestr.subSequence(start, linestr.length()));
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					
					break;
				}
				case '{':
				case '|':
					break;
				case '.':
				case '-':{
					linestr.delete(new char[]{'.','-'});
					if (linestr.length() > 0){
						text.append(linestr);
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					break;
				}
				default:{
					if (linestr.lastChar() == '}')
						break;
					text.append(linestr);
					if (punts.indexOf(text.lastChar())<0)
						text.append('.');
					text.append(' ');
				}
				}
			}
		} catch (IOException e) {}
		if (text.length()>0) text.deleteCharAt(text.length()-1);
		return text;
	}
 
Example 8
Source File: ImmutableExternalPrefixMap.java    From database with GNU General Public License v2.0 4 votes vote down vote up
public Interval getInterval( final CharSequence prefix ) {
	ensureStream();
	// If prefix contains any character not coded by the prefix coder, we can return the empty interval.
	if ( ! isEncodable( prefix ) ) return Intervals.EMPTY_INTERVAL;

	// We recover the left extremes of the intervals where extensions of prefix could possibly lie.
	Interval interval = intervalApproximator.getApproximatedInterval( prefix );
	// System.err.println( "Approximate interval: " + interval + " , terms: [" + blockStart[ interval.left ] + ", " + blockStart[ interval.right ] + "]" );

	if ( interval == Intervals.EMPTY_INTERVAL ) return interval;
	try {
		dumpStream.position( blockOffset[ interval.left ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		MutableString s = new MutableString();
		int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ], start = -1, end = -1;

		/* We scan the dump file, stopping if we exhaust the block */
		while( count < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( s.startsWith( prefix ) ) {
				start = count;
				break; 
			}
			count++;
		}
		
		/* If we did not find our string, there are two possibilities: if the
		 * interval contains one point, there is no string extending prefix. But
		 * if  the interval  is larger, the first string of the second block in the
		 * interval must be an extension of prefix. */
		if ( start < 0 && interval.length() == 1 ) return Intervals.EMPTY_INTERVAL;
		else start = count;
		
		end = start + 1;
		//assert dumpStream.readBits() <= blockSize;

		/* If the interval contains more than one point, the last string with
		 * given prefix is necessarily contained in the last block, and we
		 * must restart the search process. */
		if ( interval.length() > 1  ) {
			dumpStream.position( blockOffset[ interval.right ] * blockSize );
			dumpStream.readBits( 0 );
			s.length( 0 );
			end = blockStart[ interval.right ];
			blockEnd = blockStart[ interval.right + 1 ];
			prefixLength = -1;
		}
		
		
		while( end < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( ! s.startsWith( prefix ) ) break;
			end++;
		}
		
		return Interval.valueOf( start, end - 1 );
	} catch (IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
	
}