Java Code Examples for it.unimi.dsi.lang.MutableString#trim()

The following examples show how to use it.unimi.dsi.lang.MutableString#trim() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: tagme   File: WikiTextExtractor.java    License: Apache License 2.0 5 votes vote down vote up
public List<WikiLink> extractDisambiguationLinks(MutableString cleanText)
	{
		FastBufferedReader tokenizer = new FastBufferedReader(cleanText);
		MutableString buffer = new MutableString(1024);
		List<WikiLink> links = new ArrayList<WikiLink>();
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				buffer.trim();
				if (buffer.length() == 0) continue;
				
				if (buffer.charAt(0) == '*')
				{
					int start = 1;
					for(; start<buffer.length() && buffer.charAt(start)=='*' ; start++);
					buffer.delete(start, buffer.length()).trim();
					
					if (buffer.length() == 0) continue;
//					if (!buffer.startsWith("[[")) continue;
					
					List<WikiLink> lineLinks = extractLinkFromCleanedLine(buffer);
					if (lineLinks.size()>0) links.add(lineLinks.get(0));
				}
			}
		} catch (IOException ioe){}
		
		return links;
		
	}
 
Example 2
Source Project: tagme   File: WikipediaAnchorParser.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * It normalizes the anchor text:
 * 1. ascii normalization
 * 2. delete brackets at the end (i.e. in titles)
 * 3. delete a pattern at the beginning of the text, see anchorStart
 * 4. delete all dots '.'
 * 5. replace all punctuations with whitespaces, except for {@link WikipediaAnchorParser#SPECIAL_PUNCTS}
 * 6. if the original contained any of {@link WikipediaAnchorParser#SPECIAL_PUNCTS},
 * 	  it returns 2 anchors (with those puncts replaced by whitespace, and with those puncts deleted)
 *    otherwise, the normalization at 5.
 * If the normalization process doesn't produce a valid anchor, an empty array is returned
 * @param original
 * @param anchorStart A pattern that identifies some common articles or preposition to be deleted if they occur at the beginning of the anchor
 * @return
 */
public static CharSequence[] parseAnchor(CharSequence original, Pattern anchorStart)
{

	MutableString anchor = Chars.toNormalizedASCII(original);
	anchor.squeezeSpace();
	anchor.trim();

	if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

	anchor.loose();
	anchor.toLowerCase();

	Matcher m = P_FINAL_BRACKETS.matcher(anchor);
	if (m.find()) anchor.delete(m.start(), m.end());
	anchor.trim();
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchorStart != null)
	{
		Matcher m2 = anchorStart.matcher(anchor);
		if (m2.find()) anchor.delete(m2.start(), m2.end());
		anchor.trim();
		if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;
	}

	anchor = removeDots(anchor);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	anchor = removePunctuations(anchor, SPECIAL_PUNCTS, false);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchor.indexOfAnyOf(SPECIAL_PUNCTS_CHARS, 0)<0){
		if (!contaisText(anchor)) return Chars.EMPTY_STRINGS;
		else return new CharSequence[]{anchor};
	} else {
		MutableString anchorNoPuncts = new MutableString(anchor);
		anchor.replace(SPECIAL_PUNCTS_CHARS, SPECIAL_PUNCTS_CHAR_MAP);
		//			Chars.trimMultispace(anchor);
		anchor.squeezeSpace().trim();
		if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

		anchorNoPuncts.delete(SPECIAL_PUNCTS_CHARS);
		anchorNoPuncts.squeezeSpace().trim();
		if (anchorNoPuncts.length() < MIN_ANCHOR_LEN || !contaisText(anchorNoPuncts)) return new CharSequence[]{anchor};
		else return new CharSequence[]{anchor, anchorNoPuncts};
	}
}
 
Example 3
Source Project: tagme   File: WikipediaIndexer.java    License: Apache License 2.0 4 votes vote down vote up
public static HashMap<String, List<String>> parseDBPediaCategories(String lang) throws IOException
{
	PLogger plog = new PLogger(log, Step.TEN_MINUTES, "Lines", "Articles", "Errors");
	plog.start("Parsing DBPEDIA categories");
	
	HashMap<String, List<String>> cats = new HashMap<String, List<String>>(1600000);

	Pattern patTitle = Pattern.compile("/resource/([^>]*)>");
	Pattern patCat = Pattern.compile("/resource/[^:</]*:([^>]*)>");

	
	File dbpedia_cat = WikipediaFiles.DBPEDIA_CAT.getSourceFile(lang);
	FastBufferedReader fbr = new FastBufferedReader(new InputStreamReader(new FileInputStream(dbpedia_cat), Charset.forName("UTF-8")));
	
	MutableString line = new MutableString(1024);
	while(fbr.readLine(line) != null)
	{
		plog.update(0);
		line.trim();
		if (line.startsWith("#")) continue;
		
		Matcher m = patTitle.matcher(line);
		if (!m.find())
		{
			plog.update(2);
			continue;
		}
		String title = m.group(1).replace('_', ' ');
		
		int lastCharTitle = m.end();
		m = patCat.matcher(line);
		if (!m.find(lastCharTitle))
		{
			plog.update(2);
			continue;
		}
		String cat = m.group(1).replace('_', ' ');
		
		if (!cats.containsKey(title))
		{
			plog.update(1);
			cats.put(title, new ArrayList<String>());
		}
		cats.get(title).add(cat);
	}
	plog.stop();
	
	fbr.close();
	return cats;
}
 
Example 4
Source Project: tagme   File: WikiTextExtractor.java    License: Apache License 2.0 4 votes vote down vote up
public MutableString removeStructure(MutableString input, boolean onlyAbstract)
	{
		
		MutableString buffer = new MutableString(1024);
		FastBufferedReader tokenizer = new FastBufferedReader(input);
		
		MutableString text = new MutableString(2048);
		String punts = ":.;,-";
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				if (text.length() > MIN_ABSTRACT_CHARS && onlyAbstract){
					text.deleteCharAt(text.length()-1);
					return text;					
				}
				
//				MutableString linestr = new MutableString(buffer.trim());
				MutableString linestr = buffer.trim();
				if (linestr.length() == 0) continue;
				
				int start;
				int end;
				String chars;
				char[] line = linestr.array();
				int line_len = linestr.length();
				
				char first = linestr.charAt(0);
				switch (first)
				{
				case '=':{
					chars = " =";
					for(start=0; start <line_len && chars.indexOf(line[start])>=0; start++);
					for(end=line_len-1; end >= 0  && chars.indexOf(line[end])>=0; end--);
					
					if (start < end){
						text.append(linestr.subSequence(start, end+1));
						text.append(". ");
					}
					break;
				}
					
				case '*':
				case '#':
				case ':':
				case ';':{
					
					chars = "*#:; ";
					for(start=0; start<line_len && chars.indexOf(line[start])>=0 ; start++);
					
					if (start < line_len-1){
						text.append(linestr.subSequence(start, linestr.length()));
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					
					break;
				}
				case '{':
				case '|':
					break;
				case '.':
				case '-':{
					linestr.delete(new char[]{'.','-'});
					if (linestr.length() > 0){
						text.append(linestr);
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					break;
				}
				default:{
					if (linestr.lastChar() == '}')
						break;
					text.append(linestr);
					if (punts.indexOf(text.lastChar())<0)
						text.append('.');
					text.append(' ');
				}
				}
			}
		} catch (IOException e) {}
		if (text.length()>0) text.deleteCharAt(text.length()-1);
		return text;
	}