Java Code Examples for it.unimi.dsi.fastutil.objects.Object2IntMap#getInt()

The following examples show how to use it.unimi.dsi.fastutil.objects.Object2IntMap#getInt() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BestAnchors.java    From tagme with Apache License 2.0 5 votes vote down vote up
String findBest(int wid, final Object2IntMap<String> anchors) throws IOException
{
	Query q = new TermQuery(new Term(WikipediaIndexer.FIELD_WID, ""+wid));
	TopDocs td = articles.search(q, 1);
	if (td.totalHits == 0) return null;//throw new IOException("Unable to find title for WID:"+wid);
	String title = articles.doc(td.scoreDocs[0].doc).get(WikipediaIndexer.FIELD_TITLE);
	title = title.replaceAll("\\&quot;", "\"");

	Set<String> titleTerms = terms(title).keySet();

	List<String> bests = new ArrayList<String>(anchors.size());
	bests.addAll(anchors.keySet());
	Collections.sort(bests, new Comparator<String>() {
		@Override
		public int compare(String o1, String o2) {
			return anchors.getInt(o2)-anchors.getInt(o1);
		}
	});


	for (String a : bests)
	{
		if (anchors.getInt(a)< MIN_ANCHORS) continue;
		Set<String> anchorTerms = terms(a).keySet();
		for(String aw : anchorTerms)
			if (!titleTerms.contains(aw))
				return a;
	}
	return null;
}
 
Example 2
Source File: RedirectMap.java    From tagme with Apache License 2.0 5 votes vote down vote up
@Override
protected Int2IntMap parseSet() throws IOException
{
	final Object2IntMap<String> titles = new TitlesToWIDMap(lang).getDataset();
	final Int2IntOpenHashMap map = new Int2IntOpenHashMap(3000000);
	SQLWikiParser parser = new SQLWikiParser(log, "Titles NF") {
		@Override
		public boolean compute(ArrayList<String> values) throws IOException
		{
			int ns = Integer.parseInt(values.get(SQLWikiParser.REDIRECT_NS));
			if (ns == SQLWikiParser.NS_ARTICLE)
			{
				int idFrom = Integer.parseInt(values.get(SQLWikiParser.REDIRECT_ID_FROM));
				int idTo = titles.getInt(cleanPageName(values.get(SQLWikiParser.REDIRECT_TITLE_TO)));
				if (idTo >= 0)
					map.put(idFrom, idTo);
				else this.updateItem(0);
				
				return true;
			} else return false;
		}
	};

	File input = WikipediaFiles.REDIRECTS.getSourceFile(lang);
	InputStreamReader in = new InputStreamReader(new FileInputStream(input), Charset.forName("UTF-8"));
	parser.compute(in);
	in.close();
	
	map.defaultReturnValue(-1);
	map.trim();
	
	return map;

}
 
Example 3
Source File: TweetCentroid.java    From AffectiveTweets with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Adds a new document to the word representation.
 * @param docVector a  document vector
 */
public void addDoc(Object2IntMap<String> docVector){
	this.numDoc++;
	for(String vecWord:docVector.keySet()){
		int vecWordFreq=docVector.getInt(vecWord);
		// if the word was seen before we add the current frequency
		this.wordSpace.put(vecWord,vecWordFreq+this.wordSpace.getInt(vecWord));
	}	

}
 
Example 4
Source File: NoDictionarySingleColumnGroupKeyGenerator.java    From incubator-pinot with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
private int getKeyForValue(String value) {
  Object2IntMap<String> map = (Object2IntMap<String>) _groupKeyMap;
  int groupId = map.getInt(value);
  if (groupId == INVALID_ID) {
    if (_numGroups < _globalGroupIdUpperBound) {
      groupId = _numGroups;
      map.put(value, _numGroups++);
    }
  }
  return groupId;
}
 
Example 5
Source File: LanguageDetector.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
/**
 * 检测语言
 * 
 * @param text
 * @param options
 * @return
 */
public SortedSet<DetectionLanguage> detectLanguages(String text, Object2BooleanMap<String> options) {
    SortedSet<DetectionLanguage> locales = new TreeSet<>();

    // 最小长度限制
    int size = text.length();
    if (size < minimum) {
        return locales;
    }
    // 最大长度限制
    if (size > maximum) {
        text = text.substring(0, maximum);
        size = maximum;
    }

    // 白名单,黑名单
    Set<String> writes = options.size() == 0 ? Collections.EMPTY_SET : new HashSet<>();
    Set<String> blacks = options.size() == 0 ? Collections.EMPTY_SET : new HashSet<>();
    for (Object2BooleanMap.Entry<String> option : options.object2BooleanEntrySet()) {
        if (option.getBooleanValue()) {
            writes.add(option.getKey());
        } else {
            blacks.add(option.getKey());
        }
    }

    /*
     * Get the script which characters occur the most in `value`.
     */
    int count = -1;
    String script = null;
    for (DetectionPattern regulation : patterns.values()) {
        Pattern pattern = regulation.getPattern();
        Matcher matcher = pattern.matcher(text);
        int match = 0;
        while (matcher.find()) {
            match++;
        }
        if (match > count) {
            count = match;
            script = regulation.getName();
        }
    }
    if (script == null || count <= 0) {
        return locales;
    }

    /* One languages exists for the most-used script. */
    Set<DetectionTrie> dictionaries = tires.get(script);
    if (dictionaries == null) {
        /*
         * If no matches occured, such as a digit only string, or because the language is ignored, exit with `und`.
         */
        if (!checkLanguage(script, writes, blacks)) {
            return locales;
        }
        locales.add(new DetectionLanguage(Locale.forLanguageTag(script), 1D));
        return locales;
    }

    /*
     * Get all distances for a given script, and normalize the distance values.
     */
    // 前后补空格是为了N-Gram处理
    text = StringUtility.SPACE + REPLACE.matcher(text).replaceAll(StringUtility.SPACE).toLowerCase() + StringUtility.SPACE;
    CharacterNgram ngram = new CharacterNgram(3, text);
    Object2IntMap<CharSequence> tuples = new Object2IntOpenHashMap<>();
    for (CharSequence character : ngram) {
        count = tuples.getInt(character);
        tuples.put(character, count + 1);
    }
    for (DetectionTrie dictionary : dictionaries) {
        String language = dictionary.getName();
        if (checkLanguage(language, writes, blacks)) {
            double score = getScore(tuples, dictionary.getTrie());
            DetectionLanguage locale = new DetectionLanguage(Locale.forLanguageTag(language), score);
            locales.add(locale);
        }
    }
    if (!locales.isEmpty()) {
        normalizeScores(text, locales);
    }
    return locales;
}
 
Example 6
Source File: WikipediaEdges.java    From tagme with Apache License 2.0 4 votes vote down vote up
@Override
protected void parseFile(File file) throws IOException
{

	final Int2IntMap redirects = DatasetLoader.get(new RedirectMap(lang));
	final IntSet disambiguations = DatasetLoader.get(new DisambiguationWIDs(lang));
	final IntSet listpages = DatasetLoader.get(new ListPageWIDs(lang));
	final IntSet ignores = DatasetLoader.get(new IgnoreWIDs(lang));
	final IntSet valids = new AllWIDs(lang).getDataset();//DatasetLoader.get(new AllWIDs(lang));
	valids.removeAll(redirects.keySet());
	//valids.removeAll(disambiguations);
	//valids.removeAll(listpages);
	valids.removeAll(ignores);
	final Object2IntMap<String> titles = DatasetLoader.get(new TitlesToWIDMap(lang));


	File tmp = Dataset.createTmpFile();
	final BufferedWriter out = new BufferedWriter(new FileWriter(tmp));
	SQLWikiParser parser = new 	SQLWikiParser(log) {
		@Override
		public boolean compute(ArrayList<String> values) throws IOException
		{
			int idFrom = Integer.parseInt(values.get(SQLWikiParser.PAGELINKS_ID_FROM));
			if (redirects.containsKey(idFrom)) idFrom = redirects.get(idFrom);
			
			int ns = Integer.parseInt(values.get(SQLWikiParser.PAGELINKS_NS));

			
			if (ns == SQLWikiParser.NS_ARTICLE && !redirects.containsKey(idFrom) && !ignores.contains(idFrom) &&
					//questo e' necessario perchè alcune pagine che sono delle liste, in inglese finiscono
					//tra le pagine di disambiguazione (per via della categoria All_set_index_articles)
					(listpages.contains(idFrom) || !disambiguations.contains(idFrom))
					//!listpages.contains(idFrom) && !disambiguations.contains(idFrom)
					&& valids.contains(idFrom)
			
			/**/ )
			{

				String titleTo = Dataset.cleanPageName(values.get(SQLWikiParser.PAGELINKS_TITLE_TO));

				int idTo = titles.getInt(titleTo);
				
				if (redirects.containsKey(idTo)) idTo = redirects.get(idTo);
				if (idTo >= 0 && !ignores.contains(idTo) && (listpages.contains(idFrom) || !disambiguations.contains(idFrom)) && valids.contains(idTo))
				{
					out.append(Integer.toString(idFrom));
					out.append(SEP_CHAR);
					out.append(Integer.toString(idTo));
					out.append('\n');
					return true;
				}
			}
			return false;
		}
	};

	File input = WikipediaFiles.PAGE_LINKS.getSourceFile(lang);
	parser.compute(input);
	out.close();

	log.info("Now sorting edges...");

	ExternalSort sorter = new ExternalSort();
	sorter.setUniq(true);
	sorter.setNumeric(true);
	sorter.setColumns(new int[]{0,1});
	sorter.setInFile(tmp.getAbsolutePath());
	sorter.setOutFile(file.getAbsolutePath());
	sorter.run();

	tmp.delete();

	log.info("Sorted. Done.");

}
 
Example 7
Source File: PTCM.java    From AffectiveTweets with GNU General Public License v3.0 2 votes vote down vote up
@Override
protected Instances process(Instances instances) throws Exception {



	Instances result;


	// The first batch creates de labelled data		
	if(!this.isFirstBatchDone()){
		result = getOutputFormat();

		for(String word:this.wordInfo.keySet()){
			// get the word vector
			WordRep wordRep=this.wordInfo.get(word);

			// We just consider valid words
			if(wordRep.numDoc>=this.minInstDocs){

				// a list of lists of tweet vectors
				ObjectList<ObjectList<Object2IntMap<String>>> partitions=wordRep.partition(this.getPartNumber());

				// traverse the partitions
				for(ObjectList<Object2IntMap<String>> tweetPartition:partitions){
					// create one instance per partition	
					double[] values = new double[result.numAttributes()];

					// average the vectors of the tweets in the partition
					// traverse each feature space in the partition
					for(Object2IntMap<String> wordSpace:tweetPartition){

						for(String innerWord:wordSpace.keySet()){
							// only include valid words
							if(this.m_Dictionary.containsKey(innerWord)){
								int attIndex=this.m_Dictionary.getInt(innerWord);
								// we normalize the value by the number of documents
								values[attIndex]+=((double)wordSpace.getInt(innerWord))/tweetPartition.size();					
							}
						}
					}



					String wordPol=this.lex.getNomDict().get(word).get(this.polarityAttName);
					if(wordPol.equals(this.polarityAttNegValName))
						values[result.numAttributes()-1]=0;
					else if(wordPol.equals(this.polarityAttPosValName))
						values[result.numAttributes()-1]=1;
					else
						values[result.numAttributes()-1]= Utils.missingValue();					



					Instance inst=new SparseInstance(1, values);


					inst.setDataset(result);

					result.add(inst);



				}
			}
		}
	}

	// Second batch maps tweets into the original feature space
	else{
		result=this.mapTargetInstance(instances);

	}

	return result;

}
 
Example 8
Source File: DistantSupervisionSyntheticFilter.java    From AffectiveTweets with GNU General Public License v3.0 2 votes vote down vote up
/**
 * Maps tweets from the second batch into instances that are compatible with the ones generated 
 * @param inp input Instances
 * @return convertes Instances
 */
public Instances mapTargetInstance(Instances inp){

	// Creates instances with the same format
	Instances result=getOutputFormat();


	Attribute contentAtt=inp.attribute(this.m_textIndex.getIndex());


	for(Instance inst:inp){
		String content=inst.stringValue(contentAtt);



		// tokenizes the content 
		List<String> tokens = affective.core.Utils.tokenize(content, this.toLowerCase, this.standarizeUrlsUsers, this.reduceRepeatedLetters, this.m_tokenizer,this.m_stemmer,this.m_stopwordsHandler);

		// Identifies the distinct terms
		AbstractObjectSet<String> terms=new  ObjectOpenHashSet<String>(); 
		terms.addAll(tokens);


		Object2IntMap<String> docVec=this.calculateDocVec(tokens);

		double[] values = new double[result.numAttributes()];


		values[result.classIndex()]= inst.classValue();

		for(String att:docVec.keySet()){

			if(this.m_Dictionary.containsKey(att)){
				int attIndex=this.m_Dictionary.getInt(att);
				// we normalise the value by the number of documents
				values[attIndex]=docVec.getInt(att);					
			}


		}


		Instance outInst=new SparseInstance(1, values);

		inst.setDataset(result);

		result.add(outInst);

	}

	return result;

}
 
Example 9
Source File: TweetToSparseFeatureVector.java    From AffectiveTweets with GNU General Public License v3.0 2 votes vote down vote up
@Override
protected Instances process(Instances instances) throws Exception {



	Instances result = getOutputFormat();

	// if we are in the testing data we calculate the word vectors again
	if (this.isFirstBatchDone()) {
		this.tweetsToVectors(instances);
	}


	int i = 0;
	for (Object2IntMap<String> vec : this.procTweets) {
		double[] values = new double[result.numAttributes()];

		// copy previous attributes values
		for (int n = 0; n < instances.numAttributes(); n++)
			values[n] = instances.instance(i).value(n);

		// add words using the frequency as attribute value
		for (String innerAtt : vec.keySet()) {
			// we only add the value of valid attributes
			if (result.attribute(innerAtt) != null){
				int attIndex=result.attribute(innerAtt).index();					
				values[attIndex]=(double)vec.getInt(innerAtt);

			}


		}


		Instance inst=new SparseInstance(1, values);


		inst.setDataset(result);
		// copy possible strings, relational values...
		copyValues(inst, false, instances, result);

		result.add(inst);
		i++;

	}

	return result;
}
 
Example 10
Source File: AlleleLikelihoods.java    From gatk with BSD 3-Clause "New" or "Revised" License 2 votes vote down vote up
/**
 * Returns the index of evidence within a sample evidence-likelihood sub collection.
 * @param sampleIndex the sample index.
 * @param evidence the query evidence.
 * @return -1 if there is no such evidence in that sample, 0 or greater otherwise.
 */
@VisibleForTesting
int evidenceIndex(final int sampleIndex, final EVIDENCE evidence) {
    final Object2IntMap<EVIDENCE> index = evidenceIndexBySampleIndex(sampleIndex);
    return index.getInt(evidence);
}