Java Code Examples for weka.core.Instance#replaceMissingValues()

The following examples show how to use weka.core.Instance#replaceMissingValues() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Model.java    From AIDR with GNU Affero General Public License v3.0 6 votes vote down vote up
Instance wordsToInstance(WordSet words) {
    Instance item = new SparseInstance(
            attributeSpecification.numAttributes());
    item.setDataset(attributeSpecification);
    // Words
    for (String word : words.getWords()) {
        Attribute attribute = attributeSpecification.attribute(word);
        if (attribute != null) {
            item.setValue(attribute, 1);
        }
    }

    item.replaceMissingValues(missingVal);

    return item;
}
 
Example 2
Source File: NNge.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Performs the update of the classifier
 *
 * @param instance the new instance
 * @throws Exception if the update fails
 */
private void update(Instance instance) throws Exception {

  if (instance.classIsMissing()) {
    return;
  }

  instance.replaceMissingValues(m_MissingVector);
  m_Train.add(instance);

  /* Update the minimum and maximum for all the attributes */
  updateMinMax(instance);

  /* update the mutual information datas */
  updateMI(instance);

  /* Nearest Exemplar */
  Exemplar nearest = nearestExemplar(instance);
	
  /* Adjust */
  if(nearest == null){
    Exemplar newEx = new Exemplar(this, m_Train, 10, instance.classValue());
    newEx.generalise(instance);
    initWeight(newEx);
    addExemplar(newEx);
    return;
  }
  adjust(instance, nearest);

  /* Generalise */
  generalise(instance);
}
 
Example 3
Source File: NutchOnlineClassifier.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Converts an {@link AnthURL} into an {@link Instance} which can be handled
 * by the {@link Classifier}.
 * 
 * @param url
 *            the {@link AnthURL} which should be transformed/converted.
 * @return the resulting {@link Instance}.
 */
private static Instance convert(AnthURL url) {
	if (url != null) {

		Instance inst = new SparseInstance(dimension);
		inst.replaceMissingValues(replaceMissingValues);

		inst.setDataset(instances);
		inst.setValue(attributesIndex.get("class"), (url.sem ? "sem" : "nonsem"));
		inst.setValue(attributesIndex.get("sempar"), (url.semFather ? 1 : 0));
		inst.setValue(attributesIndex.get("nonsempar"), (url.nonSemFather ? 1 : 0));
		inst.setValue(attributesIndex.get("semsib"), (url.semSibling ? 1 : 0));
		inst.setValue(attributesIndex.get("nonsempar"), (url.nonSemFather ? 1 : 0));
		inst.setValue(attributesIndex.get("domain"), url.uri.getHost());
		Set<String> tokens = new HashSet<String>();

		tokens.addAll(tokenizer(url.uri.getPath()));
		tokens.addAll(tokenizer(url.uri.getQuery()));
		tokens.addAll(tokenizer(url.uri.getFragment()));
		for (String tok : tokens) {
			inst.setValue(attributesIndex.get(getAttributeNameOfHash(getHash(tok, hashTrickSize))), 1);
		}
		return inst;

	} else {
		System.out.println("Input AnthURL for convertion into instance was null.");
		return null;
	}
}
 
Example 4
Source File: NutchOnlineClassifier.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Converts an {@link AnthURL} into an {@link Instance} which can be handled
 * by the {@link Classifier}.
 * 
 * @param url
 *            the {@link AnthURL} which should be transformed/converted.
 * @return the resulting {@link Instance}.
 */
private static Instance convert(AnthURL url) {
	if (url != null) {

		Instance inst = new SparseInstance(dimension);
		inst.replaceMissingValues(replaceMissingValues);

		inst.setDataset(instances);
		inst.setValue(attributesIndex.get("class"), (url.sem ? "sem" : "nonsem"));
		inst.setValue(attributesIndex.get("sempar"), (url.semFather ? 1 : 0));
		inst.setValue(attributesIndex.get("nonsempar"), (url.nonSemFather ? 1 : 0));
		inst.setValue(attributesIndex.get("semsib"), (url.semSibling ? 1 : 0));
		inst.setValue(attributesIndex.get("nonsempar"), (url.nonSemFather ? 1 : 0));
		inst.setValue(attributesIndex.get("domain"), url.uri.getHost());
		Set<String> tokens = new HashSet<String>();

		tokens.addAll(tokenizer(url.uri.getPath()));
		tokens.addAll(tokenizer(url.uri.getQuery()));
		tokens.addAll(tokenizer(url.uri.getFragment()));
		for (String tok : tokens) {
			inst.setValue(attributesIndex.get(getAttributeNameOfHash(getHash(tok, hashTrickSize))), 1);
		}
		return inst;

	} else {
		System.out.println("Input AnthURL for convertion into instance was null.");
		return null;
	}
}
 
Example 5
Source File: ReduceDimensionFilter.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/**
 * Returns the next instances based on the configuration of this class.
 */
public Instance nextInstance() {
	Instance inst = this.inputStream.nextInstance();

	Instance newInst = new SparseInstance(hashSize
			+ notHashableAttributes.size());
	newInst.setDataset(newInstances);
	newInst.replaceMissingValues(replacementArray);
	if (newInstances.size() > 0)
		newInstances.remove(0);
	// newInstances.add(0, newInst);
	for (int i = 0; i < inst.numAttributes(); i++) {
		if (inst.classIndex() == i) {
			newInst.setValue(
					attributesIndex.get(inst.classAttribute().name()),
					inst.classValue());
		} else {
			// check if attributes should be manipulated
			if (ignoreAttributes.contains(i)) {
				inst.setValue(i, 0);
			}
			if (makeBinaryAttributes.contains(i) && inst.value(i) > 0) {
				inst.setValue(i, 1);
			}
			// check what should be done with the attributes.
			if (notHashableAttributes.contains(i)) {
				newInst.setValue(
						attributesIndex.get(inst.attribute(i).name()),
						inst.value(i));

			} else {
				// calculate the hash of the attribute name which is
				// included in
				// the vector and set it to 1
				if (inst.value(i) > 0) {
					newInst.setValue(attributesIndex
							.get(getAttributeNameOfHash(getHash(inst
									.attribute(i).name(), hashSize))), 1);
				}
			}
		}
	}
	// System.out.println(newInst.toString());
	return newInst;
}
 
Example 6
Source File: AnthOnlineClassifier.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/**
 * Converts an {@link AnthURL} into an {@link Instance} which can be handled
 * by the {@link Classifier}.
 * 
 * @param url
 *            the {@link AnthURL} which should be transformed/converted.
 * @return the resulting {@link Instance}.
 */
private Instance convert(AnthURL url) {
	if (url != null) {
		try {
			Instance inst = new SparseInstance(dimension);
			inst.replaceMissingValues(replaceMissingValues);

			inst.setDataset(instances);
			inst.setValue(attributesIndex.get("class"), (url.sem ? "sem"
					: "nonsem"));
			inst.setValue(attributesIndex.get("sempar"), (url.semFather ? 1
					: 0));
			inst.setValue(attributesIndex.get("nonsempar"),
					(url.nonSemFather ? 1 : 0));
			inst.setValue(attributesIndex.get("semsib"),
					(url.semSibling ? 1 : 0));
			inst.setValue(attributesIndex.get("nonsempar"),
					(url.nonSemFather ? 1 : 0));
			inst.setValue(attributesIndex.get("domain"), url.uri.getHost());
			Set<String> tokens = new HashSet<String>();

			tokens.addAll(tokenizer(url.uri.getPath()));
			tokens.addAll(tokenizer(url.uri.getQuery()));
			tokens.addAll(tokenizer(url.uri.getFragment()));
			for (String tok : tokens) {
				inst.setValue(attributesIndex
						.get(getAttributeNameOfHash(getHash(tok,
								hashTrickSize))), 1);
			}
			return inst;
		} catch (NullPointerException npe) {
			System.out
					.println("Could not convert AnthURL into Instance for classification of URL: "
							+ (url != null ? (url.uri != null ? url.uri
									.toString() : "URI null")
									: "AnthURL null."));
			return null;
		}
	} else {
		System.out
				.println("Input AnthURL for convertion into instance was null.");
		return null;
	}
}
 
Example 7
Source File: DataStore.java    From AIDR with GNU Affero General Public License v3.0 4 votes vote down vote up
static Instances createFormattedInstances(Instances headerSet,
		ArrayList<String[]> wordVectors, ArrayList<String> labels)
				throws Exception {

	if (wordVectors.size() != labels.size()) {
		throw new Exception();
	}

	// Build a dictionary based on words in the documents, and transform
	// documents into word vectors
	HashSet<String> uniqueWords = new HashSet<String>();
	for (String[] words : wordVectors) {
		uniqueWords.addAll(Arrays.asList(words));
	}

	// Create the dataset
	Instances instances = new Instances(headerSet, wordVectors.size());
	double[] missingVal = new double[headerSet.numAttributes()];

	// Set class index
	instances.setClassIndex(headerSet.numAttributes() - 1);
	Attribute classAttribute = instances.classAttribute();

	// Get valid class labels
	HashSet<String> classValues = new HashSet<String>();
	Enumeration<?> classEnum = classAttribute.enumerateValues();
	while (classEnum.hasMoreElements()) {
		classValues.add((String) classEnum.nextElement());
	}

	// Add each document as an instance
	for (int i = 0; i < wordVectors.size(); i++) {

		if (!classValues.contains(labels.get(i))) {
			logger.error("New class label found in evaluation set. Discarding value.");
			continue;
			/*
			 * TODO: Handle unseen labels in a better way, as this will
			 * over-estimate classification performance. Adding new values
			 * to class attributes requires recreation of the header and
			 * copying of all data to a new Instances. See:
			 * http://comments.gmane.org/gmane.comp.ai.weka/7806
			 */
		}

		Instance item = new DenseInstance(instances.numAttributes());
		item.setDataset(instances);
		// Words
		for (String word : wordVectors.get(i)) {
			Attribute attribute = instances.attribute(word);
			if (attribute != null) {
				item.setValue(attribute, 1);
			}
		}

		item.setValue(classAttribute, labels.get(i));
		item.replaceMissingValues(missingVal);
		instances.add(item);
	}

	return instances;
}