Java Code Examples for org.nd4j.linalg.ops.transforms.Transforms#cosineSim()

The following examples show how to use org.nd4j.linalg.ops.transforms.Transforms#cosineSim() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParagraphVectors.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method returns similarity of the document to specific label, based on mean value
 *
 * @param document
 * @param label
 * @return
 */
public double similarityToLabel(List<VocabWord> document, String label) {
    if (document.isEmpty())
        throw new IllegalStateException("Document has no words inside");

    /*
    INDArray arr = Nd4j.create(document.size(), this.layerSize);
    for (int i = 0; i < document.size(); i++) {
        arr.putRow(i, getWordVectorMatrix(document.get(i).getWord()));
    }*/

    INDArray docMean = inferVector(document); //arr.mean(0);

    INDArray otherVec = getWordVectorMatrix(label);
    double sim = Transforms.cosineSim(docMean, otherVec);
    return sim;
}
 
Example 2
Source File: ParagraphVectors.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
     * Predict several labels based on the document.
     * Computes a similarity wrt the mean of the
     * representation of words in the document
     * @param document the document
     * @return possible labels in descending order
     */
    public Collection<String> predictSeveral(List<VocabWord> document, int limit) {
        /*
            This code was transferred from original ParagraphVectors DL4j implementation, and yet to be tested
         */
        if (document.isEmpty())
            throw new IllegalStateException("Document has no words inside");
/*
        INDArray arr = Nd4j.create(document.size(), this.layerSize);
        for (int i = 0; i < document.size(); i++) {
            arr.putRow(i, getWordVectorMatrix(document.get(i).getWord()));
        }
*/
        INDArray docMean = inferVector(document); //arr.mean(0);
        Counter<String> distances = new Counter<>();

        for (String s : labelsSource.getLabels()) {
            INDArray otherVec = getWordVectorMatrix(s);
            double sim = Transforms.cosineSim(docMean, otherVec);
            log.debug("Similarity inside: [" + s + "] -> " + sim);
            distances.incrementCount(s, (float) sim);
        }

        val keys = distances.keySetSorted();
        return keys.subList(0, Math.min(limit, keys.size()));
    }
 
Example 3
Source File: ParagraphVectors.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method predicts label of the document.
 * Computes a similarity wrt the mean of the
 * representation of words in the document
 * @param document the document
 * @return the word distances for each label
 */
public String predict(List<VocabWord> document) {
    /*
        This code was transferred from original ParagraphVectors DL4j implementation, and yet to be tested
     */
    if (document.isEmpty())
        throw new IllegalStateException("Document has no words inside");

    /*
    INDArray arr = Nd4j.create(document.size(), this.layerSize);
    for (int i = 0; i < document.size(); i++) {
        arr.putRow(i, getWordVectorMatrix(document.get(i).getWord()));
    }*/

    INDArray docMean = inferVector(document); //arr.mean(0);
    Counter<String> distances = new Counter<>();

    for (String s : labelsSource.getLabels()) {
        INDArray otherVec = getWordVectorMatrix(s);
        double sim = Transforms.cosineSim(docMean, otherVec);
        distances.incrementCount(s, (float) sim);
    }

    return distances.argMax();
}
 
Example 4
Source File: FlatModelUtils.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method does full scan against whole vocabulary, building descending list of similar words
 *
 * @param words
 * @param top
 * @return the words nearest the mean of the words
 */
@Override
public Collection<String> wordsNearest(INDArray words, int top) {
    Counter<String> distances = new Counter<>();

    words = adjustRank(words);

    for (String s : vocabCache.words()) {
        INDArray otherVec = lookupTable.vector(s);
        double sim = Transforms.cosineSim(Transforms.unitVec(words.dup()), Transforms.unitVec(otherVec.dup()));
        distances.incrementCount(s, (float) sim);
    }

    distances.keepTopNElements(top);
    return distances.keySetSorted();
}
 
Example 5
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Adding test for cosine similarity, to track changes in Transforms.cosineSim()
 */
@Test
public void testCosineSim() {
    double[] array1 = new double[] {1.01, 0.91, 0.81, 0.71};
    double[] array2 = new double[] {1.01, 0.91, 0.81, 0.71};
    double[] array3 = new double[] {1.0, 0.9, 0.8, 0.7};

    double sim12 = Transforms.cosineSim(Nd4j.create(array1), Nd4j.create(array2));
    double sim23 = Transforms.cosineSim(Nd4j.create(array2), Nd4j.create(array3));
    log.info("Arrays 1/2 cosineSim: " + sim12);
    log.info("Arrays 2/3 cosineSim: " + sim23);
    log.info("Arrays 1/2 dot: " + Nd4j.getBlasWrapper().dot(Nd4j.create(array1), Nd4j.create(array2)));
    log.info("Arrays 2/3 dot: " + Nd4j.getBlasWrapper().dot(Nd4j.create(array2), Nd4j.create(array3)));

    assertEquals(1.0d, sim12, 0.01d);
    assertEquals(0.99d, sim23, 0.01d);
}
 
Example 6
Source File: OpExecutionerTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testCosineSimilarity() {
    INDArray vec1 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    INDArray vec2 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);

}
 
Example 7
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Deprecated
private double arraysSimilarity(@NonNull INDArray array1, @NonNull INDArray array2) {
    if (array1.equals(array2))
        return 1.0;

    INDArray vector = Transforms.unitVec(array1);
    INDArray vector2 = Transforms.unitVec(array2);

    if (vector == null || vector2 == null)
        return -1;

    return Transforms.cosineSim(vector, vector2);

}
 
Example 8
Source File: ParagraphVectors.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method returns top N labels nearest to specified features vector
 *
 * @param labelVector
 * @param topN
 * @return
 */
public Collection<String> nearestLabels(INDArray labelVector, int topN) {
    if (labelsMatrix == null || labelsList == null || labelsList.isEmpty())
        extractLabels();

    List<BasicModelUtils.WordSimilarity> result = new ArrayList<>();

    // if list still empty - return empty collection
    if (labelsMatrix == null || labelsList == null || labelsList.isEmpty()) {
        log.warn("Labels list is empty!");
        return new ArrayList<>();
    }

    if (!normalizedLabels) {
        synchronized (this) {
            if (!normalizedLabels) {
                labelsMatrix.diviColumnVector(labelsMatrix.norm1(1));
                normalizedLabels = true;
            }
        }
    }

    INDArray similarity = Transforms.unitVec(labelVector).mmul(labelsMatrix.transpose());
    List<Double> highToLowSimList = getTopN(similarity, topN + 20);

    for (int i = 0; i < highToLowSimList.size(); i++) {
        String word = labelsList.get(highToLowSimList.get(i).intValue()).getLabel();
        if (word != null && !word.equals("UNK") && !word.equals("STOP")) {
            INDArray otherVec = lookupTable.vector(word);
            double sim = Transforms.cosineSim(labelVector, otherVec);

            result.add(new BasicModelUtils.WordSimilarity(word, sim));
        }
    }

    Collections.sort(result, new BasicModelUtils.SimilarityComparator());

    return BasicModelUtils.getLabels(result, topN);
}
 
Example 9
Source File: BasicModelUtils.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the similarity of 2 words. Result value will be in range [-1,1], where -1.0 is exact opposite similarity, i.e. NO similarity, and 1.0 is total match of two word vectors.
 * However, most of time you'll see values in range [0,1], but that's something depends of training corpus.
 *
 * Returns NaN if any of labels not exists in vocab, or any label is null
 *
 * @param label1 the first word
 * @param label2 the second word
 * @return a normalized similarity (cosine similarity)
 */
@Override
public double similarity(@NonNull String label1, @NonNull String label2) {
    if (label1 == null || label2 == null) {
        log.debug("LABELS: " + label1 + ": " + (label1 == null ? "null" : EXISTS) + ";" + label2 + " vec2:"
                        + (label2 == null ? "null" : EXISTS));
        return Double.NaN;
    }

    if (!vocabCache.hasToken(label1)) {
        log.debug("Unknown token 1 requested: [{}]", label1);
        return Double.NaN;
    }

    if (!vocabCache.hasToken(label2)) {
        log.debug("Unknown token 2 requested: [{}]", label2);
        return Double.NaN;
    }

    INDArray vec1 = lookupTable.vector(label1).dup();
    INDArray vec2 = lookupTable.vector(label2).dup();


    if (vec1 == null || vec2 == null) {
        log.debug(label1 + ": " + (vec1 == null ? "null" : EXISTS) + ";" + label2 + " vec2:"
                        + (vec2 == null ? "null" : EXISTS));
        return Double.NaN;
    }

    if (label1.equals(label2))
        return 1.0;

    return Transforms.cosineSim(vec1, vec2);
}
 
Example 10
Source File: NDArrayTestsFortran.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testCosineSim() {
    INDArray vec1 = Nd4j.create(new double[] {1, 2, 3, 4});
    INDArray vec2 = Nd4j.create(new double[] {1, 2, 3, 4});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);

    INDArray vec3 = Nd4j.create(new float[] {0.2f, 0.3f, 0.4f, 0.5f});
    INDArray vec4 = Nd4j.create(new float[] {0.6f, 0.7f, 0.8f, 0.9f});
    sim = Transforms.cosineSim(vec3, vec4);
    assertEquals(getFailureMessage(), 0.98, sim, 1e-1);

}
 
Example 11
Source File: NDArrayDistanceTransform.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
public List<Writable> map(List<Writable> writables) {
    int idxFirst = inputSchema.getIndexOfColumn(firstCol);
    int idxSecond = inputSchema.getIndexOfColumn(secondCol);

    INDArray arr1 = ((NDArrayWritable) writables.get(idxFirst)).get();
    INDArray arr2 = ((NDArrayWritable) writables.get(idxSecond)).get();

    double d;
    switch (distance) {
        case COSINE:
            d = Transforms.cosineSim(arr1, arr2);
            break;
        case EUCLIDEAN:
            d = Transforms.euclideanDistance(arr1, arr2);
            break;
        case MANHATTAN:
            d = Transforms.manhattanDistance(arr1, arr2);
            break;
        default:
            throw new UnsupportedOperationException("Unknown or not supported distance metric: " + distance);
    }

    List<Writable> out = new ArrayList<>(writables.size() + 1);
    out.addAll(writables);
    out.add(new DoubleWritable(d));

    return out;
}
 
Example 12
Source File: OpExecutionerTestsC.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testCosineSimilarity() {
    INDArray vec1 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    INDArray vec2 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);
}
 
Example 13
Source File: NDArrayDistanceTransform.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public List<Writable> map(List<Writable> writables) {
    int idxFirst = inputSchema.getIndexOfColumn(firstCol);
    int idxSecond = inputSchema.getIndexOfColumn(secondCol);

    INDArray arr1 = ((NDArrayWritable) writables.get(idxFirst)).get();
    INDArray arr2 = ((NDArrayWritable) writables.get(idxSecond)).get();

    double d;
    switch (distance) {
        case COSINE:
            d = Transforms.cosineSim(arr1, arr2);
            break;
        case EUCLIDEAN:
            d = Transforms.euclideanDistance(arr1, arr2);
            break;
        case MANHATTAN:
            d = Transforms.manhattanDistance(arr1, arr2);
            break;
        default:
            throw new UnsupportedOperationException("Unknown or not supported distance metric: " + distance);
    }

    List<Writable> out = new ArrayList<>(writables.size() + 1);
    out.addAll(writables);
    out.add(new DoubleWritable(d));

    return out;
}
 
Example 14
Source File: NDArrayTestsFortran.java    From nd4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testCosineSim() {
    INDArray vec1 = Nd4j.create(new double[] {1, 2, 3, 4});
    INDArray vec2 = Nd4j.create(new double[] {1, 2, 3, 4});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);

    INDArray vec3 = Nd4j.create(new float[] {0.2f, 0.3f, 0.4f, 0.5f});
    INDArray vec4 = Nd4j.create(new float[] {0.6f, 0.7f, 0.8f, 0.9f});
    sim = Transforms.cosineSim(vec3, vec4);
    assertEquals(getFailureMessage(), 0.98, sim, 1e-1);

}
 
Example 15
Source File: OpExecutionerTestsC.java    From nd4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testCosineSimilarity() {
    INDArray vec1 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    INDArray vec2 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);
}
 
Example 16
Source File: CudaReduce3Tests.java    From nd4j with Apache License 2.0 5 votes vote down vote up
/**
 * Norm2 + cuBlas dot call
 *
 * @throws Exception
 */
@Test
public void testPinnedCosineSim() throws Exception {
    // simple way to stop test if we're not on CUDA backend here

    INDArray array1 = Nd4j.create(new float[]{2.01f, 2.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f});
    INDArray array2 = Nd4j.create(new float[]{1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f});


    double similarity = Transforms.cosineSim(array1, array2);

    System.out.println("Cosine similarity: " + similarity);
    assertEquals(0.95f, similarity, 0.01f);
}
 
Example 17
Source File: HalfOpsTests.java    From nd4j with Apache License 2.0 5 votes vote down vote up
@Ignore
@Test
public void testReduce3_2() throws Exception {
    INDArray array1 = Nd4j.create(new float[]{2.01f, 2.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f});
    INDArray array2 = Nd4j.create(new float[]{1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f});


    double similarity = Transforms.cosineSim(array1, array2);

    System.out.println("Cosine similarity: " + similarity);
    assertEquals(0.95f, similarity, 0.01f);
}
 
Example 18
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 300000)
public void testParagraphVectorsDBOW() throws Exception {
    skipUnlessIntegrationTests();

    File file = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(file);

    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1)
                    .layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
                    .trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0)
                    .allowParallelTokenization(true).useHierarchicSoftmax(true).sampling(0).workers(4)
                    .usePreciseWeightInit(true).sequenceLearningAlgorithm(new DBOW<VocabWord>()).build();

    vec.fit();

    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn0().isAttached());
    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn1().isAttached());

    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");

    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);

    double simDN = vec.similarity("day", "night");
    log.info("day/night similariry: {}", simDN);

    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.2d);

    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //      assertTrue(similarity2 > 0.2d);

    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity3 > 0.6d);

    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);


    // testing DM inference now

    INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
    INDArray inferredA1 = vec.inferVector("This is my work");
    INDArray inferredB1 = vec.inferVector("This is my work .");
    INDArray inferredC1 = vec.inferVector("This is my day");
    INDArray inferredD1 = vec.inferVector("This is my night");

    log.info("A: {}", Arrays.toString(inferredA1.data().asFloat()));
    log.info("C: {}", Arrays.toString(inferredC1.data().asFloat()));

    assertNotEquals(inferredA1, inferredC1);

    double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
    double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
    double cosAC1 = Transforms.cosineSim(inferredA1.dup(), inferredC1.dup());
    double cosCD1 = Transforms.cosineSim(inferredD1.dup(), inferredC1.dup());

    log.info("Cos O/A: {}", cosAO1);
    log.info("Cos A/B: {}", cosAB1);
    log.info("Cos A/C: {}", cosAC1);
    log.info("Cos C/D: {}", cosCD1);

}
 
Example 19
Source File: TestNDArrayWritableTransforms.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
@Test
public void testNDArrayDistanceTransform() {

    Schema s = new Schema.Builder()

                    .addColumnDouble("col0").addColumnNDArray("col1", new long[] {1, 10})
                    .addColumnNDArray("col2", new long[] {1, 10}).build();


    TransformProcess tp = new TransformProcess.Builder(s)
                    .ndArrayDistanceTransform("dist", Distance.COSINE, "col1", "col2").build();



    List<String> expColNames = Arrays.asList("col0", "col1", "col2", "dist");
    assertEquals(expColNames, tp.getFinalSchema().getColumnNames());

    Nd4j.getRandom().setSeed(12345);
    INDArray arr1 = Nd4j.rand(1, 10);
    INDArray arr2 = Nd4j.rand(1, 10);
    double cosine = Transforms.cosineSim(arr1, arr2);

    List<Writable> in = Arrays.<Writable>asList(new DoubleWritable(0), new NDArrayWritable(arr1.dup()),
                    new NDArrayWritable(arr2.dup()));
    List<Writable> out = tp.execute(in);

    List<Writable> exp = Arrays.<Writable>asList(new DoubleWritable(0), new NDArrayWritable(arr1),
                    new NDArrayWritable(arr2), new DoubleWritable(cosine));

    assertEquals(exp, out);
}
 
Example 20
Source File: TestNDArrayWritableTransforms.java    From DataVec with Apache License 2.0 3 votes vote down vote up
@Test
public void testNDArrayDistanceTransform() {

    Schema s = new Schema.Builder()

                    .addColumnDouble("col0").addColumnNDArray("col1", new long[] {1, 10})
                    .addColumnNDArray("col2", new long[] {1, 10}).build();


    TransformProcess tp = new TransformProcess.Builder(s)
                    .ndArrayDistanceTransform("dist", Distance.COSINE, "col1", "col2").build();



    List<String> expColNames = Arrays.asList("col0", "col1", "col2", "dist");
    assertEquals(expColNames, tp.getFinalSchema().getColumnNames());

    Nd4j.getRandom().setSeed(12345);
    INDArray arr1 = Nd4j.rand(1, 10);
    INDArray arr2 = Nd4j.rand(1, 10);
    double cosine = Transforms.cosineSim(arr1, arr2);

    List<Writable> in = Arrays.<Writable>asList(new DoubleWritable(0), new NDArrayWritable(arr1.dup()),
                    new NDArrayWritable(arr2.dup()));
    List<Writable> out = tp.execute(in);

    List<Writable> exp = Arrays.<Writable>asList(new DoubleWritable(0), new NDArrayWritable(arr1),
                    new NDArrayWritable(arr2), new DoubleWritable(cosine));

    assertEquals(exp, out);
}