java source code of ConvertVectors

/*
 * Copyright 2014 Radialpoint SafeCare Inc. All Rights Reserved.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package com.radialpoint.word2vec;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

/**
 * This program takes vectors are produced by the C program word2vec and transforms them into a Java binary file to be
 * read by the Vectors class
 */
public class ConvertVectors {

    /**
     * @param args
     *            the input C vectors file, output Java vectors file
     */
    public static void main(String[] args) throws VectorsException, IOException {
        float[][] vectors;
        String[] vocabVects;
        int words;
        int size;

        File vectorFile = new File(args[0]);
        File outputFile = new File(args[1]);

        double len;

        if (!vectorFile.exists())
            throw new VectorsException("Vectors file not found");

        FileInputStream fis = new FileInputStream(vectorFile);

        StringBuilder sb = new StringBuilder();
        char ch = (char) fis.read();
        while (ch != '\n') {
            sb.append(ch);
            ch = (char) fis.read();
        }

        String line = sb.toString();
        String[] parts = line.split("\\s+");
        words = (int) Long.parseLong(parts[0]);
        size = (int) Long.parseLong(parts[1]);
        vectors = new float[words][];
        vocabVects = new String[words];

        System.out.println("" + words + " words with size " + size + " per vector.");

        byte[] orig = new byte[4];
        byte[] buf = new byte[4];
        for (int w = 0; w < words; w++) {
            if (w % (words / 10) == 0) {
                System.out.println("Read " + w + " words");
            }

            sb.setLength(0);
            ch = (char) fis.read();
            while (!Character.isWhitespace(ch) && ch >= 0 && ch <= 256) {
                sb.append((char) ch);
                ch = (char) fis.read();
            }
            ch = (char) fis.read();
            String st = sb.toString();

            vocabVects[w] = st;
            float[] m = new float[size];
            for (int i = 0; i < size; i++) {
                // read a little endian floating point number and interpret it as a big endian one, see
                // http://stackoverflow.com/questions/2782725/converting-float-values-from-big-endian-to-little-endian/2782742#2782742
                // NB: this code assumes amd64 architecture
                for (int j = 0; j < 4; j++)
                    orig[j] = (byte) fis.read();
                buf[2] = orig[0];
                buf[1] = orig[1];
                buf[0] = orig[2];
                buf[3] = orig[3];
                // this code can be made more efficient by reusing the ByteArrayInputStream
                DataInputStream dis = new DataInputStream(new ByteArrayInputStream(buf));
                m[i] = dis.readFloat();
                dis.close();
            }
            len = 0;
            for (int i = 0; i < size; i++)
                len += m[i] * m[i];
            len = (float) Math.sqrt(len);
            for (int i = 0; i < size; i++)
                m[i] /= len;
            vectors[w] = m;
        }
        fis.close();

        FileOutputStream fos = new FileOutputStream(outputFile);
        Vectors instance = new Vectors(vectors, vocabVects);
        instance.writeTo(fos);
    }
}