Java Code Examples for org.apache.lucene.store.FSDirectory#createOutput()

The following examples show how to use org.apache.lucene.store.FSDirectory#createOutput() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Text2Bin.java    From lesk-wsd-dsm with GNU General Public License v3.0 6 votes vote down vote up
/** 
 * Convert a WordSpace text matrix to a bin WordSpace file
 * Text matrix format:
 * - the first line contains the matrix dimensions N
 * - each line contains the word vector information: word d1 d2 ... dN
 * Text2Bin text_matrix_file bin_matrix_file
 * @param args the command line arguments
 */
public static void main(String[] args) {
    try {
        BufferedReader in = new BufferedReader(new FileReader(args[0]));
        File file = new File(args[1]);
        FSDirectory fs = FSDirectory.open(file.getParentFile());
        IndexOutput output = fs.createOutput(file.getName());
        String header = in.readLine();
        output.writeString("-dimensions");
        output.writeInt(Integer.parseInt(header));
        while (in.ready()) {
            String line = in.readLine();
            String[] split = line.split("\t");
            output.writeString(split[0]);
            for (int i=1;i<split.length;i++) {
                output.writeInt(Float.floatToIntBits(Float.parseFloat(split[i])));
            }
        }
        in.close();
        output.close();
    } catch (IOException ex) {
        Logger.getLogger(Text2Bin.class.getName()).log(Level.SEVERE, null, ex);
    }
}
 
Example 2
Source File: VectorStoreWriter.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Outputs a vector store in Lucene binary format.
 * 
 * @param vectorFileName The name of the file to write to
 * @param objectVectors The vector store to be written to disk
 */
public static void writeVectorsInLuceneFormat(String vectorFileName, FlagConfig flagConfig, VectorStore objectVectors)
    throws IOException {
  VerbatimLogger.info("About to write " + objectVectors.getNumVectors() + " vectors of dimension "
      + flagConfig.dimension() + " to Lucene format file: " + vectorFileName + " ... ");
  File vectorFile = new File(vectorFileName);
  java.nio.file.Files.deleteIfExists(vectorFile.toPath());
  String parentPath = vectorFile.getParent();
  if (parentPath == null) parentPath = "";
  FSDirectory fsDirectory = FSDirectory.open(FileSystems.getDefault().getPath(parentPath));
  IndexOutput outputStream = fsDirectory.createOutput(vectorFile.getName(), IOContext.DEFAULT);
  writeToIndexOutput(objectVectors, flagConfig, outputStream);
  outputStream.close();
  fsDirectory.close();
}
 
Example 3
Source File: IncrementalDocVectors.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private void trainIncrementalDocVectors() throws IOException {
  int numdocs = luceneUtils.getNumDocs();

  // Open file and write headers.
  File vectorFile = new File(
      VectorStoreUtils.getStoreFileName(flagConfig.docvectorsfile(), flagConfig));
  String parentPath = vectorFile.getParent();
  if (parentPath == null) parentPath = "";
  FSDirectory fsDirectory = FSDirectory.open(FileSystems.getDefault().getPath(parentPath));

  java.nio.file.Files.deleteIfExists(vectorFile.toPath());
  
  IndexOutput outputStream = fsDirectory.createOutput(vectorFile.getName(), IOContext.DEFAULT);

  VerbatimLogger.info("Writing vectors incrementally to file " + vectorFile + " ... ");

  // Write header giving number of dimension for all vectors.
  outputStream.writeString(VectorStoreWriter.generateHeaderString(flagConfig));

  // Iterate through documents.
  for (int dc = 0; dc < numdocs; dc++) {
    // Output progress counter.
    if ((dc > 0) && ((dc % 10000 == 0) || (dc < 10000 && dc % 1000 == 0))) {
      VerbatimLogger.info("Processed " + dc + " documents ... ");
    }

    // Get filename and path to be used as document vector ID, defaulting to doc number only if
    // docidfield is not populated.
    String docID = luceneUtils.getExternalDocId(dc);

    Vector docVector = VectorFactory.createZeroVector(flagConfig.vectortype(), flagConfig.dimension());

    for (String fieldName : flagConfig.contentsfields()) {
      Terms terms = luceneUtils.getTermVector(dc, fieldName);

      if (terms == null) {
        VerbatimLogger.fine(
            String.format(
                "When building document vectors, no term vector for field: '%s' in document '%s'.",
                fieldName, docID));
        continue;
      }

      TermsEnum termsEnum = terms.iterator();
      BytesRef bytes;
      while ((bytes = termsEnum.next()) != null) {
        Term term = new Term(fieldName, bytes);
        String termString = term.text();
        PostingsEnum docs = termsEnum.postings(null);
        docs.nextDoc();
        int freq = docs.freq();

        try {
          Vector termVector = termVectorData.getVector(termString);
          if (termVector != null && termVector.getDimension() > 0) {
            float localweight = luceneUtils.getLocalTermWeight(freq);
            float globalweight = luceneUtils.getGlobalTermWeight(new Term(fieldName, termString));
            float fieldweight = 1;

            if (flagConfig.fieldweight()) {
              //field weight: 1/sqrt(number of terms in field)
              fieldweight = (float) (1 / Math.sqrt(terms.size()));
            }

            // Add contribution from this term, excluding terms that
            // are not represented in termVectorData.
            docVector.superpose(termVector, localweight * globalweight * fieldweight, null);
          }
        } catch (NullPointerException npe) {
          // Don't normally print anything - too much data!
          logger.finest("term " + termString + " not represented");
        }
      }
    }

    if (docVector.isZeroVector()) {
      logger.warning(String.format(
          "Outputting zero vector for document '%s'. This probably means that none of " +
              "the -contentsfields were populated, or all terms failed the LuceneUtils termsfilter." +
              " You may want to investigate.",
          docID));
    }

    // All fields in document have been processed. Write out documentID and normalized vector.
    docVector.normalize();
    outputStream.writeString(docID);
    docVector.writeToLuceneStream(outputStream);
  } // Finish iterating through documents.

  VerbatimLogger.info("Finished writing vectors.\n");
  outputStream.close();
  fsDirectory.close();
}
 
Example 4
Source File: VectorStoreTruncater.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * @param args
 */
public static void main(String[] args) {
	// TODO Auto-generated method stub

	try {
	FlagConfig flagConfig 			= FlagConfig.getFlagConfig(args);
	VectorStoreRAM objectVectors 	= new VectorStoreRAM(flagConfig);
	String[] argsRemaining			= flagConfig.remainingArgs;
	String incomingVecs				= argsRemaining[0];
	int newDimension				= Integer.parseInt(argsRemaining[1]);
	objectVectors.initFromFile(incomingVecs);
	
	if (newDimension > flagConfig.dimension())
		{
		
				System.out.println("Incoming file has dimensionality of " +flagConfig.dimension());
				System.out.println("New dimensionality must be less than incoming vector length, quitting");
				System.exit(0);	
		}
	
		String vectorFileName = incomingVecs.replaceAll("\\.bin", "")+"_"+newDimension+".bin";
	  	File vectorFile = new File(vectorFileName);
	    String parentPath = vectorFile.getParent();
	    if (parentPath == null) parentPath = "";
	    FSDirectory fsDirectory = FSDirectory.open(FileSystems.getDefault().getPath(parentPath));
	    IndexOutput outputStream = fsDirectory.createOutput(vectorFile.getName(), IOContext.DEFAULT);
	 	flagConfig.setDimension(newDimension);
		outputStream.writeString(VectorStoreWriter.generateHeaderString(flagConfig));
	    Enumeration<ObjectVector> vecEnum = objectVectors.getAllVectors();

	    // Write each vector.
	    while (vecEnum.hasMoreElements()) {
	      ObjectVector objectVector = vecEnum.nextElement();
	      outputStream.writeString(objectVector.getObject().toString());
	      objectVector.getVector().writeToLuceneStream(outputStream,flagConfig.dimension());
	    }
	    
	    
	    outputStream.close();
	    fsDirectory.close();
		
	    VerbatimLogger.info("wrote "+objectVectors.getNumVectors()+" vectors to file "+ vectorFileName);
	    VerbatimLogger.info("finished writing vectors.\n");
	 		
	} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
		System.out.println("Usage: VectorStoreTruncater incomingFile.bin newDimensinoality");
	}
	
	
	
}