#!/usr/bin/env python """Generate word embeddings for use when initialising the base model.""" from pathlib import Path from gensim.models import Word2Vec, Phrases from gensim.models.word2vec import LineSentence import plac import logging logging.basicConfig( format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO ) def compute_vectors(input_path: Path, output_path: Path): """ Builds word embeddings using gensim Word2Vec. This function takes a file contained single sentences per line and writes the computed vectors in text format to the specified output path. """ print(f"Processing {input_path}") sentences = LineSentence(input_path) bigram_transformer = Phrases(sentences) model = Word2Vec( bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4 ) print(f"Saving vectors to {output_path}") model.wv.save_word2vec_format(output_path, binary=False) @plac.annotations( sent_loc=("Location of input sentences file", "positional", None, Path), output_dir=("Location of output vector file", "positional", None, Path), ) def main(sent_loc: Path, output_dir: Path): compute_vectors(sent_loc, output_dir) if __name__ == "__main__": plac.call(main)