Python torchtext.vocab.Vectors() Examples
The following are 30
code examples of torchtext.vocab.Vectors().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torchtext.vocab
, or try the search function
.
Example #1
Source File: semantic_similar_data.py From glyce with Apache License 2.0 | 6 votes |
def __init__(self, args): self.RAW = data.RawField() self.RAW.is_target = False tokenize = lambda x: list(x) self.TEXT = data.Field(batch_first=True, tokenize=tokenize) self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = data.TabularDataset.splits( path='/data/nfsdata/nlp/datasets/sentence_pair/bq_corpus_torch10', train='BQ_train.json', validation='BQ_dev.json', test='BQ_test.json', format='json', fields={"gold_label": ("label", self.LABEL), "sentence1": ("q1", self.TEXT), "sentence2": ("q2", self.TEXT), "ID": ("id", self.RAW)}) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=Vectors("BQ300", args.data)) self.LABEL.build_vocab(self.train) sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.train_iter = data.BucketIterator(self.train, batch_size=args.batch_size, device=device, sort_key=sort_key, sort=True) self.dev_iter = data.BucketIterator(self.dev, batch_size=args.batch_size, device=device, sort_key=sort_key, sort=True) self.test_iter = data.BucketIterator(self.test, batch_size=args.batch_size, device=device, sort_key=sort_key, sort=True)
Example #2
Source File: yelp2014.py From hedwig with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #3
Source File: robust04.py From hedwig with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, topic, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param topic: topic from which articles should be fetched :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train_path = os.path.join('TREC', 'robust04_train_%s.tsv' % topic) dev_path = os.path.join('TREC', 'robust04_dev_%s.tsv' % topic) test_path = os.path.join('TREC', 'core17_10k_%s.tsv' % topic) train, val, test = cls.splits(path, train=train_path, validation=dev_path, test=test_path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #4
Source File: word_embedding.py From lightNLP with Apache License 2.0 | 6 votes |
def __init__(self, vocabulary_size, word_embedding_dim, hidden_dim, num_layers, dropout, vector_path=None, non_static=False): super(BiLSTMWordEmbeddingLookup, self).__init__() self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.word_embedding_dim = word_embedding_dim self.hidden_dim = hidden_dim self.output_dim = hidden_dim self.word_embeddings = nn.Embedding(self.vocabulary_size, self.word_embedding_dim).to(DEVICE) if vector_path: logger.info('logging word vectors from {}'.format(vector_path)) word_vectors = Vectors(vector_path).vectors self.word_embeddings = self.word_embeddings.from_pretrained(word_vectors, freeze=not non_static).to(DEVICE) self.lstm = nn.LSTM(self.word_embedding_dim, self.hidden_dim // 2, bidirectional=True, num_layers=num_layers, dropout=dropout).to(DEVICE) self.hidden = self.init_hidden()
Example #5
Source File: sst.py From hedwig with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #6
Source File: imdb.py From hedwig with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #7
Source File: reuters.py From hedwig with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #8
Source File: aapd.py From hedwig with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #9
Source File: model.py From lightNLP with Apache License 2.0 | 6 votes |
def __init__(self, args): super(TextCNN, self).__init__(args) self.class_num = args.class_num self.chanel_num = 1 self.filter_num = args.filter_num self.filter_sizes = args.filter_sizes self.vocabulary_size = args.vocabulary_size self.embedding_dimension = args.embedding_dim self.embedding = nn.Embedding(self.vocabulary_size, self.embedding_dimension).to(DEVICE) if args.static: logger.info('logging word vectors from {}'.format(args.vector_path)) vectors = Vectors(args.vector_path).vectors self.embedding = self.embedding.from_pretrained(vectors, freeze=not args.non_static).to(DEVICE) if args.multichannel: self.embedding2 = nn.Embedding(self.vocabulary_size, self.embedding_dimension).from_pretrained(args.vectors).to(DEVICE) self.chanel_num += 1 else: self.embedding2 = None self.convs = nn.ModuleList( [nn.Conv2d(self.chanel_num, self.filter_num, (size, self.embedding_dimension)) for size in self.filter_sizes]).to(DEVICE) self.dropout = nn.Dropout(args.dropout).to(DEVICE) self.fc = nn.Linear(len(self.filter_sizes) * self.filter_num, self.class_num).to(DEVICE)
Example #10
Source File: robust45.py From hedwig with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, topic, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param topic: topic from which articles should be fetched :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train_path = os.path.join('TREC', 'robust45_aug_train_%s.tsv' % topic) dev_path = os.path.join('TREC', 'robust45_dev_%s.tsv' % topic) test_path = os.path.join('TREC', 'core17_10k_%s.tsv' % topic) train, val, test = cls.splits(path, train=train_path, validation=dev_path, test=test_path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #11
Source File: test_field.py From deepmatcher with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_extend_vectors_1(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec' file = os.path.join(pathdir, filename) url_base = urljoin('file:', pathname2url(file)) vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base) self.assertIsInstance(vecs, Vectors) vec_data = MatchingField._get_vector_data(vecs, vectors_cache_dir) v = MatchingVocab(Counter()) v.vectors = torch.Tensor(1, vec_data[0].dim) v.unk_init = torch.Tensor.zero_ tokens = {'hello', 'world'} v.extend_vectors(tokens, vec_data) self.assertEqual(len(v.itos), 4) self.assertEqual(v.vectors.size(), torch.Size([4, 300])) self.assertEqual(list(v.vectors[2][0:10]), [0.0] * 10) self.assertEqual(list(v.vectors[3][0:10]), [0.0] * 10) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir)
Example #12
Source File: test_field.py From deepmatcher with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_extend_vocab_1(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) mf = MatchingField() lf = MatchingField(id=True, sequential=False) fields = [('id', lf), ('left_a', mf), ('right_a', mf), ('label', lf)] col_naming = {'id': 'id', 'label': 'label', 'left': 'left_', 'right': 'right_'} pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec' file = os.path.join(pathdir, filename) url_base = urljoin('file:', pathname2url(file)) vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base) data_path = os.path.join(test_dir_path, 'test_datasets', 'sample_table_small.csv') md = MatchingDataset(fields, col_naming, path=data_path) mf.build_vocab() mf.vocab.vectors = torch.Tensor(len(mf.vocab.itos), 300) mf.extend_vocab(md, vectors=vecs) self.assertEqual(len(mf.vocab.itos), 6) self.assertEqual(mf.vocab.vectors.size(), torch.Size([6, 300]))
Example #13
Source File: sts2014.py From castor with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #14
Source File: test_field.py From deepmatcher with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_get_vector_data(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec' file = os.path.join(pathdir, filename) url_base = urljoin('file:', pathname2url(file)) vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base) self.assertIsInstance(vecs, Vectors) vec_data = MatchingField._get_vector_data(vecs, vectors_cache_dir) self.assertEqual(len(vec_data), 1) self.assertEqual(vec_data[0].vectors.size(), torch.Size([100, 300])) self.assertEqual(vec_data[0].dim, 300) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir)
Example #15
Source File: quora.py From castor with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_dir: directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param pt_file: load cached embedding file from disk if it is true :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, validation, test = cls.splits(path) cls.LABEL_FIELD.build_vocab(train, validation, test) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #16
Source File: sick.py From castor with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #17
Source File: msrvid.py From castor with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, test, vectors=vectors) return BucketIterator.splits((train, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)
Example #18
Source File: sst.py From castor with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #19
Source File: sst.py From castor with Apache License 2.0 | 6 votes |
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #20
Source File: tool.py From lightKG with Apache License 2.0 | 5 votes |
def get_vectors(self, path: str): logger.info('loading vectors from {}'.format(path)) vectors = Vectors(path) logger.info('successed loading vectors') return vectors
Example #21
Source File: iterator.py From TD-DMN with MIT License | 5 votes |
def get_iters(self, train_batch_size, fold_num, vec_name, vec_cache): # Load data splits train, test = data.TabularDataset.splits(path="./data/fold_{}".format(fold_num), train="train.tsv", test="test.tsv", format="tsv", fields=[("TEXT", self.text_doc), ("ENTITY", self.entity_doc), ("LABEL", self.label_doc), ("OFFSET", self.offset_doc), ("LENGTH", self.length_doc), ("WORD_ATTN", self.word_attn_doc), ("SENT_ATTN", self.sent_attn_doc), ("DOC_ID", self.doc_id)]) # First load vectors vector = Vectors(name=vec_name, cache=vec_cache) # Build vocabs self.text_doc.build_vocab(train, test, vectors=vector) self.entity_doc.build_vocab(train, test) self.label_doc.build_vocab(train, test) # Get iterators train_iter, test_iter = data.BucketIterator.splits((train, test), sort=False, batch_sizes=(train_batch_size, 2), repeat=True) train_iter.shuffle = True return train_iter, test_iter
Example #22
Source File: wikiqa.py From castor with Apache License 2.0 | 5 votes |
def iters(cls, path, vectors_name, vectors_dir, batch_size=64, shuffle=True, device=0, pt_file=False, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_dir: directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param pt_file: load cached embedding file from disk if it is true :param unk_init: function used to generate vector for OOV words :return: """ train, validation, test = cls.splits(path) if not pt_file: if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_dir, unk_init=unk_init) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) else: cls.TEXT_FIELD.build_vocab(train, validation, test) cls.TEXT_FIELD = cls.set_vectors(cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name)) cls.LABEL_FIELD.build_vocab(train, validation, test) cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #23
Source File: pit2015.py From castor with Apache License 2.0 | 5 votes |
def iters(cls, path, vectors_name, vectors_dir, batch_size=64, shuffle=True, device=0, pt_file=False, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_dir: directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param pt_file: load cached embedding file from disk if it is true :param unk_init: function used to generate vector for OOV words :return: """ train, validation, test = cls.splits(path) if not pt_file: if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_dir, unk_init=unk_init) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) else: cls.TEXT_FIELD.build_vocab(train, validation, test) cls.TEXT_FIELD = cls.set_vectors(cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name)) cls.LABEL_FIELD.build_vocab(train, validation, test) cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #24
Source File: trecqa.py From castor with Apache License 2.0 | 5 votes |
def iters(cls, path, vectors_name, vectors_dir, batch_size=64, shuffle=True, device=0, pt_file=False, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_dir: directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ train, validation, test = cls.splits(path) if not pt_file: if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_dir, unk_init=unk_init) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) else: cls.TEXT_FIELD.build_vocab(train, validation, test) cls.TEXT_FIELD = cls.set_vectors(cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name)) cls.LABEL_FIELD.build_vocab(train, validation, test) cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
Example #25
Source File: dataset.py From pytorch-sentiment-analysis-classification with MIT License | 5 votes |
def __init__(self, root_dir='data', batch_size=64, use_vector=True): self.TEXT = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True, batch_first=True) self.LABEL = LabelField(tensor_type=torch.FloatTensor) vectors = Vectors(name='mr_vocab.txt', cache='./') dataset_path = os.path.join(root_dir, '{}.tsv') self.dataset = {} self.dataloader = {} for target in ['train', 'dev', 'test']: self.dataset[target] = TabularDataset( path=dataset_path.format(target), format='tsv', fields=[('text', self.TEXT), ('label', self.LABEL)] ) if use_vector: self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors) else: self.TEXT.build_vocab(self.dataset[target], max_size=25000) self.LABEL.build_vocab(self.dataset[target]) self.dataloader[target] = Iterator(self.dataset[target], batch_size=batch_size, device=None, repeat=False, sort_key=lambda x: len(x.text), shuffle=True)
Example #26
Source File: tool.py From lightKG with Apache License 2.0 | 5 votes |
def get_vectors(self, path: str): logger.info('loading vectors from {}'.format(path)) vectors = Vectors(path) logger.info('successed loading vectors') return vectors
Example #27
Source File: field.py From deepmatcher with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_vector_data(cls, vecs, cache): if not isinstance(vecs, list): vecs = [vecs] vec_datas = [] for vec in vecs: if not isinstance(vec, vocab.Vectors): vec_name = vec vec_data = cls._cached_vec_data.get(vec_name) if vec_data is None: parts = vec_name.split('.') if parts[0] == 'fasttext': if parts[2] == 'bin': vec_data = FastTextBinary(language=parts[1], cache=cache) elif parts[2] == 'vec' and parts[1] == 'wiki': vec_data = FastText( suffix='wiki-news-300d-1M.vec.zip', cache=cache) elif parts[2] == 'vec' and parts[1] == 'crawl': vec_data = FastText( suffix='crawl-300d-2M.vec.zip', cache=cache) if vec_data is None: vec_data = vocab.pretrained_aliases[vec_name](cache=cache) cls._cached_vec_data[vec_name] = vec_data vec_datas.append(vec_data) else: vec_datas.append(vec) return vec_datas
Example #28
Source File: build.py From ParlAI with MIT License | 5 votes |
def download(datapath): return vocab.Vectors( name='wiki.en.vec', url=URL, cache=os.path.join(datapath, 'models', 'fasttext_vectors'), )
Example #29
Source File: build.py From neural_chat with MIT License | 5 votes |
def download(datapath): return vocab.Vectors( name='crawl-300d-2M.vec', url=URL, cache=modelzoo_path(datapath, 'models:fasttext_cc_vectors'), )
Example #30
Source File: build.py From neural_chat with MIT License | 5 votes |
def download(datapath): return vocab.Vectors( name='wiki.en.vec', url=URL, cache=modelzoo_path(datapath, 'models:fasttext_vectors'), )