Python tensorflow.python.platform.gfile.GFile() Examples
The following are 30
code examples of tensorflow.python.platform.gfile.GFile().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.python.platform.gfile
, or try the search function
.
Example #1
Source File: export_inference_graph.py From Hands-On-Machine-Learning-with-OpenCV-4 with MIT License | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #2
Source File: parse_to_conll.py From Gun-Detector with Apache License 2.0 | 6 votes |
def print_output(output_file, use_text_format, use_gold_segmentation, output): """Writes a set of sentences in CoNLL format. Args: output_file: The file to write to. use_text_format: Whether this computation used text-format input. use_gold_segmentation: Whether this computation used gold segmentation. output: A list of sentences to write to the output file. """ with gfile.GFile(output_file, 'w') as f: f.write('## tf:{}\n'.format(use_text_format)) f.write('## gs:{}\n'.format(use_gold_segmentation)) for serialized_sentence in output: sentence = sentence_pb2.Sentence() sentence.ParseFromString(serialized_sentence) f.write('# text = {}\n'.format(sentence.text.encode('utf-8'))) for i, token in enumerate(sentence.token): head = token.head + 1 f.write('%s\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_\n' % (i + 1, token.word.encode('utf-8'), head, token.label.encode('utf-8'))) f.write('\n')
Example #3
Source File: export_inference_graph.py From garbage-object-detection-tensorflow with MIT License | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #4
Source File: export_inference_graph.py From CVTron with Apache License 2.0 | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #5
Source File: export_inference_graph.py From Gun-Detector with Apache License 2.0 | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #6
Source File: task.py From solutions-vision-search with Apache License 2.0 | 6 votes |
def maybe_download_and_extract(filename, data_dir, source_url): """Maybe download and extract a file.""" if not gfile.Exists(data_dir): gfile.MakeDirs(data_dir) filepath = os.path.join(data_dir, filename) if not gfile.Exists(filepath): print('Downloading from {}'.format(source_url)) temp_file_name, _ = urllib.request.urlretrieve(source_url) gfile.Copy(temp_file_name, filepath) with gfile.GFile(filepath) as f: size = f.size() print('Successfully downloaded \'{}\' of {} bytes'.format(filename, size)) if filename.endswith('.zip'): print('Extracting {}'.format(filename)) zipfile.ZipFile(file=filepath, mode='r').extractall(data_dir)
Example #7
Source File: export_inference_graph.py From yolo_v2 with Apache License 2.0 | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #8
Source File: export_inference_graph.py From edafa with MIT License | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) if FLAGS.quantize: tf.contrib.quantize.create_eval_graph() graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #9
Source File: base.py From auto-alt-text-lambda-api with MIT License | 6 votes |
def maybe_download(filename, work_directory, source_url): """Download the data from source url, unless it's already here. Args: filename: string, name of the file in the directory. work_directory: string, path to working directory. source_url: url to download from if file doesn't exist. Returns: Path to resulting file. """ if not gfile.Exists(work_directory): gfile.MakeDirs(work_directory) filepath = os.path.join(work_directory, filename) if not gfile.Exists(filepath): temp_file_name, _ = urlretrieve_with_retry(source_url) gfile.Copy(temp_file_name, filepath) with gfile.GFile(filepath) as f: size = f.size() print('Successfully downloaded', filename, size, 'bytes.') return filepath
Example #10
Source File: parse_to_conll.py From yolo_v2 with Apache License 2.0 | 6 votes |
def print_output(output_file, use_text_format, use_gold_segmentation, output): """Writes a set of sentences in CoNLL format. Args: output_file: The file to write to. use_text_format: Whether this computation used text-format input. use_gold_segmentation: Whether this computation used gold segmentation. output: A list of sentences to write to the output file. """ with gfile.GFile(output_file, 'w') as f: f.write('## tf:{}\n'.format(use_text_format)) f.write('## gs:{}\n'.format(use_gold_segmentation)) for serialized_sentence in output: sentence = sentence_pb2.Sentence() sentence.ParseFromString(serialized_sentence) f.write('# text = {}\n'.format(sentence.text.encode('utf-8'))) for i, token in enumerate(sentence.token): head = token.head + 1 f.write('%s\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_\n' % (i + 1, token.word.encode('utf-8'), head, token.label.encode('utf-8'))) f.write('\n')
Example #11
Source File: export_inference_graph.py From Creative-Adversarial-Networks with MIT License | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #12
Source File: base.py From lambda-packs with MIT License | 6 votes |
def maybe_download(filename, work_directory, source_url): """Download the data from source url, unless it's already here. Args: filename: string, name of the file in the directory. work_directory: string, path to working directory. source_url: url to download from if file doesn't exist. Returns: Path to resulting file. """ if not gfile.Exists(work_directory): gfile.MakeDirs(work_directory) filepath = os.path.join(work_directory, filename) if not gfile.Exists(filepath): temp_file_name, _ = urlretrieve_with_retry(source_url) gfile.Copy(temp_file_name, filepath) with gfile.GFile(filepath) as f: size = f.size() print('Successfully downloaded', filename, size, 'bytes.') return filepath
Example #13
Source File: export_inference_graph.py From BMW-TensorFlow-Training-GUI with Apache License 2.0 | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #14
Source File: util_data.py From kaggle_speech_recognition with MIT License | 6 votes |
def map_chars(file_chars, chars=None): """Creates character-index mapping. The mapping needs to be constant for training and inference. """ if not os.path.exists(file_chars): tf.logging.info('WARNING!!!! regenerating %s', file_chars) idx_to_char = {i + 1: c for i, c in enumerate(chars)} # 0 is not used, dense to sparse array idx_to_char[0] = '' # null label idx_to_char[len(idx_to_char)] = '_' with gfile.GFile(file_chars, 'w') as fp: for i, c in idx_to_char.items(): fp.write('%d,%s\n' % (i, c)) else: with gfile.GFile(file_chars, 'r') as fp: reader = csv.reader(fp, delimiter=',') idx_to_char = {int(i): c for i, c in reader} char_to_idx = {c: i for i, c in idx_to_char.items()} return idx_to_char, char_to_idx
Example #15
Source File: export_inference_graph.py From ctw-baseline with MIT License | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #16
Source File: io_utils.py From DeepChatModels with MIT License | 6 votes |
def get_word_freqs(path, counter, norm_digits=True): """Extract word-frequency mapping from file given by path. Args: path: data file of words we wish to extract vocab counts from. counter: collections.Counter object for mapping word -> frequency. norm_digits: Boolean; if true, all digits are replaced by 0s. Returns: The counter (dict), updated with mappings from word -> frequency. """ print("Creating vocabulary for data", path) with gfile.GFile(path, mode="rb") as f: for i, line in enumerate(f): if (i + 1) % 100000 == 0: print("\tProcessing line", (i + 1)) line = tf.compat.as_bytes(line) tokens = basic_tokenizer(line) # Update word frequency counts in vocab counter dict. for w in tokens: word = _DIGIT_RE.sub(b"0", w) if norm_digits else w counter[word] += 1 return counter
Example #17
Source File: export_inference_graph.py From DOTA_models with Apache License 2.0 | 6 votes |
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) if hasattr(network_fn, 'default_image_size'): image_size = network_fn.default_image_size else: image_size = FLAGS.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[1, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
Example #18
Source File: io_utils.py From DeepChatModels with MIT License | 6 votes |
def get_vocab_dicts(vocabulary_path): """Returns word_to_idx, idx_to_word dictionaries given vocabulary. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
Example #19
Source File: io_utils.py From DeepChatModels with MIT License | 6 votes |
def data_to_token_ids(data_path, target_path, vocabulary_path, normalize_digits=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = get_vocab_dicts(vocabulary_path=vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids( tf.compat.as_bytes(line), vocab, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
Example #20
Source File: data_utils.py From ecm with Apache License 2.0 | 5 votes |
def initialize_vocabulary(vocabulary_path): if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip().decode('utf8') for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
Example #21
Source File: model_serialization.py From RLs with Apache License 2.0 | 5 votes |
def export_policy_model( settings: SerializationSettings, graph: tf.Graph, sess: tf.Session ) -> None: """ Exports latest saved model to .nn format for Unity embedding. """ frozen_graph_def = _make_frozen_graph(settings, graph, sess) # Save frozen graph frozen_graph_def_path = settings.model_path + "/frozen_graph_def.pb" with gfile.GFile(frozen_graph_def_path, "wb") as f: f.write(frozen_graph_def.SerializeToString()) # Convert to barracuda if settings.convert_to_barracuda: tf2bc.convert(frozen_graph_def_path, settings.model_path + ".nn") logger.info(f"Exported {settings.model_path}.nn file") # Save to onnx too (if we were able to import it) if ONNX_EXPORT_ENABLED: if settings.convert_to_onnx: try: onnx_graph = convert_frozen_to_onnx(settings, frozen_graph_def) onnx_output_path = settings.model_path + ".onnx" with open(onnx_output_path, "wb") as f: f.write(onnx_graph.SerializeToString()) logger.info(f"Converting to {onnx_output_path}") except Exception: # Make conversion errors fatal depending on environment variables (only done during CI) if _enforce_onnx_conversion(): raise logger.exception( "Exception trying to save ONNX graph. Please report this error on " "https://github.com/Unity-Technologies/ml-agents/issues and " "attach a copy of frozen_graph_def.pb" ) else: if _enforce_onnx_conversion(): raise RuntimeError( "ONNX conversion enforced, but couldn't import dependencies." )
Example #22
Source File: util_data.py From kaggle_speech_recognition with MIT License | 5 votes |
def __init__(self, file_words, file_chars, num_key_words): self._word_to_modi = {} self._modi_to_word = {} self._word_to_modi[SILENCE_CLASS] = SILENCE_CLASS self._modi_to_word[SILENCE_CLASS] = SILENCE_WORD self._all_words = [] with gfile.GFile(file_words, 'r') as fp: reader = csv.reader(fp, delimiter=',') for row in reader: if not row[0].startswith('#'): org = row[0] train_w = row[1] self._all_words.append(org) self._word_to_modi[org] = train_w self._modi_to_word[train_w] = org self._all_words += [UNKNOWN_WORD, SILENCE_WORD] train_modis = list(self._word_to_modi.values()) self._num_word_classes = len(train_modis) self._max_label_length = max([len(w) for w in train_modis]) chars = list(set(''.join(train_modis))) self._idx_to_char, self._char_to_idx = map_chars(file_chars, chars=chars) self._num_char_classes = len(self._idx_to_char) # modi_to_target dictionary self._key_words = self._all_words[:num_key_words] self._modi_to_target = {} for modi, word in self._modi_to_word.items(): if word in self._key_words + [SILENCE_WORD]: self._modi_to_target[modi] = word # word_to_target dictionary self._word_to_target = {} for word in self._all_words: if word in self._key_words + [SILENCE_WORD]: self._word_to_target[word] = word else: self._word_to_target[word] = UNKNOWN_WORD
Example #23
Source File: exporter.py From Gun-Detector with Apache License 2.0 | 5 votes |
def write_frozen_graph(frozen_graph_path, frozen_graph_def): """Writes frozen graph to disk. Args: frozen_graph_path: Path to write inference graph. frozen_graph_def: tf.GraphDef holding frozen graph. """ with gfile.GFile(frozen_graph_path, 'wb') as f: f.write(frozen_graph_def.SerializeToString()) logging.info('%d ops in the final graph.', len(frozen_graph_def.node))
Example #24
Source File: data_utils.py From deep_image_model with Apache License 2.0 | 5 votes |
def initialize_vocabulary(vocabulary_path): """Initialize vocabulary from file. We assume the vocabulary is stored one-item-per-line, so a file: dog cat will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"]. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
Example #25
Source File: file_utils.py From Gun-Detector with Apache License 2.0 | 5 votes |
def write_image(image_path, rgb): ext = os.path.splitext(image_path)[1] with gfile.GFile(image_path, 'w') as f: img_str = cv2.imencode(ext, rgb[:,:,::-1])[1].tostring() f.write(img_str)
Example #26
Source File: exporter.py From tensorflow with BSD 2-Clause "Simplified" License | 5 votes |
def _write_frozen_graph(frozen_graph_path, frozen_graph_def): """Writes frozen graph to disk. Args: frozen_graph_path: Path to write inference graph. frozen_graph_def: tf.GraphDef holding frozen graph. """ with gfile.GFile(frozen_graph_path, 'wb') as f: f.write(frozen_graph_def.SerializeToString()) logging.info('%d ops in the final graph.', len(frozen_graph_def.node))
Example #27
Source File: exporter.py From Hands-On-Machine-Learning-with-OpenCV-4 with MIT License | 5 votes |
def _write_frozen_graph(frozen_graph_path, frozen_graph_def): """Writes frozen graph to disk. Args: frozen_graph_path: Path to write inference graph. frozen_graph_def: tf.GraphDef holding frozen graph. """ with gfile.GFile(frozen_graph_path, 'wb') as f: f.write(frozen_graph_def.SerializeToString()) logging.info('%d ops in the final graph.', len(frozen_graph_def.node))
Example #28
Source File: nlp.py From LapSRN-tensorflow with Apache License 2.0 | 5 votes |
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True, UNK_ID=3, _DIGIT_RE=re.compile(br"\d")): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Parameters ----------- data_path : path to the data file in one-sentence-per-line format. target_path : path where the file with token-ids will be created. vocabulary_path : path to the vocabulary file. tokenizer : a function to use to tokenize each sentence; if None, basic_tokenizer will be used. normalize_digits : Boolean; if true, all digits are replaced by 0s. References ---------- - Code from ``/tensorflow/models/rnn/translation/data_utils.py`` """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits, UNK_ID=UNK_ID, _DIGIT_RE=_DIGIT_RE) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") else: print("Target path %s exists" % target_path)
Example #29
Source File: nlp.py From LapSRN-tensorflow with Apache License 2.0 | 5 votes |
def basic_tokenizer(sentence, _WORD_SPLIT=re.compile(b"([.,!?\"':;)(])")): """Very basic tokenizer: split the sentence into a list of tokens. Parameters ----------- sentence : tensorflow.python.platform.gfile.GFile Object _WORD_SPLIT : regular expression for word spliting. Examples -------- >>> see create_vocabulary >>> from tensorflow.python.platform import gfile >>> train_path = "wmt/giga-fren.release2" >>> with gfile.GFile(train_path + ".en", mode="rb") as f: >>> for line in f: >>> tokens = tl.nlp.basic_tokenizer(line) >>> print(tokens) >>> exit() ... [b'Changing', b'Lives', b'|', b'Changing', b'Society', b'|', b'How', ... b'It', b'Works', b'|', b'Technology', b'Drives', b'Change', b'Home', ... b'|', b'Concepts', b'|', b'Teachers', b'|', b'Search', b'|', b'Overview', ... b'|', b'Credits', b'|', b'HHCC', b'Web', b'|', b'Reference', b'|', ... b'Feedback', b'Virtual', b'Museum', b'of', b'Canada', b'Home', b'Page'] References ---------- - Code from ``/tensorflow/models/rnn/translation/data_utils.py`` """ words = [] sentence = tf.compat.as_bytes(sentence) for space_separated_fragment in sentence.strip().split(): words.extend(re.split(_WORD_SPLIT, space_separated_fragment)) return [w for w in words if w]
Example #30
Source File: nlp.py From LapSRN-tensorflow with Apache License 2.0 | 5 votes |
def read_words(filename="nietzsche.txt", replace = ['\n', '<eos>']): """File to list format context. Note that, this script can not handle punctuations. For customized read_words method, see ``tutorial_generate_text.py``. Parameters ---------- filename : a string A file path (like .txt file), replace : a list [original string, target string], to disable replace use ['', ''] Returns -------- The context in a list, split by space by default, and use ``'<eos>'`` to represent ``'\n'``, e.g. ``[... 'how', 'useful', 'it', "'s" ... ]``. Code References --------------- - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`_ """ with tf.gfile.GFile(filename, "r") as f: try: # python 3.4 or older context_list = f.read().replace(*replace).split() except: # python 3.5 f.seek(0) replace = [x.encode('utf-8') for x in replace] context_list = f.read().replace(*replace).split() return context_list