Python transformers.AutoTokenizer.from_pretrained() Examples

The following are 26 code examples of transformers.AutoTokenizer.from_pretrained(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module transformers.AutoTokenizer , or try the search function

Example #1

Source File: field.py From flambe with MIT License

7 votes

def __init__(self,
                 alias: str,
                 cache_dir: Optional[str] = None,
                 max_len_truncate: int = 500,
                 add_special_tokens: bool = True, **kwargs) -> None:
        """Initialize a pretrained tokenizer.

        Parameters
        ----------
        alias: str
            Alias of a pretrained tokenizer.
        cache_dir: str, optional
            A directory where to cache the downloaded vocabularies.
        max_len_truncate: int, default = 500
            Truncates the length of the tokenized sequence.
            Because several pretrained models crash when this is
            > 500, it defaults to 500
        add_special_tokens: bool, optional
            Add the special tokens to the inputs. Default ``True``.

        """
        self._tokenizer = AutoTokenizer.from_pretrained(alias, cache_dir=cache_dir, **kwargs)
        self.max_len_truncate = max_len_truncate
        self.add_special_tokens = add_special_tokens

Example #2

Source File: transformers.py From nboost with Apache License 2.0

6 votes

def __init__(self,
                 model_dir: str = 'nboost/pt-tinybert-msmarco',
                 verbose: bool = defaults.verbose,
                 max_seq_len: int = defaults.max_seq_len,
                 **kwargs):
        super().__init__(**kwargs)
        self.logger = set_logger(model_dir, verbose=verbose)
        self.max_seq_len = max_seq_len

        self.logger.info('Loading from checkpoint %s' % model_dir)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if self.device == torch.device("cpu"):
            self.logger.info("RUNNING ON CPU")
        else:
            self.logger.info("RUNNING ON CUDA")
            torch.cuda.synchronize(self.device)

        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)

        self.rerank_model.to(self.device, non_blocking=True)

Example #3

Source File: onnxbert.py From nboost with Apache License 2.0

6 votes

def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        sess_options = rt.SessionOptions()

        self.model_dir = glob.glob(os.path.join(self.model_dir, '*.onnx'))[0]

        # Set graph optimization level to ORT_ENABLE_EXTENDED to enable bert optimization.
        sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED

        # To enable model serialization and store the optimized graph to desired location.
        sess_options.optimized_model_filepath = self.model_dir
        self.session = rt.InferenceSession(self.model_dir, sess_options)
        if 'albert' in self.model_dir:
            self.tokenizer = AutoTokenizer.from_pretrained('albert-base-uncased')
        else:
            self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Example #4

Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import BertTokenizer, BertForSequenceClassification

        # download the model or load the model path
        path_emotion = download_model('bert.emotion', cache_dir,
                                       process_func=_unzip_process_func,
                                       verbose=verbose)
        path_emotion = os.path.join(path_emotion,'bert.emotion')
        path_reject = download_model('bert.noemotion', cache_dir,
                                       process_func=_unzip_process_func,
                                       verbose=verbose)
        path_reject = os.path.join(path_reject,'bert.noemotion')
        # load the models
        self.tokenizer_rejct = BertTokenizer.from_pretrained(path_reject)
        self.model_reject = BertForSequenceClassification.from_pretrained(path_reject)
        
        self.tokenizer = BertTokenizer.from_pretrained(path_emotion)
        self.model = BertForSequenceClassification.from_pretrained(path_emotion)
        
        # load the class names mapping
        self.catagories = {5: 'Foragt/Modvilje', 2: 'Forventning/Interrese',
                           0: 'Glæde/Sindsro', 3: 'Overasket/Målløs',
                           1: 'Tillid/Accept',
                           4: 'Vrede/Irritation', 6: 'Sorg/trist',
                           7: 'Frygt/Bekymret'}

Example #5

Source File: test_perplexity_callback.py From catalyst with Apache License 2.0

6 votes

def test_is_running():
    """Test if perplexity is running normal"""
    tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModelWithLMHead.from_pretrained("distilbert-base-uncased")
    dataset = LanguageModelingDataset(texts, tok)
    collate_fn = DataCollatorForLanguageModeling(tok).collate_batch
    dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

    runner = HuggingFaceRunner()
    runner.train(
        model=model,
        optimizer=optimizer,
        loaders={"train": dataloader},
        callbacks={
            "optimizer": dl.OptimizerCallback(),
            "perplexity": PerplexityMetricCallback(),
        },
        check=True,
    )

Example #6

Source File: prediction.py From fast-bert with Apache License 2.0

6 votes

def __init__(
        self,
        model_path,
        label_path,
        multi_label=False,
        model_type="bert",
        use_fast_tokenizer=True,
        do_lower_case=True,
    ):
        self.model_path = model_path
        self.label_path = label_path
        self.multi_label = multi_label
        self.model_type = model_type
        self.do_lower_case = do_lower_case

        # Use auto-tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_path, use_fast=use_fast_tokenizer
        )

        self.learner = self.get_learner()

Example #7

Source File: pipelines.py From exbert with Apache License 2.0

6 votes

def get_defaults(self, model, tokenizer, framework):
        task_defaults = SUPPORTED_TASKS[self.task]
        if model is None:
            if framework == "tf":
                model = task_defaults["tf"].from_pretrained(task_defaults["default"]["model"]["tf"])
            elif framework == "pt":
                model = task_defaults["pt"].from_pretrained(task_defaults["default"]["model"]["pt"])
            else:
                raise ValueError("Provided framework should be either 'tf' for TensorFlow or 'pt' for PyTorch.")

        if tokenizer is None:
            default_tokenizer = task_defaults["default"]["tokenizer"]
            if isinstance(default_tokenizer, tuple):
                # For tuple we have (tokenizer name, {kwargs})
                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer[0], **default_tokenizer[1])
            else:
                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer)

        return model, tokenizer

Example #8

Source File: question_answering.py From nlp-recipes with MIT License

6 votes

def __init__(
        self,
        model_name="bert-base-cased",
        to_lower=False,
        custom_tokenize=None,
        cache_dir=".",
    ):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=to_lower,
            cache_dir=cache_dir,
            output_loading_info=False,
        )
        self.do_lower_case = to_lower
        self.custom_tokenize = custom_tokenize

Example #9

Source File: preprocess.py From unilm with MIT License

6 votes

def seg(args):
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path, do_lower_case=True
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + ".txt.tmp"),
        tokenizer,
        args.max_len,
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + "_box.txt.tmp"),
        tokenizer,
        args.max_len,
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + "_image.txt.tmp"),
        tokenizer,
        args.max_len,
    )

Example #10

Source File: transfo_experiment.py From axcell with Apache License 2.0

5 votes

def tokenizer(self):
        if self._tokenizer is None:
            self._tokenizer = AutoTokenizer.from_pretrained(self.pretrained_name)
        return self._tokenizer

Example #11

Source File: transfo_experiment.py From axcell with Apache License 2.0

5 votes

def train_model(self, data: TransfoDatabunch):
        self.set_seed("class")
        self.train_started = time.time()
        num_labels = data.num_labels
        config = AutoConfig.from_pretrained(self.pretrained_name, num_labels=num_labels) #, finetuning_task=args.task_name
        model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_name, config=config)
        train(self, data.train_ds, data.valid_ds, model.to(self.device), self._tokenizer)
        model.to("cpu")
        return model

Example #12

Source File: download.py From exbert with Apache License 2.0

5 votes

def run(self):
        from transformers import AutoModel, AutoTokenizer

        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)

Example #13

Source File: question_answering.py From nlp-recipes with MIT License

5 votes

def __init__(
        self, model_name="bert-base-cased", cache_dir=".", load_model_from_dir=None
    ):
        model = MODEL_CLASS[model_name].from_pretrained(
            model_name if load_model_from_dir is None else load_model_from_dir,
            cache_dir=cache_dir,
            output_loading_info=False,
        )
        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)

Example #14

Source File: test_language_modeling_dataset.py From catalyst with Apache License 2.0

5 votes

def test_tokenizer_tokenizer():
    """Test initialization with tokenizer"""
    tok = AutoTokenizer.from_pretrained("bert-base-uncased")
    dataset = LanguageModelingDataset(texts, tok)
    assert dataset[0] is not None
    assert len(dataset) == 2

Example #15

Source File: test_language_modeling_dataset.py From catalyst with Apache License 2.0

5 votes

def test_exception_with_sort():
    """Test lazy=True sort=True case"""
    tok = AutoTokenizer.from_pretrained("bert-base-uncased")
    dataset = LanguageModelingDataset(  # noqa: F841
        texts, tok, lazy=True, sort=True
    )

Example #16

Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import AutoModelForTokenClassification
        from transformers import AutoTokenizer

        # download the model or load the model path
        weights_path = download_model('bert.ner', cache_dir,
                                      process_func=_unzip_process_func,
                                      verbose=verbose)

        self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
                           "I-ORG", "B-LOC", "I-LOC"]

        self.model = AutoModelForTokenClassification.from_pretrained(weights_path)
        self.tokenizer = AutoTokenizer.from_pretrained(weights_path)

Example #17

Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import BertTokenizer, BertForSequenceClassification
         
        
        # download the model or load the model path
        path_sub = download_model('bert.subjective', cache_dir, process_func=_unzip_process_func,verbose=verbose)
        path_sub = os.path.join(path_sub,'bert.sub.v0.0.1')
        path_pol = download_model('bert.polarity', cache_dir, process_func=_unzip_process_func,verbose=verbose)
        path_pol = os.path.join(path_pol,'bert.pol.v0.0.1')
        
        self.tokenizer_sub = BertTokenizer.from_pretrained(path_sub)
        self.model_sub = BertForSequenceClassification.from_pretrained(path_sub)
        self.tokenizer_pol = BertTokenizer.from_pretrained(path_pol)
        self.model_pol = BertForSequenceClassification.from_pretrained(path_pol)

Example #18

Source File: Transformer.py From sentence-transformers with Apache License 2.0

5 votes

def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ):
        super(Transformer, self).__init__()
        self.config_keys = ['max_seq_length']
        self.max_seq_length = max_seq_length

        config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
        self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)

Example #19

Source File: abstractive_summarization_bertsum.py From nlp-recipes with MIT License

4 votes

def __init__(
        self,
        processor,
        model_name="bert-base-uncased",
        finetune_bert=True,
        cache_dir=".",
        label_smoothing=0.1,
        test=False,
        max_pos_length=768,
    ):
        """Initialize an object of BertSumAbs.

        Args:
            processor (BertSumAbsProcessor): A processor with symbols, tokenizers
                and collate functions that are used in finetuning and prediction.
            model_name (str, optional:) Name of the pretrained model which is used
                to initialize the encoder of the BertSumAbs model.
                check MODEL_CLASS for supported models. Defaults to "bert-base-uncased".
            finetune_bert (bool, option): Whether the bert model in the encoder is
                finetune or not. Defaults to True.
            cache_dir (str, optional): Directory to cache the tokenizer.
                Defaults to ".".
            label_smoothing (float, optional): The amount of label smoothing.
                Value range is [0, 1]. Defaults to 0.1.
            test (bool, optional): Whether the class is initiated for test or not.
                It must be True if the class obj is only initialized to load a
                 checkpoint for test/inferencing.  Defaults to False.
            max_pos_length (int, optional): maximum postional embedding length for the
                input. Defaults to 768.
        """
        model = MODEL_CLASS[model_name].from_pretrained(
            model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False
        )
        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)

        if model_name not in self.list_supported_models():
            raise ValueError(
                "Model name {} is not supported by BertSumAbs. "
                "Call 'BertSumAbs.list_supported_models()' to get all supported model "
                "names.".format(value)
            )

        self.model_class = MODEL_CLASS[model_name]
        self.cache_dir = cache_dir
        self.max_pos_length = max_pos_length

        self.model = AbsSummarizer(
            temp_dir=cache_dir,
            finetune_bert=finetune_bert,
            checkpoint=None,
            label_smoothing=label_smoothing,
            symbols=processor.symbols,
            test=test,
            max_pos=self.max_pos_length,
        )
        self.processor = processor
        self.optim_bert = None
        self.optim_dec = None

Example #20

Source File: abstractive_summarization_bertsum.py From nlp-recipes with MIT License

4 votes

def __init__(
        self,
        model_name="bert-base-uncased",
        to_lower=True,
        cache_dir=".",
        max_src_len=640,
        max_tgt_len=140,
    ):
        """ Initialize the preprocessor.

        Args:
            model_name (str, optional): Transformer model name used in preprocessing.
                check MODEL_CLASS for supported models. Defaults to "bert-base-cased".
            to_lower (bool, optional): Whether to convert all letters to lower case
                during tokenization. This is determined by if a cased model is used.
                Defaults to True, which corresponds to a uncased model.
            cache_dir (str, optional): Directory to cache the tokenizer.
                Defaults to ".".
            max_src_len (int, optional): Max number of tokens that be used
                as input. Defaults to 640.
            max_tgt_len (int, optional): Max number of tokens that be used
                as in target. Defaults to 140.

        """
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=to_lower,
            cache_dir=cache_dir,
            output_loading_info=False,
        )

        self.symbols = {
            "BOS": self.tokenizer.vocab["[unused0]"],
            "EOS": self.tokenizer.vocab["[unused1]"],
            "PAD": self.tokenizer.vocab["[PAD]"],
            "EOQ": self.tokenizer.vocab["[unused2]"],
        }

        self.sep_token = "[SEP]"
        self.cls_token = "[CLS]"
        self.pad_token = "[PAD]"
        self.tgt_bos = self.symbols["BOS"]
        self.tgt_eos = self.symbols["EOS"]

        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len

Example #21

Source File: extractive_summarization.py From nlp-recipes with MIT License

4 votes

def __init__(
        self,
        model_name="distilbert-base-uncased",
        to_lower=False,
        cache_dir=".",
        max_nsents=200,
        max_src_ntokens=2000,
        min_nsents=3,
        min_src_ntokens=5,
    ):
        """ Initialize the preprocessor.

        Args:
            model_name (str, optional): Transformer model name used in preprocessing.
                check MODEL_CLASS for supported models. Defaults to "bert-base-cased".
            to_lower (bool, optional): Whether to convert all letters to lower case
                during tokenization. This is determined by if a cased model is used.
                Defaults to False, which corresponds to a cased model.
            cache_dir (str, optional): Directory to cache the tokenizer.
                Defaults to ".".
            max_nsents (int, optional): Max number of sentences that can be used
                as input. Defaults to 200.
            max_src_ntokens (int, optional): Max number of tokens that be used
                as input. Defaults to 2000.
            min_nsents (int, optional): Minimum number of sentences that are required
                as input. If the input has less number of sentences than this value,
                it's skipped and cannot be used as a valid input. Defaults to 3.
            min_src_ntokens (int, optional): Minimum number of tokens that are required
                as an input sentence.If the input sentence has less number of tokens
                than this value, it's skipped and cannot be used as a valid sentence.
                Defaults to 5.

        """
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=to_lower,
            cache_dir=cache_dir,
            output_loading_info=False,
        )
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

        self.max_nsents = max_nsents
        self.max_src_ntokens = max_src_ntokens
        self.min_nsents = min_nsents
        self.min_src_ntokens = min_src_ntokens

Example #22

Source File: language_modeling.py From catalyst with Apache License 2.0

4 votes

def __init__(
        self,
        texts: Iterable[str],
        tokenizer: Union[str, PreTrainedTokenizer],
        max_seq_length: int = None,
        sort: bool = True,
        lazy: bool = False,
    ):
        """
        Args:
            texts (Iterable): Iterable object with text
            tokenizer (str or tokenizer): pre trained
                huggingface tokenizer or model name
            max_seq_length (int): max sequence length to tokenize
            sort (bool): If True then sort all sequences by length
                for efficient padding
            lazy (bool): If True then tokenize and encode sequence
                in __getitem__ method
                else will tokenize in __init__ also
                if set to true sorting is unavialible
        """
        if sort and lazy:
            raise Exception(
                "lazy is set to True so we can't sort"
                " sequences by length.\n"
                "You should set sort=False and lazy=True"
                " if you want to encode text in __get_item__ function"
            )
        if isinstance(tokenizer, str):
            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        elif isinstance(
            tokenizer, transformers.tokenization_utils.PreTrainedTokenizer
        ):
            self.tokenizer = tokenizer
        else:
            raise TypeError(
                "tokenizer argument should be a model name"
                + " or huggingface PreTrainedTokenizer"
            )

        self.max_seq_length = max_seq_length

        self.lazy = lazy

        if lazy:
            self.texts = texts

        if not lazy:
            pbar = tqdm(texts, desc="tokenizing texts")
            self.encoded = [
                self.tokenizer.encode(text, max_length=max_seq_length)
                for text in pbar
            ]
            if sort:
                self.encoded.sort(key=len)

        self.length = len(texts)

        self._getitem_fn = (
            self._getitem_lazy if lazy else self._getitem_encoded
        )

Example #23

Source File: text_classification.py From catalyst with Apache License 2.0

4 votes

def __init__(
        self,
        texts: List[str],
        labels: List[str] = None,
        label_dict: Mapping[str, int] = None,
        max_seq_length: int = 512,
        model_name: str = "distilbert-base-uncased",
    ):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization
        """
        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(
                zip(sorted(set(labels)), range(len(set(labels))))
            )

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger("transformers.tokenization_utils").setLevel(
            logging.FATAL
        )

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

Example #24

Source File: benchmarks.py From exbert with Apache License 2.0

4 votes

def _compute_tensorflow(model_names, dictionary, average_over, amp):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name)
        model = TFAutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)

        max_input_size = tokenizer.max_model_input_sizes[model_name]
        batch_sizes = [1, 2, 4, 8]
        slice_sizes = [8, 64, 128, 256, 512, 1024]

        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}

        print("Using model", model)

        @tf.function
        def inference(inputs):
            return model(inputs)

        for batch_size in batch_sizes:
            for slice_size in slice_sizes:
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
                    sequence = tf.stack(
                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
                    )

                    try:
                        print("Going through model with sequence of shape", sequence.shape)
                        # To make sure that the model is traced + that the tensors are on the appropriate device
                        inference(sequence)

                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
                    except tf.errors.ResourceExhaustedError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
    return dictionary

Example #25

Source File: benchmarks.py From exbert with Apache License 2.0

4 votes

def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
        model = AutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)

        max_input_size = tokenizer.max_model_input_sizes[model_name]
        batch_sizes = [1, 2, 4, 8]
        slice_sizes = [8, 64, 128, 256, 512, 1024]

        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}

        for batch_size in batch_sizes:
            if fp16:
                model.half()
            model.to(device)
            model.eval()
            for slice_size in slice_sizes:
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
                    sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
                    try:
                        if torchscript:
                            print("Tracing model with sequence size", sequence.shape)
                            inference = torch.jit.trace(model, sequence)
                            inference(sequence)
                        else:
                            inference = model
                            inference(sequence)

                        print("Going through model with sequence of shape", sequence.shape)
                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
                    except RuntimeError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
    return dictionary

Example #26

Source File: wordpiece_indexer.py From NLP_Toolkit with Apache License 2.0

4 votes

def __init__(self,
                 pretrained_model: str,
                 use_starting_offsets: bool = False,
                 do_lowercase: bool = True,
                 never_lowercase: List[str] = None,
                 max_pieces: int = 512,
                 max_pieces_per_token=5,
                 is_test=False,
                 truncate_long_sequences: bool = True,
                 special_tokens_fix: int = 0) -> None:
        if pretrained_model.endswith("-cased") and do_lowercase:
            logger.warning("Your BERT model appears to be cased, "
                           "but your indexer is lowercasing tokens.")
        elif pretrained_model.endswith("-uncased") and not do_lowercase:
            logger.warning("Your BERT model appears to be uncased, "
                           "but your indexer is not lowercasing tokens.")

        bert_tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model, do_lower_case=do_lowercase, do_basic_tokenize=False)

        # to adjust all tokenizers
        if hasattr(bert_tokenizer, 'encoder'):
            bert_tokenizer.vocab = bert_tokenizer.encoder
        if hasattr(bert_tokenizer, 'sp_model'):
            bert_tokenizer.vocab = defaultdict(lambda: 1)
            for i in range(bert_tokenizer.sp_model.get_piece_size()):
                bert_tokenizer.vocab[bert_tokenizer.sp_model.id_to_piece(i)] = i

        if special_tokens_fix:
            bert_tokenizer.add_tokens([START_TOKEN])
            bert_tokenizer.vocab[START_TOKEN] = len(bert_tokenizer) - 1

        if "roberta" in pretrained_model:
            bpe_ranks = bert_tokenizer.bpe_ranks
            byte_encoder = bert_tokenizer.byte_encoder
        else:
            bpe_ranks = {}
            byte_encoder = None

        super().__init__(vocab=bert_tokenizer.vocab,
                         bpe_ranks=bpe_ranks,
                         byte_encoder=byte_encoder,
                         wordpiece_tokenizer=bert_tokenizer.tokenize,
                         namespace="bert",
                         use_starting_offsets=use_starting_offsets,
                         max_pieces=max_pieces,
                         max_pieces_per_token=max_pieces_per_token,
                         is_test=is_test,
                         do_lowercase=do_lowercase,
                         never_lowercase=never_lowercase,
                         start_tokens=["[CLS]"] if not special_tokens_fix else [],
                         end_tokens=["[SEP]"] if not special_tokens_fix else [],
                         truncate_long_sequences=truncate_long_sequences)