# coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Classification fine-tuning: utilities to work with various datasets """ from __future__ import absolute_import, division, print_function import csv import logging import os import sys from io import open import enum import numpy as np from scipy.stats import pearsonr, spearmanr from sklearn.metrics import matthews_corrcoef, f1_score import torch from torch.utils.data import (DataLoader, SequentialSampler, TensorDataset) from scipy.special import softmax logger = logging.getLogger(__name__) class InputExample(object): """A single training/test example for simple sequence classification.""" def __init__(self, guid, text_a, text_b=None, label=None): """Constructs a InputExample. Args: guid: Unique id for the example. text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified. text_b: (Optional) string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks. label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ self.guid = guid self.text_a = text_a self.text_b = text_b self.label = label class InputFeatures(object): """A single set of features of data.""" def __init__(self, input_ids, input_mask, segment_ids, label_id): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids self.label_id = label_id class DataProcessor(object): """Base class for data converters for sequence classification data sets.""" def get_train_examples(self, data_dir): """Gets a collection of `InputExample`s for the train set.""" raise NotImplementedError() def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" raise NotImplementedError() def get_labels(self): """Gets the list of labels for this data set.""" raise NotImplementedError() @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" with open(input_file, "r", encoding="utf-8-sig") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) lines = [] for line in reader: if sys.version_info[0] == 2: line = list(unicode(cell, 'utf-8') for cell in line) lines.append(line) return lines class ImdbProcessor(DataProcessor): """Processor for the IMDB Movie Review data set .""" def get_train_examples(self, data_dir): """See base class.""" """logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))""" folder1 = os.path.join(data_dir, "train", "pos") folder2 = os.path.join(data_dir, "train", "neg") return self._create_examples_from_folder([folder1, folder2], "train") def get_dev_examples(self, data_dir): """See base class.""" folder1 = os.path.join(data_dir, "test", "pos") folder2 = os.path.join(data_dir, "test", "neg") return self._create_examples_from_folder([folder1, folder2], "dev") def get_labels(self): """See base class.""" return ["0", "1"] """def get_label_from_name(self, name): return label_dict[id]""" def _create_examples_from_folder(self, folder_list, set_type): """Creates examples for the training and dev sets from labelled folder.""" examples = [] i = 0 for folder in folder_list: label_dir = os.path.basename(folder) for input_file in os.listdir(folder): if input_file.endswith(".txt"): with open(os.path.join(folder, input_file), "r") as f: text_a=f.readlines()[0] guid = "%s-%d" % (set_type, i); i+=1 text_b = None label = task_labels[label_dir] """logger.info("guid: %s, file: <%s>, text_a= <%s>, label=%s" % (guid, input_file, text_a, label))""" examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_mode, cls_token_at_end=False, pad_on_left=False, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_a_segment_id=0, sequence_b_segment_id=1, cls_token_segment_id=1, pad_token_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ label_map = {label : i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = tokens_a + [sep_token] segment_ids = [sequence_a_segment_id] * len(tokens) if tokens_b: tokens += tokens_b + [sep_token] segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1) if cls_token_at_end: tokens = tokens + [cls_token] segment_ids = segment_ids + [cls_token_segment_id] else: tokens = [cls_token] + tokens segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids else: input_ids = input_ids + ([pad_token] * padding_length) input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length if output_mode == "classification": label_id = label_map[example.label] elif output_mode == "regression": label_id = float(example.label) else: raise KeyError(output_mode) if ex_index < 0: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("tokens: %s" % " ".join( [str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logger.info("label: %s (id = %d)" % (example.label, label_id)) features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)) return features def _truncate_seq_pair(tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def simple_accuracy(preds, labels): return (preds == labels).mean() def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, } def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] return { "pearson": pearson_corr, "spearmanr": spearman_corr, "corr": (pearson_corr + spearman_corr) / 2, } def compute_metrics(task_name, preds, labels): assert len(preds) == len(labels) if task_name == "imdb": return acc_and_f1(preds, labels) elif task_name == "quora": return acc_and_f1(preds, labels) else: raise KeyError(task_name) processors = { "imdb": ImdbProcessor, } output_modes = { "imdb": "classification", "quora": "classification", } task_labels = { "neg": "0", "pos": "1", } def do_inference(args, model, tokenizer): inf_task = args.task_name inf_dataset = load_example(args, inf_task, tokenizer) inf_sampler = SequentialSampler(inf_dataset) inf_dataloader = DataLoader(inf_dataset, sampler=inf_sampler, batch_size=1) # Inference! logger.info("***** Running inference *****") preds = None out_label_ids = None for batch in inf_dataloader: model.eval() batch = tuple(t for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} outputs = model(**inputs) inf_loss, logits = outputs[:2] pred_arr = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() pred_prob = np.squeeze(softmax(pred_arr, axis=1)) logger.info("[0]: %s, [1]: %s", pred_prob[0], pred_prob[1]) if args.output_mode == "classification": pred = np.argmax(pred_arr, axis=1) elif args.output_mode == "regression": pred = np.squeeze(pred_arr) confidence = 0 if pred == 0: confidence = pred_prob[0]*100 logger.info("Text is negative with confidence: %d ", confidence) else: confidence = pred_prob[1]*100 logger.info("Text is positive with confidence: %d ", confidence) return (pred, confidence) def load_example(args, task, tokenizer): processor = processors[task]() output_mode = output_modes[task] label_list = processor.get_labels() examples = [InputExample(guid=0, text_a=args.text, text_b=None, label='1')] features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset