I am currently trying to replicate the article

https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f

to get an introduction to PyTorch and BERT.

I used some own sample corpus and corresponding tragets as practise, but the code throws the following:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-4-8577755f37de> in <module>()
    201 LR = 1e-6
    202 
--> 203 trainer(model, df_train, df_val, LR, EPOCHS)

3 frames
<ipython-input-4-8577755f37de> in trainer(model, train_data, val_data, learning_rate, epochs)
    162                 output = model(input_id, mask)
    163 
--> 164                 batch_loss = criterion(output, torch.max(train_label,1)[1])
    165                 total_loss_train += batch_loss.item()
    166 

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
   1150         return F.cross_entropy(input, target, weight=self.weight,
   1151                                ignore_index=self.ignore_index, reduction=self.reduction,
-> 1152                                label_smoothing=self.label_smoothing)
   1153 
   1154 

/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
   2844     if size_average is not None or reduce is not None:
   2845         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2846     return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
   2847 
   2848 

IndexError: Target 32 is out of bounds.

The code is mostly identical to the one in the article, except of course the more extensive lable-dict.

Orginial:

labels = {'business':0,
          'entertainment':1,
          'sport':2,
          'tech':3,
          'politics':4
          }

Mine:

labels = 
{'Macroeconomics': 0,
 'Microeconomics': 1,
 'Labor Economics': 2,
 'Subnational Fiscal Issues': 3,
 'Econometrics': 4,
 'International Economics': 5,
 'Financial Economics': 6,
 'Health, Education, and Welfare': 7,
 'Public Economics': 8,
 'Development and Growth': 9,
 'Industrial Organization': 10,
 'Other': 11,
 'Environmental and Resource Economics': 12,
 'History': 13,
 'Regional and Urban Economics': 14,
 'Development Economics': 15,
 'Corporate Finance': 16,
 'Children': 17,
 'Labor Studies': 18,
 'Economic Fluctuations and Growth': 19,
 'Economics of Aging': 20,
 'Economics of Education': 21,
 'International Trade and Investment': 22,
 'Asset Pricing': 23,
 'Health Economics': 24,
 'Law and Economics': 25,
 'International Finance and Macroeconomics': 26,
 'Monetary Economics': 27,
 'Technical Working Papers': 28,
 'Political Economy': 29,
 'Development of the American Economy': 30,
 'Health Care': 31,
 'Productivity, Innovation, and Entrepreneurship': 32}

Code:

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = torch.LongTensor([labels[label] for label in df["category"]])
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = np.array(range(0,len(labels)))

        return batch_texts, batch_y
    
#Splitting the sample into trainingset, validationset and testset (80,10,10)
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))


from torch import nn

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer
    
from torch.optim import Adam
from tqdm import tqdm

def trainer(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)
    
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, torch.max(train_label,1)[1])
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
trainer(model, df_train, df_val, LR, EPOCHS)

Solution 1:

You're creating a list of length 33 in your __getitem__ call which is one more than the length of the labels list, hence the out of bounds error. In fact, you create the same list each time this method is called. You're supposed to fetch the associated y with the X found at idx.

If you replace batch_y = np.array(range(...)) with batch_y = np.array(self.labels[idx]), you'll fix your error. Indeed, this is already implemented in your get_batch_labels method.