# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Also available under a BSD-style license. See LICENSE.
"""
Runs a training of the Bert model using the Lazy Tensor Core with the
example Torch MLIR backend.

Most of the code in this example was copied from the wonderful tutorial
    https://huggingface.co/transformers/training.html#fine-tuning-in-native-pytorch

Based on LTC code samples by ramiro050
    https://github.com/ramiro050/lazy-tensor-samples
"""

import argparse
import sys
from typing import List

import torch
import torch._C
import torch._lazy
from datasets import load_dataset
from datasets.dataset_dict import DatasetDict
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, \
    BertConfig, BertTokenizer, AdamW, get_scheduler


def tokenize_dataset(dataset: DatasetDict) -> DatasetDict:
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length",
                         truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(['text'])
    tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
    tokenized_datasets.set_format('torch')

    return tokenized_datasets


def train(model: BertForSequenceClassification,
          num_epochs: int,
          num_training_steps: int,
          train_dataloader: DataLoader,
          device: torch.device) -> List[torch.Tensor]:
    optimizer = AdamW(model.parameters(), lr=5e-5)
    lr_scheduler = get_scheduler('linear', optimizer=optimizer,
                                 num_warmup_steps=0,
                                 num_training_steps=num_training_steps)

    model.train()
    losses = []
    for _ in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            losses.append(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            if 'lazy' in str(model.device):
                print("Calling Mark Step")
                torch._lazy.mark_step()

    return losses


def main(device='lazy', full_size=False):
    """
    Load model to specified device. Ensure that any backends have been initialized by this point.

    :param device: name of device to load tensors to
    :param full_size: if true, use a full pretrained bert-base-cased model instead of a smaller variant
    """
    torch.manual_seed(0)

    tokenized_datasets = tokenize_dataset(load_dataset('imdb'))
    small_train_dataset = tokenized_datasets['train'].shuffle(seed=42) \
        .select(range(2))

    train_dataloader = DataLoader(small_train_dataset, shuffle=True,
                                  batch_size=8)
    if full_size:
        model = BertForSequenceClassification.from_pretrained('bert-base-cased',
                                                              num_labels=2)
    else:
        configuration = BertConfig(
            vocab_size=28996,
            hidden_size=32,
            num_hidden_layers=1,
            num_attention_heads=2,
            intermediate_size=32,
            hidden_act='gelu',
            hidden_dropout_prob=0.0,
            attention_probs_dropout_prob=0.0,
            max_position_embeddings=512,
            layer_norm_eps=1.0e-05,
        )
        model = BertForSequenceClassification(configuration)

    model.to(device)

    num_epochs = 3
    num_training_steps = num_epochs * len(train_dataloader)
    losses = train(model, num_epochs, num_training_steps, train_dataloader, device)

    # Get debug information from LTC
    if 'torch_mlir._mlir_libs._REFERENCE_LAZY_BACKEND' in sys.modules:
        computation = lazy_backend.get_latest_computation()
        if computation:
            print(computation.debug_string())

    print('Loss: ', losses)

    return model, losses


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-d",
        "--device",
        type=str.upper,
        choices=["CPU", "TS", "MLIR_EXAMPLE"],
        default="MLIR_EXAMPLE",
        help="The device type",
    )
    parser.add_argument(
        "-f",
        "--full_size",
        action='store_true',
        default=False,
        help="Use full sized BERT model instead of one with smaller parameterization",
    )
    args = parser.parse_args()

    if args.device in ("TS", "MLIR_EXAMPLE"):
        if args.device == "TS":
            import torch._lazy.ts_backend
            torch._lazy.ts_backend.init()

        elif args.device == "MLIR_EXAMPLE":
            import torch_mlir._mlir_libs._REFERENCE_LAZY_BACKEND as lazy_backend

            lazy_backend._initialize()

        device = "lazy"
        print("Initialized backend")
    else:
        device = args.device.lower()

    main(device, args.full_size)