Sunday, June 23, 2024

Fine Tune Pretrained Large Language Models for Local use

 



Fine Tuning Pretrained LLM's

Fine-tuning is a technique used to improve the performance of pre-trained Large Language Models (LLMs) on specific tasks or domains. By fine-tuning, the LLM learns the nuances and specific things related to our domain, improving its performance on that particular task.

Benefits of Fine-Tuning:

  • Increased Accuracy and Relevance: The LLM becomes more accurate and relevant to your specific needs by specializing in a particular domain.
  • Efficiency: Fine-tuning a pre-trained model is often faster and requires less computational power compared to training a new LLM from scratch.
  • Leveraging Existing Knowledge: The LLM retains its general language understanding from pre-training, which serves as a foundation for domain-specific learning.

Video Tutorial on Fine Tuning Pretrained Large Language Models.


Training: Fine Tuning Pretrained Large Language Models

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch

# Sample data creation
data = {
"text": [
"I loved this movie! The acting was great and the story was gripping.",
"This was a terrible movie. The plot was predictable and the characters were uninteresting.",
"The film was fantastic! I highly recommend it.",
"I did not enjoy this movie at all. It was too slow and boring.",
"The special effects were amazing, but the storyline was weak.",
"An excellent film with a touching story.",
"The movie was a masterpiece with brilliant performances.",
"I found the movie to be dull and uninspiring.",
"The direction and cinematography were outstanding.",
"The movie was overly long and felt drawn out.",
"An absolutely thrilling and engaging film.",
"The characters were flat and the dialogue was poor.",
"A wonderful film experience with a powerful message.",
"The plot was confusing and hard to follow.",
"One of the best movies I have seen this year!",
"The acting was subpar and the story lacked depth.",
"A heartwarming tale with excellent performances.",
"The movie was full of clichés and very predictable.",
"An emotional rollercoaster with a satisfying ending.",
"I couldn't connect with the characters or the story.",
],
"sentiment": [
"positive", "negative", "positive", "negative", "negative",
"positive", "positive", "negative", "positive", "negative",
"positive", "negative", "positive", "negative", "positive",
"negative", "positive", "negative", "positive", "negative"
]
}

df = pd.DataFrame(data)
train_df, val_df = df[:int(len(df)*0.8)], df[int(len(df)*0.8):]

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

model_name = "E:\\Niraj_Work\\LLM_Models\\Meta-Llama-3-8B-Instruct" # Replace with the actual LLaMA3 model if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure padding token is set
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# Mapping sentiment to numerical labels
def map_labels(example):
example['label'] = 1 if example['sentiment'] == 'positive' else 0
return example

tokenized_train = tokenized_train.map(map_labels)
tokenized_val = tokenized_val.map(map_labels)

# Remove the original 'sentiment' column
tokenized_train = tokenized_train.remove_columns(['sentiment'])
tokenized_val = tokenized_val.remove_columns(['sentiment'])

# Update model config if padding token is added
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.resize_token_embeddings(len(tokenizer))

# Define Data Collator with Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
training_args = TrainingArguments(
output_dir='E:\\Niraj_Work\\LLM_Models\\Meta-Llama-3-8B-Instruct_updated',
evaluation_strategy='epoch',
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
)

# Define the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the model and tokenizer
# model.save_pretrained('./fine-tuned-llama')
# tokenizer.save_pretrained('./fine-tuned-llama')

Test: Fine Tuning Pretrained Large Language Models

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

# Load the fine-tuned model and tokenizer
model_name = 'E:\\Niraj_Work\\LLM_Models\\Meta-Llama-3-8B-Instruct_updated' # Directory where the model and tokenizer are saved
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Sample test data
test_data = {
"text": [
"This movie was fantastic! I really enjoyed it.",
"I didn't like this movie at all. It was very boring.",
# Add more test examples as needed
]
}
# Create a DataFrame for test data
test_df = pd.DataFrame(test_data)
# Ensure padding token is set
if tokenizer.pad_token is None:
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
# Tokenize the test data
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')

tokenized_test = test_df['text'].apply(lambda x: tokenize_function({'text': [x]}))

# Predict function
def predict(text_list):
model.eval()
predictions = []
with torch.no_grad():
for text in text_list:
inputs = tokenize_function({'text': [text]})
input_ids = inputs['input_ids'].to(model.device)
attention_mask = inputs['attention_mask'].to(model.device)
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
predicted_class_id = torch.argmax(logits, dim=1).item()
predictions.append(predicted_class_id)
return predictions

# Convert the predictions to sentiment labels
sentiment_map = {0: 'negative', 1: 'positive'}
predicted_labels = predict(test_data['text'])

# Map numerical labels to sentiment labels
predicted_sentiments = [sentiment_map[label] for label in predicted_labels]

# Display the results
for text, sentiment in zip(test_data['text'], predicted_sentiments):
print(f'Text: {text}\nPredicted Sentiment: {sentiment}\n')

Test Output:
C:\Users\admin\AppData\Local\Programs\Python\Python310\python.exe E:/Niraj_Work/DL_Projects/llm_projects/llm_advance_1/test_abcd.py
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [18:59<00:00, 284.85s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at E:\Niraj_Work\LLM_Models\Meta-Llama-3-8B-Instruct_updated and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Text: This movie was fantastic! I really enjoyed it.
Predicted Sentiment: positive

Text: I didn't like this movie at all. It was very boring.
Predicted Sentiment: negative


Process finished with exit code 0

No comments:

Post a Comment