Uploaded model
- Developed by: vakodiya
- License: apache-2.0
- Finetuned from model : unsloth/Meta-Llama-3.1-8B-bnb-4bit
This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.
Code To Train Model on Google collab:
Installing required packages
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
importing required modules
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
Login to HuggingFace using edit Access token storing in secrets
from huggingface_hub import login
from google.colab import userdata
hf_token = userdata.get('HF_API_KEY')
login(token = hf_token)
Check if a GPU is available
import torch
if torch.cuda.is_available():
device = torch.device("cuda")
print("GPU is available and being used.")
else:
device = torch.device("cpu")
print("GPU is not available, using CPU.")
Loading model from Hugging Face
max_seq_length = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
max_seq_length=max_seq_length,
load_in_4bit=True,
dtype=None,
)
model = FastLanguageModel.get_peft_model(
model,
r=16,
lora_alpha=16,
lora_dropout=0,
target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
use_rslora=True,
use_gradient_checkpointing="unsloth"
)
loading and formating Dataset
raw_dataset = load_dataset("viber1/indian-law-dataset", split="train[:1000]")
# Define a simple prompt template using only Instruction and Response
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{}
### Response:
{}"""
# EOS token for marking the end of each example
EOS_TOKEN = tokenizer.eos_token
# Function to format prompts with only Instruction and Response
def formatting_prompts_func(examples):
Instruction = examples["Instruction"]
Response = examples["Response"]
# Create a formatted text for each example
texts = []
for Instruction, Response in zip(Instruction, Response):
# Format the text with the prompt template and add the EOS token
text = alpaca_prompt.format(Instruction, Response) + EOS_TOKEN
texts.append(text)
return {"text": texts}
# Apply the formatting function to the dataset
dataset = raw_dataset.map(formatting_prompts_func, batched=True)
Using Trainer with low batch sizes, Gradient Checkpointing, LoRA and Quantization
trainer=SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
packing=True,
args=TrainingArguments(
learning_rate=3e-4,
lr_scheduler_type="linear",
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
num_train_epochs=1,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
warmup_steps=10,
output_dir="output",
seed=0,
),
)
Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
Start Training
trainer_stats = trainer.train()
Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
Finally Saving Trained model and push to HuggingFace
# Merge to 16bit
model.save_pretrained_merged("Indian-Law-Llama-3.1-8B", tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged("vakodiya/Viber-Indian-Law-Unsloth-Llama-3.1-8B", tokenizer, save_method="merged_16bit", token = hf_token)
Model usage with streaming response
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
alpaca_prompt.format(
"What is the difference between a petition and a plaint in Indian law?",''
)
], return_tensors = "pt").to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
- Downloads last month
- 18
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.
Model tree for vakodiya/Viber-Indian-Law-Unsloth-Llama-3.1-8B
Base model
meta-llama/Llama-3.1-8B
Quantized
unsloth/Meta-Llama-3.1-8B-bnb-4bit