Creating and Uploading a Dataset with Unsloth: An Adventure in Wonderland
Comprehensive Guide: From Dataset Creation to Fine-Tuning and Using Your Model
Introduction
In this guide, we'll walk through the process of creating a dataset by scraping content from GitHub repositories and documentation sites, uploading the dataset to Hugging Face, fine-tuning a language model using the dataset, and finally, using the fine-tuned model. To make things engaging, we'll use analogies from "Alice in Wonderland" to explain each step.
Step 1: Setting Up the Environment
Before we can start, we need to set up our environment. Think of this step as Alice preparing for her adventure into Wonderland.
Install Necessary Libraries:
pip install beautifulsoup4 gitpython huggingface_hub datasets requests
Clone or Pull the Repository: We will clone the repository if it doesn’t exist or pull the latest changes if it does.
Step 2: Cloning and Pulling the Repository
Just like the Mad Hatter’s tea party, this step involves organizing the chaos of data.
Clone or Pull Repository Function:
def clone_or_pull_repo(repo_url, repo_name): if os.path.exists(repo_name): verbose_print(f"Repository {repo_name} already exists. Pulling latest changes.") repo = Repo(repo_name) repo.remotes.origin.pull() else: verbose_print(f"Cloning repository from {repo_url}") Repo.clone_from(repo_url, repo_name)
Extract Markdown Files: We extract all Markdown files from the repository to scrape content.
def extract_markdown_files(repo_path): verbose_print(f"Extracting Markdown files from {repo_path}") markdown_files = [] for root, dirs, files in os.walk(repo_path): for file in files: if file.endswith(".md"): markdown_files.append(os.path.join(root, file)) return markdown_files
Step 3: Parsing and Scraping Content
This step is akin to the Cheshire Cat appearing and disappearing, just like our content extraction process.
Parse Markdown Files:
def parse_markdown(file_path): verbose_print(f"Parsing Markdown file {file_path}") with open(file_path, 'r') as file: content = file.read() sections = content.split('\n## ') parsed_sections = [section.replace('\n', ' ') for section in sections] return parsed_sections
Scrape Documentation Pages:
def get_page_links(base_url, link_selector): verbose_print(f"Getting page links from {base_url}") response = requests.get(base_url) soup = BeautifulSoup(response.content, 'html.parser') page_links = [] for link in soup.select(link_selector): href = link['href'] if not href.startswith('http') and href != '#': href = base_url.rstrip('/') + '/' + href.lstrip('/') page_links.append(href) return page_links def scrape_page(url, content_selector): verbose_print(f"Scraping content from {url}") response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') page_content = [] main_content = soup.select_one(content_selector) if main_content: sections = main_content.find_all(['h1', 'h2', 'h3', 'p', 'pre']) for section in sections: page_content.append(section.text) return page_content
Step 4: Creating and Saving the Dataset
We’re now ready to create and save our dataset, much like Alice collecting her memories.
Create Dataset:
def create_dataset(repo_url, doc_urls): dataset = [] # Scrape GitHub repository repo_name = repo_url.split('/')[-1].replace('.git', '') clone_or_pull_repo(repo_url, repo_name) markdown_files = extract_markdown_files(repo_name) for md_file in markdown_files: sections = parse_markdown(md_file) for section in sections: dataset.append({ 'source': 'GitHub', 'repository': repo_name, 'file': md_file, 'label': 'autogen', 'content': section }) # Scrape documentation site for doc_url, link_selector, content_selector in doc_urls: page_links = get_page_links(doc_url, link_selector) for page_url in page_links: page_content = scrape_page(page_url, content_selector) for section in page_content: dataset.append({ 'source': 'Documentation', 'url': page_url, 'label': 'autogen', 'content': section }) return dataset
Save and Load Dataset Locally:
def load_dataset_locally(file_path): if os.path.exists(file_path): verbose_print(f"Loading existing dataset from {file_path}") with open(file_path, 'r') as file: return json.load(file) verbose_print(f"No existing dataset found at {file_path}") return [] def save_dataset_locally(dataset, output_file): verbose_print(f"Saving dataset to {output_file}") with open(output_file, 'w') as file: json.dump(dataset, file, indent=4) verbose_print("Dataset saved successfully")
Step 5: Uploading to Hugging Face
Finally, just like sharing stories from Wonderland, we upload our dataset to Hugging Face.
- Upload to Hugging Face:
def upload_to_huggingface(dataset, repo_id): token = os.getenv("HF_TOKEN") verbose_print(f"Uploading dataset to Hugging Face with repository ID {repo_id}") hf_api = HfApi() hf_api.create_repo(repo_id, token=token, repo_type="dataset", private=False) # Create a DatasetDict and push to hub dataset_dict = DatasetDict({"train": Dataset.from_list(dataset)}) dataset_dict.push_to_hub(repo_id, token=token) verbose_print(f"Dataset uploaded to Hugging Face with repository ID {repo_id}")
Example Usage
Define Repository and Documentation URLs:
repo_url = 'https://github.com/microsoft/autogen.git' doc_urls = [ ('https://microsoft.github.io/autogen/docs/', 'a[href]', 'div.md-content'), ('https://microsoft.github.io/autogen/docs/Examples', 'a[href]', 'div.md-content'), ('https://microsoft.github.io/autogen/docs/notebooks', 'a[href]', 'div.md-content'), ('https://microsoft.github.io/autogen/blog', 'a[href]', 'div.blog-content') ]
Create, Save, and Upload Dataset:
output_file = 'autogen_python_dataset.json'
repo_id = 'dimentox/autogen-python'
verbose_print("Starting dataset creation process")
existing_dataset = load_dataset_locally(output_file)
new_dataset = create_dataset(repo_url, doc_urls)
combined_dataset = existing_dataset + new_dataset
save_dataset_locally(combined_dataset, output_file)
upload_to_huggingface(combined_dataset, repo_id)
verbose_print("Dataset creation and upload process completed")
Step 6: Fine-Tuning the Model
Now, let's move on to the fine-tuning part. This is like refining the Queen of Hearts' garden to perfection.
- Fine-Tuning Script:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from datasets import load_dataset
import torch
class AdaptiveTrainer(SFTTrainer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.prev_eval_loss = float('inf')
def evaluation_step(self, *args, **kwargs):
output = super().evaluation_step(*args, **kwargs)
current_eval_loss = output['eval_loss']
# Adaptive Learning Rate Adjustment
if current_eval_loss > self.prev_eval_loss:
self.args.learning_rate *= 0.9 # Reduce learning rate if loss increased
print(f"Decreased learning rate to: {self.args.learning_rate}")
else:
self.args.learning_rate *= 1.05 # Slightly increase if loss decreased
print(f"Increased learning rate to: {self.args.learning_rate}")
self.prev_eval_loss = current_eval_loss
return output
def training_step(self, *args, **kwargs):
# Adjust gradient clipping based on gradient norms
if self.state.global_step > 0 and self.state.global_step % self.args.eval_steps == 0:
current_grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
print(f"Adjusted gradient clipping to: {current_grad_norm}")
return super().training_step(*args, **kwargs)
def print_memory_stats(stage):
gpu_stats = torch.cuda.get_device_properties(0)
used_memory = round(torch.cuda.memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu
_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"[{stage}] GPU: {gpu_stats.name}, Memory Reserved: {used_memory} GB / {max_memory} GB")
max_seq_length = 2048
dtype = None
load_in_4bit = True
print("Loading model")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
token="token"
)
print("Loading Laura")
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
print("Loading dataset")
dataset_path = "autogen_python_dataset.json"
dataset = load_dataset("json", data_files=dataset_path, split="train")
custom_prompt = """Source: {}
Repository: {}
File: {}
Label: {}
Content: {}
"""
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
sources = examples["source"]
repositories = examples["repository"]
files = examples["file"]
labels = examples["label"]
contents = examples["content"]
texts = []
for source, repository, file, label, content in zip(sources, repositories, files, labels, contents):
text = custom_prompt.format(source, repository, file, label, content) + EOS_TOKEN
texts.append(text)
return {"text": texts}
dataset = dataset.map(formatting_prompts_func, batched=True)
trainer = AdaptiveTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
packing=False,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=5,
num_train_epochs=1,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
save_strategy="steps",
save_steps=50,
eval_steps=1,
),
)
print_memory_stats("Before Training")
trainer_stats = trainer.train(resume_from_checkpoint=True)
print_memory_stats("After Training")
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
Step 7: Using the Fine-Tuned Model
Now that our model is fine-tuned, it's time to use it! This is like Alice finally understanding the Wonderland rules and using them to her advantage.
- Using the Fine-Tuned Model:
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True
print("Loading fine-tuned model")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="lora_model",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
token="token"
)
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
"""
<s>
Q: What is the capital of France?
A:
"""
],
return_tensors="pt"
).to("cuda")
text_streamer = TextStreamer(tokenizer)
outputs = model.generate(**inputs, streamer=text_streamer, max_new_tokens=64)
print(tokenizer.batch_decode(outputs))
Conclusion
In this comprehensive guide, we've walked through creating a dataset, fine-tuning a model, and using the fine-tuned model with the help of Unsloth. By following this guide, you can navigate through the intricacies of data scraping, dataset creation, and model fine-tuning with ease, much like Alice's adventure in Wonderland. Happy exploring!
Complete Code Notebook
Here’s the complete code notebook for your reference:
# Step 1: Setting Up the Environment
!pip install beautifulsoup4 gitpython huggingface_hub datasets requests
# Step 2: Cloning and Pulling the Repository
import os
import json
import requests
from bs4 import BeautifulSoup
from git import Repo
from huggingface_hub import HfApi
from datasets import Dataset, DatasetDict
def verbose_print(message):
print(f"[INFO] {message}")
def clone_or_pull_repo(repo_url, repo_name):
if os.path.exists(repo_name):
verbose_print(f"Repository {repo_name} already exists. Pulling latest changes.")
repo = Repo(repo_name)
repo.remotes.origin.pull()
else:
verbose_print(f"Cloning repository from {repo_url}")
Repo.clone_from(repo_url, repo_name)
def extract_markdown_files(repo_path):
verbose_print(f"Extracting Markdown files from {repo_path}")
markdown_files = []
for root, dirs, files in os.walk(repo_path):
for file in files:
if file.endswith(".md"):
markdown_files.append(os.path.join(root, file))
return markdown_files
# Step 3: Parsing and Scraping Content
def parse_markdown(file_path):
verbose_print(f"Parsing Markdown file {file_path}")
with open(file_path, 'r') as file:
content = file.read()
sections = content.split('\n## ')
parsed_sections = [section.replace('\n', ' ') for section in sections]
return parsed_sections
def get_page_links(base_url, link_selector):
verbose_print(f"Getting page links from {base_url}")
response = requests.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')
page_links = []
for link in soup.select(link_selector):
href = link['href']
if not href.startswith('http') and href != '#':
href = base_url.rstrip('/') + '/' + href.lstrip('/')
page_links.append(href)
return page_links
def scrape_page(url, content_selector):
verbose_print(f"Scraping content from {url}")
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
page_content = []
main_content = soup.select_one(content_selector)
if main_content:
sections = main_content.find_all(['h1', 'h2', 'h3', 'p', 'pre'])
for section in sections:
page_content.append(section.text)
return page_content
# Step 4: Creating and Saving the Dataset
def create_dataset(repo_url, doc_urls):
dataset = []
# Scrape GitHub repository
repo_name = repo_url.split('/')[-1].replace('.git', '')
clone_or_pull_repo(repo_url, repo_name)
markdown_files = extract_markdown_files(repo_name)
for md_file in markdown_files:
sections = parse_markdown(md_file)
for section in sections:
dataset.append({
'source': 'GitHub',
'repository': repo_name,
'file': md_file,
'label': 'autogen',
'content': section
})
# Scrape documentation site
for doc_url, link_selector, content_selector in doc_urls:
page_links = get_page_links(doc_url, link_selector)
for page_url in page_links:
page_content = scrape_page(page_url, content_selector)
for section in page_content:
dataset.append({
'source': 'Documentation',
'url': page_url,
'label': 'autogen',
'content': section
})
return dataset
def load_dataset_locally(file_path):
if os.path.exists(file_path):
verbose_print(f"Loading existing dataset from {file_path}")
with open(file_path, 'r') as file:
return json.load(file)
verbose_print(f"No existing dataset found at {file_path}")
return []
def save_dataset_locally(dataset, output_file):
verbose_print(f"Saving dataset to {output_file}")
with open(output_file, 'w') as file:
json.dump(dataset, file, indent=4)
verbose_print("Dataset saved successfully")
# Step 5: Uploading to Hugging Face
def upload_to_huggingface(dataset, repo_id):
token = os.getenv("HF_TOKEN")
verbose_print(f"Uploading dataset to Hugging Face with repository ID {repo_id}")
hf_api = HfApi()
hf_api.create_repo(repo_id, token=token, repo_type="dataset", private=False)
# Create a DatasetDict and push to hub
dataset_dict = DatasetDict({"
train": Dataset.from_list(dataset)})
dataset_dict.push_to_hub(repo_id, token=token)
verbose_print(f"Dataset uploaded to Hugging Face with repository ID {repo_id}")
# Example Usage
repo_url = 'https://github.com/microsoft/autogen.git'
doc_urls = [
('https://microsoft.github.io/autogen/docs/', 'a[href]', 'div.md-content'),
('https://microsoft.github.io/autogen/docs/Examples', 'a[href]', 'div.md-content'),
('https://microsoft.github.io/autogen/docs/notebooks', 'a[href]', 'div.md-content'),
('https://microsoft.github.io/autogen/blog', 'a[href]', 'div.blog-content')
]
output_file = 'autogen_python_dataset.json'
repo_id = 'dimentox/autogen-python'
verbose_print("Starting dataset creation process")
existing_dataset = load_dataset_locally(output_file)
new_dataset = create_dataset(repo_url, doc_urls)
combined_dataset = existing_dataset + new_dataset
save_dataset_locally(combined_dataset, output_file)
upload_to_huggingface(combined_dataset, repo_id)
verbose_print("Dataset creation and upload process completed")
# Step 6: Fine-Tuning the Model
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from datasets import load_dataset
import torch
class AdaptiveTrainer(SFTTrainer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.prev_eval_loss = float('inf')
def evaluation_step(self, *args, **kwargs):
output = super().evaluation_step(*args, **kwargs)
current_eval_loss = output['eval_loss']
# Adaptive Learning Rate Adjustment
if current_eval_loss > self.prev_eval_loss:
self.args.learning_rate *= 0.9 # Reduce learning rate if loss increased
print(f"Decreased learning rate to: {self.args.learning_rate}")
else:
self.args.learning_rate *= 1.05 # Slightly increase if loss decreased
print(f"Increased learning rate to: {self.args.learning_rate}")
self.prev_eval_loss = current_eval_loss
return output
def training_step(self, *args, **kwargs):
# Adjust gradient clipping based on gradient norms
if self.state.global_step > 0 and self.state.global_step % self.args.eval_steps == 0:
current_grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
print(f"Adjusted gradient clipping to: {current_grad_norm}")
return super().training_step(*args, **kwargs)
def print_memory_stats(stage):
gpu_stats = torch.cuda.get_device_properties(0)
used_memory = round(torch.cuda.memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"[{stage}] GPU: {gpu_stats.name}, Memory Reserved: {used_memory} GB / {max_memory} GB")
max_seq_length = 2048
dtype = None
load_in_4bit = True
print("Loading model")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
token="token"
)
print("Loading Laura")
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
print("Loading dataset")
dataset_path = "autogen_python_dataset.json"
dataset = load_dataset("json", data_files=dataset_path, split="train")
custom_prompt = """Source: {}
Repository: {}
File: {}
Label: {}
Content: {}
"""
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
sources = examples["source"]
repositories = examples["repository"]
files = examples["file"]
labels = examples["label"]
contents = examples["content"]
texts = []
for source, repository, file, label, content in zip(sources, repositories, files, labels, contents):
text = custom_prompt.format(source, repository, file, label, content) + EOS_TOKEN
texts.append(text)
return {"text": texts}
dataset = dataset.map(formatting_prompts_func, batched=True)
trainer = AdaptiveTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
packing=False,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=5,
num_train_epochs=1,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
save_strategy="steps",
save_steps=50,
eval_steps=1,
),
)
print_memory_stats("Before Training")
trainer_stats = trainer.train(resume_from_checkpoint=True)
print_memory_stats("After Training")
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
# Step 7: Using the Fine-Tuned Model
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True
print("Loading fine-tuned model")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="lora_model",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
token="TOKEN"
)
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
"""
<s>
Q: What is the capital of France?
A:
"""
],
return_tensors="pt"
).to("cuda")
text_streamer = TextStreamer(tokenizer)
outputs = model.generate(**inputs, streamer=text_streamer, max_new_tokens=64)
print(tokenizer.batch_decode(outputs))