|
Fine-tuned llama 2 7b with processed open orca dataset (Ayush2312/deduplicated_orca_post_processed): |
|
|
|
data processing: |
|
|
|
1. Remove output token less than 100 tokens in reponse |
|
2. Do cosine similarity on examples with threshold 0.95 |
|
3. |
|
python codes for data processing: |
|
|
|
step 1: |
|
``` |
|
from datasets import load_dataset, Dataset |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
# Load your dataset from Hugging Face |
|
dataset = load_dataset("Ayush2312/orca-1m-gpt4", split='train[:7000]') |
|
|
|
# Tokenize your text data |
|
texts = dataset['system_prompt'] + dataset['question'] + dataset['response'] |
|
|
|
# Filter out instructions with less than 100 tokens in response |
|
filtered_texts = [] |
|
for i, response in enumerate(dataset['response']): |
|
if len(response.split()) >= 100: |
|
filtered_texts.append({'system_prompt': dataset['system_prompt'][i], |
|
'question': dataset['question'][i], |
|
'response': response}) |
|
|
|
# TF-IDF Vectorization for deduplication |
|
texts = [text['system_prompt'] + ' ' + text['question'] + ' ' + text['response'] for text in filtered_texts] |
|
vectorizer = TfidfVectorizer() |
|
tfidf_matrix = vectorizer.fit_transform(texts) |
|
|
|
# Calculate cosine similarity for deduplication |
|
cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix) |
|
|
|
# Deduplicate the data based on cosine similarity |
|
deduplicated_indices = set() |
|
for i in range(len(cos_sim_matrix)): |
|
if i not in deduplicated_indices: |
|
for j in range(i + 1, len(cos_sim_matrix)): |
|
if cos_sim_matrix[i, j] > 0.95: |
|
deduplicated_indices.add(j) |
|
|
|
# Create a new dataset with the deduplicated data |
|
deduplicated_texts = [filtered_texts[i] for i in range(len(filtered_texts)) if i not in deduplicated_indices] |
|
deduplicated_texts_dict = {key: [item[key] for item in deduplicated_texts] for key in filtered_texts[0].keys()} |
|
deduplicated_dataset = Dataset.from_dict(deduplicated_texts_dict) |
|
|
|
# Publish the dataset on Hugging Face |
|
deduplicated_dataset.push_to_hub("deduplicated_orca_processed") |
|
|
|
``` |
|
|
|
step 2: |
|
|
|
``` |
|
from datasets import Dataset, load_dataset |
|
|
|
# Load your Hugging Face dataset |
|
dataset = load_dataset("Ayush2312/deduplicated_orca_processed")['train'][:1000] |
|
# Define the default instruction |
|
default_instruction = "### Instruction: Below is a conversation between a human and an AI agent. Write a summary of the conversation." |
|
|
|
# Define the function to format each example |
|
def format_example(example): |
|
input_text = "### Input:\n" |
|
if "response" in example: |
|
input_text += "\n".join([f" {example[role]}" for role in ["question"]]) |
|
else: |
|
input_text += "\n".join([f" {example[role]}" for role in ["question"]]) |
|
response_text = example["response"] if "response" in example else "" |
|
instruction = "### Instruction: " + example["system_prompt"] |
|
if not example["system_prompt"].strip(): |
|
instruction = default_instruction # Fill empty or missing instruction with default |
|
return { |
|
"formatted_example": f"{instruction}\n\n{input_text}\n\n### Response:\n{response_text}" |
|
} |
|
# Convert the dictionary to a Dataset object |
|
dataset = Dataset.from_dict(dataset) |
|
# Apply the function to format each example |
|
formatted_dataset = dataset.map(format_example) |
|
# Upload the new dataset to Hugging Face |
|
formatted_dataset.push_to_hub("deduplicated_orca_post_processed") |
|
``` |