Ayush2312's picture
Update README.md
acc7224 verified
Fine-tuned llama 2 7b with processed open orca dataset (Ayush2312/deduplicated_orca_post_processed):
data processing:
1. Remove output token less than 100 tokens in reponse
2. Do cosine similarity on examples with threshold 0.95
3.
python codes for data processing:
step 1:
```
from datasets import load_dataset, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Load your dataset from Hugging Face
dataset = load_dataset("Ayush2312/orca-1m-gpt4", split='train[:7000]')
# Tokenize your text data
texts = dataset['system_prompt'] + dataset['question'] + dataset['response']
# Filter out instructions with less than 100 tokens in response
filtered_texts = []
for i, response in enumerate(dataset['response']):
if len(response.split()) >= 100:
filtered_texts.append({'system_prompt': dataset['system_prompt'][i],
'question': dataset['question'][i],
'response': response})
# TF-IDF Vectorization for deduplication
texts = [text['system_prompt'] + ' ' + text['question'] + ' ' + text['response'] for text in filtered_texts]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)
# Calculate cosine similarity for deduplication
cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Deduplicate the data based on cosine similarity
deduplicated_indices = set()
for i in range(len(cos_sim_matrix)):
if i not in deduplicated_indices:
for j in range(i + 1, len(cos_sim_matrix)):
if cos_sim_matrix[i, j] > 0.95:
deduplicated_indices.add(j)
# Create a new dataset with the deduplicated data
deduplicated_texts = [filtered_texts[i] for i in range(len(filtered_texts)) if i not in deduplicated_indices]
deduplicated_texts_dict = {key: [item[key] for item in deduplicated_texts] for key in filtered_texts[0].keys()}
deduplicated_dataset = Dataset.from_dict(deduplicated_texts_dict)
# Publish the dataset on Hugging Face
deduplicated_dataset.push_to_hub("deduplicated_orca_processed")
```
step 2:
```
from datasets import Dataset, load_dataset
# Load your Hugging Face dataset
dataset = load_dataset("Ayush2312/deduplicated_orca_processed")['train'][:1000]
# Define the default instruction
default_instruction = "### Instruction: Below is a conversation between a human and an AI agent. Write a summary of the conversation."
# Define the function to format each example
def format_example(example):
input_text = "### Input:\n"
if "response" in example:
input_text += "\n".join([f" {example[role]}" for role in ["question"]])
else:
input_text += "\n".join([f" {example[role]}" for role in ["question"]])
response_text = example["response"] if "response" in example else ""
instruction = "### Instruction: " + example["system_prompt"]
if not example["system_prompt"].strip():
instruction = default_instruction # Fill empty or missing instruction with default
return {
"formatted_example": f"{instruction}\n\n{input_text}\n\n### Response:\n{response_text}"
}
# Convert the dictionary to a Dataset object
dataset = Dataset.from_dict(dataset)
# Apply the function to format each example
formatted_dataset = dataset.map(format_example)
# Upload the new dataset to Hugging Face
formatted_dataset.push_to_hub("deduplicated_orca_post_processed")
```