Ayush2312's picture
Update README.md
acc7224 verified

Fine-tuned llama 2 7b with processed open orca dataset (Ayush2312/deduplicated_orca_post_processed):

data processing:

  1. Remove output token less than 100 tokens in reponse
  2. Do cosine similarity on examples with threshold 0.95
  3. python codes for data processing:

step 1:

from datasets import load_dataset, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your dataset from Hugging Face
dataset = load_dataset("Ayush2312/orca-1m-gpt4", split='train[:7000]')

# Tokenize your text data
texts = dataset['system_prompt'] + dataset['question'] + dataset['response']

# Filter out instructions with less than 100 tokens in response
filtered_texts = []
for i, response in enumerate(dataset['response']):
    if len(response.split()) >= 100:
        filtered_texts.append({'system_prompt': dataset['system_prompt'][i],
                               'question': dataset['question'][i],
                               'response': response})

# TF-IDF Vectorization for deduplication
texts = [text['system_prompt'] + ' ' + text['question'] + ' ' + text['response'] for text in filtered_texts]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)

# Calculate cosine similarity for deduplication
cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Deduplicate the data based on cosine similarity
deduplicated_indices = set()
for i in range(len(cos_sim_matrix)):
    if i not in deduplicated_indices:
        for j in range(i + 1, len(cos_sim_matrix)):
            if cos_sim_matrix[i, j] > 0.95:
                deduplicated_indices.add(j)

# Create a new dataset with the deduplicated data
deduplicated_texts = [filtered_texts[i] for i in range(len(filtered_texts)) if i not in deduplicated_indices]
deduplicated_texts_dict = {key: [item[key] for item in deduplicated_texts] for key in filtered_texts[0].keys()}
deduplicated_dataset = Dataset.from_dict(deduplicated_texts_dict)

# Publish the dataset on Hugging Face
deduplicated_dataset.push_to_hub("deduplicated_orca_processed")

step 2:

from datasets import Dataset, load_dataset

# Load your Hugging Face dataset
dataset = load_dataset("Ayush2312/deduplicated_orca_processed")['train'][:1000]
# Define the default instruction
default_instruction = "### Instruction: Below is a conversation between a human and an AI agent. Write a summary of the conversation."

# Define the function to format each example
def format_example(example):
    input_text = "### Input:\n"
    if "response" in example:
        input_text += "\n".join([f" {example[role]}" for role in ["question"]])
    else:
        input_text += "\n".join([f" {example[role]}" for role in ["question"]])
    response_text = example["response"] if "response" in example else ""
    instruction = "### Instruction: " + example["system_prompt"]
    if not example["system_prompt"].strip():
        instruction = default_instruction  # Fill empty or missing instruction with default
    return {
        "formatted_example": f"{instruction}\n\n{input_text}\n\n### Response:\n{response_text}"
    }
# Convert the dictionary to a Dataset object
dataset = Dataset.from_dict(dataset)
# Apply the function to format each example
formatted_dataset = dataset.map(format_example)
# Upload the new dataset to Hugging Face
formatted_dataset.push_to_hub("deduplicated_orca_post_processed")