Fine-tuned llama 2 7b with processed open orca dataset (Ayush2312/deduplicated_orca_post_processed): data processing: 1. Remove output token less than 100 tokens in reponse 2. Do cosine similarity on examples with threshold 0.95 3. python codes for data processing: step 1: ``` from datasets import load_dataset, Dataset from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # Load your dataset from Hugging Face dataset = load_dataset("Ayush2312/orca-1m-gpt4", split='train[:7000]') # Tokenize your text data texts = dataset['system_prompt'] + dataset['question'] + dataset['response'] # Filter out instructions with less than 100 tokens in response filtered_texts = [] for i, response in enumerate(dataset['response']): if len(response.split()) >= 100: filtered_texts.append({'system_prompt': dataset['system_prompt'][i], 'question': dataset['question'][i], 'response': response}) # TF-IDF Vectorization for deduplication texts = [text['system_prompt'] + ' ' + text['question'] + ' ' + text['response'] for text in filtered_texts] vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(texts) # Calculate cosine similarity for deduplication cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix) # Deduplicate the data based on cosine similarity deduplicated_indices = set() for i in range(len(cos_sim_matrix)): if i not in deduplicated_indices: for j in range(i + 1, len(cos_sim_matrix)): if cos_sim_matrix[i, j] > 0.95: deduplicated_indices.add(j) # Create a new dataset with the deduplicated data deduplicated_texts = [filtered_texts[i] for i in range(len(filtered_texts)) if i not in deduplicated_indices] deduplicated_texts_dict = {key: [item[key] for item in deduplicated_texts] for key in filtered_texts[0].keys()} deduplicated_dataset = Dataset.from_dict(deduplicated_texts_dict) # Publish the dataset on Hugging Face deduplicated_dataset.push_to_hub("deduplicated_orca_processed") ``` step 2: ``` from datasets import Dataset, load_dataset # Load your Hugging Face dataset dataset = load_dataset("Ayush2312/deduplicated_orca_processed")['train'][:1000] # Define the default instruction default_instruction = "### Instruction: Below is a conversation between a human and an AI agent. Write a summary of the conversation." # Define the function to format each example def format_example(example): input_text = "### Input:\n" if "response" in example: input_text += "\n".join([f" {example[role]}" for role in ["question"]]) else: input_text += "\n".join([f" {example[role]}" for role in ["question"]]) response_text = example["response"] if "response" in example else "" instruction = "### Instruction: " + example["system_prompt"] if not example["system_prompt"].strip(): instruction = default_instruction # Fill empty or missing instruction with default return { "formatted_example": f"{instruction}\n\n{input_text}\n\n### Response:\n{response_text}" } # Convert the dictionary to a Dataset object dataset = Dataset.from_dict(dataset) # Apply the function to format each example formatted_dataset = dataset.map(format_example) # Upload the new dataset to Hugging Face formatted_dataset.push_to_hub("deduplicated_orca_post_processed") ```