import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.preprocessing import LabelEncoder import numpy as np from datasets import load_dataset import joblib import os # Define paths for the model, vectorizer, and label encoder svm_model_path = "svm_resume_model.pkl" vectorizer_path = "tfidf_vectorizer.pkl" label_encoder_path = "label_encoder.pkl" # Check if models exist and load them; otherwise, train and save if os.path.exists(svm_model_path) and os.path.exists(vectorizer_path) and os.path.exists(label_encoder_path): # Load the models if they already exist svm_model = joblib.load(svm_model_path) vectorizer = joblib.load(vectorizer_path) le = joblib.load(label_encoder_path) print("Models loaded from disk.") else: # Load the dataset ds = load_dataset('ahmedheakl/resume-atlas', cache_dir="C:/Users/dell/.cache/huggingface/datasets") # Create a DataFrame from the 'train' split df_train = pd.DataFrame(ds['train']) # Initialize the Label Encoder and encode the 'Category' labels le = LabelEncoder() df_train['Category_encoded'] = le.fit_transform(df_train['Category']) # Split the dataset into training and test sets X_train, X_test, y_train, y_test = train_test_split( df_train['Text'], df_train['Category_encoded'], test_size=0.2, random_state=42) # Initialize TF-IDF Vectorizer and transform the text data vectorizer = TfidfVectorizer(max_features=1000) X_train_tfidf = vectorizer.fit_transform(X_train) X_test_tfidf = vectorizer.transform(X_test) # Initialize and train the SVM model svm_model = SVC(probability=True, random_state=42) svm_model.fit(X_train_tfidf, y_train) # Save the SVM model, TF-IDF vectorizer, and label encoder joblib.dump(svm_model, svm_model_path) joblib.dump(vectorizer, vectorizer_path) joblib.dump(le, label_encoder_path) print("Models trained and saved to disk.") # Single-label classification function def classify_text_svm(text): text_tfidf = vectorizer.transform([text]) predicted_class_index = svm_model.predict(text_tfidf)[0] predicted_category = le.inverse_transform([predicted_class_index])[0] return predicted_category # Multi-label classification function (returning top N predictions based on probabilities) def classify_text_svm_multi(text, top_n=3): text_tfidf = vectorizer.transform([text]) probabilities = svm_model.predict_proba(text_tfidf)[0] top_n_indices = np.argsort(probabilities)[::-1][:top_n] # Get indices of top N predictions top_n_categories = le.inverse_transform(top_n_indices) return top_n_categories