Spaces:

manasagangotri
/

classify

Running

File size: 2,811 Bytes

e062e72

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import numpy as np
from datasets import load_dataset
import joblib
import os

# Define paths for the model, vectorizer, and label encoder
svm_model_path = "svm_resume_model.pkl"
vectorizer_path = "tfidf_vectorizer.pkl"
label_encoder_path = "label_encoder.pkl"

# Check if models exist and load them; otherwise, train and save
if os.path.exists(svm_model_path) and os.path.exists(vectorizer_path) and os.path.exists(label_encoder_path):
    # Load the models if they already exist
    svm_model = joblib.load(svm_model_path)
    vectorizer = joblib.load(vectorizer_path)
    le = joblib.load(label_encoder_path)
    print("Models loaded from disk.")
else:
    # Load the dataset
    ds = load_dataset('ahmedheakl/resume-atlas', cache_dir="C:/Users/dell/.cache/huggingface/datasets")

    # Create a DataFrame from the 'train' split
    df_train = pd.DataFrame(ds['train'])

    # Initialize the Label Encoder and encode the 'Category' labels
    le = LabelEncoder()
    df_train['Category_encoded'] = le.fit_transform(df_train['Category'])

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        df_train['Text'], df_train['Category_encoded'], test_size=0.2, random_state=42)

    # Initialize TF-IDF Vectorizer and transform the text data
    vectorizer = TfidfVectorizer(max_features=1000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Initialize and train the SVM model
    svm_model = SVC(probability=True, random_state=42)
    svm_model.fit(X_train_tfidf, y_train)

    # Save the SVM model, TF-IDF vectorizer, and label encoder
    joblib.dump(svm_model, svm_model_path)
    joblib.dump(vectorizer, vectorizer_path)
    joblib.dump(le, label_encoder_path)
    print("Models trained and saved to disk.")

# Single-label classification function
def classify_text_svm(text):
    text_tfidf = vectorizer.transform([text])
    predicted_class_index = svm_model.predict(text_tfidf)[0]
    predicted_category = le.inverse_transform([predicted_class_index])[0]
    return predicted_category

# Multi-label classification function (returning top N predictions based on probabilities)
def classify_text_svm_multi(text, top_n=3):
    text_tfidf = vectorizer.transform([text])
    probabilities = svm_model.predict_proba(text_tfidf)[0]
    top_n_indices = np.argsort(probabilities)[::-1][:top_n]  # Get indices of top N predictions
    top_n_categories = le.inverse_transform(top_n_indices)
    return top_n_categories