import pandas as pd from io import StringIO import pandas as pd import numpy as np import xgboost as xgb from math import sqrt from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split import plotly.express as px import logging from datetime import datetime import plotly.graph_objects as go import numpy as np import matplotlib.pyplot as plt from matplotlib import pyplot import whisper from openai import AzureOpenAI import json import re import gradio as gr # Configure logging logging.basicConfig( filename='demand_forecasting.log', # You can adjust the log file name here filemode='a', format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s', datefmt='%Y-%b-%d %H:%M:%S' ) LOGGER = logging.getLogger(__name__) log_level_env = 'INFO' # You can adjust the log level here log_level_dict = { 'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR, 'CRITICAL': logging.CRITICAL } if log_level_env in log_level_dict: log_level = log_level_dict[log_level_env] else: log_level = log_level_dict['INFO'] LOGGER.setLevel(log_level) class DemandForecasting: def __init__(self): self.client = AzureOpenAI() self.whisper_model = whisper.load_model("medium.en") def get_column(self,train_csv_path: str): # Load the training data from the specified CSV file train_df = pd.read_csv(train_csv_path) column_names = train_df.columns.tolist() return column_names def load_data(self, train_csv_path: str) -> pd.DataFrame: """ Load training data from a CSV file. Args: train_csv_path (str): Path to the training CSV file. Returns: pd.DataFrame: DataFrame containing the training data. """ try: # Load the training data from the specified CSV file train_df = pd.read_csv(train_csv_path) # Return a tuple containing the training DataFrame return train_df except Exception as e: # Log an error message if an exception occurs during data loading LOGGER.error(f"Error loading data: {e}") # Return None return None def find_date_column(self, df_data: pd.DataFrame, list_columns: list) -> str: """ Find the column containing date information from the list of columns. Args: - df_data (pd.DataFrame): Input DataFrame. - list_columns (list): List of column names to search for date information. Returns: - str: Name of the column containing date information. """ for column in list_columns: # Check if the column contains date-like values try: pd.to_datetime(df_data[column]) return column except ValueError: pass # Return None if no date column is found return None def preprocess_data(self, df_data: pd.DataFrame, list_columns) -> pd.DataFrame: """ Preprocess the input DataFrame. Args: - df_data (pd.DataFrame): Input DataFrame to preprocess. Returns: - pd.DataFrame: Preprocessed DataFrame. """ try: print(type(list_columns)) # Make a copy of the input DataFrame to avoid modifying the original data df_data = df_data.copy() list_columns.append(target_column) # Drop columns not in list_columns columns_to_drop = [col for col in df_data.columns if col not in list_columns] df_data.drop(columns=columns_to_drop, inplace=True) # Find the date column date_column = self.find_date_column(df_data, list_columns) if date_column is None: raise ValueError("No date column found in the provided list of columns.") # Parse date information df_data[date_column] = pd.to_datetime(df_data[date_column]) # Convert 'date' column to datetime format df_data['day'] = df_data[date_column].dt.day # Extract day of the month df_data['month'] = df_data[date_column].dt.month # Extract month df_data['year'] = df_data[date_column].dt.year # Extract year # Cyclical Encoding for Months df_data['month_sin'] = np.sin(2 * np.pi * df_data['month'] / 12) # Cyclical sine encoding for month df_data['month_cos'] = np.cos(2 * np.pi * df_data['month'] / 12) # Cyclical cosine encoding for month # Day of the Week df_data['day_of_week'] = df_data[date_column].dt.weekday # Extract day of the week (0 = Monday, 6 = Sunday) # Week of the Year df_data['week_of_year'] = df_data[date_column].dt.isocalendar().week.astype(int) # Extract week of the year as integer df_data.drop(columns=[date_column], inplace=True) print("df_data", df_data) return df_data except Exception as e: # Log an error message if an exception occurs during data preprocessing LOGGER.error(f"Error preprocessing data: {e}") # Return None in case of an error return None def train_model(self, train: pd.DataFrame, target_column, list_columns) -> tuple: """ Train an XGBoost model using the provided training data. Args: - train (pd.DataFrame): DataFrame containing training data. Returns: - tuple: A tuple containing the trained model, true validation labels, and predicted validation labels. """ try: # Extract features and target variable X = train.drop(columns=[target_column]) y = train[target_column] # Cannot use cross validation because it will use future data X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=333) # Convert data into DMatrix format for XGBoost dtrain = xgb.DMatrix(X_train, label=y_train) dval = xgb.DMatrix(X_val, label=y_val) # Parameters for XGBoost param = { 'max_depth': 9, 'eta': 0.3, 'objective': 'reg:squarederror' } num_round = 60 # Train the model model_xgb = xgb.train(param, dtrain, num_round) # Validate the model y_val_pred = model_xgb.predict(dval) # Predict validation set labels # Calculate mean squared error mse = mean_squared_error(y_val, y_val_pred) # Print validation RMSE validation = f"Validation RMSE: {np.sqrt(mse)}" # Return trained model, true validation labels, and predicted validation labels return model_xgb, y_val, y_val_pred, validation except Exception as e: # Log an error message if an exception occurs during model training LOGGER.error(f"Error training model: {e}") # Return None for all outputs in case of an error return None, None, None def plot_evaluation_interactive(self, y_true: np.ndarray, y_pred: np.ndarray, title: str) -> None: """ Plot interactive evaluation using Plotly. Args: - y_true (np.ndarray): True values. - y_pred (np.ndarray): Predicted values. - title (str): Title of the plot. """ try: # Create a scatter plot using Plotly fig = px.scatter(x=y_true, y=y_pred, labels={'x': 'True Values', 'y': 'Predictions'}, title=title, color_discrete_map={'': 'purple'}) fig.show() return fig except Exception as e: # Log an error message if an exception occurs during plot generation LOGGER.error(f"Error plotting evaluation: {e}") def predict_sales_for_date(self, input_data, model: xgb.Booster) -> float: """ Predict the sales for a specific date using the trained model. Args: - date_input (str): Date for which sales prediction is needed (in 'YYYY-MM-DD' format). - model (xgb.Booster): Trained XGBoost model. - features (pd.DataFrame): DataFrame containing features for the date. Returns: - float: Predicted sales value. """ try: input_features = pd.DataFrame([input_data]) # Regular expression pattern for date in the format 'dd-mm-yyyy' for key, value in input_data.items(): if isinstance(value, str) and re.match(r'\d{2}-\d{2}-\d{4}', value): date_column = key if date_column: # # Assuming date_input is a datetime object date_input = pd.to_datetime(input_features[date_column]) # Extract day of the month input_features['day'] = date_input.dt.day # Extract month input_features['month'] = date_input.dt.month # Extract year input_features['year'] = date_input.dt.year # Cyclical sine encoding for month input_features['month_sin'] = np.sin(2 * np.pi * input_features['month'] / 12) # Cyclical cosine encoding for month input_features['month_cos'] = np.cos(2 * np.pi * input_features['month'] / 12) # Extract day of the week (0 = Monday, 6 = Sunday) input_features['day_of_week'] = date_input.dt.weekday # Extract week of the year as integer input_features['week_of_year'] = date_input.dt.isocalendar().week input_features.drop(columns=[date_column], inplace=True) # Convert input features to DMatrix format dinput = xgb.DMatrix(input_features) # Make predictions using the trained model predicted_sales = model.predict(dinput)[0] # Print the predicted sales value predicted_result = f"""{input_data[str(date_column)]}Predicted Value Is {predicted_sales}""" # Return the predicted sales value return predicted_result except Exception as e: # Log an error message if an exception occurs during sales prediction LOGGER.error(f"Error predicting sales: {e}") # Return None in case of an error return None def audio_to_text(self, audio_path): """ transcribe the audio to text. """ result = self.whisper_model.transcribe(audio_path) print("audio_to_text",result["text"]) return result["text"] def parse_text(self, text, column_list): # Define the prompt or input for the model conversation =[{"role": "system", "content": ""}, {"role": "user", "content":f""" extract the {column_list}. al l values should be intiger data type. if date in there the format is dd-mm-YYYY. text```{text}``` return result should be in JSON format: """ }] # Generate a response from the GPT-3 model chat_completion = self.client.chat.completions.create( model = "GPT-3", messages = conversation, max_tokens=500, temperature=0, n=1, stop=None, ) # Extract the generated text from the API response generated_text = chat_completion.choices[0].message.content # Assuming jsonString is your JSON string json_data = json.loads(generated_text) print("parse_text",json_data) return json_data def main(self, train_csv_path: str, audio_path, target_column, column_list) -> None: """ Main function to execute the demand forecasting pipeline. Args: - train_csv_path (str): Path to the training CSV file. - date (str): Date for which sales prediction is needed (in 'YYYY-MM-DD' format). """ try: # Split the string by comma and convert it into a list column_list = column_list.split(", ") print("train_csv_path", train_csv_path) print("audio_path", audio_path) print("column_list", column_list) print("target_column", target_column) text = self.audio_to_text(audio_path) input_data = self.parse_text(text, column_list) #load data train_data = self.load_data(train_csv_path) #preprocess the train data train_df = self.preprocess_data(train_data, column_list) # Train model and get validation predictions trained_model, y_val, y_val_pred, validation = self.train_model(train_df, target_column, column_list) # Plot interactive evaluation for training plot = self.plot_evaluation_interactive(y_val, y_val_pred, title='Validation Set Evaluation') # Predict sales for the specified date using the trained model predicted_value = self.predict_sales_for_date(input_data, trained_model) return plot, predicted_value, validation except Exception as e: # Log an error message if an exception occurs in the main function LOGGER.error(f"Error in main function: {e}") def gradio_interface(self): with gr.Blocks(css="style.css", theme="freddyaboulton/test-blue") as demo: gr.HTML("""