demand-forecasting

Running

App Files Files Community

robertselvam commited on May 1

Commit

0e3fc88

•

1 Parent(s): a5cd68d

Create app.py

Browse files

Files changed (1) hide show

app.py +423 -0

app.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import pandas as pd
+from io import StringIO
+import pandas as pd
+import numpy as np
+import xgboost as xgb
+from math import sqrt
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import train_test_split
+import plotly.express as px
+import logging
+from datetime import datetime
+import plotly.graph_objects as go
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import pyplot
+import whisper
+from openai import AzureOpenAI
+import json
+import re
+import gradio as gr
+# Configure logging
+logging.basicConfig(
+    filename='demand_forecasting.log',  # You can adjust the log file name here
+    filemode='a',
+    format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s',
+    datefmt='%Y-%b-%d %H:%M:%S'
+)
+LOGGER = logging.getLogger(__name__)
+log_level_env = 'INFO'  # You can adjust the log level here
+log_level_dict = {
+    'DEBUG': logging.DEBUG,
+    'INFO': logging.INFO,
+    'WARNING': logging.WARNING,
+    'ERROR': logging.ERROR,
+    'CRITICAL': logging.CRITICAL
+}
+if log_level_env in log_level_dict:
+    log_level = log_level_dict[log_level_env]
+else:
+    log_level = log_level_dict['INFO']
+LOGGER.setLevel(log_level)
+class DemandForecasting:
+    def __init__(self):
+        self.client = AzureOpenAI()
+        self.whisper_model = whisper.load_model("medium.en")
+    def get_column(self,train_csv_path: str):
+        # Load the training data from the specified CSV file
+        train_df = pd.read_csv(train_csv_path)
+        column_names = train_df.columns.tolist()
+        return column_names
+    def load_data(self, train_csv_path: str) -> pd.DataFrame:
+        """
+        Load training data from a CSV file.
+        Args:
+            train_csv_path (str): Path to the training CSV file.
+        Returns:
+            pd.DataFrame: DataFrame containing the training data.
+        """
+        try:
+            # Load the training data from the specified CSV file
+            train_df = pd.read_csv(train_csv_path)
+            # Return a tuple containing the training DataFrame
+            return train_df
+        except Exception as e:
+            # Log an error message if an exception occurs during data loading
+            LOGGER.error(f"Error loading data: {e}")
+            # Return None
+            return None
+    def find_date_column(self, df_data: pd.DataFrame, list_columns: list) -> str:
+        """
+        Find the column containing date information from the list of columns.
+        Args:
+        - df_data (pd.DataFrame): Input DataFrame.
+        - list_columns (list): List of column names to search for date information.
+        Returns:
+        - str: Name of the column containing date information.
+        """
+        for column in list_columns:
+            # Check if the column contains date-like values
+            try:
+                pd.to_datetime(df_data[column])
+                return column
+            except ValueError:
+                pass
+        # Return None if no date column is found
+        return None
+    def preprocess_data(self, df_data: pd.DataFrame, list_columns) -> pd.DataFrame:
+        """
+        Preprocess the input DataFrame.
+        Args:
+        - df_data (pd.DataFrame): Input DataFrame to preprocess.
+        Returns:
+        - pd.DataFrame: Preprocessed DataFrame.
+        """
+        try:
+            print(type(list_columns))
+            # Make a copy of the input DataFrame to avoid modifying the original data
+            df_data = df_data.copy()
+            list_columns.append(target_column)
+            # Drop columns not in list_columns
+            columns_to_drop = [col for col in df_data.columns if col not in list_columns]
+            df_data.drop(columns=columns_to_drop, inplace=True)
+            # Find the date column
+            date_column = self.find_date_column(df_data, list_columns)
+            if date_column is None:
+                raise ValueError("No date column found in the provided list of columns.")
+            # Parse date information
+            df_data[date_column] = pd.to_datetime(df_data[date_column])     # Convert 'date' column to datetime format
+            df_data['day'] = df_data[date_column].dt.day        # Extract day of the month
+            df_data['month'] = df_data[date_column].dt.month         # Extract month
+            df_data['year'] = df_data[date_column].dt.year        # Extract year
+            # Cyclical Encoding for Months
+            df_data['month_sin'] = np.sin(2 * np.pi * df_data['month'] / 12)   # Cyclical sine encoding for month
+            df_data['month_cos'] = np.cos(2 * np.pi * df_data['month'] / 12)   # Cyclical cosine encoding for month
+            # Day of the Week
+            df_data['day_of_week'] = df_data[date_column].dt.weekday      # Extract day of the week (0 = Monday, 6 = Sunday)
+            # Week of the Year
+            df_data['week_of_year'] = df_data[date_column].dt.isocalendar().week.astype(int)   # Extract week of the year as integer
+            df_data.drop(columns=[date_column], inplace=True)
+            print("df_data", df_data)
+            return df_data
+        except Exception as e:
+            # Log an error message if an exception occurs during data preprocessing
+            LOGGER.error(f"Error preprocessing data: {e}")
+            # Return None in case of an error
+            return None
+    def train_model(self, train: pd.DataFrame, target_column, list_columns) -> tuple:
+        """
+        Train an XGBoost model using the provided training data.
+        Args:
+        - train (pd.DataFrame): DataFrame containing training data.
+        Returns:
+        - tuple: A tuple containing the trained model, true validation labels, and predicted validation labels.
+        """
+        try:
+            # Extract features and target variable
+            X = train.drop(columns=[target_column])
+            y = train[target_column]
+            # Cannot use cross validation because it will use future data
+            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=333)
+            # Convert data into DMatrix format for XGBoost
+            dtrain = xgb.DMatrix(X_train, label=y_train)
+            dval = xgb.DMatrix(X_val, label=y_val)
+            # Parameters for XGBoost
+            param = {
+                'max_depth': 9,
+                'eta': 0.3,
+                'objective': 'reg:squarederror'
+            }
+            num_round = 60
+            # Train the model
+            model_xgb = xgb.train(param, dtrain, num_round)
+            # Validate the model
+            y_val_pred = model_xgb.predict(dval)     # Predict validation set labels
+            # Calculate mean squared error
+            mse = mean_squared_error(y_val, y_val_pred)
+            # Print validation RMSE
+            validation = f"Validation RMSE: {np.sqrt(mse)}"
+            # Return trained model, true validation labels, and predicted validation labels
+            return model_xgb, y_val, y_val_pred, validation
+        except Exception as e:
+            # Log an error message if an exception occurs during model training
+            LOGGER.error(f"Error training model: {e}")
+            # Return None for all outputs in case of an error
+            return None, None, None
+    def plot_evaluation_interactive(self, y_true: np.ndarray, y_pred: np.ndarray, title: str) -> None:
+        """
+        Plot interactive evaluation using Plotly.
+        Args:
+        - y_true (np.ndarray): True values.
+        - y_pred (np.ndarray): Predicted values.
+        - title (str): Title of the plot.
+        """
+        try:
+            # Create a scatter plot using Plotly
+            fig = px.scatter(x=y_true, y=y_pred, labels={'x': 'True Values', 'y': 'Predictions'}, title=title, color_discrete_map={'': 'purple'})
+            fig.show()
+            return fig
+        except Exception as e:
+            # Log an error message if an exception occurs during plot generation
+            LOGGER.error(f"Error plotting evaluation: {e}")
+    def predict_sales_for_date(self, input_data, model: xgb.Booster) -> float:
+        """
+        Predict the sales for a specific date using the trained model.
+        Args:
+        - date_input (str): Date for which sales prediction is needed (in 'YYYY-MM-DD' format).
+        - model (xgb.Booster): Trained XGBoost model.
+        - features (pd.DataFrame): DataFrame containing features for the date.
+        Returns:
+        - float: Predicted sales value.
+        """
+        try:
+            input_features = pd.DataFrame([input_data])
+            # Regular expression pattern for date in the format 'dd-mm-yyyy'
+            for key, value in input_data.items():
+                if isinstance(value, str) and re.match(r'\d{2}-\d{2}-\d{4}', value):
+                  date_column = key
+            if date_column:
+                # # Assuming date_input is a datetime object
+                date_input = pd.to_datetime(input_features[date_column])
+                # Extract day of the month
+                input_features['day'] = date_input.dt.day
+                # Extract month
+                input_features['month'] = date_input.dt.month
+                # Extract year
+                input_features['year'] = date_input.dt.year
+                # Cyclical sine encoding for month
+                input_features['month_sin'] = np.sin(2 * np.pi * input_features['month'] / 12)
+                # Cyclical cosine encoding for month
+                input_features['month_cos'] = np.cos(2 * np.pi * input_features['month'] / 12)
+                # Extract day of the week (0 = Monday, 6 = Sunday)
+                input_features['day_of_week'] = date_input.dt.weekday
+                # Extract week of the year as integer
+                input_features['week_of_year'] = date_input.dt.isocalendar().week
+            input_features.drop(columns=[date_column], inplace=True)
+            # Convert input features to DMatrix format
+            dinput = xgb.DMatrix(input_features)
+            # Make predictions using the trained model
+            predicted_sales = model.predict(dinput)[0]
+            # Print the predicted sales value
+            predicted_result = f"""{input_data[str(date_column)]}Predicted Value Is {predicted_sales}"""
+            # Return the predicted sales value
+            return predicted_result
+        except Exception as e:
+            # Log an error message if an exception occurs during sales prediction
+            LOGGER.error(f"Error predicting sales: {e}")
+            # Return None in case of an error
+            return None
+    def audio_to_text(self, audio_path):
+        """
+        transcribe the audio to text.
+        """
+        result = self.whisper_model.transcribe(audio_path)
+        print("audio_to_text",result["text"])
+        return result["text"]
+    def parse_text(self, text, column_list):
+        # Define the prompt or input for the model
+        conversation =[{"role": "system", "content": ""},
+        {"role": "user", "content":f""" extract the {column_list}. al
+        l values should be intiger data type. if date in there the format is dd-mm-YYYY.
+        text```{text}```
+        return result should be in JSON format:
+        """
+        }]
+        # Generate a response from the GPT-3 model
+        chat_completion = self.client.chat.completions.create(
+            model = "GPT-3",
+            messages = conversation,
+            max_tokens=500,
+            temperature=0,
+            n=1,
+            stop=None,
+        )
+        # Extract the generated text from the API response
+        generated_text = chat_completion.choices[0].message.content
+        # Assuming jsonString is your JSON string
+        json_data = json.loads(generated_text)
+        print("parse_text",json_data)
+        return json_data
+    def main(self, train_csv_path: str, audio_path, target_column, column_list) -> None:
+        """
+        Main function to execute the demand forecasting pipeline.
+        Args:
+        - train_csv_path (str): Path to the training CSV file.
+        - date (str): Date for which sales prediction is needed (in 'YYYY-MM-DD' format).
+        """
+        try:
+            # Split the string by comma and convert it into a list
+            column_list = column_list.split(", ")
+            print("train_csv_path", train_csv_path)
+            print("audio_path", audio_path)
+            print("column_list", column_list)
+            print("target_column", target_column)
+            text = self.audio_to_text(audio_path)
+            input_data = self.parse_text(text, column_list)
+            #load data
+            train_data = self.load_data(train_csv_path)
+            #preprocess the train data
+            train_df = self.preprocess_data(train_data, column_list)
+            # Train model and get validation predictions
+            trained_model, y_val, y_val_pred, validation = self.train_model(train_df, target_column, column_list)
+            # Plot interactive evaluation for training
+            plot = self.plot_evaluation_interactive(y_val, y_val_pred, title='Validation Set Evaluation')
+            # Predict sales for the specified date using the trained model
+            predicted_value = self.predict_sales_for_date(input_data, trained_model)
+            return plot, predicted_value, validation
+        except Exception as e:
+            # Log an error message if an exception occurs in the main function
+            LOGGER.error(f"Error in main function: {e}")
+    def gradio_interface(self):
+        with gr.Blocks(css="style.css", theme="freddyaboulton/test-blue") as demo:
+            gr.HTML("""<center><h1 style="color:#fff">Demand Forecasting</h1></center>""")
+            with gr.Row():
+                with gr.Column(scale=0.50):
+                    train_csv = gr.File(elem_classes="uploadbutton")
+                with gr.Column(scale=0.50):
+                    column_list = gr.Textbox(label="Column List")
+            with gr.Row():
+                with gr.Column(scale=0.50):
+                    audio_path = gr.Audio(sources=["microphone"], type="filepath")
+            with gr.Row():
+                with gr.Column(scale=0.50):
+                    selected_column = gr.Textbox(label="Select column")
+                with gr.Column(scale=0.50):
+                    target_column = gr.Textbox(label="target column")
+            with gr.Row():
+                  validation = gr.Textbox(label="Validation")
+                  predicted_result = gr.Textbox(label="Predicted Result")
+                  plot = gr.Plot()
+            train_csv.upload(self.get_column, train_csv, column_list)
+            audio_path.stop_recording(self.main, [train_csv, audio_path, target_column, selected_column], [plot, predicted_result, validation])
+        demo.launch(debug=True)
+if __name__ == "__main__":
+    demand = DemandForecasting()
+    demand.gradio_interface()