assignment / app.py
khalo9747's picture
Update app.py
e5e29ce verified
raw
history blame
11.1 kB
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset from the GitHub link
@st.cache_data
def load_data():
try:
url = 'https://raw.githubusercontent.com/khalo9747/data-storage/main/data_cleaned.csv'
df = pd.read_csv(url)
# Normalize column names by stripping leading/trailing spaces
df.columns = df.columns.str.strip()
return df
except Exception as e:
st.error(f"Error loading data: {e}")
return None
# Function to categorize age into groups
def categorize_age(age):
if age < 15:
return None
elif age <= 19:
return 'Teens (15-19)'
elif age <= 35:
return 'Young Adults (20-35)'
elif age <= 55:
return 'Middle-Aged (36-55)'
elif age <= 69:
return 'Seniors (56-69)'
else:
return 'Elderly (70+)'
# Main function to run the Streamlit app
def main():
# Load the data
df = load_data()
# Check if the data is loaded correctly
if df is None:
st.error("Failed to load data. Please check the data source.")
return
# Set up the Streamlit page
st.title('EDA Dashboard')
st.write("This dashboard presents key insights from the dataset and allows for some exploration.")
# Tabs for different sections
tab1, tab2, tab3, tab4 = st.tabs(["Distributions", "Customer Segmentation", "Optimizing Digital Payment Services", "Financial Inclusion Strategies"])
with tab1:
# Dropdown for selecting distribution type
st.header('Select Distribution Type')
distribution_type = st.selectbox(
'Choose a distribution to display:',
('Distribution by Region', 'Distribution by Age Group', 'Distribution by Gender')
)
# Visualize distribution based on selection
if distribution_type == 'Distribution by Region':
st.subheader('Distribution of Respondents by Region')
if 'Region' in df.columns:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Region', order=df['Region'].value_counts().index)
plt.xticks(rotation=45)
plt.title('Distribution of Respondents by Region')
plt.xlabel('Region')
plt.ylabel('Number of Respondents')
st.pyplot(plt)
else:
st.write("'Region' column not found in the dataset.")
elif distribution_type == 'Distribution by Age Group':
st.subheader('Distribution by Age Group')
if 'Age' in df.columns:
df['Age Group'] = df['Age'].apply(categorize_age)
age_counts = df['Age Group'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
sns.barplot(x=age_counts.index, y=age_counts.values)
plt.title('Distribution by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Number of Respondents')
st.pyplot(plt)
else:
st.write("'Age' column not found in the dataset.")
elif distribution_type == 'Distribution by Gender':
st.subheader('Distribution by Gender')
if 'Gender' in df.columns:
gender_counts = df['Gender'].value_counts()
gender_labels = ['Female', 'Male']
plt.figure(figsize=(8, 8))
plt.pie(gender_counts, labels=gender_labels, autopct='%1.1f%%', startangle=140, colors=['#FF9999', '#66B3FF'])
plt.title('Distribution by Gender')
st.pyplot(plt)
else:
st.write("'Gender' column not found in the dataset.")
with tab2:
st.subheader("Customer Segmentation")
# Check if necessary columns exist
if 'Education level' in df.columns and 'Within-economy household income' in df.columns and 'Saved in the past year' in df.columns and 'Borrowed in the past year' in df.columns:
# Set the figure size for both plots
plt.figure(figsize=(12, 12))
# Cross-tabulation for saving behavior
saving_by_educ_income = pd.crosstab(df['Education level'], df['Within-economy household income'], values=df['Saved in the past year'], aggfunc='mean')
# Plotting the heatmap for saving behavior
plt.subplot(2, 1, 1)
sns.heatmap(saving_by_educ_income, annot=True, cmap="Blues", fmt=".2f", linewidths=.5)
plt.title('Proportion of People Who Save by Education Level and Income Quantile')
plt.xlabel('Income Quantile')
plt.ylabel('Education Level')
# Cross-tabulation for borrowing behavior
borrowing_by_educ_income = pd.crosstab(df['Education level'], df['Within-economy household income'], values=df['Borrowed in the past year'], aggfunc='mean')
# Plotting the heatmap for borrowing behavior
plt.subplot(2, 1, 2)
sns.heatmap(borrowing_by_educ_income, annot=True, cmap="Reds", fmt=".2f", linewidths=.5)
plt.title('Proportion of People Who Borrow by Education Level and Income Quantile')
plt.xlabel('Income Quantile')
plt.ylabel('Education Level')
plt.tight_layout()
# Display both plots
st.pyplot(plt)
st.write("Higher-income individuals tend to save more across all education levels. Those with tertiary education save the most, with savings increasing significantly from low to high income, ranging from 0.58 to 0.77. In contrast, people with primary education save less overall, even as their income rises.")
st.write("When it comes to borrowing, higher-income individuals also borrow more, particularly those with tertiary education, where borrowing rates reach up to 0.62 for the wealthiest. Borrowing behavior is less predictable, showing that wealthier individuals still use loans, even though they have more savings. Interestingly, lower-educated, higher-income individuals borrow less as income rises, which could suggest limited access to credit in these groups.")
else:
st.write("Required columns for customer segmentation ('Education level', 'Within-economy household income', 'Saved in the past year', 'Borrowed in the past year') are not found in the dataset.")
with tab3:
st.subheader("Optimizing Digital Payment Services in Underserved Regions")
# Check if necessary columns exist
if 'Region' in df.columns and 'Owns a mobile phone' in df.columns and 'Made or received a digital payment' in df.columns:
# Calculate the proportion of mobile ownership and digital payment usage by region
mobile_ownership = df.groupby('Region')['Owns a mobile phone'].mean()
digital_payment = df.groupby('Region')['Made or received a digital payment'].mean()
# Set up the figure for two bar plots side-by-side
plt.figure(figsize=(14, 8))
# Mobile Ownership Plot
plt.subplot(1, 2, 1)
sns.barplot(x=mobile_ownership.index, y=mobile_ownership.values, palette="Blues_d")
plt.title('Proportion of Mobile Ownership by Region', fontsize=14)
plt.xlabel('Region', fontsize=12)
plt.ylabel('Proportion with Mobile', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.ylim(0, 1) # Ensure the y-axis stays consistent
# Digital Payment Usage Plot
plt.subplot(1, 2, 2)
sns.barplot(x=digital_payment.index, y=digital_payment.values, palette="Greens_d")
plt.title('Proportion of Digital Payment Usage by Region', fontsize=14)
plt.xlabel('Region', fontsize=12)
plt.ylabel('Proportion with Digital Payments', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.ylim(0, 1) # Keep consistent with the mobile ownership plot
# Add a tight layout to improve spacing
plt.tight_layout()
# Display both plots
st.pyplot(plt)
st.write("The results show that the majority of people across most regions have access to mobile phones, indicating that mobile infrastructure is generally well-established. However, the adoption of digital payments lags behind significantly in several regions, particularly in Sub-Saharan Africa, South Asia, and parts of the Middle East & North Africa.")
st.write("This gap between mobile ownership and digital payment usage highlights significant opportunities for expanding digital financial services. Many people have access to mobile phones, which could serve as a platform for digital payments, but the infrastructure or accessibility of these services seems to be underdeveloped in certain regions.")
else:
st.write("Required columns for optimizing digital payment services ('Region', 'Owns a mobile phone', 'Made or received a digital payment') are not found in the dataset.")
with tab4:
st.subheader("Financial Inclusion Strategies")
# Check if necessary columns exist
if 'Gender' in df.columns and 'Education level' in df.columns and 'Has an account at a financial institution' in df.columns:
# Map numeric gender values to string labels
df['Gender'] = df['Gender'].map({1: 'Male', 2: 'Female'})
# Prepare data for plotting
account_data = df.groupby(['Gender', 'Education level'])['Has an account at a financial institution'].mean().reset_index()
# Set up the figure size for the plot
plt.figure(figsize=(8, 6))
# Bar plot with side-by-side comparison of men and women
sns.barplot(x='Education level', y='Has an account at a financial institution', hue='Gender', data=account_data, palette={'Female': 'lightpink', 'Male': 'lightblue'})
# Add labels and title
plt.title('Proportion of Financial Account Ownership by Education Level', fontsize=16)
plt.ylabel('Proportion with Account', fontsize=12)
plt.xlabel('Education Level', fontsize=12)
# Show the legend
plt.legend(title='Gender')
# Display the plot with tight layout
plt.tight_layout()
st.pyplot(plt)
st.write("For financial account ownership, the gap between men and women diminishes as education level increases. At the primary school level, women are less likely to have accounts compared to men, but this gap closes significantly for those with tertiary education or more, where both genders are almost equally likely to have financial accounts.")
else:
st.write("Required columns for financial inclusion strategies ('Gender', 'Education level', 'Has an account at a financial institution') are not found in the dataset.")
if __name__ == "__main__":
main()