Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# Load the dataset from the GitHub link | |
def load_data(): | |
try: | |
url = 'https://raw.githubusercontent.com/khalo9747/data-storage/main/data_cleaned.csv' | |
df = pd.read_csv(url) | |
# Normalize column names by stripping leading/trailing spaces | |
df.columns = df.columns.str.strip() | |
return df | |
except Exception as e: | |
st.error(f"Error loading data: {e}") | |
return None | |
# Function to categorize age into groups | |
def categorize_age(age): | |
if age < 15: | |
return None | |
elif age <= 19: | |
return 'Teens (15-19)' | |
elif age <= 35: | |
return 'Young Adults (20-35)' | |
elif age <= 55: | |
return 'Middle-Aged (36-55)' | |
elif age <= 69: | |
return 'Seniors (56-69)' | |
else: | |
return 'Elderly (70+)' | |
# Main function to run the Streamlit app | |
def main(): | |
# Load the data | |
df = load_data() | |
# Check if the data is loaded correctly | |
if df is None: | |
st.error("Failed to load data. Please check the data source.") | |
return | |
# Set up the Streamlit page | |
st.title('EDA Dashboard') | |
st.write("This dashboard presents key insights from the dataset and allows for some exploration.") | |
# Tabs for different sections | |
tab1, tab2, tab3, tab4 = st.tabs(["Distributions", "Customer Segmentation", "Optimizing Digital Payment Services", "Financial Inclusion Strategies"]) | |
with tab1: | |
# Dropdown for selecting distribution type | |
st.header('Select Distribution Type') | |
distribution_type = st.selectbox( | |
'Choose a distribution to display:', | |
('Distribution by Region', 'Distribution by Age Group', 'Distribution by Gender') | |
) | |
# Visualize distribution based on selection | |
if distribution_type == 'Distribution by Region': | |
st.subheader('Distribution of Respondents by Region') | |
if 'Region' in df.columns: | |
plt.figure(figsize=(10, 6)) | |
sns.countplot(data=df, x='Region', order=df['Region'].value_counts().index) | |
plt.xticks(rotation=45) | |
plt.title('Distribution of Respondents by Region') | |
plt.xlabel('Region') | |
plt.ylabel('Number of Respondents') | |
st.pyplot(plt) | |
else: | |
st.write("'Region' column not found in the dataset.") | |
elif distribution_type == 'Distribution by Age Group': | |
st.subheader('Distribution by Age Group') | |
if 'Age' in df.columns: | |
df['Age Group'] = df['Age'].apply(categorize_age) | |
age_counts = df['Age Group'].value_counts().sort_index() | |
plt.figure(figsize=(10, 6)) | |
sns.barplot(x=age_counts.index, y=age_counts.values) | |
plt.title('Distribution by Age Group') | |
plt.xlabel('Age Group') | |
plt.ylabel('Number of Respondents') | |
st.pyplot(plt) | |
else: | |
st.write("'Age' column not found in the dataset.") | |
elif distribution_type == 'Distribution by Gender': | |
st.subheader('Distribution by Gender') | |
if 'Gender' in df.columns: | |
gender_counts = df['Gender'].value_counts() | |
gender_labels = ['Female', 'Male'] | |
plt.figure(figsize=(8, 8)) | |
plt.pie(gender_counts, labels=gender_labels, autopct='%1.1f%%', startangle=140, colors=['#FF9999', '#66B3FF']) | |
plt.title('Distribution by Gender') | |
st.pyplot(plt) | |
else: | |
st.write("'Gender' column not found in the dataset.") | |
with tab2: | |
st.subheader("Customer Segmentation") | |
# Check if necessary columns exist | |
if 'Education level' in df.columns and 'Within-economy household income' in df.columns and 'Saved in the past year' in df.columns and 'Borrowed in the past year' in df.columns: | |
# Set the figure size for both plots | |
plt.figure(figsize=(12, 12)) | |
# Cross-tabulation for saving behavior | |
saving_by_educ_income = pd.crosstab(df['Education level'], df['Within-economy household income'], values=df['Saved in the past year'], aggfunc='mean') | |
# Plotting the heatmap for saving behavior | |
plt.subplot(2, 1, 1) | |
sns.heatmap(saving_by_educ_income, annot=True, cmap="Blues", fmt=".2f", linewidths=.5) | |
plt.title('Proportion of People Who Save by Education Level and Income Quantile') | |
plt.xlabel('Income Quantile') | |
plt.ylabel('Education Level') | |
# Cross-tabulation for borrowing behavior | |
borrowing_by_educ_income = pd.crosstab(df['Education level'], df['Within-economy household income'], values=df['Borrowed in the past year'], aggfunc='mean') | |
# Plotting the heatmap for borrowing behavior | |
plt.subplot(2, 1, 2) | |
sns.heatmap(borrowing_by_educ_income, annot=True, cmap="Reds", fmt=".2f", linewidths=.5) | |
plt.title('Proportion of People Who Borrow by Education Level and Income Quantile') | |
plt.xlabel('Income Quantile') | |
plt.ylabel('Education Level') | |
plt.tight_layout() | |
# Display both plots | |
st.pyplot(plt) | |
st.write("Higher-income individuals tend to save more across all education levels. Those with tertiary education save the most, with savings increasing significantly from low to high income, ranging from 0.58 to 0.77. In contrast, people with primary education save less overall, even as their income rises.") | |
st.write("When it comes to borrowing, higher-income individuals also borrow more, particularly those with tertiary education, where borrowing rates reach up to 0.62 for the wealthiest. Borrowing behavior is less predictable, showing that wealthier individuals still use loans, even though they have more savings. Interestingly, lower-educated, higher-income individuals borrow less as income rises, which could suggest limited access to credit in these groups.") | |
else: | |
st.write("Required columns for customer segmentation ('Education level', 'Within-economy household income', 'Saved in the past year', 'Borrowed in the past year') are not found in the dataset.") | |
with tab3: | |
st.subheader("Optimizing Digital Payment Services in Underserved Regions") | |
# Check if necessary columns exist | |
if 'Region' in df.columns and 'Owns a mobile phone' in df.columns and 'Made or received a digital payment' in df.columns: | |
# Calculate the proportion of mobile ownership and digital payment usage by region | |
mobile_ownership = df.groupby('Region')['Owns a mobile phone'].mean() | |
digital_payment = df.groupby('Region')['Made or received a digital payment'].mean() | |
# Set up the figure for two bar plots side-by-side | |
plt.figure(figsize=(14, 8)) | |
# Mobile Ownership Plot | |
plt.subplot(1, 2, 1) | |
sns.barplot(x=mobile_ownership.index, y=mobile_ownership.values, palette="Blues_d") | |
plt.title('Proportion of Mobile Ownership by Region', fontsize=14) | |
plt.xlabel('Region', fontsize=12) | |
plt.ylabel('Proportion with Mobile', fontsize=12) | |
plt.xticks(rotation=45, ha='right', fontsize=10) | |
plt.ylim(0, 1) # Ensure the y-axis stays consistent | |
# Digital Payment Usage Plot | |
plt.subplot(1, 2, 2) | |
sns.barplot(x=digital_payment.index, y=digital_payment.values, palette="Greens_d") | |
plt.title('Proportion of Digital Payment Usage by Region', fontsize=14) | |
plt.xlabel('Region', fontsize=12) | |
plt.ylabel('Proportion with Digital Payments', fontsize=12) | |
plt.xticks(rotation=45, ha='right', fontsize=10) | |
plt.ylim(0, 1) # Keep consistent with the mobile ownership plot | |
# Add a tight layout to improve spacing | |
plt.tight_layout() | |
# Display both plots | |
st.pyplot(plt) | |
st.write("The results show that the majority of people across most regions have access to mobile phones, indicating that mobile infrastructure is generally well-established. However, the adoption of digital payments lags behind significantly in several regions, particularly in Sub-Saharan Africa, South Asia, and parts of the Middle East & North Africa.") | |
st.write("This gap between mobile ownership and digital payment usage highlights significant opportunities for expanding digital financial services. Many people have access to mobile phones, which could serve as a platform for digital payments, but the infrastructure or accessibility of these services seems to be underdeveloped in certain regions.") | |
else: | |
st.write("Required columns for optimizing digital payment services ('Region', 'Owns a mobile phone', 'Made or received a digital payment') are not found in the dataset.") | |
with tab4: | |
st.subheader("Financial Inclusion Strategies") | |
# Check if necessary columns exist | |
if 'Gender' in df.columns and 'Education level' in df.columns and 'Has an account at a financial institution' in df.columns: | |
# Map numeric gender values to string labels | |
df['Gender'] = df['Gender'].map({1: 'Male', 2: 'Female'}) | |
# Prepare data for plotting | |
account_data = df.groupby(['Gender', 'Education level'])['Has an account at a financial institution'].mean().reset_index() | |
# Set up the figure size for the plot | |
plt.figure(figsize=(8, 6)) | |
# Bar plot with side-by-side comparison of men and women | |
sns.barplot(x='Education level', y='Has an account at a financial institution', hue='Gender', data=account_data, palette={'Female': 'lightpink', 'Male': 'lightblue'}) | |
# Add labels and title | |
plt.title('Proportion of Financial Account Ownership by Education Level', fontsize=16) | |
plt.ylabel('Proportion with Account', fontsize=12) | |
plt.xlabel('Education Level', fontsize=12) | |
# Show the legend | |
plt.legend(title='Gender') | |
# Display the plot with tight layout | |
plt.tight_layout() | |
st.pyplot(plt) | |
st.write("For financial account ownership, the gap between men and women diminishes as education level increases. At the primary school level, women are less likely to have accounts compared to men, but this gap closes significantly for those with tertiary education or more, where both genders are almost equally likely to have financial accounts.") | |
else: | |
st.write("Required columns for financial inclusion strategies ('Gender', 'Education level', 'Has an account at a financial institution') are not found in the dataset.") | |
if __name__ == "__main__": | |
main() | |