Spaces:

khalo9747
/

assignment

Sleeping

App Files Files Community

assignment / app.py

khalo9747

Update app.py

e5e29ce verified about 2 months ago

raw

history blame

11.1 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Load the dataset from the GitHub link
	@st.cache_data
	def load_data():
	try:
	url = 'https://raw.githubusercontent.com/khalo9747/data-storage/main/data_cleaned.csv'
	df = pd.read_csv(url)

	# Normalize column names by stripping leading/trailing spaces
	df.columns = df.columns.str.strip()

	return df
	except Exception as e:
	st.error(f"Error loading data: {e}")
	return None

	# Function to categorize age into groups
	def categorize_age(age):
	if age < 15:
	return None
	elif age <= 19:
	return 'Teens (15-19)'
	elif age <= 35:
	return 'Young Adults (20-35)'
	elif age <= 55:
	return 'Middle-Aged (36-55)'
	elif age <= 69:
	return 'Seniors (56-69)'
	else:
	return 'Elderly (70+)'

	# Main function to run the Streamlit app
	def main():
	# Load the data
	df = load_data()

	# Check if the data is loaded correctly
	if df is None:
	st.error("Failed to load data. Please check the data source.")
	return

	# Set up the Streamlit page
	st.title('EDA Dashboard')
	st.write("This dashboard presents key insights from the dataset and allows for some exploration.")

	# Tabs for different sections
	tab1, tab2, tab3, tab4 = st.tabs(["Distributions", "Customer Segmentation", "Optimizing Digital Payment Services", "Financial Inclusion Strategies"])

	with tab1:
	# Dropdown for selecting distribution type
	st.header('Select Distribution Type')
	distribution_type = st.selectbox(
	'Choose a distribution to display:',
	('Distribution by Region', 'Distribution by Age Group', 'Distribution by Gender')
	)

	# Visualize distribution based on selection
	if distribution_type == 'Distribution by Region':
	st.subheader('Distribution of Respondents by Region')
	if 'Region' in df.columns:
	plt.figure(figsize=(10, 6))
	sns.countplot(data=df, x='Region', order=df['Region'].value_counts().index)
	plt.xticks(rotation=45)
	plt.title('Distribution of Respondents by Region')
	plt.xlabel('Region')
	plt.ylabel('Number of Respondents')
	st.pyplot(plt)
	else:
	st.write("'Region' column not found in the dataset.")

	elif distribution_type == 'Distribution by Age Group':
	st.subheader('Distribution by Age Group')
	if 'Age' in df.columns:
	df['Age Group'] = df['Age'].apply(categorize_age)
	age_counts = df['Age Group'].value_counts().sort_index()
	plt.figure(figsize=(10, 6))
	sns.barplot(x=age_counts.index, y=age_counts.values)
	plt.title('Distribution by Age Group')
	plt.xlabel('Age Group')
	plt.ylabel('Number of Respondents')
	st.pyplot(plt)
	else:
	st.write("'Age' column not found in the dataset.")

	elif distribution_type == 'Distribution by Gender':
	st.subheader('Distribution by Gender')
	if 'Gender' in df.columns:
	gender_counts = df['Gender'].value_counts()
	gender_labels = ['Female', 'Male']
	plt.figure(figsize=(8, 8))
	plt.pie(gender_counts, labels=gender_labels, autopct='%1.1f%%', startangle=140, colors=['#FF9999', '#66B3FF'])
	plt.title('Distribution by Gender')
	st.pyplot(plt)
	else:
	st.write("'Gender' column not found in the dataset.")

	with tab2:
	st.subheader("Customer Segmentation")

	# Check if necessary columns exist
	if 'Education level' in df.columns and 'Within-economy household income' in df.columns and 'Saved in the past year' in df.columns and 'Borrowed in the past year' in df.columns:
	# Set the figure size for both plots
	plt.figure(figsize=(12, 12))

	# Cross-tabulation for saving behavior
	saving_by_educ_income = pd.crosstab(df['Education level'], df['Within-economy household income'], values=df['Saved in the past year'], aggfunc='mean')

	# Plotting the heatmap for saving behavior
	plt.subplot(2, 1, 1)
	sns.heatmap(saving_by_educ_income, annot=True, cmap="Blues", fmt=".2f", linewidths=.5)
	plt.title('Proportion of People Who Save by Education Level and Income Quantile')
	plt.xlabel('Income Quantile')
	plt.ylabel('Education Level')

	# Cross-tabulation for borrowing behavior
	borrowing_by_educ_income = pd.crosstab(df['Education level'], df['Within-economy household income'], values=df['Borrowed in the past year'], aggfunc='mean')

	# Plotting the heatmap for borrowing behavior
	plt.subplot(2, 1, 2)
	sns.heatmap(borrowing_by_educ_income, annot=True, cmap="Reds", fmt=".2f", linewidths=.5)
	plt.title('Proportion of People Who Borrow by Education Level and Income Quantile')
	plt.xlabel('Income Quantile')
	plt.ylabel('Education Level')
	plt.tight_layout()

	# Display both plots
	st.pyplot(plt)

	st.write("Higher-income individuals tend to save more across all education levels. Those with tertiary education save the most, with savings increasing significantly from low to high income, ranging from 0.58 to 0.77. In contrast, people with primary education save less overall, even as their income rises.")
	st.write("When it comes to borrowing, higher-income individuals also borrow more, particularly those with tertiary education, where borrowing rates reach up to 0.62 for the wealthiest. Borrowing behavior is less predictable, showing that wealthier individuals still use loans, even though they have more savings. Interestingly, lower-educated, higher-income individuals borrow less as income rises, which could suggest limited access to credit in these groups.")
	else:
	st.write("Required columns for customer segmentation ('Education level', 'Within-economy household income', 'Saved in the past year', 'Borrowed in the past year') are not found in the dataset.")

	with tab3:
	st.subheader("Optimizing Digital Payment Services in Underserved Regions")

	# Check if necessary columns exist
	if 'Region' in df.columns and 'Owns a mobile phone' in df.columns and 'Made or received a digital payment' in df.columns:
	# Calculate the proportion of mobile ownership and digital payment usage by region
	mobile_ownership = df.groupby('Region')['Owns a mobile phone'].mean()
	digital_payment = df.groupby('Region')['Made or received a digital payment'].mean()

	# Set up the figure for two bar plots side-by-side
	plt.figure(figsize=(14, 8))

	# Mobile Ownership Plot
	plt.subplot(1, 2, 1)
	sns.barplot(x=mobile_ownership.index, y=mobile_ownership.values, palette="Blues_d")
	plt.title('Proportion of Mobile Ownership by Region', fontsize=14)
	plt.xlabel('Region', fontsize=12)
	plt.ylabel('Proportion with Mobile', fontsize=12)
	plt.xticks(rotation=45, ha='right', fontsize=10)
	plt.ylim(0, 1) # Ensure the y-axis stays consistent

	# Digital Payment Usage Plot
	plt.subplot(1, 2, 2)
	sns.barplot(x=digital_payment.index, y=digital_payment.values, palette="Greens_d")
	plt.title('Proportion of Digital Payment Usage by Region', fontsize=14)
	plt.xlabel('Region', fontsize=12)
	plt.ylabel('Proportion with Digital Payments', fontsize=12)
	plt.xticks(rotation=45, ha='right', fontsize=10)
	plt.ylim(0, 1) # Keep consistent with the mobile ownership plot

	# Add a tight layout to improve spacing
	plt.tight_layout()

	# Display both plots
	st.pyplot(plt)
	st.write("The results show that the majority of people across most regions have access to mobile phones, indicating that mobile infrastructure is generally well-established. However, the adoption of digital payments lags behind significantly in several regions, particularly in Sub-Saharan Africa, South Asia, and parts of the Middle East & North Africa.")
	st.write("This gap between mobile ownership and digital payment usage highlights significant opportunities for expanding digital financial services. Many people have access to mobile phones, which could serve as a platform for digital payments, but the infrastructure or accessibility of these services seems to be underdeveloped in certain regions.")
	else:
	st.write("Required columns for optimizing digital payment services ('Region', 'Owns a mobile phone', 'Made or received a digital payment') are not found in the dataset.")

	with tab4:
	st.subheader("Financial Inclusion Strategies")

	# Check if necessary columns exist
	if 'Gender' in df.columns and 'Education level' in df.columns and 'Has an account at a financial institution' in df.columns:
	# Map numeric gender values to string labels
	df['Gender'] = df['Gender'].map({1: 'Male', 2: 'Female'})

	# Prepare data for plotting
	account_data = df.groupby(['Gender', 'Education level'])['Has an account at a financial institution'].mean().reset_index()

	# Set up the figure size for the plot
	plt.figure(figsize=(8, 6))

	# Bar plot with side-by-side comparison of men and women
	sns.barplot(x='Education level', y='Has an account at a financial institution', hue='Gender', data=account_data, palette={'Female': 'lightpink', 'Male': 'lightblue'})

	# Add labels and title
	plt.title('Proportion of Financial Account Ownership by Education Level', fontsize=16)
	plt.ylabel('Proportion with Account', fontsize=12)
	plt.xlabel('Education Level', fontsize=12)

	# Show the legend
	plt.legend(title='Gender')

	# Display the plot with tight layout
	plt.tight_layout()
	st.pyplot(plt)
	st.write("For financial account ownership, the gap between men and women diminishes as education level increases. At the primary school level, women are less likely to have accounts compared to men, but this gap closes significantly for those with tertiary education or more, where both genders are almost equally likely to have financial accounts.")
	else:
	st.write("Required columns for financial inclusion strategies ('Gender', 'Education level', 'Has an account at a financial institution') are not found in the dataset.")

	if __name__ == "__main__":
	main()