Analysis of CSR Expenditure and Demographics in India#

This notebook presents an in-depth analysis of state expenditure and demographic factors in India. The dataset used contains information about the amount spent by various Indian states during the fiscal years 2019-2020, 2020-2021, and 2021-2022, along with population and poverty rate data. The goal of this analysis is to gain valuable insights into the CSR spending patterns of different states, explore correlations between spending and demographic factors, and identify trends and growth patterns. By examining this data, we aim to provide a comprehensive understanding of how fiscal decisions and demographics interplay in India’s diverse regions.

Imports#

%%capture
pip install pandas numpy matplotlib seaborn ipywidgets scikit-learn openpyxl geopandas mplleaflet
import pandas as pd
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interactive
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Set Seaborn style
sns.set(style="whitegrid")

Load Data#

excel_file_path = 'data/StateWiseView.xlsx'
data = pd.read_excel(excel_file_path, sheet_name=0)
data.rename(columns={'population':"Population"}, inplace=True)
data.head()
State Amount Spent FY 2019-2020 (INR Cr.) Amount Spent FY 2020-2021 (INR Cr.) Amount Spent FY 2021-2022 (INR Cr.) Population Poverty rate
0 Bihar 110.48 89.89 165.66 128500364 33.76
1 Jharkhand 155.21 226.54 192.41 40100376 28.81
2 Meghalaya 17.65 17.63 19.30 3772103 27.79
3 Uttar Pradesh 577.98 907.32 1321.36 231502578 22.93
4 Madhya Pradesh 220.46 375.51 420.04 85002417 20.63
hdi_df = pd.read_excel('data/state_HDI.xlsx', header=1)

# Check if 'Unnamed: 0' column exists before attempting to drop it
if 'Unnamed: 0' in hdi_df.columns:
    hdi_df = hdi_df.drop('Unnamed: 0', axis=1)
hdi_df.head()

data = data.merge(hdi_df, left_on='State', right_on="State/Union Territory", how='inner')
data.head()
State Amount Spent FY 2019-2020 (INR Cr.) Amount Spent FY 2020-2021 (INR Cr.) Amount Spent FY 2021-2022 (INR Cr.) Population Poverty rate Rank State/Union Territory HDI (2021) Country comparison
0 Bihar 110.48 89.89 165.66 128500364 33.76 34 Bihar 0.571 Republic of the Congo
1 Jharkhand 155.21 226.54 192.41 40100376 28.81 33 Jharkhand 0.589 Angola
2 Meghalaya 17.65 17.63 19.30 3772103 27.79 21 Meghalaya 0.643 Tuvalu
3 Uttar Pradesh 577.98 907.32 1321.36 231502578 22.93 32 Uttar Pradesh 0.592 Zimbabwe
4 Madhya Pradesh 220.46 375.51 420.04 85002417 20.63 31 Madhya Pradesh 0.596 Equatorial Guinea
# Descriptive Statistics
# Calculate basic statistics for numeric columns
desc_stats = data.describe()
desc_stats
Amount Spent FY 2019-2020 (INR Cr.) Amount Spent FY 2020-2021 (INR Cr.) Amount Spent FY 2021-2022 (INR Cr.) Population Poverty rate HDI (2021)
count 32.000000 32.000000 32.000000 3.200000e+01 32.000000 32.000000
mean 428.564062 458.734688 588.124688 4.416897e+07 10.897813 0.663062
std 653.044228 683.359113 982.482441 5.187731e+07 8.807686 0.051314
min 0.000000 0.010000 0.450000 6.600100e+04 0.550000 0.571000
25% 17.132500 16.310000 39.002500 3.095980e+06 4.442500 0.627750
50% 204.950000 203.385000 229.165000 3.135037e+07 7.955000 0.668500
75% 611.042500 638.282500 658.675000 7.267573e+07 15.492500 0.696000
max 3353.240000 3464.810000 5229.310000 2.315026e+08 33.760000 0.752000

Exploratory Data Analysis#

# Comparative Analysis - Spending patterns over the years
# Create a line plot to visualize spending trends for each state over time
plt.figure(figsize=(18, 6))
for col in data.columns[1:4]:  # Columns with spending data
    sns.lineplot(data=data, x='State', y=col, label=col)
plt.title('Spending Trends Over Time')
plt.xlabel('State')
plt.ylabel('Amount Spent (INR Cr.)')
plt.xticks(rotation=90)
plt.legend()
plt.show()
../_images/37e569ed9bca2a55d6ee33778a6f19006c6b50ed7769714240ca7f414c183b70.png
# Sort the data by poverty rate in descending order
sorted_data = data.sort_values(by='Poverty rate', ascending=False)

# Get the top 5 states with the highest poverty rates
top_5_states = sorted_data.head(5)

# Extract the columns for expenditures per year
expenditure_columns = ['Amount Spent FY 2019-2020 (INR Cr.)', 'Amount Spent FY 2020-2021 (INR Cr.)', 'Amount Spent FY 2021-2022 (INR Cr.)']

# Create a DataFrame with the top 5 states and their corresponding expenditures for each year
result_df = top_5_states[['State', 'Poverty rate', 'Population'] + expenditure_columns]

# Display the DataFrame
print("5 states with the highest poverty rate and their expenditure")
result_df
5 states with the highest poverty rate and their expenditure
State Poverty rate Population Amount Spent FY 2019-2020 (INR Cr.) Amount Spent FY 2020-2021 (INR Cr.) Amount Spent FY 2021-2022 (INR Cr.)
0 Bihar 33.76 128500364 110.48 89.89 165.66
1 Jharkhand 28.81 40100376 155.21 226.54 192.41
2 Meghalaya 27.79 3772103 17.65 17.63 19.30
3 Uttar Pradesh 22.93 231502578 577.98 907.32 1321.36
4 Madhya Pradesh 20.63 85002417 220.46 375.51 420.04
def calculate_correlation(data, field1, field2):
    # Calculate the correlation between field1 and field2
    correlation = data[field1].corr(data[field2])
    return correlation

# Example of how to use the function
population_spending_corr = calculate_correlation(data, 'Population', 'Amount Spent FY 2021-2022 (INR Cr.)')
poverty_rate_spending_corr = calculate_correlation(data, 'Poverty rate', 'Amount Spent FY 2021-2022 (INR Cr.)')
hdi_spending_corr = calculate_correlation(data, 'HDI (2021)', 'Amount Spent FY 2021-2022 (INR Cr.)')

print(f"Correlation between Population and Spending (FY 2021-2022): {population_spending_corr:.2f}")
print(f"Correlation between Poverty Rate and Spending (FY 2021-2022): {poverty_rate_spending_corr:.2f}")
print(f"Correlation between HDI (2021) and Spending (FY 2021-2022): {hdi_spending_corr:.2f}")
Correlation between Population and Spending (FY 2021-2022): 0.54
Correlation between Poverty Rate and Spending (FY 2021-2022): -0.06
Correlation between HDI (2021) and Spending (FY 2021-2022): -0.03
# Create a widget to select the year
year_selector = widgets.Dropdown(
    options=['2019-2020', '2020-2021', '2021-2022'],
    value='2021-2022',
    description='Select Year:',
)

# Function to update and display the linear regression chart
def update_chart(selected_year, new_field):
    # Extract CSR spending and the selected field for the selected year
    csr_spending = data[f'Amount Spent FY {selected_year} (INR Cr.)']
    selected_field = data[new_field]

    # Reshape the data for Linear Regression
    csr_spending = csr_spending.values.reshape(-1, 1)
    selected_field = selected_field.values.reshape(-1, 1)

    # Create a Linear Regression model
    model = LinearRegression()
    model.fit(selected_field, csr_spending)

    # Make predictions using the model
    predictions = model.predict(selected_field)

    # Create a scatter plot of the data points
    plt.figure(figsize=(20, 6))
    plt.scatter(selected_field, csr_spending, color='blue', label='Data Points')

    # Plot the regression line
    plt.plot(selected_field, predictions, color='red', linewidth=2, label='Linear Regression')

    # Customize the plot
    plt.title(f'Linear Regression: {new_field} vs. CSR Spending ({selected_year})', fontsize=16)
    plt.xlabel(f'{new_field}')
    plt.ylabel(f'CSR Spending {selected_year} (INR Cr.)')
    plt.legend()
    plt.grid(True)

    # Show the plot
    plt.show()
    


# Create an interactive widget that updates the chart
interactive_plot = interactive(update_chart, selected_year=year_selector, new_field=['Poverty rate', 'Population', 'HDI (2021)'])

# Display the interactive widget
display(interactive_plot)

Parliamentary Constituencies Maps are provided by Data{Meet} Community Maps Project. Its made available under the Creative Commons Attribution 2.5 India.

# Load the shapefile containing state boundaries
shapefile = 'States/Admin2.shp'
gdf = gpd.read_file(shapefile)
gdf = gdf.rename(columns={"ST_NM": "State"})

# Create a function to update the choropleth map
def update_map(year):
    # Merge the shapefile data with your DataFrame (assuming a common column, e.g., 'State')
    merged = gdf.set_index('State').join(data.set_index('State'))

    # Create a choropleth map based on the selected year
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    merged.plot(column=f'Amount Spent FY {year} (INR Cr.)', cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)

    # Customize the plot
    ax.set_title(f'Expenditure by State ({year})')
    ax.axis('off')

    # Show the map
    plt.show()

# Create a widget to select the year
year_selector = widgets.Dropdown(
    options=['2019-2020', '2020-2021', '2021-2022'],
    value='2021-2022',
    description='Select Year:'
)

# Create an interactive widget to update the map
interactive_map = widgets.interactive(update_map, year=year_selector)

# Display the interactive widget
display(interactive_map)