Analysis of CSR Expenditure and Demographics in India

Analysis of CSR Expenditure and Demographics in India#

This notebook presents an in-depth analysis of state expenditure and demographic factors in India. The dataset used contains information about the amount spent by various Indian states during the fiscal years 2019-2020, 2020-2021, and 2021-2022, along with population and poverty rate data. The goal of this analysis is to gain valuable insights into the CSR spending patterns of different states, explore correlations between spending and demographic factors, and identify trends and growth patterns. By examining this data, we aim to provide a comprehensive understanding of how fiscal decisions and demographics interplay in India’s diverse regions.

Imports#

%%capture
pip install pandas numpy matplotlib seaborn ipywidgets scikit-learn openpyxl geopandas mplleaflet

import pandas as pd
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interactive
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Set Seaborn style
sns.set(style="whitegrid")

Load Data#

excel_file_path = 'data/StateWiseView.xlsx'
data = pd.read_excel(excel_file_path, sheet_name=0)
data.rename(columns={'population':"Population"}, inplace=True)
data.head()

	State	Amount Spent FY 2019-2020 (INR Cr.)	Amount Spent FY 2020-2021 (INR Cr.)	Amount Spent FY 2021-2022 (INR Cr.)	Population	Poverty rate
0	Bihar	110.48	89.89	165.66	128500364	33.76
1	Jharkhand	155.21	226.54	192.41	40100376	28.81
2	Meghalaya	17.65	17.63	19.30	3772103	27.79
3	Uttar Pradesh	577.98	907.32	1321.36	231502578	22.93
4	Madhya Pradesh	220.46	375.51	420.04	85002417	20.63

hdi_df = pd.read_excel('data/state_HDI.xlsx', header=1)

# Check if 'Unnamed: 0' column exists before attempting to drop it
if 'Unnamed: 0' in hdi_df.columns:
    hdi_df = hdi_df.drop('Unnamed: 0', axis=1)
hdi_df.head()

data = data.merge(hdi_df, left_on='State', right_on="State/Union Territory", how='inner')
data.head()

	State	Amount Spent FY 2019-2020 (INR Cr.)	Amount Spent FY 2020-2021 (INR Cr.)	Amount Spent FY 2021-2022 (INR Cr.)	Population	Poverty rate	Rank	State/Union Territory	HDI (2021)	Country comparison
0	Bihar	110.48	89.89	165.66	128500364	33.76	34	Bihar	0.571	Republic of the Congo
1	Jharkhand	155.21	226.54	192.41	40100376	28.81	33	Jharkhand	0.589	Angola
2	Meghalaya	17.65	17.63	19.30	3772103	27.79	21	Meghalaya	0.643	Tuvalu
3	Uttar Pradesh	577.98	907.32	1321.36	231502578	22.93	32	Uttar Pradesh	0.592	Zimbabwe
4	Madhya Pradesh	220.46	375.51	420.04	85002417	20.63	31	Madhya Pradesh	0.596	Equatorial Guinea

# Descriptive Statistics
# Calculate basic statistics for numeric columns
desc_stats = data.describe()
desc_stats

	Amount Spent FY 2019-2020 (INR Cr.)	Amount Spent FY 2020-2021 (INR Cr.)	Amount Spent FY 2021-2022 (INR Cr.)	Population	Poverty rate	HDI (2021)
count	32.000000	32.000000	32.000000	3.200000e+01	32.000000	32.000000
mean	428.564062	458.734688	588.124688	4.416897e+07	10.897813	0.663062
std	653.044228	683.359113	982.482441	5.187731e+07	8.807686	0.051314
min	0.000000	0.010000	0.450000	6.600100e+04	0.550000	0.571000
25%	17.132500	16.310000	39.002500	3.095980e+06	4.442500	0.627750
50%	204.950000	203.385000	229.165000	3.135037e+07	7.955000	0.668500
75%	611.042500	638.282500	658.675000	7.267573e+07	15.492500	0.696000
max	3353.240000	3464.810000	5229.310000	2.315026e+08	33.760000	0.752000

Exploratory Data Analysis#

# Comparative Analysis - Spending patterns over the years
# Create a line plot to visualize spending trends for each state over time
plt.figure(figsize=(18, 6))
for col in data.columns[1:4]:  # Columns with spending data
    sns.lineplot(data=data, x='State', y=col, label=col)
plt.title('Spending Trends Over Time')
plt.xlabel('State')
plt.ylabel('Amount Spent (INR Cr.)')
plt.xticks(rotation=90)
plt.legend()
plt.show()

../_images/37e569ed9bca2a55d6ee33778a6f19006c6b50ed7769714240ca7f414c183b70.png

# Sort the data by poverty rate in descending order
sorted_data = data.sort_values(by='Poverty rate', ascending=False)

# Get the top 5 states with the highest poverty rates
top_5_states = sorted_data.head(5)

# Extract the columns for expenditures per year
expenditure_columns = ['Amount Spent FY 2019-2020 (INR Cr.)', 'Amount Spent FY 2020-2021 (INR Cr.)', 'Amount Spent FY 2021-2022 (INR Cr.)']

# Create a DataFrame with the top 5 states and their corresponding expenditures for each year
result_df = top_5_states[['State', 'Poverty rate', 'Population'] + expenditure_columns]

# Display the DataFrame
print("5 states with the highest poverty rate and their expenditure")
result_df

5 states with the highest poverty rate and their expenditure

	State	Poverty rate	Population	Amount Spent FY 2019-2020 (INR Cr.)	Amount Spent FY 2020-2021 (INR Cr.)	Amount Spent FY 2021-2022 (INR Cr.)
0	Bihar	33.76	128500364	110.48	89.89	165.66
1	Jharkhand	28.81	40100376	155.21	226.54	192.41
2	Meghalaya	27.79	3772103	17.65	17.63	19.30
3	Uttar Pradesh	22.93	231502578	577.98	907.32	1321.36
4	Madhya Pradesh	20.63	85002417	220.46	375.51	420.04

def calculate_correlation(data, field1, field2):
    # Calculate the correlation between field1 and field2
    correlation = data[field1].corr(data[field2])
    return correlation

# Example of how to use the function
population_spending_corr = calculate_correlation(data, 'Population', 'Amount Spent FY 2021-2022 (INR Cr.)')
poverty_rate_spending_corr = calculate_correlation(data, 'Poverty rate', 'Amount Spent FY 2021-2022 (INR Cr.)')
hdi_spending_corr = calculate_correlation(data, 'HDI (2021)', 'Amount Spent FY 2021-2022 (INR Cr.)')

print(f"Correlation between Population and Spending (FY 2021-2022): {population_spending_corr:.2f}")
print(f"Correlation between Poverty Rate and Spending (FY 2021-2022): {poverty_rate_spending_corr:.2f}")
print(f"Correlation between HDI (2021) and Spending (FY 2021-2022): {hdi_spending_corr:.2f}")

Correlation between Population and Spending (FY 2021-2022): 0.54
Correlation between Poverty Rate and Spending (FY 2021-2022): -0.06
Correlation between HDI (2021) and Spending (FY 2021-2022): -0.03

# Create a widget to select the year
year_selector = widgets.Dropdown(
    options=['2019-2020', '2020-2021', '2021-2022'],
    value='2021-2022',
    description='Select Year:',
)

# Function to update and display the linear regression chart
def update_chart(selected_year, new_field):
    # Extract CSR spending and the selected field for the selected year
    csr_spending = data[f'Amount Spent FY {selected_year} (INR Cr.)']
    selected_field = data[new_field]

    # Reshape the data for Linear Regression
    csr_spending = csr_spending.values.reshape(-1, 1)
    selected_field = selected_field.values.reshape(-1, 1)

    # Create a Linear Regression model
    model = LinearRegression()
    model.fit(selected_field, csr_spending)

    # Make predictions using the model
    predictions = model.predict(selected_field)

    # Create a scatter plot of the data points
    plt.figure(figsize=(20, 6))
    plt.scatter(selected_field, csr_spending, color='blue', label='Data Points')

    # Plot the regression line
    plt.plot(selected_field, predictions, color='red', linewidth=2, label='Linear Regression')

    # Customize the plot
    plt.title(f'Linear Regression: {new_field} vs. CSR Spending ({selected_year})', fontsize=16)
    plt.xlabel(f'{new_field}')
    plt.ylabel(f'CSR Spending {selected_year} (INR Cr.)')
    plt.legend()
    plt.grid(True)

    # Show the plot
    plt.show()
    


# Create an interactive widget that updates the chart
interactive_plot = interactive(update_chart, selected_year=year_selector, new_field=['Poverty rate', 'Population', 'HDI (2021)'])

# Display the interactive widget
display(interactive_plot)

Parliamentary Constituencies Maps are provided by Data{Meet} Community Maps Project. Its made available under the Creative Commons Attribution 2.5 India.

# Load the shapefile containing state boundaries
shapefile = 'States/Admin2.shp'
gdf = gpd.read_file(shapefile)
gdf = gdf.rename(columns={"ST_NM": "State"})

# Create a function to update the choropleth map
def update_map(year):
    # Merge the shapefile data with your DataFrame (assuming a common column, e.g., 'State')
    merged = gdf.set_index('State').join(data.set_index('State'))

    # Create a choropleth map based on the selected year
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    merged.plot(column=f'Amount Spent FY {year} (INR Cr.)', cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)

    # Customize the plot
    ax.set_title(f'Expenditure by State ({year})')
    ax.axis('off')

    # Show the map
    plt.show()

# Create a widget to select the year
year_selector = widgets.Dropdown(
    options=['2019-2020', '2020-2021', '2021-2022'],
    value='2021-2022',
    description='Select Year:'
)

# Create an interactive widget to update the map
interactive_map = widgets.interactive(update_map, year=year_selector)

# Display the interactive widget
display(interactive_map)

Analysis of CSR Expenditure and Demographics in India

Contents

Analysis of CSR Expenditure and Demographics in India#

Imports#

Load Data#

Exploratory Data Analysis#