Based on the data analyzed, we have observed:
# Dependencies and Setup
import pandas as pd
# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"
# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)
# Combine the data into a single dataset
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
Calculate the total number of schools
Calculate the total number of students
Calculate the total budget
Calculate the average math score
Calculate the average reading score
Calculate the overall passing rate (overall average score), i.e. (avg. math score + avg. reading score)/2
Calculate the percentage of students with a passing math score (70 or greater)
Calculate the percentage of students with a passing reading score (70 or greater)
Create a dataframe to hold the above results
Optional: give the displayed data cleaner formatting
# Total number of schools
total_schools = len(school_data_complete.school_name.unique())
# Total number of students
total_students = school_data_complete.student_name.count()
# Total budget
total_budget = sum(school_data_complete.budget.unique())
# Average math score
average_math_score = school_data_complete.math_score.mean()
# Average reading score
average_reading_score = school_data_complete.reading_score.mean()
# Percentage of students with passing math score
passing_math_pct = (school_data_complete[school_data_complete['math_score'] >= 70].math_score.count()) / total_students
# Percentage of students with passing reading score
passing_reading_pct = (school_data_complete[school_data_complete['reading_score'] >= 70].reading_score.count()) / total_students
# Overall passing rate
overall_passing_rate = (passing_math_pct + passing_reading_pct)/2
# Store all calculations into dataframe
district_summary = pd.DataFrame({"Total Schools":[total_schools],
"Total Students":[total_students],
"Total Budget":[total_budget],
"Average Math Score":[average_math_score],
"Average Reading Score":[average_reading_score],
"% Passing Math":[passing_math_pct*100],
"% Passing Reading":[passing_reading_pct*100],
"% Overall Passing Rate":[overall_passing_rate*100]})
# Reformat Total Students and Total Budget
district_summary['Total Students'] = district_summary.apply(lambda x: "{:,.0f}".format(x['Total Students']), axis=1)
district_summary['Total Budget'] = district_summary.apply(lambda x: "${:,.2f}".format(x['Total Budget']), axis=1)
# Print Summary Dataframe
district_summary
Create an overview table that summarizes key metrics about each school, including:
Create a dataframe to hold the above results
school_summary = school_data_complete.groupby(['school_name'])
school_names = school_data_complete.school_name.sort_values().unique()
school_types = school_data.sort_values(by="school_name").type
school_total_students = list(school_summary.student_name.count())
school_budget = list(school_summary.budget.mean())
school_per_student_budget = [i/j for i,j in zip(school_budget,school_total_students)]
school_avg_math_score = list(school_summary.math_score.mean())
school_avg_reading_score = list(school_summary.reading_score.mean())
# Calculating passing percentages
school_summary = school_data_complete[school_data_complete['math_score'] >= 70].groupby(['school_name'])
school_pct_passing_math = [(i/j)*100 for i,j in zip(school_summary.math_score.count(),school_total_students)]
school_summary = school_data_complete[school_data_complete['reading_score'] >= 70].groupby(['school_name'])
school_pct_passing_reading = [(i/j)*100 for i,j in zip(school_summary.reading_score.count(),school_total_students)]
school_overall_passing = [(i+j)/2 for i,j in zip(school_pct_passing_math,school_pct_passing_reading)]
# Compile all calculations into dataframe
school_summary_df = pd.DataFrame({"School Names":school_names,
"School Type":school_types,
"Total Students":school_total_students,
"Total School Budget":school_budget,
"Per Student Budget":school_per_student_budget,
"Average Math Score":school_avg_math_score,
"Average Reading Score":school_avg_reading_score,
"% Passing Math":school_pct_passing_math,
"% Passing Reading":school_pct_passing_reading,
"Overall Passing Rate":school_overall_passing})
school_summary_df = school_summary_df.reset_index(drop=True)
school_summary_df
top_5 = school_summary_df.sort_values(by='Overall Passing Rate', ascending=False).head(5).reset_index(drop=True)
top_5
bot_5 = school_summary_df.sort_values(by='Overall Passing Rate', ascending=True).head(5).reset_index(drop=True)
bot_5
Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
Create a pandas series for each grade. Hint: use a conditional statement.
Group each series by school
Combine the series into a dataframe
Optional: give the displayed data cleaner formatting
# Function to create series for average math score by grade
def average_math_by_grade(grade):
school_summary = school_data_complete.loc[school_data_complete.grade == grade].groupby(['school_name'])
school_names = school_data_complete.school_name.sort_values().unique()
school_avg_math_score = list(school_summary.math_score.mean())
# Compile all calculations into dataframe
average_math_df = pd.DataFrame({"School Names":school_names,
f"{grade} Avg Math Score":school_avg_math_score})
average_math_df = average_math_df.reset_index(drop=True)
return average_math_df
# Function to create series for average reading score by grade
def average_reading_by_grade(grade):
school_summary = school_data_complete.loc[school_data_complete.grade == grade].groupby(['school_name'])
school_names = school_data_complete.school_name.sort_values().unique()
school_avg_reading_score = list(school_summary.reading_score.mean())
# Compile all calculations into dataframe
average_reading_df = pd.DataFrame({"School Names":school_names,
f"{grade} Avg Reading Score":school_avg_reading_score})
average_reading_df = average_reading_df.reset_index(drop=True)
return average_reading_df
# Create series for each grade and group by school
grade_9 = average_math_by_grade('9th')
grade_10 = average_math_by_grade('10th')
grade_11 = average_math_by_grade('11th')
grade_12 = average_math_by_grade('12th')
avg_math_score_by_grade = pd.merge(grade_9,grade_10,how='inner',suffixes=('',''))
avg_math_score_by_grade = pd.merge(avg_math_score_by_grade,grade_11,how='inner',suffixes=('',''))
avg_math_score_by_grade = pd.merge(avg_math_score_by_grade,grade_12,how='inner',suffixes=('',''))
avg_math_score_by_grade
# Create series for each grade and group by school
grade_9 = average_reading_by_grade('9th')
grade_10 = average_reading_by_grade('10th')
grade_11 = average_reading_by_grade('11th')
grade_12 = average_reading_by_grade('12th')
avg_reading_score_by_grade = pd.merge(grade_9,grade_10,how='inner',suffixes=('',''))
avg_reading_score_by_grade = pd.merge(avg_reading_score_by_grade,grade_11,how='inner',suffixes=('',''))
avg_reading_score_by_grade = pd.merge(avg_reading_score_by_grade,grade_12,how='inner',suffixes=('',''))
avg_reading_score_by_grade
# Sample bins. Feel free to create your own bins.
spending_bins = [0, 585, 615, 645, 675]
group_names = ["<$585", "$585-615", "$615-645", "$645-675"]
scores_by_spending = school_summary_df[["School Names",
"Average Math Score",
"Average Reading Score",
"% Passing Math",
"% Passing Reading",
"Overall Passing Rate"]]
scores_by_spending["Spending Summary"] = pd.cut(school_summary_df["Per Student Budget"], spending_bins, labels=group_names)
scores_by_spending = scores_by_spending.groupby(["Spending Summary"])
scores_by_spending.mean()
# Sample bins. Feel free to create your own bins.
size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]
scores_by_school_size = school_summary_df[["School Names",
"Average Math Score",
"Average Reading Score",
"% Passing Math",
"% Passing Reading",
"Overall Passing Rate"]]
scores_by_school_size["School Size Summary"] = pd.cut(school_summary_df["Total Students"], size_bins, labels=group_names)
scores_by_school_size = scores_by_school_size.groupby(["School Size Summary"])
scores_by_school_size.mean()
scores_by_type = school_summary_df[["School Names",
"School Type",
"Average Math Score",
"Average Reading Score",
"% Passing Math",
"% Passing Reading",
"Overall Passing Rate"]]
scores_by_type = scores_by_type.groupby("School Type")
scores_by_type.mean()