Baseline Comparison Code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
Building baseline data on race
1950-1990 US Census population breakdown by race
popdiv = pd.read_csv('./inst/us-pop-data/popdiv-census.csv').iloc[0:28]
popdiv = popdiv.replace('(NA)','').replace('-','')
popdiv[['Total population', 'White', 'Black', 'American Indian, Eskimo,\nand Aleut',
'Asian and Pacific\nIslander', 'Other race', 'Hispanic\norigin (of any race)',
'White, not of Hispanic\norigin']] = popdiv[['Total population', 'White', 'Black', 'American Indian, Eskimo,\nand Aleut',
'Asian and Pacific\nIslander', 'Other race', 'Hispanic\norigin (of any race)',
'White, not of Hispanic\norigin']].apply(lambda x:x.str.replace(' ','') ).replace(r'^\s*$', np.nan, regex=True).astype(float)
racedata = popdiv[(popdiv.Year=='1980') | (popdiv.Year=='1970') | (popdiv.Year=='1960')]
racedata['White, not of Hispanic\norigin'] = [180256366.0,177748975.0,158831732.0]
1980-2021 USA Facts population breakdown by race and ethnicity
pop_usa = pd.read_csv('./inst/us-pop-data/population_usafacts.csv')
pop_usa = pop_usa[['Years', 'Population (People)',
' Gender',
' Male (People)',
' Female (People)',
' Race',
' White (People)',
' Black (People)',
' Asian or Pacific Islander (People)',
' By group',
' Asian (People)',
' Native Hawaiian or other Pacific Islander (People)',
' American Indian or Alaska Native (People)',
' Two or more races (People)',
' Ethnicity',
' Hispanic (People)',
' Not Hispanic (People)',
' Race and ethnicity',
' Hispanic (People).1',
' White, non-Hispanic (People)',
' Black, non-Hispanic (People)',
' Asian or Pacific Islander, non-Hispanic (People)',
' By group.1',
' Asian, non-Hispanic (People)',
' Native Hawaiian or other Pacific Islander, non-Hispanic (People)',
' American Indian and Alaska Native, non-Hispanic (People)',
' Two or more races, non-Hispanic (People)',]]
rows = pop_usa.iloc[1:,2:].dropna(axis=0, how='all')
pop_usa = pop_usa.loc[rows.index]
Merging
a = pop_usa[['Years', 'Population (People)',
' White, non-Hispanic (People)',
' Black, non-Hispanic (People)',
' Asian or Pacific Islander (People)',
' American Indian or Alaska Native (People)',
' Two or more races (People)',
' Hispanic (People)']]
a.columns = ['year','total','white','black',
'asian/pacific','native','other','hispanic']
b = racedata[['Year','Total population','White, not of Hispanic\norigin','Black','American Indian, Eskimo,\nand Aleut',
'Asian and Pacific\nIslander','Other race','Hispanic\norigin (of any race)']]
b.columns = ['year','total','white','black','native','asian/pacific','other','hispanic']
race = pd.concat([a,b])
race.year = race.year.astype(int)
race = race.sort_values('year')
race['white%'] = race.white / race.total
race['nonwhite%'] = 1- race['white%']
race['black%'] = race.black / race.total
race['asian/pacific%'] = race['asian/pacific'] / race.total
race['native%'] = race.native / race.total
race['hispanic%'] = race.hispanic / race.total
race['other%'] = race.other / race.total
Visualization
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.1)
plot = sns.lineplot(data=race, x='year', y='white%', marker='o',label='white')
plot = sns.lineplot(data=race, x='year', y='black%', marker='o',label='black')
plot = sns.lineplot(data=race, x='year', y='asian/pacific%', marker='o',label='asian/pacific')
plot = sns.lineplot(data=race, x='year', y='hispanic%', marker='o',label='hispanic')
plot = sns.lineplot(data=race, x='year', y='native%', marker='o',label='native')
plot = sns.lineplot(data=race, x='year', y='other%', marker='o',label='other')
plot.set(title='1960-2021 US population by race', xlim=(1960,2022),
ylabel='proportion')
plot.set_xticks(range(1958,2022,2))
plt.legend(title='group')
plt.xticks(rotation=45);
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.1)
plot = sns.lineplot(data=race, x='year', y='white%', marker='o',label='white')
plot = sns.lineplot(data=race, x='year', y='nonwhite%', marker='o',label='nonwhite')
plot.set(title='1960-2021 US population, % white vs. % nonwhite', xlim=(1960,2022),
ylabel='proportion')
plot.set_xticks(range(1958,2022,2))
plt.legend(title='group')
plt.xticks(rotation=45);
# COMPARISON
stats = pd.read_csv('./out/masterstats_long.csv', index_col=0, dtype={'year':'int'})
stats_nonwhite = stats[stats.category=='white_8_groups']
stats_nonwhite['nonwhite percent'] = 1 - stats_nonwhite.percent
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats_nonwhite, x='year', y='nonwhite percent', marker='o',label='State Dept', color='#007793')
plot = sns.lineplot(data=race, x='year', y='nonwhite%', marker='o',label='USpop', color='#373938')
plot.set(title='1965-2022 % of nonwhite officers vs. US nonwhite population', ylim=(0,.45), xlim=(1964,2023),
ylabel='proportion')
plt.legend(loc='lower right')
plot.set_xticks(range(1965,2022,2))
plt.xticks(rotation=45);
Building baseline data on gender
1960-2021 World Bank US population gender breakdown
wb = pd.read_csv('./inst/us-pop-data/worldbank-percentfemale.csv', header=2)
female = wb[wb['Country Code'] == 'USA'].iloc[:,4:].T
female = female.reset_index().iloc[0:62]
female.columns = ['year','%female']
female['%female'] = female['%female']/100
female['%male'] = (1 - female['%female'])
female.year = female.year.astype(int)
f = female[['year','%female']].rename(columns={'%female':'percent'})
f['cat'] = 'female'
m = female[['year','%male']].rename(columns={'%male':'percent'})
m['cat'] = 'male'
female_long = pd.concat([f,m],axis=0)
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.lineplot(data=female, x='year', y='%female',marker='o', label='female')
sns.lineplot(data=female, x='year', y='%male',marker='o', label='male')
plot.set(title='1960-2021 US Population by Biological Sex', ylim=(0,1),
ylabel='percent')
plt.legend(title='Gender', loc='upper right')
plt.xticks(rotation=60);
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.lineplot(data=female, x='year', y='%female',marker='o', label='female')
sns.lineplot(data=female, x='year', y='%male',marker='o', label='male')
plot.set(title='1960-2021 US Population by Biological Sex',
ylabel='percent')
plt.legend(title='Gender', loc='upper right')
plt.xticks(rotation=60);
stats = pd.read_csv('./out/masterstats_long.csv', index_col=0, dtype={'year':'int'})
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats[(stats['category']=='known_male')|(stats['category']=='known_female')],
x='year', y='percent', hue='category', marker='o')
plot.set(title='1965-2022 Officer Name-Gender Classification, Excluding Unknowns', ylim=(0,1), xlim=(1960,2022),
ylabel='proportion')
plt.legend(title='Gender', loc='upper right',labels=['female','male'])
plot.set_xticks(range(1960,2022,2))
plt.xticks(rotation=45);
# COMPARISON
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.lineplot(data=stats[(stats['category']=='known_female')],
x='year', y='percent', marker='o',label='State Dept', color='#007793')
sns.lineplot(data=female[female.year>1964], x='year', y='%female',marker='o', label='USpop', color='#373938')
plot.set(title='1965-2022 % female of State Dept officers vs. US population', ylim=(0,.6), xlim=(1964,2024),
ylabel='proportion')
plt.legend(title='Gender', loc='lower right')
plot.set_xticks(range(1965,2024,2))
plt.xticks(rotation=45);
# WITH REGRESSION
plt.figure(figsize=(16,5))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.scatterplot()
plot.set(xlim=(1960,2044))
plot = sns.regplot(data=female[female.year>1964], x='year', y='%female',
label='USpop', color='#373938',truncate=False, ci=False,
line_kws={'linewidth':1.25},
scatter_kws={'s':10})
plot = sns.regplot(data=stats[(stats['category']=='known_female')],
x='year', y='percent', marker='o', color='#007793',
truncate=False, ci=False, line_kws={'linewidth':1.25})
plot = sns.lineplot(data=stats[(stats['category']=='known_female')],
x='year', y='percent', marker='o', color='#007793', label='State Dept')
plot.set(title='1965-2022 % female of State Dept officers vs. US population, with prediction curve', ylim=(0,.6),
ylabel='proportion')
plt.axvline(2040, color='#B47D54', linewidth=4)
# plt.legend(loc='lower right')
sns.move_legend(plot, "upper left", bbox_to_anchor=(1, 1))
plot.set_xticks(range(1960,2044,2))
plt.xticks(rotation=45);
Building baseline data on educational attainment
Census educational attainment - Percent 25+ college graduates, by race and sex
# percent of a group that has reached college degree
edu = pd.read_csv('./inst/us-pop-data/25+ high school or more 1940-2021.csv', header=[0,1])
edu.columns = ['year','schooling','age','combination'] + list(edu.columns.map('{0[0]}-{0[1]}'.format)[4:])
pop = pd.read_csv('./inst/us-pop-data/worldbank-uspopulation.csv', header=2)
pop = pop[pop['Country Code'] == 'USA'].iloc[:,4:].T
pop = pop.iloc[:-1,:]
pop = pop.reset_index()
pop.columns = ['year','total']
pop['%female'] = female['%female']
pop['%male'] = female['%male']
pop['female'] = pop['%female'] * pop.total
pop['male'] = pop['%male'] * pop.total
pop.year = pop.year.astype(int)
edu = edu.merge(pop[['year','total','female','male']], on='year',how='left',suffixes=['','pop'])
edu = edu.merge(race[['year','white','black','asian/pacific','native','hispanic']],how='left', on='year',suffixes=['','pop'])
edu['totalgrads'] = edu.total * edu['Total-Total']
edu['femalegrads'] = edu.female * edu['Total-Female']
edu['malegrads'] = edu.male * edu['Total-Male']
edu['%femalegrads'] = edu['femalegrads'] / edu['totalgrads']
edu['%malegrads'] = edu['malegrads'] / edu['totalgrads']
# edu['white'] = edu['white'].fillna(0)
edu['White-Total'] = edu['White-Total'].replace('N',0).astype(float)
edu['whitegrads'] = edu.white * edu['White-Total']
edu['%whitegrads'] = edu['whitegrads'] / edu['totalgrads']
edu['%nonwhitegrads'] = 1 - edu['%whitegrads']
educollege = edu[(edu.combination==0)&(edu.age=='25+')&(edu.schooling=='College')].iloc[:,[0]+list(range(-8,0))]
by sex
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=educollege, x='year', y='%femalegrads', marker='o', ci=None, color='#C85858',label='female')
plot = sns.lineplot(data=educollege, x='year', y='%malegrads', marker='o', ci=None, color='#007793',label='male')
plot.set(title='US population percentage of college graduates by sex', xlim=(1960,2022),
ylabel='proportion')
plot.set_xticks(range(1960,2022,3))
plt.xticks(rotation=45);
stats = pd.read_csv('./out/masterstats_long.csv', index_col=0, dtype={'year':'int'})
# COMPARISON
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.lineplot(data=stats[(stats['category']=='known_female')],
x='year', y='percent', marker='o',label='State Dept', color='#007793')
plot = sns.lineplot(data=educollege, x='year', y='%femalegrads', marker='o',label='US grads',ci=None, color='#373938')
plot.set(title='1965-2022 % of female State Dept officers vs. US college graduates', xlim=(1964,2023), ylim=(0,.6),
ylabel='proportion')
plot.set_xticks(range(1965,2022,2))
plt.xticks(rotation=45);
by race
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=educollege, x='year', y='%whitegrads', marker='o', ci=None, color='#B47D54',label='white')
plot = sns.lineplot(data=educollege, x='year', y='%nonwhitegrads', marker='o', ci=None, color='#007793',label='nonwhite')
plot.set(title='US population percentage of white vs. nonwhite college graduates', xlim=(1968,2022),
ylabel='proportion')
plot.set_xticks(range(1968,2022,3))
plt.xticks(rotation=45);
# COMPARISON
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats_nonwhite,
x='year', y='nonwhite percent', marker='o',label='State Dept', color='#007793')
plot = sns.lineplot(data=educollege, x='year', y='%nonwhitegrads', marker='o', ci=None, color='#373938',label='US grads')
plot.set(title='1965-2022 % of nonwhite State Dept officers vs. US college graduates', ylim=(0,.45), xlim=(1964,2023),
ylabel='proportion')
plt.legend(title='population', loc='lower right')
plot.set_xticks(range(1964,2024,2))
plt.xticks(rotation=45);