Baseline Comparison Code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Building baseline data on race

1950-1990 US Census population breakdown by race

popdiv = pd.read_csv('./inst/us-pop-data/popdiv-census.csv').iloc[0:28]
popdiv = popdiv.replace('(NA)','').replace('-','')

popdiv[['Total  population', 'White', 'Black', 'American Indian, Eskimo,\nand Aleut',
        'Asian and Pacific\nIslander', 'Other race', 'Hispanic\norigin (of any race)', 
        'White, not of Hispanic\norigin']] = popdiv[['Total  population', 'White', 'Black', 'American Indian, Eskimo,\nand Aleut',
                                                     'Asian and Pacific\nIslander', 'Other race', 'Hispanic\norigin (of any race)', 
                                                     'White, not of Hispanic\norigin']].apply(lambda x:x.str.replace(' ','') ).replace(r'^\s*$', np.nan, regex=True).astype(float)

racedata = popdiv[(popdiv.Year=='1980') | (popdiv.Year=='1970') | (popdiv.Year=='1960')]
racedata['White, not of Hispanic\norigin'] = [180256366.0,177748975.0,158831732.0]

1980-2021 USA Facts population breakdown by race and ethnicity

pop_usa = pd.read_csv('./inst/us-pop-data/population_usafacts.csv')
pop_usa =   pop_usa[['Years', 'Population (People)',
                     '    Gender',
                     '        Male (People)',
                     '        Female (People)',
                     '    Race',
                     '        White (People)',
                     '        Black (People)',
                     '        Asian or Pacific Islander (People)',
                     '            By group',
                     '                Asian (People)',
                     '                Native Hawaiian or other Pacific Islander (People)',
                     '        American Indian or Alaska Native (People)',
                     '        Two or more races (People)',
                     '    Ethnicity',
                     '        Hispanic (People)',
                     '        Not Hispanic (People)',
                     '    Race and ethnicity',
                     '        Hispanic (People).1',
                     '        White, non-Hispanic (People)',
                     '        Black, non-Hispanic (People)',
                     '        Asian or Pacific Islander, non-Hispanic (People)',
                     '            By group.1',
                     '                Asian, non-Hispanic (People)',
                     '                Native Hawaiian or other Pacific Islander, non-Hispanic (People)',
                     '        American Indian and Alaska Native, non-Hispanic (People)',
                     '        Two or more races, non-Hispanic (People)',]]

rows = pop_usa.iloc[1:,2:].dropna(axis=0, how='all')
pop_usa = pop_usa.loc[rows.index]

Merging

a = pop_usa[['Years', 'Population (People)',
             '        White, non-Hispanic (People)',
             '        Black, non-Hispanic (People)',
             '        Asian or Pacific Islander (People)',
             '        American Indian or Alaska Native (People)',
             '        Two or more races (People)',
             '        Hispanic (People)']]
a.columns = ['year','total','white','black',
             'asian/pacific','native','other','hispanic']

b = racedata[['Year','Total  population','White, not of Hispanic\norigin','Black','American Indian, Eskimo,\nand Aleut',
              'Asian and Pacific\nIslander','Other race','Hispanic\norigin (of any race)']]
b.columns = ['year','total','white','black','native','asian/pacific','other','hispanic']

race = pd.concat([a,b])
race.year = race.year.astype(int)
race = race.sort_values('year')

race['white%'] = race.white / race.total
race['nonwhite%'] = 1- race['white%']
race['black%'] = race.black / race.total
race['asian/pacific%'] = race['asian/pacific'] / race.total
race['native%'] = race.native / race.total
race['hispanic%'] = race.hispanic / race.total
race['other%'] = race.other / race.total

Visualization

plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.1)
plot = sns.lineplot(data=race, x='year', y='white%', marker='o',label='white')
plot = sns.lineplot(data=race, x='year', y='black%', marker='o',label='black')
plot = sns.lineplot(data=race, x='year', y='asian/pacific%', marker='o',label='asian/pacific')
plot = sns.lineplot(data=race, x='year', y='hispanic%', marker='o',label='hispanic')
plot = sns.lineplot(data=race, x='year', y='native%', marker='o',label='native')
plot = sns.lineplot(data=race, x='year', y='other%', marker='o',label='other')
plot.set(title='1960-2021 US population by race', xlim=(1960,2022),
         ylabel='proportion')
plot.set_xticks(range(1958,2022,2))
plt.legend(title='group')
plt.xticks(rotation=45);

png

plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.1)
plot = sns.lineplot(data=race, x='year', y='white%', marker='o',label='white')
plot = sns.lineplot(data=race, x='year', y='nonwhite%', marker='o',label='nonwhite')
plot.set(title='1960-2021 US population, % white vs. % nonwhite', xlim=(1960,2022),
         ylabel='proportion')
plot.set_xticks(range(1958,2022,2))
plt.legend(title='group')
plt.xticks(rotation=45);

png

# COMPARISON
stats = pd.read_csv('./out/masterstats_long.csv', index_col=0, dtype={'year':'int'})
stats_nonwhite = stats[stats.category=='white_8_groups']
stats_nonwhite['nonwhite percent'] = 1 - stats_nonwhite.percent

plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats_nonwhite, x='year', y='nonwhite percent', marker='o',label='State Dept', color='#007793')
plot = sns.lineplot(data=race, x='year', y='nonwhite%', marker='o',label='USpop', color='#373938')
plot.set(title='1965-2022 % of nonwhite officers vs. US nonwhite population', ylim=(0,.45), xlim=(1964,2023),
         ylabel='proportion')
plt.legend(loc='lower right')
plot.set_xticks(range(1965,2022,2))
plt.xticks(rotation=45);

png

Building baseline data on gender

1960-2021 World Bank US population gender breakdown

wb = pd.read_csv('./inst/us-pop-data/worldbank-percentfemale.csv', header=2)

female = wb[wb['Country Code'] == 'USA'].iloc[:,4:].T
female = female.reset_index().iloc[0:62]
female.columns = ['year','%female']
female['%female'] = female['%female']/100
female['%male'] = (1 - female['%female'])
female.year = female.year.astype(int)

f = female[['year','%female']].rename(columns={'%female':'percent'})
f['cat'] = 'female'
m = female[['year','%male']].rename(columns={'%male':'percent'})
m['cat'] = 'male'
female_long = pd.concat([f,m],axis=0)

plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.lineplot(data=female, x='year', y='%female',marker='o', label='female')
sns.lineplot(data=female, x='year', y='%male',marker='o', label='male')
plot.set(title='1960-2021 US Population by Biological Sex', ylim=(0,1),
         ylabel='percent')
plt.legend(title='Gender', loc='upper right')
plt.xticks(rotation=60);

png

plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.lineplot(data=female, x='year', y='%female',marker='o', label='female')
sns.lineplot(data=female, x='year', y='%male',marker='o', label='male')
plot.set(title='1960-2021 US Population by Biological Sex', 
         ylabel='percent')
plt.legend(title='Gender', loc='upper right')
plt.xticks(rotation=60);

png

stats = pd.read_csv('./out/masterstats_long.csv', index_col=0, dtype={'year':'int'})
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats[(stats['category']=='known_male')|(stats['category']=='known_female')], 
                    x='year', y='percent', hue='category', marker='o')
plot.set(title='1965-2022 Officer Name-Gender Classification, Excluding Unknowns', ylim=(0,1), xlim=(1960,2022),
         ylabel='proportion')
plt.legend(title='Gender', loc='upper right',labels=['female','male'])
plot.set_xticks(range(1960,2022,2))
plt.xticks(rotation=45);

png

# COMPARISON
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.lineplot(data=stats[(stats['category']=='known_female')], 
                    x='year', y='percent', marker='o',label='State Dept', color='#007793')
sns.lineplot(data=female[female.year>1964], x='year', y='%female',marker='o', label='USpop', color='#373938')
plot.set(title='1965-2022 % female of State Dept officers vs. US population', ylim=(0,.6), xlim=(1964,2024),
         ylabel='proportion')
plt.legend(title='Gender', loc='lower right')
plot.set_xticks(range(1965,2024,2))
plt.xticks(rotation=45);

png

# WITH REGRESSION
plt.figure(figsize=(16,5))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.scatterplot()
plot.set(xlim=(1960,2044))
plot = sns.regplot(data=female[female.year>1964], x='year', y='%female',
                    label='USpop', color='#373938',truncate=False, ci=False, 
                    line_kws={'linewidth':1.25},
                    scatter_kws={'s':10})

plot = sns.regplot(data=stats[(stats['category']=='known_female')], 
                    x='year', y='percent', marker='o', color='#007793',
                   truncate=False, ci=False, line_kws={'linewidth':1.25})
plot = sns.lineplot(data=stats[(stats['category']=='known_female')], 
                    x='year', y='percent', marker='o', color='#007793', label='State Dept')
plot.set(title='1965-2022 % female of State Dept officers vs. US population, with prediction curve', ylim=(0,.6),
         ylabel='proportion')
plt.axvline(2040, color='#B47D54', linewidth=4)
# plt.legend(loc='lower right')
sns.move_legend(plot, "upper left", bbox_to_anchor=(1, 1))
plot.set_xticks(range(1960,2044,2))
plt.xticks(rotation=45);

png

Building baseline data on educational attainment

Census educational attainment - Percent 25+ college graduates, by race and sex

# percent of a group that has reached college degree
edu = pd.read_csv('./inst/us-pop-data/25+ high school or more 1940-2021.csv', header=[0,1])
edu.columns = ['year','schooling','age','combination'] + list(edu.columns.map('{0[0]}-{0[1]}'.format)[4:])

pop = pd.read_csv('./inst/us-pop-data/worldbank-uspopulation.csv',  header=2)
pop = pop[pop['Country Code'] == 'USA'].iloc[:,4:].T
pop = pop.iloc[:-1,:]
pop = pop.reset_index()
pop.columns = ['year','total']
pop['%female'] = female['%female']
pop['%male'] = female['%male']
pop['female'] = pop['%female'] * pop.total
pop['male'] = pop['%male'] * pop.total
pop.year = pop.year.astype(int)

edu = edu.merge(pop[['year','total','female','male']], on='year',how='left',suffixes=['','pop'])
edu = edu.merge(race[['year','white','black','asian/pacific','native','hispanic']],how='left', on='year',suffixes=['','pop'])

edu['totalgrads'] = edu.total * edu['Total-Total']
edu['femalegrads'] = edu.female * edu['Total-Female']
edu['malegrads'] = edu.male * edu['Total-Male']

edu['%femalegrads'] = edu['femalegrads'] / edu['totalgrads']
edu['%malegrads'] = edu['malegrads'] / edu['totalgrads']

# edu['white'] = edu['white'].fillna(0)
edu['White-Total'] = edu['White-Total'].replace('N',0).astype(float)
edu['whitegrads'] = edu.white * edu['White-Total']
edu['%whitegrads'] = edu['whitegrads'] / edu['totalgrads']
edu['%nonwhitegrads'] = 1 - edu['%whitegrads']

educollege = edu[(edu.combination==0)&(edu.age=='25+')&(edu.schooling=='College')].iloc[:,[0]+list(range(-8,0))]

by sex

plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=educollege, x='year', y='%femalegrads', marker='o', ci=None, color='#C85858',label='female')
plot = sns.lineplot(data=educollege, x='year', y='%malegrads', marker='o', ci=None, color='#007793',label='male')
plot.set(title='US population percentage of college graduates by sex', xlim=(1960,2022),
         ylabel='proportion')
plot.set_xticks(range(1960,2022,3))
plt.xticks(rotation=45);

png

stats = pd.read_csv('./out/masterstats_long.csv', index_col=0, dtype={'year':'int'})
# COMPARISON
plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.2)
plot = sns.lineplot(data=stats[(stats['category']=='known_female')], 
                    x='year', y='percent', marker='o',label='State Dept', color='#007793')
plot = sns.lineplot(data=educollege, x='year', y='%femalegrads', marker='o',label='US grads',ci=None, color='#373938')
plot.set(title='1965-2022 % of female State Dept officers vs. US college graduates', xlim=(1964,2023), ylim=(0,.6),
         ylabel='proportion')
plot.set_xticks(range(1965,2022,2))
plt.xticks(rotation=45);

png

by race

plt.figure(figsize=(16,4))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=educollege, x='year', y='%whitegrads', marker='o', ci=None, color='#B47D54',label='white')
plot = sns.lineplot(data=educollege, x='year', y='%nonwhitegrads', marker='o', ci=None, color='#007793',label='nonwhite')
plot.set(title='US population percentage of white vs. nonwhite college graduates', xlim=(1968,2022),
         ylabel='proportion')
plot.set_xticks(range(1968,2022,3))
plt.xticks(rotation=45);

png

# COMPARISON
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats_nonwhite, 
                    x='year', y='nonwhite percent', marker='o',label='State Dept', color='#007793')
plot = sns.lineplot(data=educollege, x='year', y='%nonwhitegrads', marker='o', ci=None, color='#373938',label='US grads')
plot.set(title='1965-2022 % of nonwhite State Dept officers vs. US college graduates', ylim=(0,.45), xlim=(1964,2023),
         ylabel='proportion')
plt.legend(title='population', loc='lower right')
plot.set_xticks(range(1964,2024,2))
plt.xticks(rotation=45);

png