Skip to content

Visualization Code

import pandas as pd
import nameparser
from nameparser import HumanName
import re
import gender_guesser
import numpy as np
import math
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('./FSODiversity/out/master_classified.csv', dtype={'year':'str'})

Preview of master classified dataset:

year mo rank name first_clean middle last suffix firstlast gender_guess eth_guess 8_nationality_groups eth_prob 8_nationality_groups ethnicseer
1965 Feb AGRICULTURAL ATTACHE Marshal D. Fox Marshal D. Fox nan Marshal Fox unknown southAsian 32.525 eng
1965 Feb DEPUTY CHIEF OF MISSION C. Robert Moore Robert Robert Moore nan Robert Moore male celtic 77.223 eng
1965 Feb ECONOMIC SECTION James H. Cheatham James H. Cheatham nan James Cheatham male celtic 87.306 eng
1965 Feb CONSUL GENERAL W. Garland Richardson Garland Garland Richardson nan Garland Richardson male celtic 56.789 eng
1965 Feb CONSULATE GENERAL Pamrmaribo Pamrmaribo nan nan nan Pamrmaribo unknown african 64.201 ita

Grouping data

Gender

data.loc[data['gender_guess']=='mostly_male','gender_guess']='male'
data.loc[data['gender_guess']=='mostly_female','gender_guess']='female'
counts = data.groupby(['year', 'gender_guess']).count()[['name']].reset_index().set_index('year')

totals = data.groupby(['year']).count()[['name']]
totals['gender_guess'] = 'total'

known = totals['name'] - counts[counts['gender_guess']=='unknown']['name']
known = pd.DataFrame(known)
known['gender_guess'] = 'known'

gender = pd.concat([counts, totals, known]).sort_index()
gender.columns = ['category','count']
gender['percent'] = round(gender['count'] / gender.join(totals[['name']])['name'],3)

known_percent = gender.join(gender.loc[gender['category']=='known',['count']],rsuffix='.known')
known_percent['percent'] = round(known_percent['count'] / known_percent['count.known'],3)
known_percent = known_percent[(known_percent['category']=='male')|(known_percent['category']=='female')]
known_percent.loc[known_percent['category']=='male','category'] = 'known_male'
known_percent.loc[known_percent['category']=='female','category'] = 'known_female'
known_percent = known_percent[['category','count','percent']]

gender = pd.concat([gender, known_percent]).sort_index()

Race (name-to-ethnicity)

sub = data[data['eth_prob 8_nationality_groups']>50]
group_eth = sub.groupby(['year','eth_guess 8_nationality_groups']).count()[['name']]
group_eth = group_eth.reset_index().set_index('year')

totals_eth = sub.groupby(['year']).count()[['name']]
totals_eth['eth_guess 8_nationality_groups'] = 'total_eth>50'

eth = pd.concat([group_eth, totals_eth]).sort_index()
eth = eth.rename(columns={"eth_guess 8_nationality_groups":'category','name':'count'})
eth['percent'] = round(eth['count'] / eth.join(totals_eth[['name']])['name'],3)

Race (ethnicseer)

group_ethnicseer = data.groupby(['year','ethnicseer']).count()[['name']]
group_ethnicseer = group_ethnicseer.reset_index().set_index('year')

totals_ethnicseer = data.groupby(['year']).count()[['name']]

group_ethnicseer = group_ethnicseer.rename(columns={"ethnicseer":'category','name':'count'})
group_ethnicseer['percent'] = round(group_ethnicseer['count'] / group_ethnicseer.join(totals_ethnicseer[['name']])['name'],3)

Concatenate grouped data

stats = pd.concat([gender, eth, group_ethnicseer]).sort_index()

transpose = stats.reset_index().set_index(['year','category']).T.stack(0).sort_index(level=1).fillna(0)
transpose['white_ethnicseer'] = (transpose['eng']+transpose['ger']+
                                 transpose['frn']+transpose['ita']+transpose['rus'])
transpose['asian_ethnicseer'] = (transpose['chi']+transpose['jap']+
                                 transpose['kor']+transpose['ind']+transpose['vie'])
transpose['white_8_groups'] = (transpose['celtic']+transpose['european']+
                               transpose['nordic'])
transpose['asian_8_groups'] = (transpose['southAsian']+transpose['eastAsian'])

stats = transpose.T.stack(1).reset_index().sort_values('year').reset_index(drop=True)
stats = stats[['year','category','count','percent']]
stats
stats.to_csv('./FSODiversity/out/masterstats_long.csv')

Preview of master statistics dataset:

year category count percent
1965 african 55 0.089
1965 known_female 23 0.035
1965 male 627 0.821
1965 white_8_groups 519 0.839
1965 celtic 395 0.639
1965 known 651 0.852
1965 mea 8 0.01
1965 asian_ethnicseer 24 0.032
1965 chi 2 0.003
1965 jap 5 0.007

Visualization

stats = pd.read_csv('./FSODiversity/out/masterstats_long.csv', index_col=0, dtype={'year':'str'})

Gender

# visualize gender
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats[(stats['category']=='known_male')|(stats['category']=='known_female')], 
                    x='year', y='percent', hue='category', marker='o', palette=['#C85858','#007793'])
plot.set(title='1965-2022 Officer Name-Gender Classification, Excluding Unknowns', ylim=(0,1),
         ylabel='proportion')
plt.legend(title='Gender', loc='upper right', labels=['female', 'male'])
plot.set_xticks(range(1964,2024,2))
plt.xticks(rotation=45);

png

Race (name-to-ethnicity)

# name to ethnicity visualize
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot2 = sns.lineplot(data=stats[(stats['category']=='white_8_groups')|(stats['category']=='asian_8_groups')
                                |(stats['category']=='muslim')|(stats['category']=='hispanic')|(stats['category']=='african')], 
                     x='year', y='percent', hue='category', marker='o')
plot2.set(title='1965-2022 Officer Name-Ethnicity Classification, name-to-ethnicity classifier, >50% Certainty', ylim=(0,1),
         ylabel='proportion')
plt.legend(title='Ethnicity group', labels=['african', 'white', 'muslim', 'hispanic', 'asian'])
plot2.set_xticks(range(1964,2024,2))
plt.xticks(rotation=45);

png

Race (ethnicseer)

# visualize ethnicseer
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot2 = sns.lineplot(data=stats[(stats['category']=='white_ethnicseer')|(stats['category']=='asian_ethnicseer')|
                                (stats['category']=='mea')|(stats['category']=='spa')], 
                     x='year', y='percent', hue='category', marker='o')
plot2.set(title='1965-2022 Officer Name-Ethnicity Classification, ethnicseer classifier', ylim=(0,1),
         ylabel='proportion')
plt.legend(title='Ethnicity group', labels=['middle east', 'asian', 'white', 'spanish/hispanic'])
plot2.set_xticks(range(1964,2024,2))
plt.xticks(rotation=45);

png

White / nonwhite

stats = pd.read_csv('./FOfficerSODiversity/Ellice/out/masterstats_long.csv', index_col=0, dtype={'year':'int'})
stats_nonwhite = stats[stats.category=='white_8_groups']
stats_nonwhite['nonwhite percent'] = 1 - stats_nonwhite.percent
/var/folders/js/7x9j4wbs6nj983t_k4_l9x5r0000gn/T/ipykernel_16229/3920309226.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stats_nonwhite['nonwhite percent'] = 1 - stats_nonwhite.percent
# COMPARISON of white/nonwhite (name to ethnicity classifier)
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats_nonwhite, 
                    x='year', y='nonwhite percent', marker='o',label='nonwhite', color='#007793')
plot = sns.lineplot(data=stats_nonwhite, 
                    x='year', y='percent', marker='o',label='white', color='#B47D54')
plot.set(title='1965-2022 % of white vs. nonwhite officers', ylim=(0,1), xlim=(1964,2023),
         ylabel='proportion')
plt.legend(title='race', loc='lower right')
plot.set_xticks(range(1965,2022,2))
plt.xticks(rotation=45);

png