Visualization Code
import pandas as pd
import nameparser
from nameparser import HumanName
import re
import gender_guesser
import numpy as np
import math
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('./FSODiversity/out/master_classified.csv', dtype={'year':'str'})
Preview of master classified dataset:
year | mo | rank | name | first_clean | middle | last | suffix | firstlast | gender_guess | eth_guess 8_nationality_groups | eth_prob 8_nationality_groups | ethnicseer |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1965 | Feb | AGRICULTURAL ATTACHE | Marshal D. Fox | Marshal | D. | Fox | nan | Marshal Fox | unknown | southAsian | 32.525 | eng |
1965 | Feb | DEPUTY CHIEF OF MISSION | C. Robert Moore | Robert | Robert | Moore | nan | Robert Moore | male | celtic | 77.223 | eng |
1965 | Feb | ECONOMIC SECTION | James H. Cheatham | James | H. | Cheatham | nan | James Cheatham | male | celtic | 87.306 | eng |
1965 | Feb | CONSUL GENERAL | W. Garland Richardson | Garland | Garland | Richardson | nan | Garland Richardson | male | celtic | 56.789 | eng |
1965 | Feb | CONSULATE GENERAL | Pamrmaribo | Pamrmaribo | nan | nan | nan | Pamrmaribo | unknown | african | 64.201 | ita |
Grouping data
Gender
data.loc[data['gender_guess']=='mostly_male','gender_guess']='male'
data.loc[data['gender_guess']=='mostly_female','gender_guess']='female'
counts = data.groupby(['year', 'gender_guess']).count()[['name']].reset_index().set_index('year')
totals = data.groupby(['year']).count()[['name']]
totals['gender_guess'] = 'total'
known = totals['name'] - counts[counts['gender_guess']=='unknown']['name']
known = pd.DataFrame(known)
known['gender_guess'] = 'known'
gender = pd.concat([counts, totals, known]).sort_index()
gender.columns = ['category','count']
gender['percent'] = round(gender['count'] / gender.join(totals[['name']])['name'],3)
known_percent = gender.join(gender.loc[gender['category']=='known',['count']],rsuffix='.known')
known_percent['percent'] = round(known_percent['count'] / known_percent['count.known'],3)
known_percent = known_percent[(known_percent['category']=='male')|(known_percent['category']=='female')]
known_percent.loc[known_percent['category']=='male','category'] = 'known_male'
known_percent.loc[known_percent['category']=='female','category'] = 'known_female'
known_percent = known_percent[['category','count','percent']]
gender = pd.concat([gender, known_percent]).sort_index()
Race (name-to-ethnicity)
sub = data[data['eth_prob 8_nationality_groups']>50]
group_eth = sub.groupby(['year','eth_guess 8_nationality_groups']).count()[['name']]
group_eth = group_eth.reset_index().set_index('year')
totals_eth = sub.groupby(['year']).count()[['name']]
totals_eth['eth_guess 8_nationality_groups'] = 'total_eth>50'
eth = pd.concat([group_eth, totals_eth]).sort_index()
eth = eth.rename(columns={"eth_guess 8_nationality_groups":'category','name':'count'})
eth['percent'] = round(eth['count'] / eth.join(totals_eth[['name']])['name'],3)
Race (ethnicseer)
group_ethnicseer = data.groupby(['year','ethnicseer']).count()[['name']]
group_ethnicseer = group_ethnicseer.reset_index().set_index('year')
totals_ethnicseer = data.groupby(['year']).count()[['name']]
group_ethnicseer = group_ethnicseer.rename(columns={"ethnicseer":'category','name':'count'})
group_ethnicseer['percent'] = round(group_ethnicseer['count'] / group_ethnicseer.join(totals_ethnicseer[['name']])['name'],3)
Concatenate grouped data
stats = pd.concat([gender, eth, group_ethnicseer]).sort_index()
transpose = stats.reset_index().set_index(['year','category']).T.stack(0).sort_index(level=1).fillna(0)
transpose['white_ethnicseer'] = (transpose['eng']+transpose['ger']+
transpose['frn']+transpose['ita']+transpose['rus'])
transpose['asian_ethnicseer'] = (transpose['chi']+transpose['jap']+
transpose['kor']+transpose['ind']+transpose['vie'])
transpose['white_8_groups'] = (transpose['celtic']+transpose['european']+
transpose['nordic'])
transpose['asian_8_groups'] = (transpose['southAsian']+transpose['eastAsian'])
stats = transpose.T.stack(1).reset_index().sort_values('year').reset_index(drop=True)
stats = stats[['year','category','count','percent']]
stats
stats.to_csv('./FSODiversity/out/masterstats_long.csv')
Preview of master statistics dataset:
year | category | count | percent |
---|---|---|---|
1965 | african | 55 | 0.089 |
1965 | known_female | 23 | 0.035 |
1965 | male | 627 | 0.821 |
1965 | white_8_groups | 519 | 0.839 |
1965 | celtic | 395 | 0.639 |
1965 | known | 651 | 0.852 |
1965 | mea | 8 | 0.01 |
1965 | asian_ethnicseer | 24 | 0.032 |
1965 | chi | 2 | 0.003 |
1965 | jap | 5 | 0.007 |
Visualization
stats = pd.read_csv('./FSODiversity/out/masterstats_long.csv', index_col=0, dtype={'year':'str'})
Gender
# visualize gender
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats[(stats['category']=='known_male')|(stats['category']=='known_female')],
x='year', y='percent', hue='category', marker='o', palette=['#C85858','#007793'])
plot.set(title='1965-2022 Officer Name-Gender Classification, Excluding Unknowns', ylim=(0,1),
ylabel='proportion')
plt.legend(title='Gender', loc='upper right', labels=['female', 'male'])
plot.set_xticks(range(1964,2024,2))
plt.xticks(rotation=45);
Race (name-to-ethnicity)
# name to ethnicity visualize
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot2 = sns.lineplot(data=stats[(stats['category']=='white_8_groups')|(stats['category']=='asian_8_groups')
|(stats['category']=='muslim')|(stats['category']=='hispanic')|(stats['category']=='african')],
x='year', y='percent', hue='category', marker='o')
plot2.set(title='1965-2022 Officer Name-Ethnicity Classification, name-to-ethnicity classifier, >50% Certainty', ylim=(0,1),
ylabel='proportion')
plt.legend(title='Ethnicity group', labels=['african', 'white', 'muslim', 'hispanic', 'asian'])
plot2.set_xticks(range(1964,2024,2))
plt.xticks(rotation=45);
Race (ethnicseer)
# visualize ethnicseer
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot2 = sns.lineplot(data=stats[(stats['category']=='white_ethnicseer')|(stats['category']=='asian_ethnicseer')|
(stats['category']=='mea')|(stats['category']=='spa')],
x='year', y='percent', hue='category', marker='o')
plot2.set(title='1965-2022 Officer Name-Ethnicity Classification, ethnicseer classifier', ylim=(0,1),
ylabel='proportion')
plt.legend(title='Ethnicity group', labels=['middle east', 'asian', 'white', 'spanish/hispanic'])
plot2.set_xticks(range(1964,2024,2))
plt.xticks(rotation=45);
White / nonwhite
stats = pd.read_csv('./FOfficerSODiversity/Ellice/out/masterstats_long.csv', index_col=0, dtype={'year':'int'})
stats_nonwhite = stats[stats.category=='white_8_groups']
stats_nonwhite['nonwhite percent'] = 1 - stats_nonwhite.percent
/var/folders/js/7x9j4wbs6nj983t_k4_l9x5r0000gn/T/ipykernel_16229/3920309226.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
stats_nonwhite['nonwhite percent'] = 1 - stats_nonwhite.percent
# COMPARISON of white/nonwhite (name to ethnicity classifier)
plt.figure(figsize=(16,6))
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lineplot(data=stats_nonwhite,
x='year', y='nonwhite percent', marker='o',label='nonwhite', color='#007793')
plot = sns.lineplot(data=stats_nonwhite,
x='year', y='percent', marker='o',label='white', color='#B47D54')
plot.set(title='1965-2022 % of white vs. nonwhite officers', ylim=(0,1), xlim=(1964,2023),
ylabel='proportion')
plt.legend(title='race', loc='lower right')
plot.set_xticks(range(1965,2022,2))
plt.xticks(rotation=45);