import pandas as pd
import nameparser
import re
import gender_guesser
import numpy as np
import math
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
from nameparser import HumanName
from nameparser.config import Constants
import pdftotext
import fitz
Parsing rank abbreviations
Using 2022 key
with fitz.open('./fp21/data/inst/Abbreviations_Symbols.pdf') as doc:
text = ''
for page in doc:
text += page.get_text()
text = text.split('\n')
key = pd.DataFrame({'text':text[8:]})
key = key[-key.text.str.contains('keyiations|UNCLASSIFIED|\d+', regex=True)].reset_index(drop=True)
key = key[-key.text.str.match('\s|^$')].reset_index(drop=True)
key['join'] = key.text.str.contains('[A-Z ]{2,}$', regex=True)
joinedlist = []
for i in range(0, len(key.text)):
if key['join'][i]==True and i!=len(key.text)-1:
a = list(key.text)[i]
b = list(key.text)[i+1]
joined = a + ' ' + b
joinedlist += [joined]
else:
joinedlist += [list(key.text)[i]]
key['text_joined'] = joinedlist
key['text_clean'] = key.text_joined.str.contains("^[A-Z/ ]{2,}\s[A-Za-z()/,' ]*$", regex=True)
key = key[key['text_clean']][['text_joined']].reset_index(drop=True)
a = key['text_joined'].str.extract('([A-Z/]{2,})[\s\.:;](.*)', expand=True)
key['abbrev'] = a[0]
key['full_form'] = a[1]
key = key.dropna().reset_index(drop=True)
key = key[-(key.abbrev=='US')].reset_index(drop=True)
key['abbrev'] = key['abbrev'].str.upper()
key.to_csv('key.csv')
# OPEN IN EXCEL AND ADD RANK POSITIONS
Key:
text_joined | abbrev | full_form | FS_cone | department | usecretary | usecretary_long | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 | Unnamed: 11 | Unnamed: 12 | Unnamed: 13 | Unnamed: 14 | Unnamed: 15 | Unnamed: 16 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
nan | ACM | Assistant Chief of Mission | principal officer | state | M | Under Secretary for Management (M) | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
nan | ACS | American Citizens Services | consular | state | M | Under Secretary for Management (M) | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
ADB Asian Development Bank | ADB | Asian Development Bank | nonFSO | international | S | Secretary of State (S) | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
nan | A DM | Administrative Section | management | state | M | Under Secretary for Management (M) | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
nan | ADM | Administrative Section | management | state | M | Under Secretary for Management (M) | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
Match key and classified data
filename = '1985_Jan'
key = pd.read_csv('./fp21/data/out/key.csv')
key = key.drop_duplicates('abbrev')
key['abbrev'] = key['abbrev'].str.upper()
key = key.iloc[:,1:7]
classified = pd.read_csv('./fp21/data/out/master_classified.csv')
classified = classified[-(classified['rank'].str.contains("VACANT|UNCLASSIFIED", regex=True))|(classified['name'].str.contains("VACANT|UNCLASSIFIED", regex=True))]
classified = classified[-classified['name'].str.fullmatch('[A-Z\s]+')] # delete rows where name is all caps (corrupted item)
classified['rank_separated1'] = classified['rank'].str.split('\s|/').str[0]
classified['rank_separated2'] = classified['rank'].str.split('\s|/').str[1]
classified['rank_separated3'] = classified['rank'].str.split('\s|/').str[2]
# classified_ranked = classified.set_index('rank').join(key.set_index('abbrev')[['full_form','department', 'branch']],rsuffix='0',sort=True)
# classified_ranked = classified_ranked.reset_index().rename(columns={'index':'rank_separated1'})
classified_ranked = classified.set_index('rank_separated1').join(key.set_index('abbrev')[['full_form','FS_cone','department','usecretary']],rsuffix='1',sort=True)
classified_ranked = classified_ranked.reset_index().rename(columns={'index':'rank_separated1'})
classified_ranked = classified_ranked.set_index('rank_separated2').join(key.set_index('abbrev')[['full_form','FS_cone','department','usecretary']],rsuffix='2',sort=True)
classified_ranked = classified_ranked.reset_index().rename(columns={'index':'rank_separated2'})
classified_ranked = classified_ranked.set_index('rank_separated3').join(key.set_index('abbrev')[['full_form','FS_cone','department','usecretary']],rsuffix='3',sort=True)
classified_ranked = classified_ranked.reset_index().rename(columns={'index':'rank_separated3'})
classified_ranked = classified_ranked.fillna('')
def zip_join_3(zipped_list):
joined = []
for i, j, k in zipped_list:
word = ''
if i==j:
word = i
elif i!=j:
word = i + ' ' + j
if j!=k and i!=k:
word = word + ' ' + k
joined += [word]
return joined
classified_ranked['full_form_joined'] = zip_join_3(zip(classified_ranked['full_form'], classified_ranked['full_form2'], classified_ranked['full_form3']))
classified_ranked['dept_joined'] = zip_join_3(zip(classified_ranked['department'], classified_ranked['department2'], classified_ranked['department3']))
classified_ranked['usec_joined'] = zip_join_3(zip(classified_ranked['usecretary'], classified_ranked['usecretary2'], classified_ranked['usecretary3']))
classified_ranked['FS_cone_joined'] = zip_join_3(zip(classified_ranked['FS_cone'], classified_ranked['FS_cone2'], classified_ranked['FS_cone3']))
classified_ranked['dept_joined'] = classified_ranked['dept_joined'].str.strip()
classified_ranked['full_form_joined'] = classified_ranked['full_form_joined'].str.strip()
classified_ranked['usec_joined'] = classified_ranked['usec_joined'].str.strip()
classified_ranked['FS_cone_joined'] = classified_ranked['FS_cone_joined'].str.strip()
# classified_ranked.loc[classified_ranked['dept_joined']=='state state','dept_joined'] = 'state'
# classified_ranked.loc[classified_ranked['branch_joined']=='M M','branch_joined'] = 'M'
classified_ranked = classified_ranked[['year','rank', 'name', 'first_clean','middle', 'last', 'firstlast',
'gender_guess','eth_guess 8_nationality_groups', 'eth_prob 8_nationality_groups','ethnicseer',
'rank_separated1', 'rank_separated2','rank_separated3',
'full_form_joined', 'dept_joined', 'usec_joined','FS_cone_joined'
]]
classified_ranked
classified_ranked.to_csv('./fp21/data/out/classified_rank.csv')
Preview of classified dataset with ranks:
year | rank | name | first_clean | middle | last | firstlast | gender_guess | eth_guess 8_nationality_groups | eth_prob 8_nationality_groups | ethnicseer | rank_separated1 | rank_separated2 | rank_separated3 | full_form_joined | dept_joined | usec_joined | FS_cone_joined |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1978 | PUB AFF ADV | Dean O. Claussen | Dean | O. | Claussen | Dean Claussen | male | african | 67.797 | eng | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
1979 | PUB AFF ADV | Dean O. Claussen | Dean | O. | Claussen | Dean Claussen | male | african | 67.797 | eng | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
1981 | PUB AFF ADV | Robert J. Wozniak | Robert | J. | Wozniak | Robert Wozniak | male | european | 85.945 | ger | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
1982 | PUB AFF ADV | Robert J. Wozniak | Robert | J. | Wozniak | Robert Wozniak | male | european | 85.945 | ger | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
1983 | PUB AFF ADV | Robert J. Wozniak | Robert | J. | Wozniak | Robert Wozniak | male | european | 85.945 | ger | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
1985 | PUB AFF ADV | Victor Olason | Victor | nan | Olason | Victor Olason | male | nordic | 71.15 | spa | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
1986 | PUB AFF ADV | Victor B. Olason | Victor | B. | Olason | Victor Olason | male | nordic | 71.15 | spa | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
1987 | PUB AFF ADV | Victor B. Olason | Victor | B. | Olason | Victor Olason | male | nordic | 71.15 | spa | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
1988 | PUB AFF ADV | Barry Fulton | Barry | nan | Fulton | Barry Fulton | male | celtic | 85.191 | eng | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
1989 | PUB AFF ADV | Barry Fulton | Barry | nan | Fulton | Barry Fulton | male | celtic | 85.191 | eng | PUB | AFF | ADV | Public Affairs Adviser | nan | nan | public diplomacy |
Gender by job function
Compare gender ratio (male:female)
classified_ranked.loc[classified_ranked['gender_guess']=='mostly_male','gender_guess']='male'
classified_ranked.loc[classified_ranked['gender_guess']=='mostly_female','gender_guess']='female'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='consular leadership','FS_cone_joined'] = 'consular'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='consular management','FS_cone_joined'] = 'consular'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='economic leadership','FS_cone_joined'] = 'economic'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='principal officer leadership','FS_cone_joined'] = 'principal officer'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='principal officer management','FS_cone_joined'] = 'principal officer'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='security nonFSO','FS_cone_joined'] = 'security'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='political leadership','FS_cone_joined'] = 'political'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='leadership','FS_cone_joined'] = 'principal officer'
known = classified_ranked.loc[classified_ranked['gender_guess']!='unknown',['year','rank','FS_cone_joined']].groupby(['year','FS_cone_joined']).count()
known.columns=['count']
m = classified_ranked.loc[classified_ranked['gender_guess']=='male',['year','rank','FS_cone_joined']].groupby(['year','FS_cone_joined']).count()
f = classified_ranked.loc[classified_ranked['gender_guess']=='female',['year','rank','FS_cone_joined']].groupby(['year','FS_cone_joined']).count()
total = classified_ranked.loc[(classified_ranked['gender_guess']=='female')|(classified_ranked['gender_guess']=='male'),
['year','rank','FS_cone_joined']].groupby(['year','FS_cone_joined']).count()
ratio = pd.concat([total,
round(m/total,2),
round(f/total,2),
round(m/f,2)], axis=1)
ratio.columns = ['total','%male','%female','ratio']
ratio = ratio.reset_index()
ratio = ratio[(ratio['total']>10)&(ratio['FS_cone_joined']!='')]
ratio
plt.figure(figsize=(16,8))
sns.set(style='whitegrid', font_scale=1.25)
# sns.color_palette("tab10")
plot = sns.lineplot(data=ratio[(ratio['FS_cone_joined']!='political/economic')&
(ratio['FS_cone_joined']!='nonFSO')&(ratio['FS_cone_joined']!='security')],
x='year', y='%female', hue='FS_cone_joined', linewidth = 2)
plot.set(title='1965-2022 % female officers, by foreign service cone',
ylabel='proportion')
plot.axhline(0.5, ls='dashed', color='darkgray')
plt.legend(title='cone')
plt.xticks(rotation=45);
sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lmplot(data=ratio[(ratio['FS_cone_joined']!='political/economic')
&(ratio['FS_cone_joined']!='nonFSO')&(ratio['FS_cone_joined']!='security')],
x='year', y='%female', hue='FS_cone_joined',ci=None, truncate=False,
height=7, aspect=1.8);
plot.set(title='1965-2022 % female officers, by foreign service cone', ylabel='proportion', ylim=(-.01,0.65))
plt.xticks(rotation=45);
g = sns.FacetGrid(data=ratio[(ratio['FS_cone_joined']!='political/economic')&(ratio['FS_cone_joined']!='nonFSO')],
row='FS_cone_joined', height=1.5, aspect=10)
g = g.map(sns.lineplot, 'year', '%female', ci=None).add_legend()
g = g.set(ylim=(0,0.65))
pivot = ratio.pivot(index='year', columns='FS_cone_joined', values='%female')[['consular','economic','management','political','principal officer','public diplomacy','security']]
pivot.plot.bar(stacked=True, figsize=(22, 10), ylabel='%', xlabel='year', title='% of female officers by FS cone');
plt.legend(title='Cone', bbox_to_anchor=(1.05, 1), loc='upper left');