Skip to content
import pandas as pd
import nameparser
import re
import gender_guesser
import numpy as np
import math
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
from nameparser import HumanName
from nameparser.config import Constants
import pdftotext
import fitz

Parsing rank abbreviations

Using 2022 key

with fitz.open('./fp21/data/inst/Abbreviations_Symbols.pdf') as doc:
    text = ''
    for page in doc:
        text += page.get_text()
    text = text.split('\n')
key = pd.DataFrame({'text':text[8:]})
key = key[-key.text.str.contains('keyiations|UNCLASSIFIED|\d+', regex=True)].reset_index(drop=True)
key = key[-key.text.str.match('\s|^$')].reset_index(drop=True)
key['join'] = key.text.str.contains('[A-Z ]{2,}$', regex=True)

joinedlist = []
for i in range(0, len(key.text)):
    if key['join'][i]==True and i!=len(key.text)-1:
        a = list(key.text)[i]
        b = list(key.text)[i+1]
        joined = a + ' ' + b
        joinedlist += [joined]
    else:
        joinedlist += [list(key.text)[i]]

key['text_joined'] = joinedlist

key['text_clean'] = key.text_joined.str.contains("^[A-Z/ ]{2,}\s[A-Za-z()/,' ]*$", regex=True)
key = key[key['text_clean']][['text_joined']].reset_index(drop=True)
a = key['text_joined'].str.extract('([A-Z/]{2,})[\s\.:;](.*)', expand=True)

key['abbrev'] = a[0]
key['full_form'] = a[1]

key = key.dropna().reset_index(drop=True)
key = key[-(key.abbrev=='US')].reset_index(drop=True)

key['abbrev'] = key['abbrev'].str.upper()
key.to_csv('key.csv')
# OPEN IN EXCEL AND ADD RANK POSITIONS

Key:

text_joined abbrev full_form FS_cone department usecretary usecretary_long Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 Unnamed: 12 Unnamed: 13 Unnamed: 14 Unnamed: 15 Unnamed: 16
nan ACM Assistant Chief of Mission principal officer state M Under Secretary for Management (M) nan nan nan nan nan nan nan nan nan nan
nan ACS American Citizens Services consular state M Under Secretary for Management (M) nan nan nan nan nan nan nan nan nan nan
ADB Asian Development Bank ADB Asian Development Bank nonFSO international S Secretary of State (S) nan nan nan nan nan nan nan nan nan nan
nan A DM Administrative Section management state M Under Secretary for Management (M) nan nan nan nan nan nan nan nan nan nan
nan ADM Administrative Section management state M Under Secretary for Management (M) nan nan nan nan nan nan nan nan nan nan

Match key and classified data

filename = '1985_Jan'
key = pd.read_csv('./fp21/data/out/key.csv')
key = key.drop_duplicates('abbrev')
key['abbrev'] = key['abbrev'].str.upper()
key = key.iloc[:,1:7]
classified = pd.read_csv('./fp21/data/out/master_classified.csv')
classified = classified[-(classified['rank'].str.contains("VACANT|UNCLASSIFIED", regex=True))|(classified['name'].str.contains("VACANT|UNCLASSIFIED", regex=True))]
classified = classified[-classified['name'].str.fullmatch('[A-Z\s]+')] # delete rows where name is all caps (corrupted item)
classified['rank_separated1'] = classified['rank'].str.split('\s|/').str[0]
classified['rank_separated2'] = classified['rank'].str.split('\s|/').str[1]
classified['rank_separated3'] = classified['rank'].str.split('\s|/').str[2]

# classified_ranked = classified.set_index('rank').join(key.set_index('abbrev')[['full_form','department', 'branch']],rsuffix='0',sort=True)
# classified_ranked = classified_ranked.reset_index().rename(columns={'index':'rank_separated1'})
classified_ranked = classified.set_index('rank_separated1').join(key.set_index('abbrev')[['full_form','FS_cone','department','usecretary']],rsuffix='1',sort=True)
classified_ranked = classified_ranked.reset_index().rename(columns={'index':'rank_separated1'})
classified_ranked = classified_ranked.set_index('rank_separated2').join(key.set_index('abbrev')[['full_form','FS_cone','department','usecretary']],rsuffix='2',sort=True)
classified_ranked = classified_ranked.reset_index().rename(columns={'index':'rank_separated2'})
classified_ranked = classified_ranked.set_index('rank_separated3').join(key.set_index('abbrev')[['full_form','FS_cone','department','usecretary']],rsuffix='3',sort=True)
classified_ranked = classified_ranked.reset_index().rename(columns={'index':'rank_separated3'})
classified_ranked = classified_ranked.fillna('')

def zip_join_3(zipped_list):
    joined = []
    for i, j, k in zipped_list:
        word = ''
        if i==j:
            word = i
        elif i!=j:
            word = i + ' ' + j
        if j!=k and i!=k:
            word = word + ' ' + k
        joined += [word]
    return joined

classified_ranked['full_form_joined'] = zip_join_3(zip(classified_ranked['full_form'], classified_ranked['full_form2'], classified_ranked['full_form3']))
classified_ranked['dept_joined'] = zip_join_3(zip(classified_ranked['department'], classified_ranked['department2'], classified_ranked['department3']))
classified_ranked['usec_joined'] = zip_join_3(zip(classified_ranked['usecretary'], classified_ranked['usecretary2'], classified_ranked['usecretary3']))
classified_ranked['FS_cone_joined'] = zip_join_3(zip(classified_ranked['FS_cone'], classified_ranked['FS_cone2'], classified_ranked['FS_cone3']))

classified_ranked['dept_joined'] = classified_ranked['dept_joined'].str.strip()
classified_ranked['full_form_joined'] = classified_ranked['full_form_joined'].str.strip()
classified_ranked['usec_joined'] = classified_ranked['usec_joined'].str.strip()
classified_ranked['FS_cone_joined'] = classified_ranked['FS_cone_joined'].str.strip()

# classified_ranked.loc[classified_ranked['dept_joined']=='state state','dept_joined'] = 'state'
# classified_ranked.loc[classified_ranked['branch_joined']=='M M','branch_joined'] = 'M'

classified_ranked = classified_ranked[['year','rank', 'name', 'first_clean','middle', 'last', 'firstlast', 
                               'gender_guess','eth_guess 8_nationality_groups', 'eth_prob 8_nationality_groups','ethnicseer', 
                               'rank_separated1', 'rank_separated2','rank_separated3',
                               'full_form_joined', 'dept_joined', 'usec_joined','FS_cone_joined'
                              ]]
classified_ranked
classified_ranked.to_csv('./fp21/data/out/classified_rank.csv')

Preview of classified dataset with ranks:

year rank name first_clean middle last firstlast gender_guess eth_guess 8_nationality_groups eth_prob 8_nationality_groups ethnicseer rank_separated1 rank_separated2 rank_separated3 full_form_joined dept_joined usec_joined FS_cone_joined
1978 PUB AFF ADV Dean O. Claussen Dean O. Claussen Dean Claussen male african 67.797 eng PUB AFF ADV Public Affairs Adviser nan nan public diplomacy
1979 PUB AFF ADV Dean O. Claussen Dean O. Claussen Dean Claussen male african 67.797 eng PUB AFF ADV Public Affairs Adviser nan nan public diplomacy
1981 PUB AFF ADV Robert J. Wozniak Robert J. Wozniak Robert Wozniak male european 85.945 ger PUB AFF ADV Public Affairs Adviser nan nan public diplomacy
1982 PUB AFF ADV Robert J. Wozniak Robert J. Wozniak Robert Wozniak male european 85.945 ger PUB AFF ADV Public Affairs Adviser nan nan public diplomacy
1983 PUB AFF ADV Robert J. Wozniak Robert J. Wozniak Robert Wozniak male european 85.945 ger PUB AFF ADV Public Affairs Adviser nan nan public diplomacy
1985 PUB AFF ADV Victor Olason Victor nan Olason Victor Olason male nordic 71.15 spa PUB AFF ADV Public Affairs Adviser nan nan public diplomacy
1986 PUB AFF ADV Victor B. Olason Victor B. Olason Victor Olason male nordic 71.15 spa PUB AFF ADV Public Affairs Adviser nan nan public diplomacy
1987 PUB AFF ADV Victor B. Olason Victor B. Olason Victor Olason male nordic 71.15 spa PUB AFF ADV Public Affairs Adviser nan nan public diplomacy
1988 PUB AFF ADV Barry Fulton Barry nan Fulton Barry Fulton male celtic 85.191 eng PUB AFF ADV Public Affairs Adviser nan nan public diplomacy
1989 PUB AFF ADV Barry Fulton Barry nan Fulton Barry Fulton male celtic 85.191 eng PUB AFF ADV Public Affairs Adviser nan nan public diplomacy

Gender by job function

Compare gender ratio (male:female)

classified_ranked.loc[classified_ranked['gender_guess']=='mostly_male','gender_guess']='male'
classified_ranked.loc[classified_ranked['gender_guess']=='mostly_female','gender_guess']='female'

classified_ranked.loc[classified_ranked['FS_cone_joined']=='consular leadership','FS_cone_joined'] = 'consular'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='consular management','FS_cone_joined'] = 'consular'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='economic leadership','FS_cone_joined'] = 'economic'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='principal officer leadership','FS_cone_joined'] = 'principal officer'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='principal officer management','FS_cone_joined'] = 'principal officer'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='security nonFSO','FS_cone_joined'] = 'security'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='political leadership','FS_cone_joined'] = 'political'
classified_ranked.loc[classified_ranked['FS_cone_joined']=='leadership','FS_cone_joined'] = 'principal officer'
known = classified_ranked.loc[classified_ranked['gender_guess']!='unknown',['year','rank','FS_cone_joined']].groupby(['year','FS_cone_joined']).count()
known.columns=['count']

m = classified_ranked.loc[classified_ranked['gender_guess']=='male',['year','rank','FS_cone_joined']].groupby(['year','FS_cone_joined']).count()

f = classified_ranked.loc[classified_ranked['gender_guess']=='female',['year','rank','FS_cone_joined']].groupby(['year','FS_cone_joined']).count()

total = classified_ranked.loc[(classified_ranked['gender_guess']=='female')|(classified_ranked['gender_guess']=='male'),
                              ['year','rank','FS_cone_joined']].groupby(['year','FS_cone_joined']).count()

ratio = pd.concat([total,
                   round(m/total,2),
                   round(f/total,2),
                   round(m/f,2)], axis=1)
ratio.columns = ['total','%male','%female','ratio']
ratio = ratio.reset_index()
ratio = ratio[(ratio['total']>10)&(ratio['FS_cone_joined']!='')]
ratio
plt.figure(figsize=(16,8))
sns.set(style='whitegrid', font_scale=1.25)
# sns.color_palette("tab10")
plot = sns.lineplot(data=ratio[(ratio['FS_cone_joined']!='political/economic')&
                               (ratio['FS_cone_joined']!='nonFSO')&(ratio['FS_cone_joined']!='security')], 
                     x='year', y='%female', hue='FS_cone_joined', linewidth = 2)
plot.set(title='1965-2022 % female officers, by foreign service cone', 
         ylabel='proportion')
plot.axhline(0.5, ls='dashed', color='darkgray')
plt.legend(title='cone')
plt.xticks(rotation=45);

png

sns.set(style='whitegrid', font_scale=1.25)
plot = sns.lmplot(data=ratio[(ratio['FS_cone_joined']!='political/economic')
                             &(ratio['FS_cone_joined']!='nonFSO')&(ratio['FS_cone_joined']!='security')], 
                     x='year', y='%female', hue='FS_cone_joined',ci=None, truncate=False,
                     height=7, aspect=1.8);
plot.set(title='1965-2022 % female officers, by foreign service cone', ylabel='proportion', ylim=(-.01,0.65))
plt.xticks(rotation=45);

png

g = sns.FacetGrid(data=ratio[(ratio['FS_cone_joined']!='political/economic')&(ratio['FS_cone_joined']!='nonFSO')], 
                  row='FS_cone_joined', height=1.5, aspect=10)
g = g.map(sns.lineplot, 'year', '%female', ci=None).add_legend()
g = g.set(ylim=(0,0.65))

png

pivot = ratio.pivot(index='year', columns='FS_cone_joined', values='%female')[['consular','economic','management','political','principal officer','public diplomacy','security']]
pivot.plot.bar(stacked=True, figsize=(22, 10), ylabel='%', xlabel='year', title='% of female officers by FS cone');
plt.legend(title='Cone', bbox_to_anchor=(1.05, 1), loc='upper left');

png