Data Extraction and Cleaning

Using various text analysis and name processing Python packages including pandas, regex, and PyMuPDF (fitz), we wrote a script that systematically scrapes and cleans the Key Officers, transforming and compiling text files and PDFs into a dataset of approximately 112,456 officer names and their positions.

The script includes processes including removing address lines and embassy names; parsing out our desired format of officer positions and names (‘RANK: First M. Last’); and using regular expressions to separate names from positions.

Code

import pandas as pd
import nameparser
import re
import gender_guesser
import numpy as np
import math
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
from nameparser import HumanName
from nameparser.config import Constants
import pdftotext
import fitz

Functions

import iso3166
from iso3166 import countries
countries_title = pd.Series(iso3166.countries_by_name.keys()).str.title()
countries_caps = list(iso3166.countries_by_name.keys())
countries = list(countries_title)+countries_caps
countries_regex = "(" + ")|(".join(countries) + ")"
# countries_regex = map(re.compile, countries)

# Import TXT
def import_txt(filename):
    text = open("./data/inst/" + filename + ".txt", "r").read()
    text = text.split('\n')

    df = pd.DataFrame({'text':text,
                   'text_squish':0,
                   'address':0,
                   'country':0,
                   'embassy':0,
                   'consulate':0,
                   'consgen':0,
                   'pagenum':0,
                   'page':0})
    df.text_squish = df.text.str.strip()
    df = df.dropna()
    df = df[df.text_squish != ''].reset_index(drop=True)

    return df

# Import PDF
def import_pdf(filename):
    with fitz.open('./data/inst/' + filename + '.pdf') as doc:
        text = ''
        for page in doc:
            text += page.get_text()
        text = text.split('\n')

    df = pd.DataFrame({'text':text,
                   'text_squish':0,
                   'address':0,
                   'country':0,
                   'embassy':0,
                   'consulate':0,
                   'consgen':0,
                   'pagenum':0,
                   'page':0,
                    'url':0})
    df.text_squish = df.text.str.strip()
    df = df.dropna()
    df = df[df.text_squish != ''].reset_index(drop=True)

    return df

# Find where DF starts
def data_filter(df):
    # start
    algeria = df.loc[df.text.str.fullmatch('ALGERIA')].index
    algeria = algeria[0] if len(algeria)>0 else 0
    afgh = df.loc[df.text.str.fullmatch('AFGHANISTAN')].index
    afgh = afgh[0] if len(afgh)>0 else 0
    # some_expression if condition else other_expression
    start = afgh if afgh!=0 and afgh<algeria else algeria
    print(start)

    # end
    abbrev = df.loc[df.text.str.fullmatch('ABBREVIATIONS AND SYMBOLS')].index
    abbrev = abbrev[0] if len(abbrev)>0 else df.shape[0]
    geog = df.loc[df.text.str.fullmatch('GEOGRAPHICAL INDEX')].index
    geog = geog[0] if len(geog)>0 else df.shape[0]
    end = geog if geog < abbrev else abbrev
    end = df.shape[0] if end==0 or end<start else end
    print(end)

    df = df.iloc[start:end,:]

    return df

# Remove all addresses, embassies, etc.
def remove_addresses(df):
    # dummy code addresses, countries, embassies, consulates, page numbers
    df['address'] = df.text_squish.str.contains('B.P. Box|P.O. Box|Tel|Workweek|Telex|\d{5}|FAX', regex=True).astype(int)
    # df['country'] = df.text_squish.str.contains('^[A-Z]{4,}(?!:)', regex=True).astype(int)
    # df['country'] = [1 if i in countries or i in countries else 0 for i in df.text_squish]
    df['country'] = df.text_squish.str.contains(countries_regex, regex=True).astype(int)
    df['embassy'] = df.text_squish.str.contains('\(E\)|EMBASSY', regex=True).astype(int)
    df['consulate'] = df.text_squish.str.contains('\(C\)', regex=True).astype(int)
    df['consgen'] = df.text_squish.str.contains('\(CG\)', regex=True).astype(int)
    df['pagenum'] = df.text_squish.str.contains('#{10,}', regex=True).astype(int)
    df['page'] = df.text_squish.str.fullmatch('\d*').astype(int)

    names = df.loc[(df.address==0) & (df.country==0) &
               (df.embassy==0) & (df.consulate==0) &
               (df.consgen==0) & (df.pagenum==0) &
               (df.page==0),['text','text_squish']].reset_index(drop=True)
    return names

# If there is a rank label with (:) at the end of the line and no following name
# Join it with the following line
# Then, delete rows without the format 'RANK: First M. Last'
def find_format(df):
    df['join'] = df.text_squish.str.contains('[A-Z ]+[\s:|;|.]$', regex=True)

    joinedlist = []
    for i in range(0, len(df.text_squish)):
        if df['join'][i]==True and i!=len(df.text_squish)-1:
            a = list(df.text_squish)[i]
            b = list(df.text_squish)[i+1]
            joined = a + ' ' + b
            joinedlist += [joined]
        else:
            joinedlist += [list(df.text_squish)[i]]

    df['text_joined'] = joinedlist

    # delete rows without the format 'RANK: First M. Last'
    df['text_clean'] = df.text_joined.str.contains('^[A-Z ]{2,}[:;,.]\s[A-Za-z\. ]*', regex=True)
    df = df[df['text_clean']][['text','text_squish','text_joined']].reset_index(drop=True)

    return df

# Separate rank and name
# regex: `([A-Z\/]{2,})[\.:;]`
def separate_name(df):

    a = df['text_joined'].str.split('([A-Z/ ]{2,})[:;]', expand=True)

    if len(a.columns)>3:
        ranks = list(a[1]) + list(a[3])
        namelist = list(a[2]) + list(a[4])
    elif len(a.columns)==3:
        ranks = a[1]
        namelist = a[2]

    df['rank'] = ranks
    df['name'] = namelist

    df = df.dropna().reset_index(drop=True)

    return clean

# Identical to 'find_format', except using format 'RANK First M. Last'
def find_format2(df):
    df['join'] = df.text_squish.str.contains('[A-Z ]{2,}$', regex=True)

    joinedlist = []
    for i in range(0, len(df.text_squish)):
        if df['join'][i]==True and i!=len(df.text_squish)-1:
            a = list(df.text_squish)[i]
            b = list(df.text_squish)[i+1]
            joined = a + ' ' + b
            joinedlist += [joined]
        else:
            joinedlist += [list(df.text_squish)[i]]

    df['text_joined'] = joinedlist

    df['text_clean'] = df.text_joined.str.contains('^[A-Z ]{2,}\s[A-Za-z\. ]*$', regex=True)
    df = df[df['text_clean']][['text','text_squish','text_joined']].reset_index(drop=True)

    return df

# Identical to 'separate_name', except using format 'RANK First M. Last'
def separate_name2(df):
    a = df['text_joined'].str.extract('([A-Z/\s]{2,})[\s\.:;](.*)', expand=True)

    df['rank'] = a[0]
    df['name'] = a[1]

    df = df.dropna().reset_index(drop=True)

    return df

# Parse and split full name to first/last name
def split_firstlast(df):

    # add amb to titles
    constants = Constants()
    constants.titles.add('amb', 'amb.')
    constants.suffix_not_acronyms.add('attache')

    # separate names to title, first, mid, last, suff
    titles = []
    firsts = []
    mids = []
    lasts = []
    suffs = []
    for name in df.name:
        parse = HumanName(name)
        titles += [parse.title]
        firsts += [parse.first]
        mids += [parse.middle]
        lasts += [parse.last]
        suffs += [parse.suffix]
    df['title'] = titles
    df['first'] = firsts
    df['middle'] = mids
    df['last'] = lasts
    df['suffix'] = suffs

    # if first name is an initial -> use middle name
    df['first_clean'] = ''
    for index in range(0,len(df['first'])):
        name = df['first'][index]
        middle = df['middle'][index]
        if re.match('[A-Z]\.', name) is not None:
            df['first_clean'][index] = middle
        elif (name == 'Amb') | (name == 'Amb.'):
            df['first_clean'][index] = middle
        else:
            df['first_clean'][index] = name

    # M. First -> First
    # First M. -> First
    for index in range(0,len(df['first'])):
        name = df['first_clean'][index]
        if name is not None:
            split = name.split()
            if len(split)>1:
                if len(split[0]) > len(split[1]):
                    df['first_clean'][index] = split[0]
                else: 
                    df['first_clean'][index] = split[1]

    # finally, merge first and last to new column firstlast
    df['firstlast'] = df['first_clean'] + ' ' + df['last']
    df = df.loc[(df['firstlast']!=' ')]

    # and drop duplicates and missing full names
    df = df.drop_duplicates(['rank','name']).reset_index(drop=True)
    df = df.loc[(df['first']!='')]

    return df

Cleaning

To preserve rank/name formatting:
- Format [RANK: First M. Last]: 1965-1990, 2000-2003
- Format [RANK First M. Last]: 2004-2022
- PDF files: 2007-2022
- possibly corrupted: 1991-1998 -> only grab line items with RANK: F M. L

Get list of all .pdf and .text files to parse:

from os import listdir
from os.path import isfile, join
mypath = './data/inst'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles = pd.Series(onlyfiles)*pd.Series(onlyfiles).str.contains('txt|pdf')
onlyfiles = [i for i in onlyfiles if i!='']
print((onlyfiles),)

['1994_Spring.txt', '2007_Dec.pdf', '2010_Nov.pdf', '1997_Jun.txt', '2002_Jun.txt', '1965_Feb.txt', '1986_Jan.txt', '1974_Mar.txt', '1987_Jan.txt', '1975_Mar.txt', '2016_Feb.pdf', '1985_Jan.txt', '1966_Feb.txt', '2002_Jun.pdf', '2008_Aug.pdf', '2014_Dec.pdf', '2001_Aug.pdf', '2012_Nov.pdf', '1984_Jan.txt', '1993_Spring.txt', '2009_Dec.pdf', '1991_Jan.txt', '1972_Feb.txt', '1981_Jan.txt', '1982_Jan.pdf', '2021_Jan.pdf', '2022_Oct.pdf', '1988_Jan.txt', '2011_Dec.pdf', '1998_Jun.txt', '1989_Jan.txt', '1996_Mar.txt', '2004_Jun.txt', '2003_Mar.txt', '1990_Jan.txt', '2005_Nov.txt', '2013_Dec.pdf', '1979_Mar.txt', '1968_Feb.txt', '1992_Spring.txt', '1982_Jan.txt', '2004_Jun.pdf', '2005_Nov.pdf', '1983_Jan.txt', '1970_Feb.txt', '1978_Mar.txt', '2018_Apr.pdf', 'Abbreviations_Symbols.pdf', '1969_Feb.txt']

Clean file by file, based on the year and structure:

for file in onlyfiles:
    filename = re.split(r'.txt|.pdf', file)[0]
    year = filename.split('_')[0]
    print(year)

    if int(year)<2001: # txt file, format R: F M. L
        df = import_txt(filename)
        df = data_filter(df)
        names = remove_addresses(df)
        names = find_format(names)
        clean = separate_name(names)
        clean = split_firstlast(clean)
        display(clean)
        clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)

    elif int(year)==2001 or int(year)==2002: # pdf file, format R: F M. L
        df = import_pdf(filename)
        df = data_filter(df)
        names = remove_addresses(df)
        names = find_format(names)
        clean = separate_name(names)
        clean = split_firstlast(clean)
        display(clean)
        clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)

    elif int(year)>=2004 and int(year)<2007: # txt file, format R F M. L
        df = import_txt(filename)
        df = data_filter(df)
        names = remove_addresses(df)
        names = find_format2(names)
        clean = separate_name2(names)
        clean = split_firstlast(clean)
        display(clean)
        clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)

    elif int(year)>=2007: # pdf file, format R F M. L
        df = import_pdf(filename)
        df = data_filter(df)
        names = remove_addresses(df)
        names = find_format2(names)
        clean = separate_name2(names)
        clean = split_firstlast(clean)
        display(clean)
        clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)

Compile into master cleaned dataset

Get list of all cleaned .csv files:

from os import listdir
from os.path import isfile, join
mypath = './fp21/data/rank-name'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles = pd.Series(onlyfiles)*pd.Series(onlyfiles).str.contains('clean')
onlyfiles = [i for i in onlyfiles if i!='']
print((onlyfiles),)

['clean_1997_Jun.csv', 'clean_1986_Jan.csv', 'clean_1965_Feb.csv', 'clean_2002_Jun.csv', 'clean_2008_Aug.csv', 'clean_1974_Mar.csv', 'clean_2014_Dec.csv', 'clean_1992_Spring.csv', 'clean_2001_Aug.csv', 'clean_2012_Nov.csv', 'clean_1987_Jan.csv', 'clean_1975_Mar.csv', 'clean_2007_Dec.csv', 'clean_1966_Feb.csv', 'clean_1985_Jan.csv', 'clean_2010_Nov.csv', 'clean_1984_Jan.csv', 'clean_2016_Feb.csv', 'clean_2013_Dec.csv', 'clean_1988_Jan.csv', 'clean_1981_Jan.csv', 'clean_1972_Feb.csv', 'clean_1991_Jan.csv', 'clean_1996_Mar.csv', 'clean_1990_Jan.csv', 'clean_2004_Jun.csv', 'clean_2005_Nov.csv', 'clean_2018_Apr.csv', 'clean_1993_Spring.csv', 'clean_1998_Jun.csv', 'clean_1989_Jan.csv', 'clean_1994_Spring.csv', 'clean_2009_Dec.csv', 'clean_1982_Jan.csv', 'clean_2021_Jan.csv', 'clean_1979_Mar.csv', 'clean_2022_Oct.csv', 'clean_1968_Feb.csv', 'clean_2011_Dec.csv', 'clean_1978_Mar.csv', 'clean_1969_Feb.csv', 'clean_1970_Feb.csv', 'clean_1983_Jan.csv']

Compile all .csv files into a master cleaned dataframe:

master_cleaned = pd.DataFrame(columns=['year','mo','rank','name','first_clean','middle','last','suffix','firstlast'])

for file in onlyfiles:
    clean = pd.read_csv('./data/rank-name/'+file)
    filename = file.split('.csv')[0].split('clean_')[1]
    year = filename.split('_')[0]
    mo = filename.split('_')[1]

    clean['year'] = year
    clean['mo'] = mo
    clean = clean[['year','mo','rank','name','first_clean','middle','last','suffix','firstlast']]

    master_cleaned = pd.concat([master_cleaned, clean])

master_cleaned = master_cleaned.sort_values('year')
master_cleaned.to_csv('./fp21/data/out/master_cleaned.csv', index=False)

master_cleaned

Preview of master cleaned dataset:

year	mo	rank	name	first_clean	middle	last	suffix	firstlast
1965	Feb	AGRICULTURAL ATTACHE	Marshal D. Fox	Marshal	D.	Fox	nan	Marshal Fox
1965	Feb	DEPUTY CHIEF OF MISSION	C. Robert Moore	Robert	Robert	Moore	nan	Robert Moore
1965	Feb	ECONOMIC SECTION	James H. Cheatham	James	H.	Cheatham	nan	James Cheatham
1965	Feb	CONSUL GENERAL	W. Garland Richardson	Garland	Garland	Richardson	nan	Garland Richardson
1965	Feb	CONSULATE GENERAL	Pamrmaribo	Pamrmaribo	nan	nan	nan	Pamrmaribo
1965	Feb	AID MISSION DIRECTOR	William 0. Cooper	William	0.	Cooper	nan	William Cooper
1965	Feb	AGRICULTURAL ATTACHE	James A. Hutchins, Jr.	James	A.	Hutchins	Jr.	James Hutchins
1965	Feb	ECONOMIC SECTION	James H. Ashida	James	H.	Ashida	nan	James Ashida
1965	Feb	AMBASSADOR	J. Graham Parsons	Graham	Graham	Parsons	nan	Graham Parsons
1965	Feb	DEPUTY CHIEF OF MISSION	Leo G. Cyr	Leo	G.	Cyr	nan	Leo Cyr