Data Extraction and Cleaning
Using various text analysis and name processing Python packages including pandas, regex, and PyMuPDF (fitz), we wrote a script that systematically scrapes and cleans the Key Officers, transforming and compiling text files and PDFs into a dataset of approximately 112,456 officer names and their positions.
The script includes processes including removing address lines and embassy names; parsing out our desired format of officer positions and names (‘RANK: First M. Last’); and using regular expressions to separate names from positions.
Code
import pandas as pd
import nameparser
import re
import gender_guesser
import numpy as np
import math
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
from nameparser import HumanName
from nameparser.config import Constants
import pdftotext
import fitz
Functions
import iso3166
from iso3166 import countries
countries_title = pd.Series(iso3166.countries_by_name.keys()).str.title()
countries_caps = list(iso3166.countries_by_name.keys())
countries = list(countries_title)+countries_caps
countries_regex = "(" + ")|(".join(countries) + ")"
# countries_regex = map(re.compile, countries)
# Import TXT
def import_txt(filename):
text = open("./data/inst/" + filename + ".txt", "r").read()
text = text.split('\n')
df = pd.DataFrame({'text':text,
'text_squish':0,
'address':0,
'country':0,
'embassy':0,
'consulate':0,
'consgen':0,
'pagenum':0,
'page':0})
df.text_squish = df.text.str.strip()
df = df.dropna()
df = df[df.text_squish != ''].reset_index(drop=True)
return df
# Import PDF
def import_pdf(filename):
with fitz.open('./data/inst/' + filename + '.pdf') as doc:
text = ''
for page in doc:
text += page.get_text()
text = text.split('\n')
df = pd.DataFrame({'text':text,
'text_squish':0,
'address':0,
'country':0,
'embassy':0,
'consulate':0,
'consgen':0,
'pagenum':0,
'page':0,
'url':0})
df.text_squish = df.text.str.strip()
df = df.dropna()
df = df[df.text_squish != ''].reset_index(drop=True)
return df
# Find where DF starts
def data_filter(df):
# start
algeria = df.loc[df.text.str.fullmatch('ALGERIA')].index
algeria = algeria[0] if len(algeria)>0 else 0
afgh = df.loc[df.text.str.fullmatch('AFGHANISTAN')].index
afgh = afgh[0] if len(afgh)>0 else 0
# some_expression if condition else other_expression
start = afgh if afgh!=0 and afgh<algeria else algeria
print(start)
# end
abbrev = df.loc[df.text.str.fullmatch('ABBREVIATIONS AND SYMBOLS')].index
abbrev = abbrev[0] if len(abbrev)>0 else df.shape[0]
geog = df.loc[df.text.str.fullmatch('GEOGRAPHICAL INDEX')].index
geog = geog[0] if len(geog)>0 else df.shape[0]
end = geog if geog < abbrev else abbrev
end = df.shape[0] if end==0 or end<start else end
print(end)
df = df.iloc[start:end,:]
return df
# Remove all addresses, embassies, etc.
def remove_addresses(df):
# dummy code addresses, countries, embassies, consulates, page numbers
df['address'] = df.text_squish.str.contains('B.P. Box|P.O. Box|Tel|Workweek|Telex|\d{5}|FAX', regex=True).astype(int)
# df['country'] = df.text_squish.str.contains('^[A-Z]{4,}(?!:)', regex=True).astype(int)
# df['country'] = [1 if i in countries or i in countries else 0 for i in df.text_squish]
df['country'] = df.text_squish.str.contains(countries_regex, regex=True).astype(int)
df['embassy'] = df.text_squish.str.contains('\(E\)|EMBASSY', regex=True).astype(int)
df['consulate'] = df.text_squish.str.contains('\(C\)', regex=True).astype(int)
df['consgen'] = df.text_squish.str.contains('\(CG\)', regex=True).astype(int)
df['pagenum'] = df.text_squish.str.contains('#{10,}', regex=True).astype(int)
df['page'] = df.text_squish.str.fullmatch('\d*').astype(int)
names = df.loc[(df.address==0) & (df.country==0) &
(df.embassy==0) & (df.consulate==0) &
(df.consgen==0) & (df.pagenum==0) &
(df.page==0),['text','text_squish']].reset_index(drop=True)
return names
# If there is a rank label with (:) at the end of the line and no following name
# Join it with the following line
# Then, delete rows without the format 'RANK: First M. Last'
def find_format(df):
df['join'] = df.text_squish.str.contains('[A-Z ]+[\s:|;|.]$', regex=True)
joinedlist = []
for i in range(0, len(df.text_squish)):
if df['join'][i]==True and i!=len(df.text_squish)-1:
a = list(df.text_squish)[i]
b = list(df.text_squish)[i+1]
joined = a + ' ' + b
joinedlist += [joined]
else:
joinedlist += [list(df.text_squish)[i]]
df['text_joined'] = joinedlist
# delete rows without the format 'RANK: First M. Last'
df['text_clean'] = df.text_joined.str.contains('^[A-Z ]{2,}[:;,.]\s[A-Za-z\. ]*', regex=True)
df = df[df['text_clean']][['text','text_squish','text_joined']].reset_index(drop=True)
return df
# Separate rank and name
# regex: `([A-Z\/]{2,})[\.:;]`
def separate_name(df):
a = df['text_joined'].str.split('([A-Z/ ]{2,})[:;]', expand=True)
if len(a.columns)>3:
ranks = list(a[1]) + list(a[3])
namelist = list(a[2]) + list(a[4])
elif len(a.columns)==3:
ranks = a[1]
namelist = a[2]
df['rank'] = ranks
df['name'] = namelist
df = df.dropna().reset_index(drop=True)
return clean
# Identical to 'find_format', except using format 'RANK First M. Last'
def find_format2(df):
df['join'] = df.text_squish.str.contains('[A-Z ]{2,}$', regex=True)
joinedlist = []
for i in range(0, len(df.text_squish)):
if df['join'][i]==True and i!=len(df.text_squish)-1:
a = list(df.text_squish)[i]
b = list(df.text_squish)[i+1]
joined = a + ' ' + b
joinedlist += [joined]
else:
joinedlist += [list(df.text_squish)[i]]
df['text_joined'] = joinedlist
df['text_clean'] = df.text_joined.str.contains('^[A-Z ]{2,}\s[A-Za-z\. ]*$', regex=True)
df = df[df['text_clean']][['text','text_squish','text_joined']].reset_index(drop=True)
return df
# Identical to 'separate_name', except using format 'RANK First M. Last'
def separate_name2(df):
a = df['text_joined'].str.extract('([A-Z/\s]{2,})[\s\.:;](.*)', expand=True)
df['rank'] = a[0]
df['name'] = a[1]
df = df.dropna().reset_index(drop=True)
return df
# Parse and split full name to first/last name
def split_firstlast(df):
# add amb to titles
constants = Constants()
constants.titles.add('amb', 'amb.')
constants.suffix_not_acronyms.add('attache')
# separate names to title, first, mid, last, suff
titles = []
firsts = []
mids = []
lasts = []
suffs = []
for name in df.name:
parse = HumanName(name)
titles += [parse.title]
firsts += [parse.first]
mids += [parse.middle]
lasts += [parse.last]
suffs += [parse.suffix]
df['title'] = titles
df['first'] = firsts
df['middle'] = mids
df['last'] = lasts
df['suffix'] = suffs
# if first name is an initial -> use middle name
df['first_clean'] = ''
for index in range(0,len(df['first'])):
name = df['first'][index]
middle = df['middle'][index]
if re.match('[A-Z]\.', name) is not None:
df['first_clean'][index] = middle
elif (name == 'Amb') | (name == 'Amb.'):
df['first_clean'][index] = middle
else:
df['first_clean'][index] = name
# M. First -> First
# First M. -> First
for index in range(0,len(df['first'])):
name = df['first_clean'][index]
if name is not None:
split = name.split()
if len(split)>1:
if len(split[0]) > len(split[1]):
df['first_clean'][index] = split[0]
else:
df['first_clean'][index] = split[1]
# finally, merge first and last to new column firstlast
df['firstlast'] = df['first_clean'] + ' ' + df['last']
df = df.loc[(df['firstlast']!=' ')]
# and drop duplicates and missing full names
df = df.drop_duplicates(['rank','name']).reset_index(drop=True)
df = df.loc[(df['first']!='')]
return df
Cleaning
To preserve rank/name formatting:
- Format [RANK: First M. Last]: 1965-1990, 2000-2003
- Format [RANK First M. Last]: 2004-2022
- PDF files: 2007-2022
- possibly corrupted: 1991-1998 -> only grab line items with RANK: F M. L
Get list of all .pdf and .text files to parse:
from os import listdir
from os.path import isfile, join
mypath = './data/inst'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles = pd.Series(onlyfiles)*pd.Series(onlyfiles).str.contains('txt|pdf')
onlyfiles = [i for i in onlyfiles if i!='']
print((onlyfiles),)
['1994_Spring.txt', '2007_Dec.pdf', '2010_Nov.pdf', '1997_Jun.txt', '2002_Jun.txt', '1965_Feb.txt', '1986_Jan.txt', '1974_Mar.txt', '1987_Jan.txt', '1975_Mar.txt', '2016_Feb.pdf', '1985_Jan.txt', '1966_Feb.txt', '2002_Jun.pdf', '2008_Aug.pdf', '2014_Dec.pdf', '2001_Aug.pdf', '2012_Nov.pdf', '1984_Jan.txt', '1993_Spring.txt', '2009_Dec.pdf', '1991_Jan.txt', '1972_Feb.txt', '1981_Jan.txt', '1982_Jan.pdf', '2021_Jan.pdf', '2022_Oct.pdf', '1988_Jan.txt', '2011_Dec.pdf', '1998_Jun.txt', '1989_Jan.txt', '1996_Mar.txt', '2004_Jun.txt', '2003_Mar.txt', '1990_Jan.txt', '2005_Nov.txt', '2013_Dec.pdf', '1979_Mar.txt', '1968_Feb.txt', '1992_Spring.txt', '1982_Jan.txt', '2004_Jun.pdf', '2005_Nov.pdf', '1983_Jan.txt', '1970_Feb.txt', '1978_Mar.txt', '2018_Apr.pdf', 'Abbreviations_Symbols.pdf', '1969_Feb.txt']
Clean file by file, based on the year and structure:
for file in onlyfiles:
filename = re.split(r'.txt|.pdf', file)[0]
year = filename.split('_')[0]
print(year)
if int(year)<2001: # txt file, format R: F M. L
df = import_txt(filename)
df = data_filter(df)
names = remove_addresses(df)
names = find_format(names)
clean = separate_name(names)
clean = split_firstlast(clean)
display(clean)
clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)
elif int(year)==2001 or int(year)==2002: # pdf file, format R: F M. L
df = import_pdf(filename)
df = data_filter(df)
names = remove_addresses(df)
names = find_format(names)
clean = separate_name(names)
clean = split_firstlast(clean)
display(clean)
clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)
elif int(year)>=2004 and int(year)<2007: # txt file, format R F M. L
df = import_txt(filename)
df = data_filter(df)
names = remove_addresses(df)
names = find_format2(names)
clean = separate_name2(names)
clean = split_firstlast(clean)
display(clean)
clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)
elif int(year)>=2007: # pdf file, format R F M. L
df = import_pdf(filename)
df = data_filter(df)
names = remove_addresses(df)
names = find_format2(names)
clean = separate_name2(names)
clean = split_firstlast(clean)
display(clean)
clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)
Compile into master cleaned dataset
Get list of all cleaned .csv files:
from os import listdir
from os.path import isfile, join
mypath = './fp21/data/rank-name'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles = pd.Series(onlyfiles)*pd.Series(onlyfiles).str.contains('clean')
onlyfiles = [i for i in onlyfiles if i!='']
print((onlyfiles),)
['clean_1997_Jun.csv', 'clean_1986_Jan.csv', 'clean_1965_Feb.csv', 'clean_2002_Jun.csv', 'clean_2008_Aug.csv', 'clean_1974_Mar.csv', 'clean_2014_Dec.csv', 'clean_1992_Spring.csv', 'clean_2001_Aug.csv', 'clean_2012_Nov.csv', 'clean_1987_Jan.csv', 'clean_1975_Mar.csv', 'clean_2007_Dec.csv', 'clean_1966_Feb.csv', 'clean_1985_Jan.csv', 'clean_2010_Nov.csv', 'clean_1984_Jan.csv', 'clean_2016_Feb.csv', 'clean_2013_Dec.csv', 'clean_1988_Jan.csv', 'clean_1981_Jan.csv', 'clean_1972_Feb.csv', 'clean_1991_Jan.csv', 'clean_1996_Mar.csv', 'clean_1990_Jan.csv', 'clean_2004_Jun.csv', 'clean_2005_Nov.csv', 'clean_2018_Apr.csv', 'clean_1993_Spring.csv', 'clean_1998_Jun.csv', 'clean_1989_Jan.csv', 'clean_1994_Spring.csv', 'clean_2009_Dec.csv', 'clean_1982_Jan.csv', 'clean_2021_Jan.csv', 'clean_1979_Mar.csv', 'clean_2022_Oct.csv', 'clean_1968_Feb.csv', 'clean_2011_Dec.csv', 'clean_1978_Mar.csv', 'clean_1969_Feb.csv', 'clean_1970_Feb.csv', 'clean_1983_Jan.csv']
Compile all .csv files into a master cleaned dataframe:
master_cleaned = pd.DataFrame(columns=['year','mo','rank','name','first_clean','middle','last','suffix','firstlast'])
for file in onlyfiles:
clean = pd.read_csv('./data/rank-name/'+file)
filename = file.split('.csv')[0].split('clean_')[1]
year = filename.split('_')[0]
mo = filename.split('_')[1]
clean['year'] = year
clean['mo'] = mo
clean = clean[['year','mo','rank','name','first_clean','middle','last','suffix','firstlast']]
master_cleaned = pd.concat([master_cleaned, clean])
master_cleaned = master_cleaned.sort_values('year')
master_cleaned.to_csv('./fp21/data/out/master_cleaned.csv', index=False)
master_cleaned
Preview of master cleaned dataset:
year | mo | rank | name | first_clean | middle | last | suffix | firstlast |
---|---|---|---|---|---|---|---|---|
1965 | Feb | AGRICULTURAL ATTACHE | Marshal D. Fox | Marshal | D. | Fox | nan | Marshal Fox |
1965 | Feb | DEPUTY CHIEF OF MISSION | C. Robert Moore | Robert | Robert | Moore | nan | Robert Moore |
1965 | Feb | ECONOMIC SECTION | James H. Cheatham | James | H. | Cheatham | nan | James Cheatham |
1965 | Feb | CONSUL GENERAL | W. Garland Richardson | Garland | Garland | Richardson | nan | Garland Richardson |
1965 | Feb | CONSULATE GENERAL | Pamrmaribo | Pamrmaribo | nan | nan | nan | Pamrmaribo |
1965 | Feb | AID MISSION DIRECTOR | William 0. Cooper | William | 0. | Cooper | nan | William Cooper |
1965 | Feb | AGRICULTURAL ATTACHE | James A. Hutchins, Jr. | James | A. | Hutchins | Jr. | James Hutchins |
1965 | Feb | ECONOMIC SECTION | James H. Ashida | James | H. | Ashida | nan | James Ashida |
1965 | Feb | AMBASSADOR | J. Graham Parsons | Graham | Graham | Parsons | nan | Graham Parsons |
1965 | Feb | DEPUTY CHIEF OF MISSION | Leo G. Cyr | Leo | G. | Cyr | nan | Leo Cyr |