Skip to content

Data Extraction and Cleaning

Using various text analysis and name processing Python packages including pandas, regex, and PyMuPDF (fitz), we wrote a script that systematically scrapes and cleans the Key Officers, transforming and compiling text files and PDFs into a dataset of approximately 112,456 officer names and their positions.

The script includes processes including removing address lines and embassy names; parsing out our desired format of officer positions and names (‘RANK: First M. Last’); and using regular expressions to separate names from positions.

Code

import pandas as pd
import nameparser
import re
import gender_guesser
import numpy as np
import math
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
from nameparser import HumanName
from nameparser.config import Constants
import pdftotext
import fitz

Functions

import iso3166
from iso3166 import countries
countries_title = pd.Series(iso3166.countries_by_name.keys()).str.title()
countries_caps = list(iso3166.countries_by_name.keys())
countries = list(countries_title)+countries_caps
countries_regex = "(" + ")|(".join(countries) + ")"
# countries_regex = map(re.compile, countries)
# Import TXT
def import_txt(filename):
    text = open("./data/inst/" + filename + ".txt", "r").read()
    text = text.split('\n')

    df = pd.DataFrame({'text':text,
                   'text_squish':0,
                   'address':0,
                   'country':0,
                   'embassy':0,
                   'consulate':0,
                   'consgen':0,
                   'pagenum':0,
                   'page':0})
    df.text_squish = df.text.str.strip()
    df = df.dropna()
    df = df[df.text_squish != ''].reset_index(drop=True)

    return df

# Import PDF
def import_pdf(filename):
    with fitz.open('./data/inst/' + filename + '.pdf') as doc:
        text = ''
        for page in doc:
            text += page.get_text()
        text = text.split('\n')

    df = pd.DataFrame({'text':text,
                   'text_squish':0,
                   'address':0,
                   'country':0,
                   'embassy':0,
                   'consulate':0,
                   'consgen':0,
                   'pagenum':0,
                   'page':0,
                    'url':0})
    df.text_squish = df.text.str.strip()
    df = df.dropna()
    df = df[df.text_squish != ''].reset_index(drop=True)

    return df

# Find where DF starts
def data_filter(df):
    # start
    algeria = df.loc[df.text.str.fullmatch('ALGERIA')].index
    algeria = algeria[0] if len(algeria)>0 else 0
    afgh = df.loc[df.text.str.fullmatch('AFGHANISTAN')].index
    afgh = afgh[0] if len(afgh)>0 else 0
    # some_expression if condition else other_expression
    start = afgh if afgh!=0 and afgh<algeria else algeria
    print(start)

    # end
    abbrev = df.loc[df.text.str.fullmatch('ABBREVIATIONS AND SYMBOLS')].index
    abbrev = abbrev[0] if len(abbrev)>0 else df.shape[0]
    geog = df.loc[df.text.str.fullmatch('GEOGRAPHICAL INDEX')].index
    geog = geog[0] if len(geog)>0 else df.shape[0]
    end = geog if geog < abbrev else abbrev
    end = df.shape[0] if end==0 or end<start else end
    print(end)

    df = df.iloc[start:end,:]

    return df

# Remove all addresses, embassies, etc.
def remove_addresses(df):
    # dummy code addresses, countries, embassies, consulates, page numbers
    df['address'] = df.text_squish.str.contains('B.P. Box|P.O. Box|Tel|Workweek|Telex|\d{5}|FAX', regex=True).astype(int)
    # df['country'] = df.text_squish.str.contains('^[A-Z]{4,}(?!:)', regex=True).astype(int)
    # df['country'] = [1 if i in countries or i in countries else 0 for i in df.text_squish]
    df['country'] = df.text_squish.str.contains(countries_regex, regex=True).astype(int)
    df['embassy'] = df.text_squish.str.contains('\(E\)|EMBASSY', regex=True).astype(int)
    df['consulate'] = df.text_squish.str.contains('\(C\)', regex=True).astype(int)
    df['consgen'] = df.text_squish.str.contains('\(CG\)', regex=True).astype(int)
    df['pagenum'] = df.text_squish.str.contains('#{10,}', regex=True).astype(int)
    df['page'] = df.text_squish.str.fullmatch('\d*').astype(int)

    names = df.loc[(df.address==0) & (df.country==0) &
               (df.embassy==0) & (df.consulate==0) &
               (df.consgen==0) & (df.pagenum==0) &
               (df.page==0),['text','text_squish']].reset_index(drop=True)
    return names

# If there is a rank label with (:) at the end of the line and no following name
# Join it with the following line
# Then, delete rows without the format 'RANK: First M. Last'
def find_format(df):
    df['join'] = df.text_squish.str.contains('[A-Z ]+[\s:|;|.]$', regex=True)

    joinedlist = []
    for i in range(0, len(df.text_squish)):
        if df['join'][i]==True and i!=len(df.text_squish)-1:
            a = list(df.text_squish)[i]
            b = list(df.text_squish)[i+1]
            joined = a + ' ' + b
            joinedlist += [joined]
        else:
            joinedlist += [list(df.text_squish)[i]]

    df['text_joined'] = joinedlist

    # delete rows without the format 'RANK: First M. Last'
    df['text_clean'] = df.text_joined.str.contains('^[A-Z ]{2,}[:;,.]\s[A-Za-z\. ]*', regex=True)
    df = df[df['text_clean']][['text','text_squish','text_joined']].reset_index(drop=True)

    return df

# Separate rank and name
# regex: `([A-Z\/]{2,})[\.:;]`
def separate_name(df):

    a = df['text_joined'].str.split('([A-Z/ ]{2,})[:;]', expand=True)

    if len(a.columns)>3:
        ranks = list(a[1]) + list(a[3])
        namelist = list(a[2]) + list(a[4])
    elif len(a.columns)==3:
        ranks = a[1]
        namelist = a[2]

    df['rank'] = ranks
    df['name'] = namelist

    df = df.dropna().reset_index(drop=True)

    return clean

# Identical to 'find_format', except using format 'RANK First M. Last'
def find_format2(df):
    df['join'] = df.text_squish.str.contains('[A-Z ]{2,}$', regex=True)

    joinedlist = []
    for i in range(0, len(df.text_squish)):
        if df['join'][i]==True and i!=len(df.text_squish)-1:
            a = list(df.text_squish)[i]
            b = list(df.text_squish)[i+1]
            joined = a + ' ' + b
            joinedlist += [joined]
        else:
            joinedlist += [list(df.text_squish)[i]]

    df['text_joined'] = joinedlist

    df['text_clean'] = df.text_joined.str.contains('^[A-Z ]{2,}\s[A-Za-z\. ]*$', regex=True)
    df = df[df['text_clean']][['text','text_squish','text_joined']].reset_index(drop=True)

    return df

# Identical to 'separate_name', except using format 'RANK First M. Last'
def separate_name2(df):
    a = df['text_joined'].str.extract('([A-Z/\s]{2,})[\s\.:;](.*)', expand=True)

    df['rank'] = a[0]
    df['name'] = a[1]

    df = df.dropna().reset_index(drop=True)

    return df

# Parse and split full name to first/last name
def split_firstlast(df):

    # add amb to titles
    constants = Constants()
    constants.titles.add('amb', 'amb.')
    constants.suffix_not_acronyms.add('attache')

    # separate names to title, first, mid, last, suff
    titles = []
    firsts = []
    mids = []
    lasts = []
    suffs = []
    for name in df.name:
        parse = HumanName(name)
        titles += [parse.title]
        firsts += [parse.first]
        mids += [parse.middle]
        lasts += [parse.last]
        suffs += [parse.suffix]
    df['title'] = titles
    df['first'] = firsts
    df['middle'] = mids
    df['last'] = lasts
    df['suffix'] = suffs

    # if first name is an initial -> use middle name
    df['first_clean'] = ''
    for index in range(0,len(df['first'])):
        name = df['first'][index]
        middle = df['middle'][index]
        if re.match('[A-Z]\.', name) is not None:
            df['first_clean'][index] = middle
        elif (name == 'Amb') | (name == 'Amb.'):
            df['first_clean'][index] = middle
        else:
            df['first_clean'][index] = name

    # M. First -> First
    # First M. -> First
    for index in range(0,len(df['first'])):
        name = df['first_clean'][index]
        if name is not None:
            split = name.split()
            if len(split)>1:
                if len(split[0]) > len(split[1]):
                    df['first_clean'][index] = split[0]
                else: 
                    df['first_clean'][index] = split[1]

    # finally, merge first and last to new column firstlast
    df['firstlast'] = df['first_clean'] + ' ' + df['last']
    df = df.loc[(df['firstlast']!=' ')]

    # and drop duplicates and missing full names
    df = df.drop_duplicates(['rank','name']).reset_index(drop=True)
    df = df.loc[(df['first']!='')]

    return df

Cleaning

To preserve rank/name formatting:
- Format [RANK: First M. Last]: 1965-1990, 2000-2003
- Format [RANK First M. Last]: 2004-2022
- PDF files: 2007-2022
- possibly corrupted: 1991-1998 -> only grab line items with RANK: F M. L

Get list of all .pdf and .text files to parse:

from os import listdir
from os.path import isfile, join
mypath = './data/inst'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles = pd.Series(onlyfiles)*pd.Series(onlyfiles).str.contains('txt|pdf')
onlyfiles = [i for i in onlyfiles if i!='']
print((onlyfiles),)
['1994_Spring.txt', '2007_Dec.pdf', '2010_Nov.pdf', '1997_Jun.txt', '2002_Jun.txt', '1965_Feb.txt', '1986_Jan.txt', '1974_Mar.txt', '1987_Jan.txt', '1975_Mar.txt', '2016_Feb.pdf', '1985_Jan.txt', '1966_Feb.txt', '2002_Jun.pdf', '2008_Aug.pdf', '2014_Dec.pdf', '2001_Aug.pdf', '2012_Nov.pdf', '1984_Jan.txt', '1993_Spring.txt', '2009_Dec.pdf', '1991_Jan.txt', '1972_Feb.txt', '1981_Jan.txt', '1982_Jan.pdf', '2021_Jan.pdf', '2022_Oct.pdf', '1988_Jan.txt', '2011_Dec.pdf', '1998_Jun.txt', '1989_Jan.txt', '1996_Mar.txt', '2004_Jun.txt', '2003_Mar.txt', '1990_Jan.txt', '2005_Nov.txt', '2013_Dec.pdf', '1979_Mar.txt', '1968_Feb.txt', '1992_Spring.txt', '1982_Jan.txt', '2004_Jun.pdf', '2005_Nov.pdf', '1983_Jan.txt', '1970_Feb.txt', '1978_Mar.txt', '2018_Apr.pdf', 'Abbreviations_Symbols.pdf', '1969_Feb.txt']

Clean file by file, based on the year and structure:

for file in onlyfiles:
    filename = re.split(r'.txt|.pdf', file)[0]
    year = filename.split('_')[0]
    print(year)

    if int(year)<2001: # txt file, format R: F M. L
        df = import_txt(filename)
        df = data_filter(df)
        names = remove_addresses(df)
        names = find_format(names)
        clean = separate_name(names)
        clean = split_firstlast(clean)
        display(clean)
        clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)

    elif int(year)==2001 or int(year)==2002: # pdf file, format R: F M. L
        df = import_pdf(filename)
        df = data_filter(df)
        names = remove_addresses(df)
        names = find_format(names)
        clean = separate_name(names)
        clean = split_firstlast(clean)
        display(clean)
        clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)

    elif int(year)>=2004 and int(year)<2007: # txt file, format R F M. L
        df = import_txt(filename)
        df = data_filter(df)
        names = remove_addresses(df)
        names = find_format2(names)
        clean = separate_name2(names)
        clean = split_firstlast(clean)
        display(clean)
        clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)

    elif int(year)>=2007: # pdf file, format R F M. L
        df = import_pdf(filename)
        df = data_filter(df)
        names = remove_addresses(df)
        names = find_format2(names)
        clean = separate_name2(names)
        clean = split_firstlast(clean)
        display(clean)
        clean.to_csv('./data/rank-name/clean_' + filename + '.csv', index=False)

Compile into master cleaned dataset

Get list of all cleaned .csv files:

from os import listdir
from os.path import isfile, join
mypath = './fp21/data/rank-name'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles = pd.Series(onlyfiles)*pd.Series(onlyfiles).str.contains('clean')
onlyfiles = [i for i in onlyfiles if i!='']
print((onlyfiles),)
['clean_1997_Jun.csv', 'clean_1986_Jan.csv', 'clean_1965_Feb.csv', 'clean_2002_Jun.csv', 'clean_2008_Aug.csv', 'clean_1974_Mar.csv', 'clean_2014_Dec.csv', 'clean_1992_Spring.csv', 'clean_2001_Aug.csv', 'clean_2012_Nov.csv', 'clean_1987_Jan.csv', 'clean_1975_Mar.csv', 'clean_2007_Dec.csv', 'clean_1966_Feb.csv', 'clean_1985_Jan.csv', 'clean_2010_Nov.csv', 'clean_1984_Jan.csv', 'clean_2016_Feb.csv', 'clean_2013_Dec.csv', 'clean_1988_Jan.csv', 'clean_1981_Jan.csv', 'clean_1972_Feb.csv', 'clean_1991_Jan.csv', 'clean_1996_Mar.csv', 'clean_1990_Jan.csv', 'clean_2004_Jun.csv', 'clean_2005_Nov.csv', 'clean_2018_Apr.csv', 'clean_1993_Spring.csv', 'clean_1998_Jun.csv', 'clean_1989_Jan.csv', 'clean_1994_Spring.csv', 'clean_2009_Dec.csv', 'clean_1982_Jan.csv', 'clean_2021_Jan.csv', 'clean_1979_Mar.csv', 'clean_2022_Oct.csv', 'clean_1968_Feb.csv', 'clean_2011_Dec.csv', 'clean_1978_Mar.csv', 'clean_1969_Feb.csv', 'clean_1970_Feb.csv', 'clean_1983_Jan.csv']

Compile all .csv files into a master cleaned dataframe:

master_cleaned = pd.DataFrame(columns=['year','mo','rank','name','first_clean','middle','last','suffix','firstlast'])

for file in onlyfiles:
    clean = pd.read_csv('./data/rank-name/'+file)
    filename = file.split('.csv')[0].split('clean_')[1]
    year = filename.split('_')[0]
    mo = filename.split('_')[1]

    clean['year'] = year
    clean['mo'] = mo
    clean = clean[['year','mo','rank','name','first_clean','middle','last','suffix','firstlast']]

    master_cleaned = pd.concat([master_cleaned, clean])
master_cleaned = master_cleaned.sort_values('year')
master_cleaned.to_csv('./fp21/data/out/master_cleaned.csv', index=False)
master_cleaned

Preview of master cleaned dataset:

year mo rank name first_clean middle last suffix firstlast
1965 Feb AGRICULTURAL ATTACHE Marshal D. Fox Marshal D. Fox nan Marshal Fox
1965 Feb DEPUTY CHIEF OF MISSION C. Robert Moore Robert Robert Moore nan Robert Moore
1965 Feb ECONOMIC SECTION James H. Cheatham James H. Cheatham nan James Cheatham
1965 Feb CONSUL GENERAL W. Garland Richardson Garland Garland Richardson nan Garland Richardson
1965 Feb CONSULATE GENERAL Pamrmaribo Pamrmaribo nan nan nan Pamrmaribo
1965 Feb AID MISSION DIRECTOR William 0. Cooper William 0. Cooper nan William Cooper
1965 Feb AGRICULTURAL ATTACHE James A. Hutchins, Jr. James A. Hutchins Jr. James Hutchins
1965 Feb ECONOMIC SECTION James H. Ashida James H. Ashida nan James Ashida
1965 Feb AMBASSADOR J. Graham Parsons Graham Graham Parsons nan Graham Parsons
1965 Feb DEPUTY CHIEF OF MISSION Leo G. Cyr Leo G. Cyr nan Leo Cyr