Skip to content

Classification Code

import pandas as pd
import nameparser
from nameparser import HumanName
import re
import gender_guesser
import numpy as np
import math
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
import gender_guesser.detector as gender
from ethnicseer import EthnicClassifier

Functions

# gender classify function
def guess_gender(df):
    d = gender.Detector()

    genderlist = []
    for name in df.first_clean:
        genderlist += [d.get_gender(name)]
    df['gender_guess'] = genderlist

    male = (clean[(clean['gender_guess']=='male') | (clean['gender_guess']=='mostly_male')].shape[0] / 
            clean[(clean['gender_guess']!='unknown')].shape[0])
    female = (clean[(clean['gender_guess']=='female') | (clean['gender_guess']=='mostly_female')].shape[0] / 
              clean[(clean['gender_guess']!='unknown')].shape[0])
    andy = (clean[(clean['gender_guess']=='andy')].shape[0] / 
            clean[(clean['gender_guess']!='unknown')].shape[0])
    unknown = (clean[(clean['gender_guess']=='unknown')].shape[0] / 
               clean.shape[0])
    print('% male: ' + str(round(male,3)))
    print('% female: ' + str(round(female,3)))
    print('% andy: ' + str(round(andy,3)))
    print('% unkown: ' + str(round(unknown,3)))

    return df

api_url = 'https://api.name-to-ethnicity.com/classify-names'

headers = {"Authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6ImVsbGljZUBmcDIxLm9yZyIsImlkIjoiJDJiJDEwJEdUYWVCSnVrMjlPUXVIampzV1huMGV1Q1hhM1N6RjdXSjlBQXM2ZWlSYjF6cWNNME82ODIuIiwiaWF0IjoxNjY4MDIyMDUxLCJleHAiOjE2Njg4ODYwNTF9.VlSB2uPV6YmJaKgRftXRM5B76JfdubSoffmNaNa21_8",
           "Email": "ellice@fp21.org"}

# ethnicity classify function
def classify_eth(df, model):
    # API limited to 1000 names; split name list into chunks
    # then request API one by one

    names = df['firstlast']

    chunks = math.ceil(df.shape[0]/999)

    namechunks = np.array_split(df['firstlast'], chunks)

    guesses = []
    for index, namechunk in enumerate(namechunks):
        print('chunk ' + str(index+1) + ' of ' + str(chunks))
        body = {"modelName": model,
                "names": namechunk}
        response = requests.post(api_url, data=body, headers=headers)
        guess = pd.DataFrame(json.loads(response.text)).transpose()
        guesses += [guess]

    guesses = pd.concat(guesses, axis=0)

    df = df.join(guesses, on='firstlast')

    guesslabel = "eth_guess "+str(model)
    problabel = "eth_prob "+str(model)
    df = df.rename(columns={0: guesslabel, 1: problabel})

    df = df.drop_duplicates(subset=['year','name']).reset_index(drop=True)

    prob50 = np.sum(df[problabel]>50)/df.shape[0]
    print(model + ' probability over 50%: ' + str(round(prob50,3)))

    return df

Classify single year

filename = "1985_Jan"
year = "1985"
clean = pd.read_csv('./rank-name/clean_'+filename+'.csv')
classified = guess_gender(clean)
classified = classify_eth(classified, "8_nationality_groups")
# classified = classify_eth(classified, "10_nationalities_and_else")

ec = EthnicClassifier.load_pretrained_model()
classified['ethnicseer'] = ec.classify_names(classified['firstlast'])

display(classified)

classified.to_csv('./out/classified_' + filename + '.csv', index=False)

Classify in bulk

clean = pd.read_csv('./out/master_cleaned.csv', dtype={'year':'str'})
clean

Preview of master cleaned dataset:

year mo rank name first_clean middle last suffix firstlast
1965 Feb AGRICULTURAL ATTACHE Marshal D. Fox Marshal D. Fox nan Marshal Fox
1965 Feb DEPUTY CHIEF OF MISSION C. Robert Moore Robert Robert Moore nan Robert Moore
1965 Feb ECONOMIC SECTION James H. Cheatham James H. Cheatham nan James Cheatham
1965 Feb CONSUL GENERAL W. Garland Richardson Garland Garland Richardson nan Garland Richardson
1965 Feb CONSULATE GENERAL Pamrmaribo Pamrmaribo nan nan nan Pamrmaribo
classified = guess_gender(clean)
classified = classify_eth(classified, "8_nationality_groups")
# classified = classify_eth(classified, "10_nationalities_and_else")

ec = EthnicClassifier.load_pretrained_model()
classified['ethnicseer'] = ec.classify_names(classified['firstlast'])

# display(classified)

classified.to_csv('./out/master_classified.csv', index=False)

Preview of master classified dataset:

year mo rank name first_clean middle last suffix firstlast gender_guess eth_guess 8_nationality_groups eth_prob 8_nationality_groups ethnicseer
1965 Feb AGRICULTURAL ATTACHE Marshal D. Fox Marshal D. Fox nan Marshal Fox unknown southAsian 32.525 eng
1965 Feb DEPUTY CHIEF OF MISSION C. Robert Moore Robert Robert Moore nan Robert Moore male celtic 77.223 eng
1965 Feb ECONOMIC SECTION James H. Cheatham James H. Cheatham nan James Cheatham male celtic 87.306 eng
1965 Feb CONSUL GENERAL W. Garland Richardson Garland Garland Richardson nan Garland Richardson male celtic 56.789 eng
1965 Feb CONSULATE GENERAL Pamrmaribo Pamrmaribo nan nan nan Pamrmaribo unknown african 64.201 ita