Classification Code
import pandas as pd
import nameparser
from nameparser import HumanName
import re
import gender_guesser
import numpy as np
import math
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
import gender_guesser.detector as gender
from ethnicseer import EthnicClassifier
Functions
# gender classify function
def guess_gender(df):
d = gender.Detector()
genderlist = []
for name in df.first_clean:
genderlist += [d.get_gender(name)]
df['gender_guess'] = genderlist
male = (clean[(clean['gender_guess']=='male') | (clean['gender_guess']=='mostly_male')].shape[0] /
clean[(clean['gender_guess']!='unknown')].shape[0])
female = (clean[(clean['gender_guess']=='female') | (clean['gender_guess']=='mostly_female')].shape[0] /
clean[(clean['gender_guess']!='unknown')].shape[0])
andy = (clean[(clean['gender_guess']=='andy')].shape[0] /
clean[(clean['gender_guess']!='unknown')].shape[0])
unknown = (clean[(clean['gender_guess']=='unknown')].shape[0] /
clean.shape[0])
print('% male: ' + str(round(male,3)))
print('% female: ' + str(round(female,3)))
print('% andy: ' + str(round(andy,3)))
print('% unkown: ' + str(round(unknown,3)))
return df
api_url = 'https://api.name-to-ethnicity.com/classify-names'
headers = {"Authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6ImVsbGljZUBmcDIxLm9yZyIsImlkIjoiJDJiJDEwJEdUYWVCSnVrMjlPUXVIampzV1huMGV1Q1hhM1N6RjdXSjlBQXM2ZWlSYjF6cWNNME82ODIuIiwiaWF0IjoxNjY4MDIyMDUxLCJleHAiOjE2Njg4ODYwNTF9.VlSB2uPV6YmJaKgRftXRM5B76JfdubSoffmNaNa21_8",
"Email": "ellice@fp21.org"}
# ethnicity classify function
def classify_eth(df, model):
# API limited to 1000 names; split name list into chunks
# then request API one by one
names = df['firstlast']
chunks = math.ceil(df.shape[0]/999)
namechunks = np.array_split(df['firstlast'], chunks)
guesses = []
for index, namechunk in enumerate(namechunks):
print('chunk ' + str(index+1) + ' of ' + str(chunks))
body = {"modelName": model,
"names": namechunk}
response = requests.post(api_url, data=body, headers=headers)
guess = pd.DataFrame(json.loads(response.text)).transpose()
guesses += [guess]
guesses = pd.concat(guesses, axis=0)
df = df.join(guesses, on='firstlast')
guesslabel = "eth_guess "+str(model)
problabel = "eth_prob "+str(model)
df = df.rename(columns={0: guesslabel, 1: problabel})
df = df.drop_duplicates(subset=['year','name']).reset_index(drop=True)
prob50 = np.sum(df[problabel]>50)/df.shape[0]
print(model + ' probability over 50%: ' + str(round(prob50,3)))
return df
Classify single year
filename = "1985_Jan"
year = "1985"
clean = pd.read_csv('./rank-name/clean_'+filename+'.csv')
classified = guess_gender(clean)
classified = classify_eth(classified, "8_nationality_groups")
# classified = classify_eth(classified, "10_nationalities_and_else")
ec = EthnicClassifier.load_pretrained_model()
classified['ethnicseer'] = ec.classify_names(classified['firstlast'])
display(classified)
classified.to_csv('./out/classified_' + filename + '.csv', index=False)
Classify in bulk
clean = pd.read_csv('./out/master_cleaned.csv', dtype={'year':'str'})
clean
Preview of master cleaned dataset:
year | mo | rank | name | first_clean | middle | last | suffix | firstlast |
---|---|---|---|---|---|---|---|---|
1965 | Feb | AGRICULTURAL ATTACHE | Marshal D. Fox | Marshal | D. | Fox | nan | Marshal Fox |
1965 | Feb | DEPUTY CHIEF OF MISSION | C. Robert Moore | Robert | Robert | Moore | nan | Robert Moore |
1965 | Feb | ECONOMIC SECTION | James H. Cheatham | James | H. | Cheatham | nan | James Cheatham |
1965 | Feb | CONSUL GENERAL | W. Garland Richardson | Garland | Garland | Richardson | nan | Garland Richardson |
1965 | Feb | CONSULATE GENERAL | Pamrmaribo | Pamrmaribo | nan | nan | nan | Pamrmaribo |
classified = guess_gender(clean)
classified = classify_eth(classified, "8_nationality_groups")
# classified = classify_eth(classified, "10_nationalities_and_else")
ec = EthnicClassifier.load_pretrained_model()
classified['ethnicseer'] = ec.classify_names(classified['firstlast'])
# display(classified)
classified.to_csv('./out/master_classified.csv', index=False)
Preview of master classified dataset:
year | mo | rank | name | first_clean | middle | last | suffix | firstlast | gender_guess | eth_guess 8_nationality_groups | eth_prob 8_nationality_groups | ethnicseer |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1965 | Feb | AGRICULTURAL ATTACHE | Marshal D. Fox | Marshal | D. | Fox | nan | Marshal Fox | unknown | southAsian | 32.525 | eng |
1965 | Feb | DEPUTY CHIEF OF MISSION | C. Robert Moore | Robert | Robert | Moore | nan | Robert Moore | male | celtic | 77.223 | eng |
1965 | Feb | ECONOMIC SECTION | James H. Cheatham | James | H. | Cheatham | nan | James Cheatham | male | celtic | 87.306 | eng |
1965 | Feb | CONSUL GENERAL | W. Garland Richardson | Garland | Garland | Richardson | nan | Garland Richardson | male | celtic | 56.789 | eng |
1965 | Feb | CONSULATE GENERAL | Pamrmaribo | Pamrmaribo | nan | nan | nan | Pamrmaribo | unknown | african | 64.201 | ita |