Closed vlandeiro closed 9 years ago
Finding gender from the census:
def label_genders(tweets, males, females):
genders = []
for tw in tweets:
names = tw['user']['name'].lower().split()
if len(names) == 0:
names = ['']
name = names[0]
if name in males:
genders.append('m')
elif name in females:
genders.append('f')
else:
genders.append('n')
return np.array(genders)
def get_census_names():
males = requests.get('http://www.census.gov/genealogy/www/data/1990surnames/dist.male.first').text.split('\n')
males = [m.split()[0].lower() for m in males if m]
females = requests.get('http://www.census.gov/genealogy/www/data/1990surnames/dist.female.first').text.split('\n')
females = [f.split()[0].lower() for f in females if f]
# Remove ambiguous names (those that appear on both lists)
ambiguous = [f for f in females + males if f in males and f in females]
males = [m for m in males if m not in ambiguous]
females = [f for f in females if f not in ambiguous]
return set(males), set(females)
Slight changes in the functions to find the gender of a person from the census: I take into account the frequency given by the census. If the difference between the male freq. and the female freq. is larger than an epsilon (in [0.1, 0.2]) then the user's gender is set to the gender that has the largest frequency.
def getCensusNames(self):
males_url = 'http://www.census.gov/genealogy/www/data/1990surnames/dist.male.first'
females_url = 'http://www.census.gov/genealogy/www/data/1990surnames/dist.female.first'
males = requests.get(males_url).text.split('\n')
females = requests.get(females_url).text.split('\n')
males_dict = {}
females_dict = {}
for m in males:
if m:
entry = m.split()
males_dict[entry[0].lower()] = float(entry[1])
for f in females:
if f:
entry = f.split()
females_dict[entry[0].lower()] = float(entry[1])
# Remove ambiguous names (those that appear on both lists)
males = males_dict
females = females_dict
ambiguous = {n: (males[n], females[n]) for n in females.keys() + males.keys()
if n in males and n in females}
males = [m for m in males if m not in ambiguous]
females = [f for f in females if f not in ambiguous]
eps = 0.1
todel = []
for n in ambiguous:
scores = ambiguous[n]
if scores[0] > scores[1]+eps:
males.append(n)
todel.append(n)
elif scores[0]+eps < scores[1]:
females.append(n)
todel.append(n)
for n in todel:
del ambiguous[n]
return set(males), set(females)
Find most similar friends using match on location (exact match first, then match on state), match on gender, then computing a cosine similarity on the statuses count, the followers count, and the followees count.
Features to considerate when looking for the most similar person to a twitter user:
Papers to look at: