langcog / wordbank

open repository of children's vocabulary data
http://wordbank.stanford.edu
GNU General Public License v2.0
64 stars 10 forks source link

Unilemma consistency updating #299

Open alvinwmtan opened 1 year ago

alvinwmtan commented 1 year ago

Starting a new thread for unilemma consistency updating. Rather than change these individually when they are found, we intend to update them regularly once enough have accumulated.

Current list:

New items (to check if old forms contain this):

kachergis commented 1 week ago

Adding more unilemmas and odd categories discovered during SwadeshCDI work:

  mutate(category = case_when(
    category == "descriptive_words (adjectives)" ~ "descriptive_words",
    category == "descriptive_words (adverbs)" ~ "descriptive_words",
    category == "outside_places" ~ "outside", 
    category == "places" ~ "outside", # combine outside and places
    category == "articles" ~ "quantifiers",
    category == "hold" ~ "household",
    category == "states" ~ "action_words",
    category == "prepositions" ~ "locations",
    category == "mental" & uni_lemma == "understand" ~ "action_words",
    category == "mental" & uni_lemma == "remember" ~ "action_words",
    category == "mental" & uni_lemma == "mad" ~ "descriptive_words",
    category == "negation_words" ~ "games_routines", # Arabic: 'finished', 'don't want', 'not mine'...
    uni_lemma == "although" & is.na(category) ~ "connecting_words",
    uni_lemma == "accident" & is.na(category) ~ "other",
    uni_lemma == "expensive" & is.na(category) ~ "descriptive_words",
    uni_lemma == "album" & category == "toys" ~ "household",
    uni_lemma == "allowed" & category == "games_routines" ~ "helping_verbs",
    uni_lemma == "bored" & is.na(category) ~ "descriptive_words",
    uni_lemma == "circle" & is.na(category) ~ "descriptive_words",
    uni_lemma == "backpack" ~ "household",
    uni_lemma == "only" ~ "quantifiers",
    uni_lemma == "material" & is.na(category) ~ "household",
    uni_lemma == "microscope" & is.na(category) ~ "toys",
    uni_lemma == "mall" ~ "outside",
    uni_lemma == "our" ~ "pronouns",
    uni_lemma == "I" ~ "pronouns",
    uni_lemma == "today" ~ "time_words",
    uni_lemma == "bounce" ~ "action_words",
    uni_lemma == "blouse" ~ "clothing",
    uni_lemma == "gas" ~ "outside",
    uni_lemma == "yet" ~ "descriptive_words",
    uni_lemma == "deep" & is.na(category) ~ "descriptive_words",
    uni_lemma == "bead" ~ "clothing",
    uni_lemma == "every" ~ "quantifiers",
    uni_lemma == "fishtank" ~ "household",
    uni_lemma == "he" ~ "pronouns",
    uni_lemma == "might" ~ "helping_verbs",
    .default = category)) |>
  mutate(uni_lemma = case_when(
    definition == "haber (hay)" ~ "have",
    uni_lemma == "Pencil" ~ "pencil", # Catalan typo
    uni_lemma == "allowed" ~ "allow",
    uni_lemma == "mop" ~ "mop (object)",
    uni_lemma == "her" ~ "3SG.POSS", # Dutch
    uni_lemma == "he" ~ "3SG", # Mandarin (Taiwanese)
    uni_lemma == "fishtank" ~ "fish tank",
    uni_lemma == "aggrieved" ~ "upset",
    uni_lemma == "aound" ~ "round", # Irish typo
    uni_lemma == "baby chair" ~ "high chair", # 
    uni_lemma == "self" & category == "vehicles" ~ "car", # Spanish (Chilean)
    uni_lemma == "self" & category == "pronouns" ~ "1SG", # Estonian
    uni_lemma == "back" & category == "body_parts" ~ "back (body part)", 
    uni_lemma == "nail" & category == "body_parts" ~ "fingernail", 
    uni_lemma == "baker" & category == "outside" ~ "bakery", 
    uni_lemma == "bat" & category == "toys" ~ "bat (object)", 
    uni_lemma == "chicken" & category == "animals" ~ "chicken (animal)", 
    uni_lemma == "I" ~ "1SG", # Japanese (boku, watashi)
    uni_lemma == "our" ~ "1PL.POSS",
    uni_lemma == "bead" ~ "beads",
    uni_lemma == "chewing gum" ~ "gum",
    uni_lemma == "clothing" ~ "clothes",
    uni_lemma == "cock-a-doodle-doo" ~ "cockadoodledoo",
    uni_lemma == "child's name" ~ "child's own name",
    uni_lemma == "fireman" ~ "firefighter",
    uni_lemma == "fridge/freezer" ~ "fridge",
    uni_lemma == "forger" ~ "forget",
    uni_lemma == "fries" ~ "french fries",
    .default = uni_lemma
  ))