Open mdingemanse opened 1 year ago
@bvreede Here's some code (won't run without access to the data) that we can walk though together for some ideas of useful quality checks. These are notes I have used in the selection and curation of corpora in the ACL and LREC papers.
# turns with identical times ----------------------------------------------
# Turns with identical times should be exceedingly rare.
# take a long source as an example
duplicate_annotations <- d %>% group_by(source,begin,end) %>%
filter(n()>1) %>% ungroup() %>%
select(begin,end,duration,participant,utterance,source,language,uid,priorby)
duplications <- d %>% group_by(source,begin,end) %>%
mutate(is_dup = ifelse(n() >1,1,0)) %>% ungroup() %>%
select(begin,end,duration,participant,utterance,source,language,uid,is_dup)
duplications %>%
group_by(language) %>%
summarise(turns = n_distinct(uid),
dup = sum(is_dup),
prop = dup/turns) %>%
arrange(desc(prop))
convplot(sample(duplicate_annotations[duplicate_annotations$language != "nahuatl",]$uid,10),highlight=T,content=T)
ggsave('qc-duplicate-annotations.png',width=20,height=22,bg="white")
# total time of all included sources --------------------------------------
d %>% group_by(language,source) %>%
summarize(start=min.na(begin),finish=max.na(end),
totaltime = finish - start,
hours = (totaltime/1000) / 3600)
#sort by length and inspect: are the shortest ones worth keeping?
d %>% group_by(language,source) %>%
summarize(start=min.na(begin),finish=max.na(end),
turns=n_distinct(uid),
totaltime = finish - start,
minutes = (totaltime/1000 / 60),
hours = (totaltime/1000) / 3600) %>%
arrange(hours)
# sorting from longest shows no extreme times
d %>% group_by(language,source) %>%
summarize(start=min.na(begin),finish=max.na(end),
turns=n_distinct(uid),
totaltime = finish - start,
minutes = (totaltime/1000 / 60),
hours = (totaltime/1000) / 3600) %>%
arrange(desc(hours))
# Akhoe -------------------------------------------------------------------
# The Akhoe corpus is so small that a long annotation occurring 4 times is
# pretty striking. Is this a copy paste error?
term <- "ga gestures that a mad (blind?) person has come"
d[d$utterance %in% term,]$source
uids <- d[d$utterance %in% term,]$uid
convplot(uids,focus=T,window=6000)
# Arabic ------------------------------------------------------------------
# Compare three callhome corpora
arabic1 <- read_corpus("arabic1") %>% mutate(language="arabic1")
arabic2 <- read_corpus("arabic2") %>% mutate(language="arabic2")
arabic3 <- read_corpus("arabic3") %>% mutate(language="arabic3")
loadedcorpora <- c("arabic1","arabic2","arabic3")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
# run FTO coding lines from processing.R
d %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle('Floor transfer onset') +
geom_density(alpha=0.1,na.rm=T) +
xlim(-20000,20000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
ggsave('qc-arabic-3corpora_FTO.png',bg="white")
d %>%
ggplot(aes(FTO,duration)) +
theme_tufte() +
ggtitle('Floor transfer onset by turn duration') +
geom_point(alpha=0.1,na.rm=T) +
ylim(0,20000) +
xlim(-20000,20000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
ggsave('qc-arabic-3corpora_FTOxdur.png',bg="white")
# Chatino -----------------------------------------------------------------
# Chatino is mostly massively monologic: only 463 speaker transitions out of
# 2154 turns in total, and not necessarily because the annotations are
# splitter-type
d %>% filter(language == "Chatino") %>% drop_na(FTO)
# very few speaker transitions overall — I'm beginning to think this corpus may not be usable
d %>% filter(language == "Chatino") %>%
group_by(source,priorby) %>%
summarize(n=n())
d %>% filter(language == "Chatino") %>%
group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n))
convplot(lang="Chatino",focus=T,window=120000)
# One source especially is mostly monologue: "/zacetepec_chatino1/zac-2012_07_11-trans_mgh_mbh_amp.ea"
# Datooga -----------------------------------------------------------------
# Datooga unfortunately has only single tier transcriptions
d %>% filter(language == "Datooga") %>% View()
convplot(lang="Datooga",n=2,window=8000000)
# Dutch -------------------------------------------------------------------
tokenize_corpus <- function(langs=NULL) {
d.tokens <- list()
for (lang in langs) {
print(paste('Tokenization: ',lang))
d.temp <- d %>% filter(language == lang) %>%
ungroup() %>%
drop_na(utterance) %>% drop_na(utterance_stripped) %>%
unnest_tokens(word, utterance_stripped) %>%
count(word, sort=T, name="n") %>%
mutate(language = lang,
rank = row_number(desc(n)),
total = sum(n),
freq = n/total,
freq_log = log(freq))
d.tokens[[lang]] <- d.temp
}
d.tokens <- reduce(d.tokens, rbind)
return(d.tokens)
}
# Two corpora for Dutch
dutch1 <- read_corpus("dutch1") %>% mutate(language="dutch1")
dutch2 <- read_corpus("dutch2") %>% mutate(language="dutch2")
loadedcorpora <- c("dutch1","dutch2")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
d$translation <- NA
test <- d %>%
mutate(duration_abs = abs(duration)) %>%
filter(duration_abs > 40000)
d <- d %>% filter(uid %notin% test$uid)
d.tokens <- tokenize_corpus(c("dutch1","dutch2"))
inspect_language("dutch1",saveplot=T)
inspect_language("dutch2",saveplot=T)
convplot(lang="dutch1",window=10000,dyads=T,content=T)
ggsave('qc-panel-dutch1-convplot.png',bg="white",width=14,height=10)
# what to do with speaker = UNKNOWN ?
uids <- d[d$participant == "UNKNOWN",]$uid
test <- convplot(sample(uids,6),content=T,datamode=T)
convplot(sample(uids,6),content=T)
ggsave('qc-panel-dutch1-convplot.png',bg="white",width=14,height=10)
examples <- c("dutch-1486-387-568685", "dutch-2133-32-71095", "dutch-218-309-489890",
"dutch-2561-70-147440", "dutch-2739-305-496438", "dutch-909-74-95161")
test <- convplot(examples,datamode=T)
ggsave('qc-dutch1-convplot-unknown.png',bg="white", width=14,height=10)
test %>% select(uid,source,participant,participant_int) %>% View()
# English -----------------------------------------------------------------
# For English we have the following corpora:
english1 <- read_corpus("english1") %>% mutate(language="english1")
english2 <- read_corpus("english2") %>% mutate(language="english2")
english3 <- read_corpus("english3") %>% mutate(language="english3")
english4 <- read_corpus("english4") %>% mutate(language="english4")
english5 <- read_corpus("english5") %>% mutate(language="english5")
english6 <- read_corpus("english6") %>% mutate(language="english6")
english7 <- read_corpus("english7") %>% mutate(language="english7")
# combine all
loadedcorpora <- c("english1","english2","english3","english4","english5","english6","english7")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
d <- english7
# verdict: English 1 looks very weird (no <0 timings); English6 too (all times
# 0). English 2, 3, and 7 have a (slight) problem of a sizable set of turns that
# begin at the exact same instant (very unlikely and also seen for Spanish and
# Japanese corpora that have been excluded for same reason).
# this leaves English 2 and English 4 as the most reasonable-looking ones. We
# might want to combine them.
d %>%
ggplot(aes(FTO,duration)) +
theme_tufte() +
ggtitle('Floor transfer onset by turn duration') +
geom_point(alpha=0.1,na.rm=T) +
ylim(0,20000) +
xlim(-20000,20000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
#ggsave('qc-english-7corpora.png',bg="white")
# looking into english1
test <- d %>% filter(source %in% sources[1:100])
test %>%
ggplot(aes(FTO,duration)) +
theme_tufte() +
ggtitle('Floor transfer onset by turn duration',
subtitle="CABNC, first 100 sources (18370 transitions, 7 with negative FTO)") +
geom_point(alpha=0.1,na.rm=T) +
ylim(0,20000) +
xlim(-10000,10000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
ggsave('out/qc-english1-cabnc-FTO-by-duration.png',bg="white",width=6,height=4)
test %>%
ungroup() %>%
mutate(FTO_status = ifelse(FTO <0,"in overlap","not in overlap")) %>%
drop_na(FTO) %>% group_by(FTO_status) %>%
summarise(n=n())
test <- test %>% mutate(naiveFTO = begin-lag(end))
summary(test$naiveFTO)
# looking into english7
d %>%
ggplot(aes(FTO,duration)) +
theme_tufte() +
ggtitle('Floor transfer onset by turn duration',
subtitle="Santa Barbara Corpus of Spoken English (english7)") +
geom_point(alpha=0.1,na.rm=T) +
ylim(0,20000) +
xlim(-10000,10000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
ggsave('out/qc-english7-FTO-by-duration.png',bg="white",width=6,height=4)
d %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle('Floor transfer onset by turn duration',
subtitle="Santa Barbara Corpus of Spoken English (english7)") +
geom_density() +
#ylim(0,20000) +
xlim(-5000,5000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
ggsave('out/qc-english7-FTO-density.png',bg="white",width=6,height=4)
FTO_0 <- d[which(d$FTO==0),]$uid
FTO_length <- d[which(d$FTO==d$duration),]$uid
these_uids <- sample(FTO_0,10)
these_uids <- sample(FTO_length,10)
convplot(these_uids,content=T,highlight=T)
# French ------------------------------------------------------------------
french1 <- read_corpus("french1") %>% mutate(language="french1")
french2 <- read_corpus("french2") %>% mutate(language="french2")
french3 <- read_corpus("french3") %>% mutate(language="french3")
# combine all
loadedcorpora <- c("french1","french2","french3")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
d <- d %>% mutate(translation = NA)
d %>%
ggplot(aes(FTO,duration)) +
theme_tufte() +
ggtitle('Floor transfer onset by turn duration') +
geom_point(alpha=0.1,na.rm=T) +
ylim(0,20000) +
xlim(-20000,20000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
ggsave('qc-french-3corpora.png',bg="white")
inspect_language("french1",saveplot = T)
inspect_language("french2",saveplot = T)
inspect_language("french3",saveplot = T)
# German ------------------------------------------------------------------
german1 <- read_corpus("german1") %>% mutate(language="german1")
german2 <- read_corpus("german2") %>% mutate(language="german2")
# combine all
loadedcorpora <- c("german1","german2")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
d <- d %>% mutate(translation = NA)
inspect_language("german1",saveplot = T)
inspect_language("german2",saveplot = T)
convplot(lang="german1")
ggsave('qc-conv-german1.png',bg="white")
convplot(lang="german2")
ggsave('qc-conv-german2.png',bg="white")
# Hausa -------------------------------------------------------------------
# Hausa prior to post-processing has 'splitter-type' segmentation, and also has
# an unlikely proportion of FTOs at exactly 0 (586 out of 1585)
d %>% filter(language == "hausa") %>%
group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n))
convplot(lang="hausa",window=20000,focus=T)
# Mixtec ------------------------------------------------------------------
mixtec <- d %>% filter(langshort=="Mixtec")
inspect_corpus("yoloxochitl_mixtec")
mixtec %>%
filter(priorby=="other")
# oof, only 212 out of 50k turns have a prior turn by other instead of self
# Polish ------------------------------------------------------------------
# polish has some outliers in duration
test <- d %>% filter(language=="polish") %>%
mutate(duration_abs = abs(duration)) %>%
filter(duration_abs > 40000)
convplot(test$uid)
test$source
# three of these are in MW_006, let's have a closer look
uids <- sample(d[d$source == "/polish/MW_006.eaf",]$uid,10)
convplot(uids,window=20000)
# Siputhi -----------------------------------------------------------------
# Siputhi is a very small corpus (430 turns, of which only 328 are speaker
# transitions). Timing appears to be weirdly spiky mainly because of this
d %>% filter(language == "Siputhi") %>%
group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n)) %>% View()
convplot(lang="Siputhi",window=20000,focus=T)
# Totoli ------------------------------------------------------------------
# Totoli has a mean annotation length of only 798, and many self-transitions,
# indicating that turns are segmented with a splitter's temperament
convplot(lang="Totoli")
theseuids <- c("totoli-1-139-150117", "totoli-1-209-259023", "totoli-1-66-76901")
convplot(theseuids)
# Zaar --------------------------------------------------------------------
# Zaar is relatively small and segmented in splitter fashion, with relatively
# few speaker transitions (520 other vs 1234 self). This is especially striking
# in BC_CONV_03.eaf. There are also relatively many FTOs of 0 (201 out of 520)
# indicating sth about transcription convention or software.
d %>% filter(language == "Zaar") %>%
group_by(source,priorby) %>% summarise(n=n())
d %>% filter(language == "Zaar") %>%
group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n)) %>% View()
convplot(lang="Zaar",window=20000,focus=T)
# Japanese and German selection -------------------------------------------
japanese1 <- read_corpus("japanese1") %>% mutate(language = "japanese1")
japanese2 <- read_corpus("japanese2") %>% mutate(language = "japanese2")
japanese3 <- read_corpus("japanese3") %>% mutate(language = "japanese3")
spanish1 <- read_corpus("spanish1") %>% mutate(language = "spanish1")
spanish2 <- read_corpus("spanish2") %>% mutate(language = "spanish2")
spanish3 <- read_corpus("spanish3") %>% mutate(language = "spanish3")
# combine all
loadedcorpora <- c("japanese1","japanese2","japanese3","spanish1","spanish2","spanish3")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
# Mandarin ----------------------------------------------------------------
# mandarin comparing 3 corpora
mandarin1 <- read_corpus("mandarin1") %>% mutate(language = "mandarin1")
mandarin2 <- read_corpus("mandarin2") %>% mutate(language = "mandarin2")
# mandarin check
mandarindata <- list(mandarin1,mandarin2)
d <- rbindlist(mandarindata,fill=T)
inspect_language("mandarin1",saveplot=T)
inspect_language("mandarin2",saveplot=T)
# Tseltal timing investigation --------------------------------------------
# The two largest Tseltal sources unfortunately have a disproportionate amount
# of transitions set at exactly zero, an artefact of cutting up annotations
# during segmentation. This means we cannot trust these to represent true
# time-aligned annotations, and we cannot use them to precision-target segments
# of audio.
d %>% filter(language == "Tseltal") %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle(paste0('Floor transfer onset for all turns by source')) +
geom_density(na.rm=T,size=1) +
xlim(-2000,2000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ source,strip.position ="bottom")
d %>%
filter(language %in% c("Tseltal")) %>% group_by(source) %>%
summarise(n=n(),
medianFTO=median.na(FTO),
meanFTO=mean.na(FTO))
d %>%
filter(language %in% c("Tseltal")) %>% group_by(FTO) %>%
summarise(n=n()) %>% arrange(desc(n))
# Zauzou ------------------------------------------------------------------
# some turns are exactly 4000 with an FTO of 0?
test <- zauzou %>%
filter(duration==4000)
test$uid
convplot(test$uid,focus=T)
View(test)
# japanese and spanish timing investigation -------------------------------
whatsupwith <- c("japanese","spanish")
d %>%
filter(language %in% whatsupwith) %>%
ggplot(aes(FTO,duration)) +
theme_tufte() +
ggtitle('Floor transfer onset by turn duration') +
geom_point(alpha=0.1,na.rm=T) +
ylim(0,20000) +
xlim(-20000,20000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
ggsave('ex-whatsupwith-japanese-spanish.png',bg="white")
d %>%
filter(language %in% whatsupwith) %>%
ggplot(aes(offsetCRUDE,duration)) +
theme_tufte() + theme(legend.position = "none") +
ggtitle('What process could have generated this pattern?',
subtitle='~5000 turns for which -offset = duration. Are these duplicate rows?') +
geom_point(alpha=0.1,na.rm=T) +
geom_point(data=. %>% filter(duration == -offsetCRUDE),
aes(offsetCRUDE,duration),color="red",alpha=0.3) +
ylim(0,20000) +
xlim(-20000,20000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
ggsave('ex-whatsupwith-japanese-spanish-highlighted.png',bg="white")
d %>%
filter(language %in% whatsupwith) %>%
filter(duration == -offsetCRUDE) %>%
ggplot(aes(offsetCRUDE,duration)) +
theme_tufte() +
ggtitle('OffsetCRUDE by turn duration') +
geom_point(alpha=0.1,na.rm=T) +
ylim(0,20000) +
xlim(-20000,20000) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position="bottom")
weird <- d %>%
filter(language %in% whatsupwith) %>%
filter(duration == -offsetCRUDE)
View(weird)
uids <- sample(weird[weird$language %in% "spanish",]$uid,20)
uids <- c("spanish-11-500-697626","spanish-14-569-935684", "spanish-17-1047-1317073", "spanish-17-814-993060", "spanish-18-1433-1924591", "spanish-19-410-726290",
"spanish-19-506-924906", "spanish-7-113-199645")
convplot(uids,focus=T)
ggsave('ex-whatsupwith-spanish-convplot.png',bg="white")
test <- convplot(uids,datamode=T)
test %>% select(begin,end,duration,offsetCRUDE,utterance,focus)
uids <- sample(weird[weird$language %in% "japanese",]$uid,8)
uids <- c("japanese-40-228-525640", "japanese-61-99-292040", "japanese-7-117-423940", "japanese-7-12-130240", "japanese-7-128-460210", "japanese-7-219-722520", "japanese-7-90-332520", "japanese-87-120-302060")
convplot(uids,focus=T)
ggsave('ex-whatsupwith-japanese-convplot.png',bg="white")
unique(weird$source)
# Farsi subcorpora --------------------------------------------------------
# summary: below some data on weird subcorpora that might be worth excluding
# from FTO / timing-related analyses
weird_subcorpora <- c("/farsi/callfriend/fa_4699_asc.txt",
"/farsi/callfriend/fa_6936_asc.txt",
"/farsi/callfriend/fa_7003_asc.txt")
d %>% filter(language == "farsi" & source %in% weird_subcorpora) %>%
group_by(source) %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle(paste0('Floor transfer onset for Farsi by source')) +
geom_density(na.rm=T,size=1) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ source,strip.position ="bottom")
d %>% filter(language == "farsi" & source %notin% weird_subcorpora) %>%
group_by(source) %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle(paste0('Floor transfer onset for Farsi by source')) +
geom_density(na.rm=T,size=1) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ source,strip.position ="bottom")
ggsave('qc-farsi-by-source.png',bg="white",width=20,height=20)
# hey farsi source 20 looks weird, with lots of fully overlapping turns
#fa_4699
finduid("farsi-20-180-779680")
d %>% filter(source=="/farsi/callfriend/fa_4699_asc.txt") %>% View()
uids <- sample(d[d$source == "/farsi/callfriend/fa_4699_asc.txt",]$uid,20)
convplot(uids,window=200000)
ggsave("qc-farsi-fa_4699_asc.png",bg="white",height=12,width=5)
d %>% filter(source=="/farsi/callfriend/fa_4699_asc.txt") %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle(paste0('Floor transfer onset for fa_4699_asc')) +
geom_density(na.rm=T,size=1) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position ="bottom")
d %>% filter(language =="farsi" & source != "/farsi/callfriend/fa_4699_asc.txt") %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle(paste0('Floor transfer onset for all other Farsi data')) +
geom_density(na.rm=T,size=1) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position ="bottom")
# this makes me realize we can facet by source by language to find weirdly transcribed subcorpora
d %>% filter(language == "farsi") %>%
group_by(source) %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle(paste0('Floor transfer onset for Farsi by source')) +
geom_density(na.rm=T,size=1) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ source,strip.position ="bottom")
ggsave('qc-farsi-by-source.png',bg="white",width=20,height=20)
# fa_6936.asc
d %>% filter(source=="/farsi/callfriend/fa_6936_asc.txt") %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle(paste0('Floor transfer onset for fa_4699_asc')) +
geom_density(na.rm=T,size=1) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position ="bottom")
d %>% filter(source=="/farsi/callfriend/fa_6936_asc.txt") %>% View()
uids <- sample(d[d$source == "/farsi/callfriend/fa_6936_asc.txt",]$uid,20)
convplot(uids,window=200000)
ggsave("qc-farsi-fa_6936_asc.png",bg="white",height=12,width=5)
# fa_6419
# nothing much going on judging from convplots; just lots of competitive overlap
uids <- sample(d[d$source == "/farsi/callfriend/fa_6419_asc.txt",]$uid,20)
convplot(uids,window=200000)
# fa_4771
# nothing much going on judging from convplots; just lots of competitive overlap
uids <- sample(d[d$source == "/farsi/callfriend/fa_4771_asc.txt",]$uid,20)
convplot(uids,window=200000)
# fa_7003
# this one's mostly monologue, won't affect FTO or timing analyses
uids <- sample(d[d$source == "/farsi/callfriend/fa_7003_asc.txt",]$uid,20)
convplot(uids,window=200000)
d %>% filter(source=="/farsi/callfriend/fa_6936_asc.txt") %>% View()
# weird timing distributions ----------------------------------------------
# From the highly peaked distributions of a bunch of these it is clear that
# something is up. In most of these languages, the most frequent FTO is exactly
# 0, which is highly implausible and likely a transcription software quirk
# (breaking up annotations at one point). This means we cannot trust `end` value
# (and therefore also not `duration` or `FTO`), so we are not using them. One of
# them even has no FTOs <0 (Akie), showing that overlap is not marked and the
# content of annotations is not time-aligned with the actual speech.
#
# Two larger corpora, Brazilian Portugues and Czech, have lots of directly
# adjacent annotations by same speaker. Unless these can be concatenated in some
# empirically sensible way, timing information is not to be trusted, so we
# exclude those too for now.
weird_distributions <- c("akie","croatian","mambila","nganasan","khinalug","besemah","czech","brazilian_portuguese")
d.w <- d %>% filter(language %in% weird_distributions)
nturns <- d.w %>% drop_na(FTO) %>% ungroup() %>% summarize(n=n()) %>% as.integer()
pA <- d.w %>%
filter(language %in% c("akie","mambila")) %>%
ggplot(aes(FTO)) +
theme_tufte() +
ggtitle(paste0('Floor transfer onset')) +
xlim(-2000,2000) +
geom_density(na.rm=T,size=1,bw=0.5,trim=T) +
geom_vline(xintercept = 0,colour="#cccccc") +
facet_wrap(~ language,strip.position ="bottom")
d.w %>%
filter(language %in% c("akie","mambila")) %>% group_by(language) %>% summarise(n=n())
sort(table(d.w$FTO),decreasing=T)[1:10]
convplot(lang="akie",n=5,window=20000)
convplot(lang="mambila",n=5,window=20000)
uids <- c("akie-1-171-371969", "mambila-1-756-1260971")
pB <- convplot(uids)
pB <- pB +
ggtitle("Sample annotations in Akie and Mambila")
plot_grid(pB,pA,labels=c("A","B"),rel_widths=c(2,1))
ggsave('sup-akia-mambila-panel.png',bg="white",height=2,width=10)
# how weird are Brazilian Portuguese and Czech?
test <- d %>% filter(language %in% c("brazilian_portuguese","czech") & FTO < -250)
uids <- sample(test[test$language == "czech",]$uid,20)
convplot(uids,focus=T)
ggsave('ex-segmentation-weirdness_czech.png',bg="white")
View(convplot(uids,focus=T,datamode=T))
uids <- sample(test[test$language == "brazilian_portuguese",]$uid,6)
convplot(uids,focus=T)
ggsave('ex-segmentation-weirdness-brazilian_portugues.png',bg="white")
View(convplot(uids,focus=T,datamode=T))
# diagnosis: too weird to include in first run. They have lots of directly
# adjacent annotations by same speaker which I guess should be concatenated.
I have a large set of source level and corpus level notes in the original ElPaCo code — share this so that it can inspire functions and examples for quality control