assorted examples of quality checks

@bvreede Here's some code (won't run without access to the data) that we can walk though together for some ideas of useful quality checks. These are notes I have used in the selection and curation of corpora in the ACL and LREC papers.
# turns with identical times ----------------------------------------------

# Turns with identical times should be exceedingly rare.

# take a long source as an example

duplicate_annotations <- d %>% group_by(source,begin,end) %>%
  filter(n()>1) %>% ungroup() %>% 
  select(begin,end,duration,participant,utterance,source,language,uid,priorby)

duplications <- d %>% group_by(source,begin,end) %>%
  mutate(is_dup = ifelse(n() >1,1,0)) %>% ungroup() %>%
  select(begin,end,duration,participant,utterance,source,language,uid,is_dup)
duplications %>% 
  group_by(language) %>%
  summarise(turns = n_distinct(uid),
            dup = sum(is_dup),
            prop = dup/turns) %>%
  arrange(desc(prop))

convplot(sample(duplicate_annotations[duplicate_annotations$language != "nahuatl",]$uid,10),highlight=T,content=T)
ggsave('qc-duplicate-annotations.png',width=20,height=22,bg="white")

# total time of all included sources --------------------------------------

d %>% group_by(language,source) %>%
  summarize(start=min.na(begin),finish=max.na(end),
            totaltime = finish - start,
            hours = (totaltime/1000) / 3600)

#sort by length and inspect: are the shortest ones worth keeping?
d %>% group_by(language,source) %>%
  summarize(start=min.na(begin),finish=max.na(end),
            turns=n_distinct(uid),
            totaltime = finish - start,
            minutes = (totaltime/1000 / 60), 
            hours = (totaltime/1000) / 3600) %>%
  arrange(hours)

# sorting from longest shows no extreme times
d %>% group_by(language,source) %>%
  summarize(start=min.na(begin),finish=max.na(end),
            turns=n_distinct(uid),
            totaltime = finish - start,
            minutes = (totaltime/1000 / 60), 
            hours = (totaltime/1000) / 3600) %>%
  arrange(desc(hours))

# Akhoe -------------------------------------------------------------------

# The Akhoe corpus is so small that a long annotation occurring 4 times is
# pretty striking. Is this a copy paste error?

term <- "ga gestures that a mad (blind?) person has come"
d[d$utterance %in% term,]$source
uids <- d[d$utterance %in% term,]$uid
convplot(uids,focus=T,window=6000)

# Arabic ------------------------------------------------------------------

# Compare three callhome corpora

arabic1 <- read_corpus("arabic1") %>% mutate(language="arabic1")
arabic2 <- read_corpus("arabic2") %>% mutate(language="arabic2")
arabic3 <- read_corpus("arabic3") %>% mutate(language="arabic3")

loadedcorpora <- c("arabic1","arabic2","arabic3")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
# run FTO coding lines from processing.R

d %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle('Floor transfer onset') +
  geom_density(alpha=0.1,na.rm=T) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('qc-arabic-3corpora_FTO.png',bg="white")

d %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('qc-arabic-3corpora_FTOxdur.png',bg="white")

# Chatino -----------------------------------------------------------------

# Chatino is mostly massively monologic: only 463 speaker transitions out of
# 2154 turns in total, and not necessarily because the annotations are
# splitter-type

d %>% filter(language == "Chatino") %>% drop_na(FTO) 

# very few speaker transitions overall — I'm beginning to think this corpus may not be usable
d %>% filter(language == "Chatino") %>%
  group_by(source,priorby) %>%
  summarize(n=n())

d %>% filter(language == "Chatino") %>% 
  group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n))

convplot(lang="Chatino",focus=T,window=120000)

# One source especially is mostly monologue: "/zacetepec_chatino1/zac-2012_07_11-trans_mgh_mbh_amp.ea"

# Datooga -----------------------------------------------------------------

# Datooga unfortunately has only single tier transcriptions

d %>% filter(language == "Datooga") %>% View()

convplot(lang="Datooga",n=2,window=8000000)

# Dutch -------------------------------------------------------------------

tokenize_corpus <- function(langs=NULL) {

  d.tokens <- list()

  for (lang in langs) {

    print(paste('Tokenization: ',lang))
    d.temp <- d %>% filter(language == lang) %>%
      ungroup() %>%
      drop_na(utterance) %>% drop_na(utterance_stripped) %>%
      unnest_tokens(word, utterance_stripped) %>%
      count(word, sort=T, name="n") %>%
      mutate(language = lang,
             rank = row_number(desc(n)),
             total = sum(n),
             freq = n/total,
             freq_log = log(freq))

    d.tokens[[lang]] <- d.temp

  }

  d.tokens <- reduce(d.tokens, rbind)
  return(d.tokens)
}  

# Two corpora for Dutch
dutch1 <- read_corpus("dutch1") %>% mutate(language="dutch1")
dutch2 <- read_corpus("dutch2") %>% mutate(language="dutch2")

loadedcorpora <- c("dutch1","dutch2")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
d$translation <- NA

test <- d %>%
  mutate(duration_abs = abs(duration)) %>%
  filter(duration_abs > 40000)
d <- d %>% filter(uid %notin% test$uid)

d.tokens <- tokenize_corpus(c("dutch1","dutch2"))
inspect_language("dutch1",saveplot=T)
inspect_language("dutch2",saveplot=T)

convplot(lang="dutch1",window=10000,dyads=T,content=T)
ggsave('qc-panel-dutch1-convplot.png',bg="white",width=14,height=10)

# what to do with speaker = UNKNOWN ?

uids <- d[d$participant == "UNKNOWN",]$uid
test <- convplot(sample(uids,6),content=T,datamode=T)

convplot(sample(uids,6),content=T)
ggsave('qc-panel-dutch1-convplot.png',bg="white",width=14,height=10)

examples <- c("dutch-1486-387-568685", "dutch-2133-32-71095", "dutch-218-309-489890", 
              "dutch-2561-70-147440", "dutch-2739-305-496438", "dutch-909-74-95161")

test <- convplot(examples,datamode=T)
ggsave('qc-dutch1-convplot-unknown.png',bg="white", width=14,height=10)
test %>% select(uid,source,participant,participant_int) %>% View()

# English -----------------------------------------------------------------

# For English we have the following corpora:

english1 <- read_corpus("english1") %>% mutate(language="english1")
english2 <- read_corpus("english2") %>% mutate(language="english2")
english3 <- read_corpus("english3") %>% mutate(language="english3")
english4 <- read_corpus("english4") %>% mutate(language="english4")
english5 <- read_corpus("english5") %>% mutate(language="english5")
english6 <- read_corpus("english6") %>% mutate(language="english6")
english7 <- read_corpus("english7") %>% mutate(language="english7")

# combine all 
loadedcorpora <- c("english1","english2","english3","english4","english5","english6","english7")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
d <- english7

# verdict: English 1 looks very weird (no <0 timings); English6 too (all times
# 0). English 2, 3, and 7 have a (slight) problem of a sizable set of turns that
# begin at the exact same instant (very unlikely and also seen for Spanish and
# Japanese corpora that have been excluded for same reason).

# this leaves English 2 and English 4 as the most reasonable-looking ones. We
# might want to combine them.

d %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
#ggsave('qc-english-7corpora.png',bg="white")

# looking into english1
test <- d %>% filter(source %in% sources[1:100])
test %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration',
          subtitle="CABNC, first 100 sources (18370 transitions, 7 with negative FTO)") +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-10000,10000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('out/qc-english1-cabnc-FTO-by-duration.png',bg="white",width=6,height=4)

test %>%
  ungroup() %>%
  mutate(FTO_status = ifelse(FTO <0,"in overlap","not in overlap")) %>%
  drop_na(FTO) %>% group_by(FTO_status) %>%
  summarise(n=n())

test <- test %>% mutate(naiveFTO = begin-lag(end))
summary(test$naiveFTO)

# looking into english7

d %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration',
          subtitle="Santa Barbara Corpus of Spoken English (english7)") +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-10000,10000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('out/qc-english7-FTO-by-duration.png',bg="white",width=6,height=4)

d %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration',
          subtitle="Santa Barbara Corpus of Spoken English (english7)") +
  geom_density() +
  #ylim(0,20000) +
  xlim(-5000,5000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('out/qc-english7-FTO-density.png',bg="white",width=6,height=4)

FTO_0 <- d[which(d$FTO==0),]$uid
FTO_length <- d[which(d$FTO==d$duration),]$uid
these_uids <- sample(FTO_0,10)
these_uids <- sample(FTO_length,10)
convplot(these_uids,content=T,highlight=T)

# French ------------------------------------------------------------------

french1 <- read_corpus("french1") %>% mutate(language="french1")
french2 <- read_corpus("french2") %>% mutate(language="french2")
french3 <- read_corpus("french3") %>% mutate(language="french3")

# combine all 
loadedcorpora <- c("french1","french2","french3")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T) 
rm(data)

d <- d %>% mutate(translation = NA)
d %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('qc-french-3corpora.png',bg="white")

inspect_language("french1",saveplot = T)
inspect_language("french2",saveplot = T)
inspect_language("french3",saveplot = T)

# German ------------------------------------------------------------------

german1 <- read_corpus("german1") %>% mutate(language="german1")
german2 <- read_corpus("german2") %>% mutate(language="german2")

# combine all 
loadedcorpora <- c("german1","german2")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T) 
rm(data)

d <- d %>% mutate(translation = NA)

inspect_language("german1",saveplot = T)
inspect_language("german2",saveplot = T)

convplot(lang="german1")
ggsave('qc-conv-german1.png',bg="white")
convplot(lang="german2")
ggsave('qc-conv-german2.png',bg="white")

# Hausa -------------------------------------------------------------------

# Hausa prior to post-processing has 'splitter-type' segmentation, and also has
# an unlikely proportion of FTOs at exactly 0 (586 out of 1585)

d %>% filter(language == "hausa") %>% 
  group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n))

convplot(lang="hausa",window=20000,focus=T)

# Mixtec ------------------------------------------------------------------

mixtec <- d %>% filter(langshort=="Mixtec")

inspect_corpus("yoloxochitl_mixtec")

mixtec %>%
  filter(priorby=="other")

# oof, only 212 out of 50k turns have a prior turn by other instead of self

# Polish ------------------------------------------------------------------

# polish has some outliers in duration

test <- d %>% filter(language=="polish") %>%
  mutate(duration_abs = abs(duration)) %>%
  filter(duration_abs > 40000)
convplot(test$uid)
test$source
# three of these are in MW_006, let's have a closer look

uids <- sample(d[d$source == "/polish/MW_006.eaf",]$uid,10)
convplot(uids,window=20000)

# Siputhi -----------------------------------------------------------------

# Siputhi is a very small corpus (430 turns, of which only 328 are speaker
# transitions). Timing appears to be weirdly spiky mainly because of this

d %>% filter(language == "Siputhi") %>% 
  group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n)) %>% View()

convplot(lang="Siputhi",window=20000,focus=T)

# Totoli ------------------------------------------------------------------

# Totoli has a mean annotation length of only 798, and many self-transitions,
# indicating that turns are segmented with a splitter's temperament

convplot(lang="Totoli")

theseuids <- c("totoli-1-139-150117", "totoli-1-209-259023", "totoli-1-66-76901")

convplot(theseuids)

# Zaar --------------------------------------------------------------------

# Zaar is relatively small and segmented in splitter fashion, with relatively
# few speaker transitions (520 other vs 1234 self). This is especially striking
# in BC_CONV_03.eaf. There are also relatively many FTOs of 0 (201 out of 520)
# indicating sth about transcription convention or software.

d %>% filter(language == "Zaar") %>% 
  group_by(source,priorby) %>% summarise(n=n())

d %>% filter(language == "Zaar") %>% 
  group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n)) %>% View()

convplot(lang="Zaar",window=20000,focus=T)

# Japanese and German selection -------------------------------------------

japanese1 <- read_corpus("japanese1") %>% mutate(language = "japanese1")
japanese2 <- read_corpus("japanese2") %>% mutate(language = "japanese2")
japanese3 <- read_corpus("japanese3") %>% mutate(language = "japanese3")
spanish1 <- read_corpus("spanish1") %>% mutate(language = "spanish1")
spanish2 <- read_corpus("spanish2") %>% mutate(language = "spanish2")
spanish3 <- read_corpus("spanish3") %>% mutate(language = "spanish3")

# combine all 
loadedcorpora <- c("japanese1","japanese2","japanese3","spanish1","spanish2","spanish3")
data = do.call("list", mget(loadedcorpora))

d <- rbindlist(data,fill=T)
rm(data)

# Mandarin ----------------------------------------------------------------

# mandarin comparing 3 corpora
mandarin1 <- read_corpus("mandarin1") %>% mutate(language = "mandarin1")
mandarin2 <- read_corpus("mandarin2") %>% mutate(language = "mandarin2")

# mandarin check
mandarindata <- list(mandarin1,mandarin2)
d <- rbindlist(mandarindata,fill=T)

inspect_language("mandarin1",saveplot=T)
inspect_language("mandarin2",saveplot=T)

# Tseltal timing investigation --------------------------------------------

# The two largest Tseltal sources unfortunately have a disproportionate amount
# of transitions set at exactly zero, an artefact of cutting up annotations
# during segmentation. This means we cannot trust these to represent true
# time-aligned annotations, and we cannot use them to precision-target segments
# of audio.

d %>% filter(language == "Tseltal") %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for all turns by source')) +
  geom_density(na.rm=T,size=1) +
  xlim(-2000,2000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ source,strip.position ="bottom")

d %>%
  filter(language %in% c("Tseltal")) %>% group_by(source) %>% 
  summarise(n=n(),
            medianFTO=median.na(FTO),
            meanFTO=mean.na(FTO))
d %>%
  filter(language %in% c("Tseltal")) %>% group_by(FTO) %>% 
  summarise(n=n()) %>% arrange(desc(n))

# Zauzou ------------------------------------------------------------------

# some turns are exactly 4000 with an FTO of 0?

test <- zauzou %>%
  filter(duration==4000)
test$uid
convplot(test$uid,focus=T)
View(test)

# japanese and spanish timing investigation -------------------------------

whatsupwith <- c("japanese","spanish")

d %>%
  filter(language %in% whatsupwith) %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('ex-whatsupwith-japanese-spanish.png',bg="white")

d %>%
  filter(language %in% whatsupwith) %>%
  ggplot(aes(offsetCRUDE,duration)) +
  theme_tufte() + theme(legend.position = "none") +
  ggtitle('What process could have generated this pattern?',
          subtitle='~5000 turns for which -offset = duration. Are these duplicate rows?') +
  geom_point(alpha=0.1,na.rm=T) +
  geom_point(data=. %>% filter(duration == -offsetCRUDE),
             aes(offsetCRUDE,duration),color="red",alpha=0.3) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('ex-whatsupwith-japanese-spanish-highlighted.png',bg="white")

d %>%
  filter(language %in% whatsupwith) %>%
  filter(duration == -offsetCRUDE) %>%
  ggplot(aes(offsetCRUDE,duration)) +
  theme_tufte() +
  ggtitle('OffsetCRUDE by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")

weird <- d %>%
  filter(language %in% whatsupwith) %>%
  filter(duration == -offsetCRUDE)
View(weird)

uids <- sample(weird[weird$language %in% "spanish",]$uid,20)
uids <- c("spanish-11-500-697626","spanish-14-569-935684", "spanish-17-1047-1317073", "spanish-17-814-993060", "spanish-18-1433-1924591", "spanish-19-410-726290", 
          "spanish-19-506-924906", "spanish-7-113-199645")
convplot(uids,focus=T)
ggsave('ex-whatsupwith-spanish-convplot.png',bg="white")
test <- convplot(uids,datamode=T)
test %>% select(begin,end,duration,offsetCRUDE,utterance,focus)

uids <- sample(weird[weird$language %in% "japanese",]$uid,8)
uids <- c("japanese-40-228-525640", "japanese-61-99-292040", "japanese-7-117-423940", "japanese-7-12-130240", "japanese-7-128-460210", "japanese-7-219-722520", "japanese-7-90-332520", "japanese-87-120-302060")
convplot(uids,focus=T)
ggsave('ex-whatsupwith-japanese-convplot.png',bg="white")

unique(weird$source)

# Farsi subcorpora --------------------------------------------------------

# summary: below some data on weird subcorpora that might be worth excluding
# from FTO / timing-related analyses
weird_subcorpora <- c("/farsi/callfriend/fa_4699_asc.txt",
                      "/farsi/callfriend/fa_6936_asc.txt",
                      "/farsi/callfriend/fa_7003_asc.txt")

d %>% filter(language == "farsi" & source %in% weird_subcorpora) %>%
  group_by(source) %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for Farsi by source')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ source,strip.position ="bottom")

d %>% filter(language == "farsi" & source %notin% weird_subcorpora) %>%
  group_by(source) %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for Farsi by source')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ source,strip.position ="bottom")
ggsave('qc-farsi-by-source.png',bg="white",width=20,height=20)

# hey farsi source 20 looks weird, with lots of fully overlapping turns
#fa_4699
finduid("farsi-20-180-779680")
d %>% filter(source=="/farsi/callfriend/fa_4699_asc.txt") %>% View()
uids <- sample(d[d$source == "/farsi/callfriend/fa_4699_asc.txt",]$uid,20)
convplot(uids,window=200000)
ggsave("qc-farsi-fa_4699_asc.png",bg="white",height=12,width=5)

d %>% filter(source=="/farsi/callfriend/fa_4699_asc.txt") %>% 
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for fa_4699_asc')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position ="bottom")

d %>% filter(language =="farsi" & source != "/farsi/callfriend/fa_4699_asc.txt") %>% 
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for all other Farsi data')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position ="bottom")

# this makes me realize we can facet by source by language to find weirdly transcribed subcorpora
d %>% filter(language == "farsi") %>%
  group_by(source) %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for Farsi by source')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ source,strip.position ="bottom")
ggsave('qc-farsi-by-source.png',bg="white",width=20,height=20)

# fa_6936.asc
d %>% filter(source=="/farsi/callfriend/fa_6936_asc.txt") %>% 
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for fa_4699_asc')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position ="bottom")
d %>% filter(source=="/farsi/callfriend/fa_6936_asc.txt") %>% View()
uids <- sample(d[d$source == "/farsi/callfriend/fa_6936_asc.txt",]$uid,20)
convplot(uids,window=200000)
ggsave("qc-farsi-fa_6936_asc.png",bg="white",height=12,width=5)

# fa_6419
# nothing much going on judging from convplots; just lots of competitive overlap
uids <- sample(d[d$source == "/farsi/callfriend/fa_6419_asc.txt",]$uid,20)
convplot(uids,window=200000)

# fa_4771
# nothing much going on judging from convplots; just lots of competitive overlap
uids <- sample(d[d$source == "/farsi/callfriend/fa_4771_asc.txt",]$uid,20)
convplot(uids,window=200000)

# fa_7003
# this one's mostly monologue, won't affect FTO or timing analyses
uids <- sample(d[d$source == "/farsi/callfriend/fa_7003_asc.txt",]$uid,20)
convplot(uids,window=200000)
d %>% filter(source=="/farsi/callfriend/fa_6936_asc.txt") %>% View()

# weird timing distributions ----------------------------------------------

# From the highly peaked distributions of a bunch of these it is clear that
# something is up. In most of these languages, the most frequent FTO is exactly
# 0, which is highly implausible and likely a transcription software quirk
# (breaking up annotations at one point). This means we cannot trust `end` value
# (and therefore also not `duration` or `FTO`), so we are not using them. One of
# them even has no FTOs <0 (Akie), showing that overlap is not marked and the
# content of annotations is not time-aligned with the actual speech.
#
# Two larger corpora, Brazilian Portugues and Czech, have lots of directly
# adjacent annotations by same speaker. Unless these can be concatenated in some
# empirically sensible way, timing information is not to be trusted, so we
# exclude those too for now.

weird_distributions <- c("akie","croatian","mambila","nganasan","khinalug","besemah","czech","brazilian_portuguese")
d.w <- d %>% filter(language %in% weird_distributions)

nturns <- d.w %>% drop_na(FTO) %>% ungroup() %>% summarize(n=n()) %>% as.integer()

pA <- d.w %>%
  filter(language %in% c("akie","mambila")) %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset')) +
  xlim(-2000,2000) +
  geom_density(na.rm=T,size=1,bw=0.5,trim=T) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position ="bottom")

d.w %>%
  filter(language %in% c("akie","mambila")) %>% group_by(language) %>% summarise(n=n())

sort(table(d.w$FTO),decreasing=T)[1:10]

convplot(lang="akie",n=5,window=20000)
convplot(lang="mambila",n=5,window=20000)

uids <- c("akie-1-171-371969", "mambila-1-756-1260971")
pB <- convplot(uids)
pB <- pB +
  ggtitle("Sample annotations in Akie and Mambila")

plot_grid(pB,pA,labels=c("A","B"),rel_widths=c(2,1))

ggsave('sup-akia-mambila-panel.png',bg="white",height=2,width=10)

# how weird are Brazilian Portuguese and Czech?
test <- d %>% filter(language %in% c("brazilian_portuguese","czech") & FTO < -250)

uids <- sample(test[test$language == "czech",]$uid,20)
convplot(uids,focus=T)
ggsave('ex-segmentation-weirdness_czech.png',bg="white")
View(convplot(uids,focus=T,datamode=T))

uids <- sample(test[test$language == "brazilian_portuguese",]$uid,6)
convplot(uids,focus=T)
ggsave('ex-segmentation-weirdness-brazilian_portugues.png',bg="white")
View(convplot(uids,focus=T,datamode=T))

# diagnosis: too weird to include in first run. They have lots of directly
# adjacent annotations by same speaker which I guess should be concatenated.
elpaco-escience / talkr

assorted examples of quality checks #37