Open stefanocoretta opened 6 months ago
I think this should be a separate function. The hard part is coming up with a name for it. Right now, I have pivot_nested_intervals()
. I don't want to say pivot_wider
or some variation with the word wider
because I don't want to interfere with tidyr::pivot_wider()
functions.
Here is sketch of it so far:
pivot_nested_intervals <- function(data, id_cols, nesting) {
{
# assertions/validation
# - that only `tier_names` are in `nesting`
# - point tiers can only occur in last layer in nesting
}
# split in to a list for each level of nesting
l <- data |> split(~tier_name) |> _[nesting]
# prepare each list element for table joins, renaming columns, computing midpoints
{}
# for each pair in nesting i, i+1 {
# left_join(l[[i]], l[[i+1]], join_by(id_cols, ..., between(y$xmid, x$xmin, x$xmax)), relationship = "one-to-many")
}
tail(l)
}
So that for your example, you start with the sentence
rows and join on all the word
rows where the midpoint of the word
interval falls between()
the boundaries of the sentence
interval. You now have one row per word
. Repeat that process, join to each word
all the ph
intervals with a midpoint between()
the boundaries of the word
interval.
There is no validation that a ph
interval is full contained within its parent word
interval. I don't want to make that assertion because I have textgrids where the word
/phone
intervals are adjusted but not forced to exactly line up.
For slickness, I wonder whether pivot_nested_intervals(..., nesting = "ph")
(i.e., one level of nesting) should work to provide a nice wide dataframe with renamed columns.
Then maybe it should be pivot_textgrid_tiers(..., tiers)
where if tiers
is more than element, we handle nesting.
Basic work is done. For the single tier pivots, I need to make sure that tibbles remain tibbles.
data <- structure(list(file = c("nested.TextGrid", "nested.TextGrid",
"nested.TextGrid", "nested.TextGrid", "nested.TextGrid", "nested.TextGrid",
"nested.TextGrid", "nested.TextGrid", "nested.TextGrid", "nested.TextGrid",
"nested.TextGrid", "nested.TextGrid", "nested.TextGrid", "nested.TextGrid",
"nested.TextGrid", "nested.TextGrid", "nested.TextGrid"), tier_num = c(1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3), tier_name = c("words",
"words", "words", "words", "words", "phones", "phones", "phones",
"phones", "phones", "phones", "phones", "phones", "phones", "phones",
"phones", "utterance"), tier_type = c("IntervalTier", "IntervalTier",
"IntervalTier", "IntervalTier", "IntervalTier", "IntervalTier",
"IntervalTier", "IntervalTier", "IntervalTier", "IntervalTier",
"IntervalTier", "IntervalTier", "IntervalTier", "IntervalTier",
"IntervalTier", "IntervalTier", "IntervalTier"), tier_xmin = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), tier_xmax = c(1.86349206349206,
1.86349206349206, 1.86349206349206, 1.86349206349206, 1.86349206349206,
1.86349206349206, 1.86349206349206, 1.86349206349206, 1.86349206349206,
1.86349206349206, 1.86349206349206, 1.86349206349206, 1.86349206349206,
1.86349206349206, 1.86349206349206, 1.86349206349206, 1.86349206349206
), xmin = c(0, 0.419353348081418, 0.760948470524494, 0.854457749852487,
1.4422303627713, 0, 0.419353348081418, 0.524312743245492, 0.636905548966953,
0.760948470524494, 0.854457749852487, 1.05292642434455, 1.2304032198038,
1.32200414649245, 1.4422303627713, 1.78573383785372, 0), xmax = c(0.419353348081418,
0.760948470524494, 0.854457749852487, 1.4422303627713, 1.86349206349206,
0.419353348081418, 0.524312743245492, 0.636905548966953, 0.760948470524494,
0.854457749852487, 1.05292642434455, 1.2304032198038, 1.32200414649245,
1.4422303627713, 1.78573383785372, 1.86349206349206, 1.86349206349206
), text = c("", "hug", "", "daddy", "", "sil", "HH", "AH1", "G",
"sp", "D", "AE1", "D", "IY0", "sp", "", "hug daddy"), annotation_num = c(1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
1L)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
#' @param data a textgrid dataframe created with [read_textgrid()]
#' @param tiers character vector of tiers to pivot into wide format.
#' @param join_cols character vector of the columns that will uniquely identify
#' a textgrid file. Defaults to `c("file", "tier_xmin", "tier_xmax")` because
#' these are identical columns for tiers read from the same textgrid.
pivot_textgrid_tiers <- function(
data,
tiers,
join_cols = c("file", "tier_xmin", "tier_xmax")
) {
{
stopifnot(
`tier names must be used in textgrid` =
all(tiers %in% unique(data[["tier_name"]]))
)
# todo
# allow only point tiers "TextTier" at last point in nesting
# tier_types <- data[c("tier_name", "tier_type")]
}
data <- data[data[["tier_name"]] %in% tiers, ]
f <- function(x, y) left_join_nested_tiers(x, y, join_cols)
l <- data |>
split(~tier_name) |>
_[tiers] |>
lapply(pivot_single_tier, join_cols) |>
Reduce(f, x = _)
l[["tier_name"]] <- NULL
l
}
pivot_single_tier <- function(data, join_cols) {
tier_name <- unique(data[["tier_name"]])
stopifnot(length(tier_name) == 1)
data[["xmid"]] <- data[["xmin"]] + (data[["xmax"]] - data[["xmin"]]) / 2
names_end <- c("tier_name", "tier_xmin", "tier_xmax")
names_front <- setdiff(join_cols, names_end)
names_mid <- c(
"xmin", "xmax", "xmid",
"annotation_num", "tier_num", "tier_type"
)
name_ordering <- c(names_front, "text", names_mid, names_end)
names_new <- c(
names_front, tier_name,
paste0(tier_name, "_", names_mid),
names_end
)
data <- data[name_ordering] |>
stats::setNames(names_new)
data
}
left_join_nested_tiers <- function(data_parent, data_child, join_cols) {
name_parent <- data_parent[["tier_name"]][1]
name_child <- data_child[["tier_name"]][1]
x_names <- paste0(name_parent, "_", c("xmin", "xmax"))
y_names <- paste0(name_child, "_", "xmid")
e <- rlang::expr(
dplyr::between(
`$`(y, !! y_names),
`$`(x, !! x_names[1]),
`$`(x, !! x_names[2])
)
)
data_parent[["tier_name"]] <- NULL
dplyr::left_join(
data_parent,
data_child,
dplyr::join_by(!!! join_cols, !! e),
relationship = "one-to-many"
)
}
data |>
pivot_textgrid_tiers("utterance")
#> file utterance utterance_xmin utterance_xmax utterance_xmid
#> 17 nested.TextGrid hug daddy 0 1.863492 0.931746
#> utterance_annotation_num utterance_tier_num utterance_tier_type tier_xmin
#> 17 1 3 IntervalTier 0
#> tier_xmax
#> 17 1.863492
data |>
pivot_textgrid_tiers("words")
#> file words words_xmin words_xmax words_xmid words_annotation_num
#> 1 nested.TextGrid 0.0000000 0.4193533 0.2096767 1
#> 2 nested.TextGrid hug 0.4193533 0.7609485 0.5901509 2
#> 3 nested.TextGrid 0.7609485 0.8544577 0.8077031 3
#> 4 nested.TextGrid daddy 0.8544577 1.4422304 1.1483441 4
#> 5 nested.TextGrid 1.4422304 1.8634921 1.6528612 5
#> words_tier_num words_tier_type tier_xmin tier_xmax
#> 1 1 IntervalTier 0 1.863492
#> 2 1 IntervalTier 0 1.863492
#> 3 1 IntervalTier 0 1.863492
#> 4 1 IntervalTier 0 1.863492
#> 5 1 IntervalTier 0 1.863492
data |>
pivot_textgrid_tiers(c("utterance", "words"))
#> # A tibble: 5 × 17
#> file utterance utterance_xmin utterance_xmax utterance_xmid
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 nested.TextGrid hug daddy 0 1.86 0.932
#> 2 nested.TextGrid hug daddy 0 1.86 0.932
#> 3 nested.TextGrid hug daddy 0 1.86 0.932
#> 4 nested.TextGrid hug daddy 0 1.86 0.932
#> 5 nested.TextGrid hug daddy 0 1.86 0.932
#> # ℹ 12 more variables: utterance_annotation_num <int>,
#> # utterance_tier_num <dbl>, utterance_tier_type <chr>, tier_xmin <dbl>,
#> # tier_xmax <dbl>, words <chr>, words_xmin <dbl>, words_xmax <dbl>,
#> # words_xmid <dbl>, words_annotation_num <int>, words_tier_num <dbl>,
#> # words_tier_type <chr>
data |>
pivot_textgrid_tiers(c("utterance", "words", "phones"))
#> # A tibble: 11 × 24
#> file utterance utterance_xmin utterance_xmax utterance_xmid
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 nested.TextGrid hug daddy 0 1.86 0.932
#> 2 nested.TextGrid hug daddy 0 1.86 0.932
#> 3 nested.TextGrid hug daddy 0 1.86 0.932
#> 4 nested.TextGrid hug daddy 0 1.86 0.932
#> 5 nested.TextGrid hug daddy 0 1.86 0.932
#> 6 nested.TextGrid hug daddy 0 1.86 0.932
#> 7 nested.TextGrid hug daddy 0 1.86 0.932
#> 8 nested.TextGrid hug daddy 0 1.86 0.932
#> 9 nested.TextGrid hug daddy 0 1.86 0.932
#> 10 nested.TextGrid hug daddy 0 1.86 0.932
#> 11 nested.TextGrid hug daddy 0 1.86 0.932
#> # ℹ 19 more variables: utterance_annotation_num <int>,
#> # utterance_tier_num <dbl>, utterance_tier_type <chr>, tier_xmin <dbl>,
#> # tier_xmax <dbl>, words <chr>, words_xmin <dbl>, words_xmax <dbl>,
#> # words_xmid <dbl>, words_annotation_num <int>, words_tier_num <dbl>,
#> # words_tier_type <chr>, phones <chr>, phones_xmin <dbl>, phones_xmax <dbl>,
#> # phones_xmid <dbl>, phones_annotation_num <int>, phones_tier_num <dbl>,
#> # phones_tier_type <chr>
Created on 2024-04-26 with reprex v2.1.0
Hi! Thanks for the package, it's cool!
I wonder if you could implement a way to read TGs as "hierarchical", so that, given a hierarchical specification, you get for example:
Where the hierarchical structure is
sentence > word > ph
and each of them are a tier in the TG.