signaturescience / skater

SKATE R Utilities
https://signaturescience.github.io/skater/
Other
9 stars 5 forks source link

first pass at including ibd utils and relatedness functions #43

Closed vpnagraj closed 3 years ago

vpnagraj commented 3 years ago

work underway to bring in the IBD segment analysis utilities.

will remove WIP tag and suggest @stephenturner as reviewer when the following are done:

vpnagraj commented 3 years ago

for the demonstration of the kinship coefficient logic:

library(tidyverse)
library(skater)

plinkmap <- read_map("/data/projects/skate/data/hapmap/plink.allchr.GRCh37.map")

## read in output from IBDkin
ibdkin_out <- 
  read_tsv("/data/projects/skate/data/analysis_mbs/simulation_native/hap-ibd/MXL-gsa37-5GHS-er_0-ms_0-hom_0.out.kinship.gz") %>%
  select(id1 = ID1, id2 = ID2, ibdkin_kinship = kinship) %>%
  arrange_ids(., id1, id2)

## read in hapibd ouput for the same simulation
## compute kinship coefficient from IBD per the skater algo (adapted from original Browning Py script)
skater_out <- 
  read_ibd("/data/projects/skate/data/analysis_mbs/simulation_native/hap-ibd/MXL-gsa37-5GHS-er_0-ms_0-hom_0.out.ibd.gz", 
                       source = "hapibd") %>%
  ibd2kin(.ibd_data = ., .map = plinkmap)

## join skater kinship coef data and IBDkin
joined <-
  ibdkin_out %>%
  full_join(rename(skater_out, skater_kinship = kinship))

## the result objects arent the same length
## which pairs are missing from each tool ..

joined %>%
  filter(is.na(ibdkin_kinship) & !is.na(skater_kinship))

joined %>%
  filter(!is.na(ibdkin_kinship) & is.na(skater_kinship))

joined %>%
  filter(is.na(ibdkin_kinship) & is.na(skater_kinship))

## plot the two
joined %>%
  ggplot(aes(ibdkin_kinship, skater_kinship)) +
  geom_point(alpha = 0.05) +
  theme_classic() +
  labs(x="k.IBDKin", y = "k.skater")

skater_ibdkin_scatter

vpnagraj commented 3 years ago

as for retrieving kinship from .seg file:

library(tidyverse)
library(skater)
plinkmap <- read_map("/data/projects/skate/data/hapmap/plink.allchr.GRCh37.map")
seglikeibd <- read_ibd("/data/projects/skate/data/analysis_mbs/simulation_native/sims/MXL-gsa37-5GHS-er_0-ms_0-hom_0.seg",
                       source = "pedsim")
degree_resolution <- 4

truth <- read_fam("/data/projects/skate/data/analysis_mbs/simulation_native/sims/MXL-gsa37-5GHS-er_0-ms_0-hom_0-everyone.fam") %>%
  fam2ped() %>%
  mutate(x=map(ped, ped2kinpair)) %>%
  select(x) %>%
  unnest(x) %>%
  mutate(d.truth=kin2degree(k, max_degree=degree_resolution)) %>%
  rename(k.truth=k) %>%
  print()

res <-
  ibd2kin(.ibd_data = seglikeibd, .map = plinkmap) %>%
  mutate(degree = kin2degree(kinship)) %>%
  left_join(truth) %>%
  mutate(across(starts_with("d"), ~factor(replace_na(., "Unrelated")))) 

res %>%
  ggplot(aes(kinship)) +
  geom_density(aes(fill = d.truth), alpha = 0.5) +
  geom_vline(xintercept=2^(-seq(3,(degree_resolution*2+1+2),2)/2), lty=3) + 
  #geom_vline(data=tibble(x=.5/2^(1:degree_resolution)), aes(xintercept=x, col=factor(rev(x))), lty=1, alpha=.6) +
  geom_vline(data=tibble(x=.5/2^(1:degree_resolution), k.theoretical = factor(rev(x))), aes(xintercept=x, col=k.theoretical), lty=1, alpha=.6) +
  scale_color_manual(values=scales::hue_pal()(degree_resolution+1)[1:degree_resolution]) + 
  theme_classic() + 
  scale_x_continuous(breaks=.5/2^(1:degree_resolution)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme_classic() +
  labs(x = "k")

skaterkin_density_4

stephenturner commented 3 years ago

Modifying the code ever so slightly -- this draws vertical lines at the theoretical kinship coefficients (.25, .125, etc).

library(tidyverse)
library(skater)
plinkmap <- read_map("/data/projects/skate/data/hapmap/plink.allchr.GRCh37.map")
seglikeibd <- read_ibd("/data/projects/skate/data/analysis_mbs/simulation_native/sims/MXL-gsa37-5GHS-er_0-ms_0-hom_0.seg",
                       source = "pedsim")
degree_resolution <- 3

truth <- read_fam("/data/projects/skate/data/analysis_mbs/simulation_native/sims/MXL-gsa37-5GHS-er_0-ms_0-hom_0-everyone.fam") %>%
  fam2ped() %>%
  mutate(x=map(ped, ped2kinpair)) %>%
  select(x) %>%
  unnest(x) %>%
  mutate(d.truth=kin2degree(k, max_degree=degree_resolution)) %>%
  rename(k.truth=k) %>%
  print()

res <-
  ibd2kin(.ibd_data = seglikeibd, .map = plinkmap) %>%
  mutate(degree = kin2degree(kinship)) %>%
  left_join(truth) %>%
  mutate(across(starts_with("d"), ~factor(replace_na(., "Unrelated")))) 

res %>%
  ggplot(aes(kinship)) +
  geom_density(aes(fill = d.truth), alpha = 0.5) +
  geom_vline(xintercept=2^(-seq(3,(degree_resolution*2+1+2),2)/2), lty=3) + 
  geom_vline(data=tibble(x=.5/2^(1:degree_resolution)), aes(xintercept=x, col=factor(rev(x))), lty=1, alpha=.6) +
  scale_color_manual(values=scales::hue_pal()(degree_resolution+1)[1:degree_resolution]) + 
  theme_classic() + 
  scale_x_continuous(breaks=.5/2^(1:degree_resolution)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme_classic() +
  labs(x = "k")

image

It looks really close. The center is just slightly lower than what it should be in theory. I'm guessing this may be due to simulating with the SS genetic map but calculating K from seg using a SA map? Or, potentially something in handling IBD2 vs IBD1? (I think this is less likely given the near perfect agreement with ibdkin as shown in https://github.com/signaturescience/skater/pull/43#issuecomment-829614886.

Changing degree_resolution to 4 and rerunning:

image

vpnagraj commented 3 years ago

@stephenturner i think we are good to merge here. requesting your review to confirm.

ive added the example data now. the code below demonstrates the usage of these new functions (again) but with the small IBD segment datasets that now ship with the package.

NOTE we still dont have map data included in the package. instructions for generating the "plink map" formatted hapmap data is in ?read_map

for the .seg to kinship coefficient:

library(tidyverse)
library(skater)
plinkmap <- read_map("/data/projects/skate/data/hapmap/plink.allchr.GRCh37.map")
seglikeibd <- read_ibd(system.file("extdata", "GBR.sim.seg.gz", package="skater"),
                       source = "pedsim")
degree_resolution <- 3

truth <- read_fam(system.file("extdata", "GBR.sim.fam", package="skater")) %>%
  fam2ped() %>%
  mutate(x=map(ped, ped2kinpair)) %>%
  select(x) %>%
  unnest(x) %>%
  mutate(d.truth=kin2degree(k, max_degree=degree_resolution)) %>%
  rename(k.truth=k) %>%
  print()

res <-
  ibd2kin(.ibd_data = seglikeibd, .map = plinkmap) %>%
  mutate(degree = kin2degree(kinship)) %>%
  left_join(truth) %>%
  mutate(across(starts_with("d"), ~factor(replace_na(., "Unrelated")))) 

res %>%
  ggplot(aes(kinship)) +
  geom_density(aes(fill = d.truth), alpha = 0.5) +
  geom_vline(xintercept=2^(-seq(3,(degree_resolution*2+1+2),2)/2), lty=3) + 
  geom_vline(data=tibble(x=.5/2^(1:degree_resolution), k.theoretical = factor(rev(x))), aes(xintercept=x, col=k.theoretical), lty=1, alpha=.6) +
  scale_color_manual(values=scales::hue_pal()(degree_resolution+1)[1:degree_resolution]) + 
  theme_classic() + 
  scale_x_continuous(breaks=.5/2^(1:degree_resolution)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme_classic() +
  labs(x = "k")

skater_density2

for the hap-ibd output to kinship coefficient and calculating categorical accuracy:

hapibd <- 
  read_ibd(system.file("extdata", "GBR.sim.ibd.gz", package="skater"), 
           source = "hapibd") %>%
  ibd2kin(.ibd_data = ., .map = plinkmap)

truth <- read_fam(system.file("extdata", "GBR.sim.fam", package="skater")) %>%
  fam2ped() %>%
  mutate(x=map(ped, ped2kinpair)) %>%
  select(x) %>%
  unnest(x) %>%
  mutate(d.truth=kin2degree(k, max_degree=degree_resolution)) %>%
  rename(k.truth=k) %>%
  print()

hapibd_res <-
  hapibd %>%
  mutate(degree = kin2degree(kinship)) %>%
  left_join(truth) %>%
  mutate(across(starts_with("d"), ~factor(replace_na(., "Unrelated"))))

confusion_matrix(prediction=hapibd_res$degree, target=hapibd_res$d.truth)
$Accuracy
# A tibble: 1 x 5
  Accuracy `Accuracy LL` `Accuracy UL` `Accuracy Guessing` `Accuracy P-value`
     <dbl>         <dbl>         <dbl>               <dbl>              <dbl>
1    0.990         0.964         0.999               0.755           2.55e-21

$Other
# A tibble: 5 x 15
  Class     N `Sensitivity/Re… `Specificity/TN… `PPV/Precision`   NPV `F1/Dice` Prevalence `Detection Rate` `Detection Prev… `Balanced Accur…
  <chr> <dbl>            <dbl>            <dbl>           <dbl> <dbl>     <dbl>      <dbl>            <dbl>            <dbl>            <dbl>
1 1        27            1                1               1     1         1         0.138            0.138            0.138             1    
2 2        18            0.944            1               1     0.994     0.971     0.0918           0.0867           0.0867            0.972
3 3         3            0.667            0.995           0.667 0.995     0.667     0.0153           0.0102           0.0153            0.831
4 Unre…   148            1                0.979           0.993 1         0.997     0.755            0.755            0.760             0.990
5 Aver…    49            0.903            0.993           0.915 0.997     0.909     0.250            0.247            0.25              0.948
# … with 4 more variables: FDR <dbl>, FOR <dbl>, `FPR/Fallout` <dbl>, FNR <dbl>

$Table
           Target
Predicted     1   2   3 Unrelated
  1          27   0   0         0
  2           0  17   0         0
  3           0   1   2         0
  Unrelated   0   0   1       148

$recip_rmse
[1] 0.00933844
vpnagraj commented 3 years ago

fixes #37

vpnagraj commented 3 years ago

thanks for the review @stephenturner . i hit everything you suggested. back to you to resolve / merge.

heads up i did not rebuild the pkgdown site

stephenturner commented 3 years ago

I think I'm good with everything here. Before merging, just seeing if you have any thoughts on problem I'm running into. This is standard tidyverse stuff, not anything specific to functionality here. Reprex demonstrating below.

If I have data for relationships 1-2, 1-3, 1-4, 2-3, 2-4, 3-4, I need "true" simulated kinships for all 6 of those relationships. Now, given the tool results (or the results from fam file parsing to get theoretical kinship coefficients), I think I can go through the simkin pairs and left join, followed by turning NAs into zeros for the simkin values.

I thought I could use a complete() filling in zeros. But turns out this behavior isn't doing what I want, resulting in lots of duplicate entries for id1-id2 / id2-id1, with different values of sim-kinship (the actual, and a filled-in zero).

suppressPackageStartupMessages(suppressWarnings(library(tidyverse)))
suppressPackageStartupMessages(suppressWarnings(library(skater)))

# We want comparisons:
# 1-2, 1-3, 1-4
# 2-3, 2-4
# 3-4
# Let's say we got out of our analysis comparisons:
# 1-2, 1-3, --
# 3-2, --
# 3-4
# We need to fill in 1-4, switch around 3-2 to 2-3, fill in 2-4
tmp <- tibble::tribble(
  ~id1, ~id2, ~k.sim,
  1L,   2L,    0.1,
  1L,   3L,    0.2,
  3L,   2L,    0.4,
  3L,   4L,    0.6
)
tmp
#> # A tibble: 4 x 3
#>     id1   id2 k.sim
#>   <int> <int> <dbl>
#> 1     1     2   0.1
#> 2     1     3   0.2
#> 3     3     2   0.4
#> 4     3     4   0.6

# This doesn't work! The complete gets us a 3-3 (unneeded), but doesn't give us the 2-4.
tmp %>%
  complete(id1, id2, fill=list(k.sim=0)) %>%
  arrange_ids(id1, id2)
#> # A tibble: 6 x 3
#>     id1   id2 k.sim
#>   <int> <int> <dbl>
#> 1     1     2   0.1
#> 2     1     3   0.2
#> 3     1     4   0  
#> 4     2     3   0.4
#> 5     3     3   0  
#> 6     3     4   0.6

# If we arrange IDs before we complete, this doesn't work either! We now get a duplicate entry for 2-3!
# One entry has the value we want, the other has a zero filled in for the 3-2 comparison, which gets arranged to 2-3.
tmp %>%
  arrange_ids(id1, id2) %>%
  complete(id1, id2, fill=list(k.sim=0))
#> # A tibble: 9 x 3
#>     id1   id2 k.sim
#>   <int> <int> <dbl>
#> 1     1     2   0.1
#> 2     1     3   0.2
#> 3     1     4   0  
#> 4     2     2   0  
#> 5     2     3   0.4
#> 6     2     4   0  
#> 7     3     2   0  
#> 8     3     3   0  
#> 9     3     4   0.6

# Easier to see if we get rid of entries where id1==id2, and arrange, and add a count.
tmp %>%
  arrange_ids(id1, id2) %>%
  complete(id1, id2, fill=list(k.sim=0)) %>%
  arrange_ids(id1, id2) %>%
  arrange(id1, id2) %>%
  filter(id1!=id2) %>%
  add_count(id1, id2)
#> # A tibble: 7 x 4
#>     id1   id2 k.sim     n
#>   <int> <int> <dbl> <int>
#> 1     1     2   0.1     1
#> 2     1     3   0.2     1
#> 3     1     4   0       1
#> 4     2     3   0.4     2
#> 5     2     3   0       2
#> 6     2     4   0       1
#> 7     3     4   0.6     1

# Let's see what happens with real data!
sk <- read_csv("/data/projects/skate/data/analysis_mbs/simulation_downselection/downsample/ASW-gsa37-5GHS-er_0.05-ms_0.1-hom_0.simkin",
               col_types="ccd", col_names=c("id1", "id2", "k.sim"), skip=1L)
# Need to arrange IDs because this was added after the last code review.
sk <- sk %>% arrange_ids(id1, id2)

# How many pairs should we expect? (unique IDs choose 2)
choose(length(unique(c(sk$id1, sk$id2))), 2)
#> [1] 21945

# You don't have entries for every pair.
nrow(sk)
#> [1] 16089

# So what if we complete, then arrange? We have many more than n choose 2.
sk %>%
  complete(id1, id2, fill=list(k.sim=0)) %>%
  arrange_ids(id1, id2) %>%
  distinct()
#> # A tibble: 36,769 x 3
#>    id1                    id2                    k.sim
#>    <chr>                  <chr>                  <dbl>
#>  1 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b1-i1 0.236
#>  2 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b1-i2 0.236
#>  3 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b2-i1 0.236
#>  4 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b2-i2 0.236
#>  5 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b3-i1 0.236
#>  6 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b3-i2 0.236
#>  7 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b4-i1 0.236
#>  8 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b4-i2 0.236
#>  9 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b5-i1 0.236
#> 10 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b5-i2 0.236
#> # … with 36,759 more rows

# Which have >1 entries?
sk %>%
  complete(id1, id2, fill=list(k.sim=0)) %>%
  arrange_ids(id1, id2) %>%
  distinct() %>%
  count(id1, id2, sort=TRUE) %>%
  filter(n>1)
#> # A tibble: 15,210 x 3
#>    id1                    id2                        n
#>    <chr>                  <chr>                  <int>
#>  1 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b1-i2     2
#>  2 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b2-i1     2
#>  3 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b2-i2     2
#>  4 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b3-i1     2
#>  5 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b3-i2     2
#>  6 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b4-i1     2
#>  7 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b4-i2     2
#>  8 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b5-i1     2
#>  9 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b5-i2     2
#> 10 halfSiblings1_g2-b1-i1 halfSiblings1_g2-b6-i1     2
#> # … with 15,200 more rows
stephenturner commented 3 years ago

FWIW, the left join by id1 and id2 against the expected kinship coefficients, then replacing NAs with 0s (unless id1==id2, then replace NA with 0.5). In fact, this is probably the preferred approach considering that for extremely distant relatives there may be no simulated/reported IBD segments at all! So, there may be certain IDs that don't appear in the segment file output at all, so even if we figured out this complete() procedure, it wouldn't work if an ID wasn't reported at all.

stephenturner commented 3 years ago
vpnagraj commented 3 years ago

@stephenturner how about this ...

library(skater)
library(tidyverse)

# Let's see what happens with real data!
sk <- read_csv("/data/projects/skate/data/analysis_mbs/simulation_downselection/downsample/ASW-gsa37-5GHS-er_0.05-ms_0.1-hom_0.simkin",
               col_types="ccd", col_names=c("id1", "id2", "k.sim"), skip=1L)
# Need to arrange IDs because this was added after the last code review.
sk <- sk %>% arrange_ids(id1, id2)

## make a tibble with two matching columns with all ids
all_ids <- unique(c(sk$id1, sk$id2))

tibble(id1 = all_ids, id2 = all_ids) %>%
  ## create all combinations of ids
  expand(id1,id2) %>%
  ## arrange the ids
  arrange_ids(id1,id2) %>%
  ## going to have dupes ... make sure distinct
  distinct() %>%
  ## and toss all the id1 == id2
  filter(id1 != id2) %>%
  ## join back to sk (with expanded all combo on left side)
  left_join(sk) %>%
  ## replace NA k vals from join with 0
  mutate(k.sim = ifelse(is.na(k.sim), 0, k.sim))
#> Joining, by = c("id1", "id2")
#> # A tibble: 21,945 x 3
#>    id1                    id2                    k.sim
#>    <chr>                  <chr>                  <dbl>
#>  1 halfSiblings1_g1-b1-i1 halfSiblings1_g1-b1-s1 0    
#>  2 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b1-i1 0.236
#>  3 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b1-i2 0.236
#>  4 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b1-s1 0    
#>  5 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b1-s2 0    
#>  6 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b2-i1 0.236
#>  7 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b2-i2 0.236
#>  8 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b2-s1 0    
#>  9 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b3-i1 0.236
#> 10 halfSiblings1_g1-b1-i1 halfSiblings1_g2-b3-i2 0.236
#> # … with 21,935 more rows
vpnagraj commented 3 years ago

or wait ... i guess thats what you were describing in https://github.com/signaturescience/skater/pull/43#issuecomment-832838850

?

🙃

stephenturner commented 3 years ago

This works, so long as all individuals are represented in id1 and id2 union. Which should be most of the time. Keeping this closed because this kind of thing probably shouldn't be written into the ibd2kin function.

vpnagraj commented 3 years ago

gotcha. well maybe the other way to get the id1 + id2 = all_ids (and to make sure that is truly all ids) is to pull from the .fam:

all_ids <-
  read_fam("/data/projects/skate/data/analysis_mbs/simulation_downselection/downsample/ASW-gsa37-5GHS-er_0.05-ms_0.1-hom_0.1-everyone.fam") %>% pull(id)

no way the id would be missing from .fam, right?

stephenturner commented 3 years ago

precisely. That's what I'm doing in the analysis code, and left joining to that. Works fine, no need to worry about individuals absent in the ibd2kin output.

vpnagraj commented 3 years ago

@stephenturner rather than pushing any code up im just going to reply on the HBD discussion point here.

tl;dr : im pretty sure the left-shift in kinship from .seg is not caused by exclusion of HBD

procedure i took was to:

  1. write a modified version of read_ibd to not filter IBD1 | IBD2
  2. read in segment file with this modified version of read_ibd
  3. read in segment file with original version of read_ibd
  4. use both as inputs to ibd2kin
  5. stack results on top of each other and re-create density plots
library(tidyverse)
library(skater)

read_ibd2 <- function(file, source) {

  if(source == "hapibd") {
    seg <-
      readr::read_delim(file,
                        delim = "\t",
                        col_names = FALSE,
                        col_types = "cdcddddd") %>%
      ## select columns by indices
      dplyr::select(1,3,5,6,7,8) %>%
      ## set names
      purrr::set_names(c("id1","id2","chr","start","end","length")) %>%
      ## make sure ids are ordered
      arrange_ids(id1,id2)
  } else if (source == "pedsim") {

    seg <-
      readr::read_tsv(file,
                      col_types="ccciicddd",
                      col_names=c("id1", "id2", "chr", "pstart", "pend", "type", "gstart", "gend", "glength")) %>%
      #dplyr::filter(type == "IBD1" | type == "IBD2") %>%
      dplyr::select(id1, id2, chr, start = pstart, end = pend, length = glength) %>%
      ## make sure ids are ordered
      arrange_ids(id1,id2)

  } else {
    stop("The 'source' argument must be one of 'hapibd' or 'pedsim'.")
  }

  return(seg)

}

plinkmap <- read_map("/data/projects/skate/data/hapmap/plink.allchr.GRCh37.map")
seglikeibd <- read_ibd(system.file("extdata", "GBR.sim.seg.gz", package="skater"),
                       source = "pedsim")
seglikeibd2 <- read_ibd2(system.file("extdata", "GBR.sim.seg.gz", package="skater"),
                       source = "pedsim")
degree_resolution <- 3

truth <- read_fam(system.file("extdata", "GBR.sim.fam", package="skater")) %>%
  fam2ped() %>%
  mutate(x=map(ped, ped2kinpair)) %>%
  select(x) %>%
  unnest(x) %>%
  mutate(d.truth=kin2degree(k, max_degree=degree_resolution)) %>%
  rename(k.truth=k) %>%
  print()

res1 <-
  ibd2kin(.ibd_data = seglikeibd, .map = plinkmap) %>%
  mutate(degree = kin2degree(kinship)) %>%
  left_join(truth) %>%
  mutate(across(starts_with("d"), ~factor(replace_na(., "Unrelated")))) %>%
  mutate(type = "Without HBD")

res2 <-
  ibd2kin(.ibd_data = seglikeibd2, .map = plinkmap) %>%
  mutate(degree = kin2degree(kinship)) %>%
  left_join(truth) %>%
  mutate(across(starts_with("d"), ~factor(replace_na(., "Unrelated")))) %>%
  mutate(type = "With HBD")

bind_rows(res1,res2) %>%
  ggplot(aes(kinship)) +
  geom_density(aes(fill = d.truth), alpha = 0.5) +
  geom_vline(xintercept=2^(-seq(3,(degree_resolution*2+1+2),2)/2), lty=3) + 
  geom_vline(data=tibble(x=.5/2^(1:degree_resolution), k.theoretical = factor(rev(x))), aes(xintercept=x, col=k.theoretical), lty=1, alpha=.6) +
  scale_color_manual(values=scales::hue_pal()(degree_resolution+1)[1:degree_resolution]) + 
  theme_classic() + 
  scale_x_continuous(breaks=.5/2^(1:degree_resolution)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme_classic() +
  labs(x = "k") +
  facet_wrap(~ type, ncol = 1)

hbd_explore

looking closer and this .seg file doesnt have any HBD:

readr::read_tsv(system.file("extdata", "GBR.sim.seg.gz", package="skater"),
                col_types="ccciicddd",
                col_names=c("id1", "id2", "chr", "pstart", "pend", "type", "gstart", "gend", "glength")) %>%
  pull(type) %>%
  unique(.)
[1] "IBD1" "IBD2"

so i dont think inclusion / exclusion of HBD is the cause of the left-shift in k from .seg

stephenturner commented 3 years ago

👍 right on that one.

The only, only other thing I can think of here would be a need to handle IBD2 differently than IBD1. Is there any indication of this happening in either the refinedibd companion python script or in ibdkin cc @genignored ?

genignored commented 3 years ago

That has bothered me, but I didn't know what to do about it, because I haven't been digging deeply enough. From the Ramstetter paper, it looks like there is an equation to differentiate between IBD1 and IBD2 if the tool generates that. Screenshot because I can't figure out how to paste formulae into this text box:

image

If I'm skimming that right, {IBD1}/4 + {IBD2}/2 . not sure how well that meshes with what we're doing now. But directly below that they say that browning methods don't distinguish between IBD1 and IBD2, so....