add derivative transformation and derivative matching.

wincowgerDEV commented 3 years ago

Gabriel Erni Cassola and I have been talking about adding derivative transformations to Open Specy and adding derivative matching.

Some initial code Gabriel wrote:


## Gabriel Erni Cassola
## MiP data base matching

##_______OpenSpecy________##
#__________________________#

# main github page with basic workflow: https://github.com/wincowgerDEV/OpenSpecy
# guides: https://htmlpreview.github.io/?https://github.com/wincowgerDEV/OpenSpecy/blob/main/vignettes/sop.html

# if a need to convert spectra files comes up, this may be of help:
# https://www.effemm2.de/spectragryph/about_feat.html

#############
#__LOAD LIBS AND FILES
library("dplyr")
library("OpenSpecy")
library("ggplot2")
library("signal")

#get_lib()
spec_lib.deriv <- load_lib("ftir", path = "/Users/ernica0000/Desktop/Basel_Documents/01_research/02_WaterColumn_1Year/02_protocols:material/newOpenSpecyLib/")

# obtain file paths
all_paths <- list.files(path = "/Users/ernica0000/Desktop/Basel_Documents/01_research/02_WaterColumn_1Year/01_data/pilotExtractions_<500Âµm/postBern/spectra/CSVFiles/Filter2/", pattern = "*.csv", full.names = T)
# load all files into a list
inputSpectraList <- lapply(all_paths, read_text)

# define how many top matches you want for each spectrum
n.TopMatches <- 5

#__ADAPTING THE LOOP FOR DERIVATIVES OF UNKNOWN SPECTRA

# create empty df to fill
outputSpectraList.adj.deriv <- list()
outputSpectraList.adj <- list()
matches.deriv <- data.frame(sample_name = c(), spectrum_identity = c(), rsq = c(), organization = c())

# loop to obtain derivatives of unknown spectra and save them again in the list
for (i in 1:length(inputSpectraList)) {     
    sampleSpec.adj <- inputSpectraList[[i]][1:2] %>% adj_intens(type = "none")
    outputSpectraList.adj[[i]] <- sampleSpec.adj

    # obtain 1st derivative of the intensity from the unknown spectrum; smoothing & derivative with sgolay()
    intDeriv <- as.data.frame(filter(filt = sgolay(p = 3, n = 21, m = 1), x = sampleSpec.adj$intensity))
    colnames(intDeriv) <- "intensity"
    waveN <- as.data.frame(sampleSpec.adj$wavenumber)
    colnames(waveN) <- "wavenumber"
    sampleSpec.adj.deriv <- cbind(waveN, intDeriv)

    # correct spectrum; smoothing uses Savitzky and Golay filter; baseline correction with IModPolyFit
    #sampleSpec.adj.sm <- smooth_intens(sampleSpec.adj, p = 3, n = 11)
    #sampleSpec.adj.corr <- subtr_bg(sampleSpec.adj.sm, degree = 8)
    outputSpectraList.adj.deriv[[i]] <- sampleSpec.adj.deriv

    # match spectrum and store top match; this does min-may normalization and Pearson correlation coefficient is calculated
    matchTop <- match_spec(sampleSpec.adj.deriv, library = spec_lib.deriv, which = "ftir", top_n = n.TopMatches)

    # it looks like sometimes the match_spec() output contains more than the specified "top_n"; is this when positions "top_n" & "top_n+1" are tied?
    # this statement checks for longer outputs and trims it to the specified length, warning that this happened
    if(length(matchTop$sample_name) > n.TopMatches){
        print("ATTENTION: More matches than n.TopMatches in sample:")
        print(i)
        matchTop <- slice_head(matchTop, n = n.TopMatches)
    }

    #store results
    matches.deriv <- rbind(matches.deriv, matchTop)
}

head(matches.deriv)

# convert tibble format to "classic" data frame
matches.deriv.df <- as.data.frame(matches.deriv)

length(matches.deriv.df$sample_name)
head(matches.deriv.df)
str(matches.deriv.df);tail(matches.deriv.df)

# fetch spectra sample names from each file preserving same order as loaded into the list
all_filenames <- as.data.frame(sub("\\..*$", "", basename(all_paths)))
# replicate names by number of top matches and name column
all_filenames.exp <- as.data.frame(all_filenames[rep(seq.int(1, nrow(all_filenames)), each = n.TopMatches), 1])
colnames(all_filenames.exp) <- "specName"
colnames(all_filenames) <- "specName"

# add to the spectra table
matches.deriv.df.named <- cbind(matches.deriv.df, all_filenames.exp)

# check finished data table
head(matches.deriv.df.named)

# extract "confident" matches
confidentMatches.deriv <- subset(matches.deriv.df.named, rsq >= 0.7)
confidentMatches.deriv
unique(confidentMatches.deriv$spectrum_identity) # check different types that turned up; this can be used to subset for checking specific substances...

# export the table as CSV into the spectra folder (EDIT FILE PATH ACCORDINGLY!)
write.csv(confidentMatches.deriv, "/Users/ernica0000/Desktop/Basel_Documents/01_research/02_WaterColumn_1Year/04_Results/Filter2_MatchingResults_1deriv.csv", row.names = F)

#############
#__PLOTTING

# 1. specify the exact sample name "specName" in df "confidentMatches"
desiredSpec.name <- "RhineTrial2_ap50_2cm_6sc_Extract_91"

# this retrieves the spectra from the lists generated in the loop [DONT EDIT THIS]
desiredSpec <- which(all_filenames$specName == desiredSpec.name)
plotSpec.adj.deric <- outputSpectraList.adj.deriv[[desiredSpec]][1:2]

# 2. specify corresponding database match by providing number ("sample_name" in df "confidentMatches.deriv")
sampName <- 237

# extract spectra from library and subset to specified spectrum
refs <- spec_lib.deriv[["ftir"]][["library"]]
sub <- subset(refs, sample_name == sampName)

# 3. plot the spectra
ggplot(plotSpec.adj.deric, aes(x = wavenumber, y = intensity, color = "sample")) + geom_line() + scale_x_reverse(limits = c(3350, 1150)) + geom_line(data = sub, aes(x = wavenumber, y = intensity, color = "ref")) + theme_bw() + ggtitle(desiredSpec.name) #+ geom_line(data = plotSpecRaw, aes(x = wavenumber, y = intensity, color = "raw"))

And code for converting the spectral library to derivative version:


## Gabriel Erni Cassola
## MiP data base matching

##______using_OpenSpecy________##
#_______________________________#

# main github page with basic workflow: https://github.com/wincowgerDEV/OpenSpecy
# guides: https://htmlpreview.github.io/?https://github.com/wincowgerDEV/OpenSpecy/blob/main/vignettes/sop.html

# if a need to convert spectra files comes up, this may be of help:
# https://www.effemm2.de/spectragryph/about_feat.html

library("dplyr")
library("OpenSpecy")
library("ggplot2")
library("signal")

#__CONVERT THE LIBRARY

# loading specific library
ftir.db <- readRDS("/Users/ernica0000/Desktop/Basel_Documents/01_research/02_WaterColumn_1Year/02_protocols:material/ftir_library.rds")

# checks...
is(ftir.db)
str(ftir.db); head(ftir.db)
# check if "sample_name" and "group" are identical
all(ifelse(ftir.db$group == ftir.db$sample_name, TRUE, FALSE))

# loop for taking the 1st derivatives
ftir.db.deriv <- data.frame(wavenumber = double(), intensity = double(), sample_name = integer(), group = integer())

for (i in 1:length(unique(ftir.db$sample_name))) {
    ftir.db.subset <- ftir.db[ftir.db$sample_name == i,]

    # obtain 1st derivative
    ftir.db.intDeriv <- as.data.frame(filter(filt = sgolay(p = 3, n = 11, m = 1), x = ftir.db.subset$intensity))
    colnames(ftir.db.intDeriv) <- "intensity"

    ftir.db.subset.intDeriv <- cbind(ftir.db.subset$wavenumber, ftir.db.intDeriv$intensity, ftir.db.subset$sample_name, ftir.db.subset$group)
    ftir.db.deriv <- rbind(ftir.db.deriv, ftir.db.subset.intDeriv)
}

colnames(ftir.db.deriv)[colnames(ftir.db.deriv) == "V1"] <- "wavenumber"
colnames(ftir.db.deriv)[colnames(ftir.db.deriv) == "V2"] <- "intensity"
colnames(ftir.db.deriv)[colnames(ftir.db.deriv) == "V3"] <- "sample_name"
colnames(ftir.db.deriv)[colnames(ftir.db.deriv) == "V4"] <- "group"

head(ftir.db.deriv)
saveRDS(ftir.db.deriv, "/Users/ernica0000/Desktop/Basel_Documents/01_research/02_WaterColumn_1Year/02_protocols:material/newOpenSpecyLib/ftir_library.rds")

ftir.db.subsetPlot <- ftir.db.deriv[ftir.db$sample_name == 6,]
ggplot(ftir.db.subsetPlot, aes(x = wavenumber, y = intensity)) + geom_line() + scale_x_reverse(limits = c(3350, 1150)) + theme_bw()

wincowgerDEV commented 3 years ago

@G-EC is Gabriel's username

wincowgerDEV commented 3 years ago

Wrote some more code today to support this.

library(OpenSpecy)
library(signal)
library(dplyr)
library(ggplot2)

data("raman_hdpe")

raman_hdpe$derivative <- sgolayfilt(raman_hdpe$intensity, p = 3, n = 15, m = 1)

ggplot(raman_hdpe) + geom_line(aes(x = wavenumber, y = make_rel(intensity))) +  geom_line(aes(x = wavenumber, y = make_rel(abs(derivative))), color = "blue")

#where are peaks
raman_hdpe %>%
    dplyr::top_n(1/derivative, n = 50)

hist(log10(make_rel(abs(raman_hdpe$derivative))))

im <- make_rel(abs(raman_hdpe$derivative))

#Max entropy
calculateMaxEntropy <- function(Image){

   # if(max(Image)<=1 & min(Image)>=0){
#        im = Image*255
#    }

    size = dim(im)
    im = as.vector(im)
    hn = hist(im,breaks=seq(0,1, by = 0.01),plot=FALSE)$counts
    #hn = hn / (size[1]*size[2])
    c = rep(0,101)
    c[1] = hn[1]
    for (l in 2:101){
        c[l]=c[l-1]+hn[l]   
    }
    #low and high entropy
    hl = rep(0,101);
    hh = rep(0,101);

    for (t in 1:101){
        #low entropy threshlod
        cl  = c[100]
        if(cl>0){
            for(i in 1:t){
                if (hn[i] >=0) {
                    hl[t] = hl[t]- (hn[i]/cl)*log(hn[i]/cl)
                }
            }
        }    
        #high entropy threshold
        ch = 1 - c[t]
        if(ch > 0){
            for ( i in (t+1):101 ){
                if(!is.na(hn[i])){
                    if ( hn[i]>0 ) {
                        hh[t] = hh[t] - (hn[i]/ch)*log(hn[i]/ch);
                    }
                }  
            }   
        }    
    }
    # Find histogram index with maximum entropy
    h_max =hl[1]+hh[1]
    threshold = 0;
    for(t in 2:101) {
        j = hl[t] + hh[t]
        if(!is.na(j)){
            if (j > h_max) {
                h_max = j;
                threshold = t
            }
        }
    }

    #threshold = threshold/255
    return (threshold)
}

`

wincowgerDEV commented 12 months ago

@G-EC we have this implemented now in the package! Thanks for the help.

wincowgerDEV / OpenSpecy-package

add derivative transformation and derivative matching. #93