Commit 4951ddd5 authored by harm's avatar harm
Browse files

Improved speed and lowered minimal length requirements to 80 for Guide scoring

parent 4a169510
......@@ -73,7 +73,7 @@ if (length(args)!=3) {
######################################################################
# Set Parameters
guideLength = 23
guideLength = 23 # fixed length
WindowOffset = 0 # Offset to start scoring guides. Some of the features will depend of the target sequence context upstream. Without the offset, NAs will be assigned for the respective feature
GCmin = 0.1 # probability value between 0 and 1
......@@ -93,9 +93,6 @@ Sys.setenv("PATH" = paste(Sys.getenv('PATH'), '~/bin', sep = ':'))
######################################################################
# change nothing below here
######################################################################
......@@ -504,6 +501,27 @@ GetUnpairedProb <- function( x = Guide.df , MA = log10(UnpairedProbabilities.tra
}
GetUnpairedProbSimple = function( x = dat , MA = log10(UnpairedProbabilities.tranformed)){
# Before, i did it like this.
# it ran into an error if the taregt was < 100bp due to a combination of the position j and the window size w (specifically, if j < w and length < 2*w)
# Instead, i greatly simplified it by extracting first the single relevant row, and then the values relative to the matchpos
# The extracted values were exaclty the same for at least 1 tested sequence
# slice the nucleotide density matrix for each guide to obtain a window of +/- the window size (default 50nt) centered on guide match position 1
# Slices = lapply( as.list(x$MatchPos) , FUN = SliceMatrix ,mat=MA, w=W)
# names(Slices) <- rownames(x)
# dens <- sapply(Slices , FUN = GetValue, d = Y, w = X)
# Get relevant row from Matrix
VEC = MA[23,] # is similar to Y
# Get value raltive to guide match position (offset of -11 relative to start position)
dens <- VEC[ x$MatchPos - 11]
return(dens)
}
GetTargetSiteAccessibility = function( dat = Guide.df, fa = FA ){
......@@ -522,7 +540,8 @@ GetTargetSiteAccessibility = function( dat = Guide.df, fa = FA ){
UnpairedProbabilities.tranformed <- Transform_RNAplfold_predictions(UnpairedProbabilities)
# As there was no clear pattern, this will only record the unpaired probability covering the entire guide match
Log10_Unpaired <- GetUnpairedProb(x=dat , MA = log10(UnpairedProbabilities.tranformed), W=50, X=40 , Y=23)
Log10_Unpaired <- GetUnpairedProbSimple(x=dat , MA = log10(UnpairedProbabilities.tranformed)) # in use
# Log10_Unpaired <- GetUnpairedProb(x=dat , MA = log10(UnpairedProbabilities.tranformed), W=50, X=40 , Y=23) # using slice matrix and created errors for seqs < 100nt, got replaced by function above
# clean up
tmp = file.remove( paste0('./',RanStr,'.fa') ,paste0('./',RanStr,'_lunp'), paste0('./',RanStr,'_dp.ps'))
......@@ -773,8 +792,15 @@ Guide.df = cbind.data.frame( Guide.df, LetterProbs)
# Generate Model based on original on-target screens.
cat( paste0( "Generate model started on " , date() ,"\n"))
out = PrePareModelInput(x=fullset, FIELDS=fields)
if( !file.exists('./data/RFcombined.rds') ){
cat( paste0( "Generate model started on " , date() ,"\n"))
out = PrePareModelInput(x=fullset, FIELDS=fields)
saveRDS(out , file = './data/RFcombined.rds')
}else{
cat( paste0( "Loading precomputed model started on " , date() ,"\n"))
out = readRDS(file = './data/RFcombined.rds')
}
Model = out[[1]]
# Get feature Means and SD for scaling
ModelInputMeans = out[[2]]
......@@ -832,8 +858,3 @@ if (PLOT == TRUE){
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment