bg<-c(.295,.205,.205,.295) makepwm<-function(x,bg){ # x = matrix of N aligned sites coded numerically # bg = vector (1x4) of background base frequencies L<-length(x[1,]) # Number of positions in each site N<-length(x[,1]) # Number of sites pwm<-matrix(rep(1,4*L),nrow=4) # pwm initialized to 1 for each matrix element (pseudocounts) for (j in 1:L){ for (i in 1:N){ k <- x[i,j] pwm[k,j] <- pwm[k,j]+1 } } N <- N+4 # Denominator for small sample correction pwm<-pwm/N # PWM in terms of probabilities log2pwm<-matrix(rep(0,4*L),nrow=4,ncol=L) # Initialize PWM in terms of log(base 2) of p/q for(i in 1:4){ log2pwm[i,]<-log2(pwm[i,]/bg[i]) # Scores for each [nucleotide, position], base 2 } return(pwm, log2pwm) } calcscore<-function(seq,log2pwm){ # seq is a vector representing input DNA numerically # log2pwm is a PWM (4xL) with elements as log base 2 score <- 0 for (j in 1:length(log2pwm[1,])){ score<-score+log2pwm[seq[j],j]} return(score) } gata<-read.table("C:/Documents and Settings/lynn/Desktop/gata1S.txt", header=F) gata <- as.matrix(gata) tmp<-makepwm(gata,bg) log2pwm<-tmp$log2pwm gata.score<-rep(0,length(gata[,1])) for(i in 1:length(gata[,1])){ gata.score[i]<-calcscore(gata[i,],log2pwm) #print(gata[i,]) } signif(gata.score,3) hist(gata.score,xlim=c(-12,12),ylim=c(0,0.4),nclass=10,prob=T) length(gata[gata[,1]==1,1]) length(gata[gata[,1]==2,1]) length(gata[gata[,1]==3,1]) length(gata[gata[,1]==4,1]) vectorn<-c(29,2,1,17) vector1<-(vectorn+1)/(49+4) sum(vector1)