########################################################### ## ## preprocess function: data preprocess for blast result ouput. ## Input: ## x - blast ouput ## percentage - threshold for filtering the blast results with very low alignment percentage ## ## output: ## outdata - output data which contain seq, tax, ## ## ## description, match, maxlength ## mapping - contain the summary of seq for output data and ## taxon name ########################################################### Preprocess<-function(x, percentage=0.6){ vnames<-c("seq", "gene", "tax", "des","match", "rawlength", "length") names(x)<-vnames x<-x[!is.na(x$des),] ### remove the lines without description x<-x[!is.na(x$tax),] ### remove the lines without taxon id x<-x[!is.na(x$length),] ### remove the lines without length value x$tax=as.integer(as.character(x$tax)) x$Seq.tax<-paste(x$seq,x$tax,sep=":") ### create a new variable which is the combination of seq id and tax id, ### in order to remove the duplicated combinations x<-x[!duplicated(x$Seq.tax),] x<-x[,vnames] x$seq<-factor(x$seq, levels=unique(x$seq)) maxlen<-with(x,tapply(length,seq,max)) num<-table(x$seq) x$maxlength<-rep(maxlen, num) x$new.perc<-x$match/x$maxlength x<-x[x$new.perc>percentage,] cname=c("seq", "tax", "des", "maxlength", "match") outdata<-x[,cname] ##### get the summary/frequency of short reads for each taxon, ##### and save this info with the scientific names into mapping which will be used later temp<-outdata[!duplicated(outdata$tax),] unique.tax<-temp[order(temp$tax),] blast.summary<-table(outdata$tax) mapping<-data.frame(taxid=unique.tax$tax, BLAST=unclass(blast.summary), description=unique.tax$des) return(list(outdata=outdata, mapping=mapping)) }