###########################################################
##
## preprocess function: data preprocess for blast result ouput.
## Input:
## x - blast ouput
## percentage - threshold for filtering the blast results with very low alignment percentage
##  
## output: 
## outdata - output data which contain seq, tax, ## ## ## description, match, maxlength
## mapping - contain the summary of seq for output data and 
## 	taxon name
###########################################################

Preprocess<-function(x, percentage=0.6){
	vnames<-c("seq", "gene", "tax", "des","match",  "rawlength", "length")

	names(x)<-vnames
	x<-x[!is.na(x$des),]	### remove the lines without description 
	x<-x[!is.na(x$tax),]		### remove the lines without taxon id
	x<-x[!is.na(x$length),]	### remove the lines without length value

        x$tax=as.integer(as.character(x$tax))

	x$Seq.tax<-paste(x$seq,x$tax,sep=":") 
        ### create a new variable which is the combination of seq id and tax id, 
        ### in order to remove the duplicated combinations 

	x<-x[!duplicated(x$Seq.tax),]
	x<-x[,vnames]

        x$seq<-factor(x$seq, levels=unique(x$seq))
	maxlen<-with(x,tapply(length,seq,max))
	num<-table(x$seq)
	x$maxlength<-rep(maxlen, num)
 	x$new.perc<-x$match/x$maxlength

	x<-x[x$new.perc>percentage,] 
	cname=c("seq", "tax", "des", "maxlength", "match")
	outdata<-x[,cname] 

        ##### get the summary/frequency of short reads for each taxon, 
        ##### and save this info with the scientific names into mapping which will be used later  

	temp<-outdata[!duplicated(outdata$tax),]
	unique.tax<-temp[order(temp$tax),]
	blast.summary<-table(outdata$tax)
	mapping<-data.frame(taxid=unique.tax$tax, BLAST=unclass(blast.summary), description=unique.tax$des)
	return(list(outdata=outdata, mapping=mapping))
}