##  This is the analysis of the NGBA project data from phase I, using a UPARSE based pipeline (with Vparse for a few steps 
##  where a 64 bit version was required). Most of the code follows http://drive5.com/usearch/manual/upp_454.html
##
##  It has these known limitations
##      Homopolymers are not dealt with in any way 
##      Only the primers used in the publication are analyzed
##      For all data a read length of only 250 bp is being used. This is recommended as a rule of thumb but should be checked
##      using fastq_eestats2


##Convert to fastq format
RatioMinProduct <- 0.5
source("~/GoogleDrive/R_DNA_tools/FASTA_tools.txt")

library("parallel")
library("foreach")
library("doParallel")

##Download and install python scripts from here: http://drive5.com/python/

library(ShortRead)
'%is%' <- function(x, y) { x == y & !is.na(x)}
dir <- "/Volumes/Big_Data/Next_gen_biodiversity_assessment/NGBA_454_Data/RawData"
setwd(dir)
files <- list.files()

madPiper <- function(mainPath, taxaName, taxaPrimer, primerLength, dataBase, projectName, truncateL=250)
    ## Function to run the Uparse pipeline from within the R environment
    ## based very closely on http://drive5.com/usearch/manual/upp_454.html
    {
    dir.create(paste(mainPath, taxaName, sep=""))
    setwd(paste(mainPath, taxaName, sep=""))
    fileName <- paste(projectName, "_", taxaName, ".fastq", sep="")
    if(file.exists(fileName)) file.remove(fileName)
    writeFastq(narrow(allseqdata[seqData$primer %is% taxaPrimer], 
        primerLength+as.numeric(!is.na(seqData$MID[seqData$primer %is% taxaPrimer]))*10),
        file=fileName, compress=FALSE)
    system(paste("cd ", mainPath, taxaName, sep=""))
    ##system(paste("usearch -fastq_eestats2 ", fileName, " -output eestats_", fileName, ".txt", sep=""))    ##This shows >62% at 250, 24% at 300
    system(paste("usearch -fastq_filter ", fileName, " -fastq_trunclen ", truncateL, " -fastaout trimmed.fa", sep=""))
    system(paste("usearch -fastq_filter ", fileName, " -fastq_trunclen ", truncateL, " -fastq_maxee 1.0 -fastaout filtered.fa", sep=""))
    temp <- readFAST("filtered.fa")
    system("usearch -derep_fulllength filtered.fa -sizeout -fastaout uniques.fa")
    system("usearch -cluster_otus uniques.fa -minsize 2 -otus otus.fa -relabel Otu")
    system("usearch -usearch_global trimmed.fa -db otus.fa -strand plus -id 0.97 --blast6out sequence_match_to_Otus.txt")
    }

############################################################################################################################

for(file.i in files)
    {
    print(paste("contructing fastq for", file.i))
    filesIn <- list.files(paste(dir, file.i, sep=""))
    system(paste("python ~/drive5_py/faqual2fastq2.py ", dir, file.i,"/",filesIn[grep(".fasta$",filesIn)], 
                  " ", dir, file.i,"/",filesIn[grep(".qual$",filesIn)], " > ", filesIn[grep(".fasta$",filesIn)],
                  ".fastq", sep=""))
    print("completed")
     }
##Files then manually moved into a folder up a level in hierarchy.
setwd("/Volumes/Big_Data/Next_gen_biodiversity_assessment/NGBA_454_Data/FastQ_from_NGBA_PhaseI")

##Need to write a fastq file for each primer,
files <- list.files()
files <- files[grep("fastq", files)]
allseqdata <- readFastq(files[1])
seqDataID <- rep(files[1],length(allseqdata))
for(file.i in files[-1])
    {
    t <- readFastq(file.i)
    allseqdata <- append(allseqdata, t)
    seqDataID <- append(seqDataID, rep(file.i,length(t)))
    }
seqData <- data.frame(file = seqDataID)

primers <- read.csv("~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/primers.csv", stringsAsFactors=FALSE)
primers$groupPrimer <- paste(primers$orgGroup,primers$name,sep="_")
primers$length <- nchar(primers$seq)
primers$minProduct <- RatioMinProduct*primers$productLength
                        
seqData$primer <- NA
seqData$primer[iupacGrep(primers$seq[primers$name == "PRK341"], substr(sread(allseqdata), 1, 40))] <- "Bacteria_PRK341"
seqData$primer[iupacGrep(primers$seq[primers$name == "gITS7"], substr(sread(allseqdata), 1, 40))] <- "Fungi_gITS7"
seqData$primer[iupacGrep(primers$seq[primers$name == "18S11b"], substr(sread(allseqdata), 1, 40))] <- "Metazoans_18S11b"
seqData$primer[iupacGrep(primers$seq[primers$name == "F1"], substr(sread(allseqdata), 1, 40))] <- "Nematodes_F1"
seqData$primer[iupacGrep(primers$seq[primers$name == "LCO1490"], substr(sread(allseqdata), 1, 40))] <- "Invertebrates_LCO1490"


seq.starts <- substr(sread(allseqdata), 1, 10)
MID1 <- "ACGAGTGCGT"
MID8 <- "CTCGCGTGTC"
MID2 <- "ACGCTCGACA"
MID9 <- "TAGTATCAGC"
MID3 <- "AGACGCACTC"
MID10 <- "TCTCTATGCG"
MID4 <- "AGCACTGTAG"
MID11 <- "TGATACGTCT"
MID5 <- "ATCAGACACG"
MID12 <- "TACTGAGCTA"
MID6 <- "ATATCGCGAG"
MID13 <- "CATAGTAGTG"
MID7 <- "CGTGTCTCTA"
MID14 <- "CGAGAGATAC"
MIDlist <- c(MID1, MID2, MID3, MID4, MID5, MID6, MID7, MID8, MID9, MID10, MID11, MID12, MID13, MID14)
MIDlength <- 10                #length in bp of MID tags.  At present all must be the same
seqData$MID <- match(seq.starts, MIDlist)


seqData$plot <- unlist(lapply(strsplit(as.character(seqData$file), "_"), function(x) x[5]))

seqData$plotMap <- paste(seqData$file, seqData$primer, seqData$MID, sep="/")
gapFilling <- read.csv("~/GoogleDrive/NGBA_GoogleDrive/Data_analysis/Wairau_gap_filling_codes.csv", stringsAsFactors=FALSE)
gapFilling$fullCode <- gsub("rawdata", "reads.fasta.fastq", gapFilling$fullCode) 
seqData$plot[seqData$plotMap %in% gapFilling$fullCode] <- gapFilling$Plot[match(seqData$plotMap[seqData$plotMap %in% gapFilling$fullCode], gapFilling$fullCode)]

seqData$plot[nchar(seqData$plot)>2] <- paste("I",seqData$plot[nchar(seqData$plot)>2], sep="/")  ##This line takes the DOC plots and gives them an "I" at start
seqData$plot[seqData$plot == "I/BO116"] <- "X/BO116"
seqData$plot[seqData$plot == "I/AU127"] <- "X/AU127"
seqData$plot[seqData$plot == "I/U155"] <- "X/U155"
seqData$plot[seqData$plot == "I/AL132"] <- "X/AL132"
##Note that there are more sequences left in the "Repeat" block, as I didn't match all the primers.

seqData$ID <- unlist(lapply(strsplit(as.character(id(allseqdata))," "), "[", 1))
seqData$uniqueSeq <- NA
seqData$Otu <- NA

##db <- readFAST("~/BLAST/db/UNITE_public_31.01.2016clean.fasta")
##UNID <- grep("f__uniden", db$descs)
##writeFast(db$sequences[-UNID], db$descs[-UNID], file = "~/BLAST/db/UNITE_public_31.01.2016cleanID.fasta")


###################################################################################
###This block should run on any taxa by only changing the vars at top.
##
## FUNGI:



mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/"
taxaName <- "fungi"
taxaPrimer <- "Fungi_gITS7"
    
madPiper(mainPath = mainPath,
    taxaName = taxaName,
    taxaPrimer = taxaPrimer,
    primerLength =  nchar(primers$seq[primers$name == "gITS7"]),
    dataBase = "~/BLAST/db/UNITE_public_31.01.2016cleanID.fasta",
    projectName = "NGBA_phaseI")

setwd(paste(mainPath, taxaName, sep=""))
trimmedReads <- readFAST("trimmed.fa")
uniquesfa <- readFAST("uniques.fa")
uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)]
trimmedReads$descs <- gsub(">", "", trimmedReads$descs)
seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)]  <- gsub(">", "", uniquematch) 
OtuMatching <- read.table("sequence_match_to_Otus.txt")
OtuDBmatch <- read.table("Otus_match_to_DB.txt", sep="\t")
seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)]

##
fungiResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)])

##Name Otus using BLAST.  Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match).
system(paste("cd ", mainPath, taxaName, sep=""))
##~/BLAST/bin/formatdb -i  /Users/dickiei/BLAST/db/UNITE_public_31.01.2016clean.fasta -p F
##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt

OtusFasta <- readFAST("otus.fa")
cores <- 15
blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)]
blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs))
for(core.i in 1:cores)
    {
    writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep=""))
    }
cl <- makeCluster(cores)
registerDoParallel(cl, cores=cores)
tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% 
    {
    system(paste("~/BLAST/bin/blastn -query tempFasta",core.i,
    ".fasta -db ",dataBase," -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching",
    core.i, ".txt", sep=""))
    }

temp <- c()
for(core.i in 1:cores)
    {  
    temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep="")))
    }
matches <- grep("# 1 hits found", temp)+1

fungiOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4]
names(fungiOtus) <- c("OTU", "OtuMatch", "identity", "length")
fungiOtus$OtuMatch[as.numeric(fungiOtus$length) < 200] <- NA

fungiOtus$nearestKingdom <-  as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), 
    function(x) gsub("k__", "", ifelse(length(x[grep("k__", x)])>0, x[grep("k__", x)], NA)))))
fungiOtus$nearestPhylum <-  as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), 
    function(x) gsub("p__", "", ifelse(length(x[grep("p__", x)])>0, x[grep("p__", x)], NA)))))
fungiOtus$nearestClass <-  as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), 
    function(x) gsub("c__", "", ifelse(length(x[grep("c__", x)])>0, x[grep("c__", x)], NA)))))
fungiOtus$nearestOrder <-  as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), 
    function(x) gsub("o__", "", ifelse(length(x[grep("o__", x)])>0, x[grep("o__", x)], NA)))))
fungiOtus$nearestFamily <-  as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), 
    function(x) gsub("f__", "", ifelse(length(x[grep("f__", x)])>0, x[grep("f__", x)], NA)))))
fungiOtus$nearestGenus <-  as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), 
    function(x) gsub("g__", "", ifelse(length(x[grep("g__", x)])>0, x[grep("g__", x)], NA)))))
fungiOtus$nearestSpecies <-  as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), 
    function(x) gsub("s__", "", ifelse(length(x[grep("s__", x)])>0, x[grep("s__", x)], NA)))))
    
save(file=paste(mainPath,"All_R_output_UPARSE_NGBA_PhaseI",sep=""), list=ls())
 
 
###################
##   Bacteria
###################

###################################################################################
###This block should run on any taxa by only changing the vars at top.
##
## Bacteria
mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/"
taxaName <- "bacteria"
taxaPrimer <- "Bacteria_PRK341"

    
madPiper(mainPath = mainPath,
    taxaName = taxaName,
    taxaPrimer = taxaPrimer,
    primerLength =  nchar(primers$seq[primers$name == "PRK341"]),
    dataBase = "~/BLAST/db/current_GREENGENES_gg16S_unaligned.fasta",
    projectName = "NGBA_phaseI", truncateL=350)
    
setwd(paste(mainPath, taxaName, sep=""))
trimmedReads <- readFAST("trimmed.fa")
uniquesfa <- readFAST("uniques.fa")
    uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)]
    trimmedReads$descs <- gsub(">", "", trimmedReads$descs)
    seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)]  <- gsub(">", "", uniquematch) 
OtuMatching <- read.table("sequence_match_to_Otus.txt")
OtuDBmatch <- read.table("Otus_match_to_DB.txt", sep="\t")
seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)]
##

bacteriaResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)])

##Name Otus using BLAST.  Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match).
system(paste("cd ", mainPath, taxaName, sep=""))
##~/BLAST/bin/formatdb -i  /Users/dickiei/BLAST/db/UNITE_public_31.01.2016clean.fasta -p F
##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt

dataBase <- "~/BLAST/db/current_GREENGENES_gg16S_unaligned.fasta" 
OtusFasta <- readFAST("otus.fa")
cores <- 15
blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)]
blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs))
for(core.i in 1:cores)
    {
    writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep=""))
    }
cl <- makeCluster(cores)
registerDoParallel(cl, cores=cores)
tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% 
    {
    system(paste("~/BLAST/bin/blastn -query tempFasta",core.i,
    ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching",
    core.i, ".txt", sep=""))
    }
    
temp <- c()
for(core.i in 1:cores)
    {  
    temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep="")))
    }
matches <- grep("# 1 hits found", temp)+1

bacteriaOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4]
names(bacteriaOtus) <- c("OTU", "OtuMatch", "identity", "length")
bacteriaOtus$OtuMatch[as.numeric(bacteriaOtus$length) < 200] <- NA


Bacteria <- readFAST("~/GoogleDrive/NGBA_GoogleDrive/Reference_libraries/current_GREENGENES_gg16S_unaligned.fasta")
    ##Note that the above is a BIG file.
#update bacterial data
##data.frame(OTU = sapply(bacteriaDescList, function(x) x[1]), GB = sapply(bacteriaDescList, function(x) x[2]))
bacteriaDescList <- strsplit(Bacteria$descs, " ")
remove(list="Bacteria")  ##To save working memory, I remove the bacterial data after using it.
bacteriaData <- cbind(OTU = sapply(bacteriaDescList, function(x) x[1]), 
                            genBank = sapply(bacteriaDescList, function(x) x[2]),
                            kingdom = sapply(bacteriaDescList, function(x) gsub("k__", "", x[grep("k__", x)])),
                            phylum = sapply(bacteriaDescList, function(x) gsub("p__", "", x[grep("p__", x)])),
                            class = sapply(bacteriaDescList, function(x) gsub("c__", "", x[grep("c__", x)])),
                            order = sapply(bacteriaDescList, function(x) gsub("o__", "", x[grep("o__", x)])),
                            family = sapply(bacteriaDescList, function(x) gsub("f__", "", x[grep("f__", x)])),
                            genus = sapply(bacteriaDescList, function(x) gsub("g__", "", x[grep("g__", x)])),
                            species = sapply(bacteriaDescList, function(x) gsub("s__", "", x[grep("s__", x)])))

bacteriaData <- apply(bacteriaData, 2, function(x) gsub(";", "", x))
bacteriaData <- apply(bacteriaData, 2, function(x) gsub("character(0)", NA, x))
bacteriaData <- as.data.frame(bacteriaData)


bacteriaOtus$nearestKingdom <- bacteriaData$kingdom[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)]
bacteriaOtus$nearestPhylum <- bacteriaData$phylum[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)]
bacteriaOtus$nearestClass <- bacteriaData$class[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)]
bacteriaOtus$nearestOrder <- bacteriaData$order[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)]
bacteriaOtus$nearestFamily <- bacteriaData$family[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)]
bacteriaOtus$nearestGenus <- bacteriaData$genus[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)]
bacteriaOtus$nearestSpecies <- bacteriaData$species[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)]
    
save(file=paste(mainPath,"All_R_output_UPARSE_NGBA_PhaseI",sep=""), list=ls())
 
 
### ADDED TO RESTRICT BACTERIA:

bacteriaOtus$sequences <- OtusFasta$sequence

## Run with restricted database
source("~/GoogleDrive/NGBA_data/Compilation_code/DNA_data_analysis/PipelineFunctions.txt")
unmatchedFasta <- list(sequences=bacteriaOtus$sequence[bacteriaOtus$nearestClass%in% c("","character(0)")], descs= bacteriaOtus$OTU[bacteriaOtus$nearestClass%in% c("","character(0)")])

dataBase <- "~/BLAST/db/current_GREENGENES_gg16S_unaligned_classID.fasta"
cores <- 15
blockstarts <- floor(seq(1, length(unmatchedFasta$descs), length.out=cores+1))[-(cores+1)]
blockends <- c(blockstarts[-1]-1,length(unmatchedFasta$descs))
for(core.i in 1:cores)
    {
    writeFast(unlist(unmatchedFasta$sequences[blockstarts[core.i]:blockends[core.i]]),unmatchedFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep=""))
    }
cl <- makeCluster(cores)
registerDoParallel(cl, cores=cores)
tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% 
    {
    system(paste("~/BLAST/bin/blastn -query tempFasta",core.i,
    ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching",
    core.i, ".txt", sep=""))
    }
    
temp <- c()
for(core.i in 1:cores)
    {  
    temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep="")))
    }
matches <- grep("# 1 hits found", temp)+1


restrictedBacteriaOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4]
names(restrictedBacteriaOtus) <- c("OTU", "OtuMatchRestricted", "identityRestricted", "lengthRestricted")
restrictedBacteriaOtus$OtuMatch[as.numeric(restrictedBacteriaOtus$length) < 200] <- NA



bacteriaOtus$OtuMatchRestricted <- bacteriaOtus$OtuMatch
bacteriaOtus$identityRestricted <- bacteriaOtus$identity
bacteriaOtus$lengthRestricted <- bacteriaOtus$length
bacteriaOtus$OtuMatchRestricted[match(restrictedBacteriaOtus$OTU, bacteriaOtus$OTU)] <- restrictedBacteriaOtus$OtuMatchRestricted
bacteriaOtus$identityRestricted[match(restrictedBacteriaOtus$OTU, bacteriaOtus$OTU)] <- restrictedBacteriaOtus$identityRestricted
bacteriaOtus$lengthRestricted[match(restrictedBacteriaOtus$OTU, bacteriaOtus$OTU)] <- restrictedBacteriaOtus$length



bacteriaOtus$nearestKingdom <- bacteriaData$kingdom[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)]
bacteriaOtus$nearestPhylum <- bacteriaData$phylum[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)]
bacteriaOtus$nearestClass <- bacteriaData$class[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)]
bacteriaOtus$nearestOrder <- bacteriaData$order[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)]
bacteriaOtus$nearestFamily <- bacteriaData$family[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)]
bacteriaOtus$nearestGenus <- bacteriaData$genus[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)]
bacteriaOtus$nearestSpecies <- bacteriaData$species[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)]

save(file=paste(mainPath,"Updated_bacteria_for_kate_2",sep=""), list=c("bacteriaOtus"))
save(file=paste("~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/","Updated_bacteria_for_kate_2",sep=""), list=c("bacteriaOtus"))
##This file manually moved to shared drive later (5 May 2017)

 
###################
##   Metazoans_18S11b
###################


##prep Metazoans data


##FAILURE -- COULDN'T GET THE WHOLE THING TO DOWNLOAD PROBLEM AT NCBI
##("Metazoa"[Organism] AND (18S[All Fields] OR "small subunit"[All Fields]) AND ("100"[SLEN] : "5000"[SLEN])) NOT environmental[All Fields] 

##Because of issues, used http://ddbj.nig.ac.jp/arsa/search?lang=en&cond=advanced_search&pa=&an=&sl1=200&sl2=5000&mt=DNA&_mt=on&_mf=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&dt1=&dt2=&df=18S+OR+%22small+subunit%22+NOT+%22environmental%22&kw=&og=&ln=Metazoa&ra=&rt=&rj=&rp=&cm=&fq%5B0%5D.featureKey=&fq%5B0%5D.qualifierName=&fq%5B0%5D.qualifierValue=&at=&sortTarget=score&sortOrder=desc&displayFields=PrimaryAccessionNumber&displayFields=Definition&displayFields=SequenceLength&displayFields=MolecularType&displayFields=Organism&_displayFields=on&op=AND

##Saved as "all.Metazoa18S.August2016.fasta")
system("~/BLAST/bin/formatdb -i  /Users/dickiei/BLAST/db/all.Metazoa18S_ARSA.fasta -p F")


###################################################################################
###This block should run on any taxa by only changing the vars at top.
##
## metazoans
mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/"
taxaName <- "metazoans"
taxaPrimer <- "Metazoans_18S11b"
dataBase <- "/Users/dickiei/BLAST/db/all.Metazoa18S_ARSA.fasta"
 
madPiper(mainPath = mainPath,
    taxaName = taxaName,
    taxaPrimer = taxaPrimer,
    primerLength =  nchar(primers$seq[primers$name == "18S11b"]),
    dataBase = dataBase,
    projectName = "NGBA_phaseI", truncateL=250)
    
setwd(paste(mainPath, taxaName, sep=""))
trimmedReads <- readFAST("trimmed.fa")
uniquesfa <- readFAST("uniques.fa")
    uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)]
    trimmedReads$descs <- gsub(">", "", trimmedReads$descs)
    seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)]  <- gsub(">", "", uniquematch) 
OtuMatching <- read.table("sequence_match_to_Otus.txt")
seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)]
##

metazoansResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)])

##Name Otus using BLAST.  Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match).
system(paste("cd ", mainPath, taxaName, sep=""))
##~/BLAST/bin/formatdb -i  /Users/dickiei/BLAST/db/all.Metazoa18S.fasta -p F
##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt

OtusFasta <- readFAST("otus.fa")
cores <- 15
blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)]
blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs))
for(core.i in 1:cores)
    {
    writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep=""))
    }
cl <- makeCluster(cores)
registerDoParallel(cl, cores=cores)
tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% 
    {
    system(paste("~/BLAST/bin/blastn -query tempFasta",core.i,
    ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching",
    core.i, ".txt", sep=""))
    }
temp<-c()
for(core.i in 1:cores)
    {  
    temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep="")))
    }
matches <- grep("# 1 hits found", temp)+1

metazoansOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4]
names(metazoansOtus) <- c("OTU", "OtuMatch", "identity", "length")
metazoansOtus$OtuMatch[as.numeric(metazoansOtus$length) < 200] <- NA

source("~/GoogleDrive/GenBankNames")
temp <- readFAST(dataBase)
temp$names <- unlist(lapply(strsplit(temp$descs, " "), function(x) paste(x[2],x[3])))

temp$ID <- unlist(lapply(strsplit(temp$descs, " "), function(x) x[1]))
metazoansOtus$nearestSpecies <- temp$names[match(metazoansOtus$OtuMatch, temp$ID)]
metazoansOtus$nearestGenus <- unlist(sapply(strsplit(metazoansOtus$nearestSpecies, " "), "[", 1))

taxonomyL <- lapply(unique(metazoansOtus$nearestGenus), getTaxed)
taxonomyDF <- data.frame(genus=unique(metazoansOtus$nearestGenus),
                Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])),
                Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error",as.character(x$name)[x$rank=="phylum"])[1])),
                Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])),
                Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])),
                Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1])))
metazoansOtus$nearestKingdom <- as.character(taxonomyDF$Kingdom[match(metazoansOtus$nearestGenus, taxonomyDF$genus)])
metazoansOtus$nearestPhylum <- as.character(taxonomyDF$Phylum[match(metazoansOtus$nearestGenus, taxonomyDF$genus)])
metazoansOtus$nearestClass <- as.character(taxonomyDF$Class[match(metazoansOtus$nearestGenus, taxonomyDF$genus)])
metazoansOtus$nearestOrder <- as.character(taxonomyDF$Order[match(metazoansOtus$nearestGenus, taxonomyDF$genus)])
metazoansOtus$nearestFamily <- as.character(taxonomyDF$Family[match(metazoansOtus$nearestGenus, taxonomyDF$genus)])

##add missing species

Spp.list <- unique(metazoansOtus$nearestSpecies[is.na(metazoansOtus$nearestKingdom) & !is.na(metazoansOtus$nearestSpecies)])
taxonomyL <- lapply(Spp.list, getTaxed)
taxonomyDF <- data.frame(genus=Spp.list,
                Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])),
                Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="phylum"])[1])),
                Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])),
                Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])),
                Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1])), stringsAsFactors=FALSE)
metazoansOtus$nearestKingdom[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Kingdom[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)])
metazoansOtus$nearestPhylum[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Phylum[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)])
metazoansOtus$nearestClass[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Class[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)])
metazoansOtus$nearestOrder[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Order[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)])
metazoansOtus$nearestFamily[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Family[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)])


###################
##   nematodes
###################

###################################################################################
###This block should run on any taxa by only changing the vars at top.
##
## nematodes
mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/"
taxaName <- "nematodes"
taxaPrimer <- "Nematodes_F1"
 dataBase <-  "/Users/dickiei/BLAST/db/all.Metazoa18S_ARSA.fasta"
    
madPiper(mainPath = mainPath,
    taxaName = taxaName,
    taxaPrimer = taxaPrimer,
    primerLength =  nchar(primers$seq[primers$name == "F1"]),
    dataBase = dataBase,
    projectName = "NGBA_phaseI", truncateL = 300)
    
setwd(paste(mainPath, taxaName, sep=""))



primerLength <-  nchar(primers$seq[primers$name == "F1"])
dataBase <-  "/Users/dickiei/BLAST/db/all.Metazoa18S_ARSA.fasta"
projectName <- "NGBA_phaseI"
    dir.create(paste(mainPath, taxaName, sep=""))
    setwd(paste(mainPath, taxaName, sep=""))
    fileName <- paste(projectName, "_", taxaName, ".fastq", sep="")
    if(file.exists(fileName)) file.remove(fileName)
    writeFastq(narrow(allseqdata[seqData$primer %is% taxaPrimer], 
    primerLength+as.numeric(!is.na(seqData$MID[seqData$primer %is% taxaPrimer]))*10),
    file=fileName, compress=FALSE)
    system(paste("cd ", mainPath, taxaName, sep=""))
      ##system(paste("usearch -fastq_eestats2 ", fileName, " -output eestats_", fileName, ".txt", sep=""))    #Trim to 300 bo

    system(paste("usearch -fastq_filter ", fileName, " -fastq_trunclen 300 -fastaout trimmed.fa", sep=""))
    system(paste("usearch -fastq_filter ", fileName, " -fastq_trunclen 300 -fastq_maxee 1.0 -fastaout filtered.fa", sep=""))
    system("usearch -derep_fulllength filtered.fa -sizeout -fastaout uniques.fa")
    system("usearch -cluster_otus uniques.fa -minsize 2 -otus otus.fa -relabel Otu")
    system("usearch -usearch_global trimmed.fa -db otus.fa -strand plus -id 0.97 --blast6out sequence_match_to_Otus.txt")
trimmedReads <- readFAST("trimmed.fa")
uniquesfa <- readFAST("uniques.fa")
    uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)]
    trimmedReads$descs <- gsub(">", "", trimmedReads$descs)
    seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)]  <- gsub(">", "", uniquematch) 
OtuMatching <- read.table("sequence_match_to_Otus.txt")
OtuDBmatch <- read.table("Otus_match_to_DB.txt", sep="\t")
seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)]
##

nematodesResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)])

##Name Otus using BLAST.  Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match).
system(paste("cd ", mainPath, taxaName, sep=""))
##~/BLAST/bin/formatdb -i  /Users/dickiei/BLAST/db/UNITE_public_31.01.2016clean.fasta -p F
##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt

OtusFasta <- readFAST("otus.fa")
cores <- 15
blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)]
blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs))
for(core.i in 1:cores)
    {
    writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep=""))
    }
cl <- makeCluster(cores)
registerDoParallel(cl, cores=cores)
tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% 
    {
    system(paste("~/BLAST/bin/blastn -query tempFasta",core.i,
    ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching",
    core.i, ".txt", sep=""))
    }
temp <- c()
for(core.i in 1:cores)
    {  
    temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep="")))
    }
matches <- grep("# 1 hits found", temp)+1

nematodesOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4]
names(nematodesOtus) <- c("OTU", "OtuMatch", "identity", "length")
nematodesOtus$OtuMatch[as.numeric(nematodesOtus$length) < 200] <- NA

source("~/GoogleDrive/GenBankNames")
temp <- readFAST(dataBase)
temp$names <- unlist(lapply(strsplit(temp$descs, " "), function(x) paste(x[2],x[3])))

temp$ID <- unlist(lapply(strsplit(temp$descs, " "), function(x) x[1]))
nematodesOtus$nearestSpecies <- temp$names[match(nematodesOtus$OtuMatch, temp$ID)]
nematodesOtus$nearestGenus <- unlist(sapply(strsplit(nematodesOtus$nearestSpecies, " "), "[", 1))

taxonomyL <- lapply(unique(nematodesOtus$nearestGenus), getTaxed)
taxonomyDF <- data.frame(genus=unique(nematodesOtus$nearestGenus),
                Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])),
                Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error",as.character(x$name)[x$rank=="phylum"])[1])),
                Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])),
                Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])),
                Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1])))
nematodesOtus$nearestKingdom <- as.character(taxonomyDF$Kingdom[match(nematodesOtus$nearestGenus, taxonomyDF$genus)])
nematodesOtus$nearestPhylum <- as.character(taxonomyDF$Phylum[match(nematodesOtus$nearestGenus, taxonomyDF$genus)])
nematodesOtus$nearestClass <- as.character(taxonomyDF$Class[match(nematodesOtus$nearestGenus, taxonomyDF$genus)])
nematodesOtus$nearestOrder <- as.character(taxonomyDF$Order[match(nematodesOtus$nearestGenus, taxonomyDF$genus)])
nematodesOtus$nearestFamily <- as.character(taxonomyDF$Family[match(nematodesOtus$nearestGenus, taxonomyDF$genus)])

##add missing species

Spp.list <- unique(nematodesOtus$nearestSpecies[is.na(nematodesOtus$nearestKingdom) & !is.na(nematodesOtus$nearestSpecies)])
taxonomyL <- lapply(Spp.list, getTaxed)
taxonomyDF <- data.frame(genus=Spp.list,
                Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])),
                Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="phylum"])[1])),
                Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])),
                Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])),
                Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1])), stringsAsFactors=FALSE)
nematodesOtus$nearestKingdom[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Kingdom[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)])
nematodesOtus$nearestPhylum[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Phylum[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)])
nematodesOtus$nearestClass[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Class[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)])
nematodesOtus$nearestOrder[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Order[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)])
nematodesOtus$nearestFamily[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Family[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)])


###################
##   invertebrates
###################


## http://ddbj.nig.ac.jp/arsa/search?lang=en&cond=advanced_search&pa=&an=&sl1=200&sl2=5000&mt=DNA&_mt=on&_mf=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&dt1=&dt2=&df=%28COI+OR+%22CO1%22+OR+%22cytochrome+oxidase%22+OR+%22COX1%22+OR+%22cytochrome+c+oxidase%22%29+NOT+%28%22unverified%22+OR+%22uncultured%22%29&kw=&og=&ln=Metazoa&ra=&rt=&rj=&rp=&cm=&fq%5B0%5D.featureKey=&fq%5B0%5D.qualifierName=&fq%5B0%5D.qualifierValue=&at=&sortTarget=score&sortOrder=desc&displayFields=PrimaryAccessionNumber&displayFields=Definition&displayFields=SequenceLength&displayFields=MolecularType&displayFields=Organism&_displayFields=on&op=AND
system("~/BLAST/bin/formatdb -i  /Users/dickiei/BLAST/db/all.MetazoaCO1_ARSA.fasta -p F")


###################################################################################
###This block should run on any taxa by only changing the vars at top.
##
## invertebrates
mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/"
taxaName <- "invertebrates"
taxaPrimer <- "Invertebrates_LCO1490"
dataBase <- "~/BLAST/db/all.MetazoaCO1_ARSA.fasta"
    
madPiper(mainPath = mainPath,
    taxaName = taxaName,
    taxaPrimer = taxaPrimer,
    primerLength =  nchar(primers$seq[primers$name == "LCO1490"]),
    dataBase =   dataBase,
    projectName = "NGBA_phaseI", truncateL = 250)
    
setwd(paste(mainPath, taxaName, sep=""))



trimmedReads <- readFAST("trimmed.fa")
uniquesfa <- readFAST("uniques.fa")
    uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)]
    trimmedReads$descs <- gsub(">", "", trimmedReads$descs)
    seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)]  <- gsub(">", "", uniquematch) 
OtuMatching <- read.table("sequence_match_to_Otus.txt")
OtuDBmatch <- read.table("Otus_match_to_DB.txt", sep="\t")
seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)]
##

invertebratesResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)])

##Name Otus using BLAST.  Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match).
system(paste("cd ", mainPath, taxaName, sep=""))
##~/BLAST/bin/formatdb -i  /Users/dickiei/BLAST/db/UNITE_public_31.01.2016clean.fasta -p F
##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt

OtusFasta <- readFAST("otus.fa")
cores <- 15
blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)]
blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs))
for(core.i in 1:cores)
    {
    writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep=""))
    }
cl <- makeCluster(cores)
registerDoParallel(cl, cores=cores)
tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% 
    {
    system(paste("~/BLAST/bin/blastn -query tempFasta",core.i,
    ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching",
    core.i, ".txt", sep=""))
    }
temp <- c()
for(core.i in 1:cores)
    {  
    temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep="")))
    }
matches <- grep("# 1 hits found", temp)+1

invertebratesOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4]
names(invertebratesOtus) <- c("OTU", "OtuMatch", "identity", "length")
invertebratesOtus$OtuMatch[as.numeric(invertebratesOtus$length) < 200] <- NA

source("~/GoogleDrive/GenBankNames")
temp <- readFAST(dataBase)
temp$names <- unlist(lapply(strsplit(temp$descs, " "), function(x) paste(x[2],x[3])))

temp$ID <- unlist(lapply(strsplit(temp$descs, " "), function(x) x[1]))
invertebratesOtus$nearestSpecies <- temp$names[match(invertebratesOtus$OtuMatch, temp$ID)]
invertebratesOtus$nearestGenus <- unlist(sapply(strsplit(invertebratesOtus$nearestSpecies, " "), "[", 1))

taxonomyL <- lapply(unique(invertebratesOtus$nearestGenus), getTaxed)
taxonomyDF <- data.frame(genus=unique(invertebratesOtus$nearestGenus),
                Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])),
                Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error",as.character(x$name)[x$rank=="phylum"])[1])),
                Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])),
                Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])),
                Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1])))
invertebratesOtus$nearestKingdom <- as.character(taxonomyDF$Kingdom[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)])
invertebratesOtus$nearestPhylum <- as.character(taxonomyDF$Phylum[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)])
invertebratesOtus$nearestClass <- as.character(taxonomyDF$Class[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)])
invertebratesOtus$nearestOrder <- as.character(taxonomyDF$Order[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)])
invertebratesOtus$nearestFamily <- as.character(taxonomyDF$Family[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)])

##add missing species

Spp.list <- unique(invertebratesOtus$nearestSpecies[is.na(invertebratesOtus$nearestKingdom) & !is.na(invertebratesOtus$nearestSpecies)])
taxonomyL <- lapply(Spp.list, getTaxed)
taxonomyDF <- data.frame(genus=Spp.list,
                Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])),
                Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="phylum"])[1])),
                Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])),
                Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])),
                Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1])), stringsAsFactors=FALSE)
invertebratesOtus$nearestKingdom[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Kingdom[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)])
invertebratesOtus$nearestPhylum[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Phylum[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)])
invertebratesOtus$nearestClass[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Class[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)])
invertebratesOtus$nearestOrder[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Order[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)])
invertebratesOtus$nearestFamily[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Family[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)])




save(file=paste(mainPath,"All_R_output_UPARSE_NGBA_PhaseI",sep=""), list=ls())