## This is the analysis of the NGBA project data from phase I, using a UPARSE based pipeline (with Vparse for a few steps ## where a 64 bit version was required). Most of the code follows http://drive5.com/usearch/manual/upp_454.html ## ## It has these known limitations ## Homopolymers are not dealt with in any way ## Only the primers used in the publication are analyzed ## For all data a read length of only 250 bp is being used. This is recommended as a rule of thumb but should be checked ## using fastq_eestats2 ##Convert to fastq format RatioMinProduct <- 0.5 source("~/GoogleDrive/R_DNA_tools/FASTA_tools.txt") library("parallel") library("foreach") library("doParallel") ##Download and install python scripts from here: http://drive5.com/python/ library(ShortRead) '%is%' <- function(x, y) { x == y & !is.na(x)} dir <- "/Volumes/Big_Data/Next_gen_biodiversity_assessment/NGBA_454_Data/RawData" setwd(dir) files <- list.files() madPiper <- function(mainPath, taxaName, taxaPrimer, primerLength, dataBase, projectName, truncateL=250) ## Function to run the Uparse pipeline from within the R environment ## based very closely on http://drive5.com/usearch/manual/upp_454.html { dir.create(paste(mainPath, taxaName, sep="")) setwd(paste(mainPath, taxaName, sep="")) fileName <- paste(projectName, "_", taxaName, ".fastq", sep="") if(file.exists(fileName)) file.remove(fileName) writeFastq(narrow(allseqdata[seqData$primer %is% taxaPrimer], primerLength+as.numeric(!is.na(seqData$MID[seqData$primer %is% taxaPrimer]))*10), file=fileName, compress=FALSE) system(paste("cd ", mainPath, taxaName, sep="")) ##system(paste("usearch -fastq_eestats2 ", fileName, " -output eestats_", fileName, ".txt", sep="")) ##This shows >62% at 250, 24% at 300 system(paste("usearch -fastq_filter ", fileName, " -fastq_trunclen ", truncateL, " -fastaout trimmed.fa", sep="")) system(paste("usearch -fastq_filter ", fileName, " -fastq_trunclen ", truncateL, " -fastq_maxee 1.0 -fastaout filtered.fa", sep="")) temp <- readFAST("filtered.fa") system("usearch -derep_fulllength filtered.fa -sizeout -fastaout uniques.fa") system("usearch -cluster_otus uniques.fa -minsize 2 -otus otus.fa -relabel Otu") system("usearch -usearch_global trimmed.fa -db otus.fa -strand plus -id 0.97 --blast6out sequence_match_to_Otus.txt") } ############################################################################################################################ for(file.i in files) { print(paste("contructing fastq for", file.i)) filesIn <- list.files(paste(dir, file.i, sep="")) system(paste("python ~/drive5_py/faqual2fastq2.py ", dir, file.i,"/",filesIn[grep(".fasta$",filesIn)], " ", dir, file.i,"/",filesIn[grep(".qual$",filesIn)], " > ", filesIn[grep(".fasta$",filesIn)], ".fastq", sep="")) print("completed") } ##Files then manually moved into a folder up a level in hierarchy. setwd("/Volumes/Big_Data/Next_gen_biodiversity_assessment/NGBA_454_Data/FastQ_from_NGBA_PhaseI") ##Need to write a fastq file for each primer, files <- list.files() files <- files[grep("fastq", files)] allseqdata <- readFastq(files[1]) seqDataID <- rep(files[1],length(allseqdata)) for(file.i in files[-1]) { t <- readFastq(file.i) allseqdata <- append(allseqdata, t) seqDataID <- append(seqDataID, rep(file.i,length(t))) } seqData <- data.frame(file = seqDataID) primers <- read.csv("~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/primers.csv", stringsAsFactors=FALSE) primers$groupPrimer <- paste(primers$orgGroup,primers$name,sep="_") primers$length <- nchar(primers$seq) primers$minProduct <- RatioMinProduct*primers$productLength seqData$primer <- NA seqData$primer[iupacGrep(primers$seq[primers$name == "PRK341"], substr(sread(allseqdata), 1, 40))] <- "Bacteria_PRK341" seqData$primer[iupacGrep(primers$seq[primers$name == "gITS7"], substr(sread(allseqdata), 1, 40))] <- "Fungi_gITS7" seqData$primer[iupacGrep(primers$seq[primers$name == "18S11b"], substr(sread(allseqdata), 1, 40))] <- "Metazoans_18S11b" seqData$primer[iupacGrep(primers$seq[primers$name == "F1"], substr(sread(allseqdata), 1, 40))] <- "Nematodes_F1" seqData$primer[iupacGrep(primers$seq[primers$name == "LCO1490"], substr(sread(allseqdata), 1, 40))] <- "Invertebrates_LCO1490" seq.starts <- substr(sread(allseqdata), 1, 10) MID1 <- "ACGAGTGCGT" MID8 <- "CTCGCGTGTC" MID2 <- "ACGCTCGACA" MID9 <- "TAGTATCAGC" MID3 <- "AGACGCACTC" MID10 <- "TCTCTATGCG" MID4 <- "AGCACTGTAG" MID11 <- "TGATACGTCT" MID5 <- "ATCAGACACG" MID12 <- "TACTGAGCTA" MID6 <- "ATATCGCGAG" MID13 <- "CATAGTAGTG" MID7 <- "CGTGTCTCTA" MID14 <- "CGAGAGATAC" MIDlist <- c(MID1, MID2, MID3, MID4, MID5, MID6, MID7, MID8, MID9, MID10, MID11, MID12, MID13, MID14) MIDlength <- 10 #length in bp of MID tags. At present all must be the same seqData$MID <- match(seq.starts, MIDlist) seqData$plot <- unlist(lapply(strsplit(as.character(seqData$file), "_"), function(x) x[5])) seqData$plotMap <- paste(seqData$file, seqData$primer, seqData$MID, sep="/") gapFilling <- read.csv("~/GoogleDrive/NGBA_GoogleDrive/Data_analysis/Wairau_gap_filling_codes.csv", stringsAsFactors=FALSE) gapFilling$fullCode <- gsub("rawdata", "reads.fasta.fastq", gapFilling$fullCode) seqData$plot[seqData$plotMap %in% gapFilling$fullCode] <- gapFilling$Plot[match(seqData$plotMap[seqData$plotMap %in% gapFilling$fullCode], gapFilling$fullCode)] seqData$plot[nchar(seqData$plot)>2] <- paste("I",seqData$plot[nchar(seqData$plot)>2], sep="/") ##This line takes the DOC plots and gives them an "I" at start seqData$plot[seqData$plot == "I/BO116"] <- "X/BO116" seqData$plot[seqData$plot == "I/AU127"] <- "X/AU127" seqData$plot[seqData$plot == "I/U155"] <- "X/U155" seqData$plot[seqData$plot == "I/AL132"] <- "X/AL132" ##Note that there are more sequences left in the "Repeat" block, as I didn't match all the primers. seqData$ID <- unlist(lapply(strsplit(as.character(id(allseqdata))," "), "[", 1)) seqData$uniqueSeq <- NA seqData$Otu <- NA ##db <- readFAST("~/BLAST/db/UNITE_public_31.01.2016clean.fasta") ##UNID <- grep("f__uniden", db$descs) ##writeFast(db$sequences[-UNID], db$descs[-UNID], file = "~/BLAST/db/UNITE_public_31.01.2016cleanID.fasta") ################################################################################### ###This block should run on any taxa by only changing the vars at top. ## ## FUNGI: mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/" taxaName <- "fungi" taxaPrimer <- "Fungi_gITS7" madPiper(mainPath = mainPath, taxaName = taxaName, taxaPrimer = taxaPrimer, primerLength = nchar(primers$seq[primers$name == "gITS7"]), dataBase = "~/BLAST/db/UNITE_public_31.01.2016cleanID.fasta", projectName = "NGBA_phaseI") setwd(paste(mainPath, taxaName, sep="")) trimmedReads <- readFAST("trimmed.fa") uniquesfa <- readFAST("uniques.fa") uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)] trimmedReads$descs <- gsub(">", "", trimmedReads$descs) seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)] <- gsub(">", "", uniquematch) OtuMatching <- read.table("sequence_match_to_Otus.txt") OtuDBmatch <- read.table("Otus_match_to_DB.txt", sep="\t") seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)] ## fungiResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)]) ##Name Otus using BLAST. Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match). system(paste("cd ", mainPath, taxaName, sep="")) ##~/BLAST/bin/formatdb -i /Users/dickiei/BLAST/db/UNITE_public_31.01.2016clean.fasta -p F ##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt OtusFasta <- readFAST("otus.fa") cores <- 15 blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)] blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs)) for(core.i in 1:cores) { writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep="")) } cl <- makeCluster(cores) registerDoParallel(cl, cores=cores) tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% { system(paste("~/BLAST/bin/blastn -query tempFasta",core.i, ".fasta -db ",dataBase," -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching", core.i, ".txt", sep="")) } temp <- c() for(core.i in 1:cores) { temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep=""))) } matches <- grep("# 1 hits found", temp)+1 fungiOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4] names(fungiOtus) <- c("OTU", "OtuMatch", "identity", "length") fungiOtus$OtuMatch[as.numeric(fungiOtus$length) < 200] <- NA fungiOtus$nearestKingdom <- as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), function(x) gsub("k__", "", ifelse(length(x[grep("k__", x)])>0, x[grep("k__", x)], NA))))) fungiOtus$nearestPhylum <- as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), function(x) gsub("p__", "", ifelse(length(x[grep("p__", x)])>0, x[grep("p__", x)], NA))))) fungiOtus$nearestClass <- as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), function(x) gsub("c__", "", ifelse(length(x[grep("c__", x)])>0, x[grep("c__", x)], NA))))) fungiOtus$nearestOrder <- as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), function(x) gsub("o__", "", ifelse(length(x[grep("o__", x)])>0, x[grep("o__", x)], NA))))) fungiOtus$nearestFamily <- as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), function(x) gsub("f__", "", ifelse(length(x[grep("f__", x)])>0, x[grep("f__", x)], NA))))) fungiOtus$nearestGenus <- as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), function(x) gsub("g__", "", ifelse(length(x[grep("g__", x)])>0, x[grep("g__", x)], NA))))) fungiOtus$nearestSpecies <- as.factor(unlist(sapply(strsplit(fungiOtus$OtuMatch, "[;|]"), function(x) gsub("s__", "", ifelse(length(x[grep("s__", x)])>0, x[grep("s__", x)], NA))))) save(file=paste(mainPath,"All_R_output_UPARSE_NGBA_PhaseI",sep=""), list=ls()) ################### ## Bacteria ################### ################################################################################### ###This block should run on any taxa by only changing the vars at top. ## ## Bacteria mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/" taxaName <- "bacteria" taxaPrimer <- "Bacteria_PRK341" madPiper(mainPath = mainPath, taxaName = taxaName, taxaPrimer = taxaPrimer, primerLength = nchar(primers$seq[primers$name == "PRK341"]), dataBase = "~/BLAST/db/current_GREENGENES_gg16S_unaligned.fasta", projectName = "NGBA_phaseI", truncateL=350) setwd(paste(mainPath, taxaName, sep="")) trimmedReads <- readFAST("trimmed.fa") uniquesfa <- readFAST("uniques.fa") uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)] trimmedReads$descs <- gsub(">", "", trimmedReads$descs) seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)] <- gsub(">", "", uniquematch) OtuMatching <- read.table("sequence_match_to_Otus.txt") OtuDBmatch <- read.table("Otus_match_to_DB.txt", sep="\t") seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)] ## bacteriaResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)]) ##Name Otus using BLAST. Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match). system(paste("cd ", mainPath, taxaName, sep="")) ##~/BLAST/bin/formatdb -i /Users/dickiei/BLAST/db/UNITE_public_31.01.2016clean.fasta -p F ##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt dataBase <- "~/BLAST/db/current_GREENGENES_gg16S_unaligned.fasta" OtusFasta <- readFAST("otus.fa") cores <- 15 blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)] blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs)) for(core.i in 1:cores) { writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep="")) } cl <- makeCluster(cores) registerDoParallel(cl, cores=cores) tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% { system(paste("~/BLAST/bin/blastn -query tempFasta",core.i, ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching", core.i, ".txt", sep="")) } temp <- c() for(core.i in 1:cores) { temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep=""))) } matches <- grep("# 1 hits found", temp)+1 bacteriaOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4] names(bacteriaOtus) <- c("OTU", "OtuMatch", "identity", "length") bacteriaOtus$OtuMatch[as.numeric(bacteriaOtus$length) < 200] <- NA Bacteria <- readFAST("~/GoogleDrive/NGBA_GoogleDrive/Reference_libraries/current_GREENGENES_gg16S_unaligned.fasta") ##Note that the above is a BIG file. #update bacterial data ##data.frame(OTU = sapply(bacteriaDescList, function(x) x[1]), GB = sapply(bacteriaDescList, function(x) x[2])) bacteriaDescList <- strsplit(Bacteria$descs, " ") remove(list="Bacteria") ##To save working memory, I remove the bacterial data after using it. bacteriaData <- cbind(OTU = sapply(bacteriaDescList, function(x) x[1]), genBank = sapply(bacteriaDescList, function(x) x[2]), kingdom = sapply(bacteriaDescList, function(x) gsub("k__", "", x[grep("k__", x)])), phylum = sapply(bacteriaDescList, function(x) gsub("p__", "", x[grep("p__", x)])), class = sapply(bacteriaDescList, function(x) gsub("c__", "", x[grep("c__", x)])), order = sapply(bacteriaDescList, function(x) gsub("o__", "", x[grep("o__", x)])), family = sapply(bacteriaDescList, function(x) gsub("f__", "", x[grep("f__", x)])), genus = sapply(bacteriaDescList, function(x) gsub("g__", "", x[grep("g__", x)])), species = sapply(bacteriaDescList, function(x) gsub("s__", "", x[grep("s__", x)]))) bacteriaData <- apply(bacteriaData, 2, function(x) gsub(";", "", x)) bacteriaData <- apply(bacteriaData, 2, function(x) gsub("character(0)", NA, x)) bacteriaData <- as.data.frame(bacteriaData) bacteriaOtus$nearestKingdom <- bacteriaData$kingdom[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)] bacteriaOtus$nearestPhylum <- bacteriaData$phylum[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)] bacteriaOtus$nearestClass <- bacteriaData$class[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)] bacteriaOtus$nearestOrder <- bacteriaData$order[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)] bacteriaOtus$nearestFamily <- bacteriaData$family[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)] bacteriaOtus$nearestGenus <- bacteriaData$genus[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)] bacteriaOtus$nearestSpecies <- bacteriaData$species[match(bacteriaOtus$OtuMatch, bacteriaData$OTU)] save(file=paste(mainPath,"All_R_output_UPARSE_NGBA_PhaseI",sep=""), list=ls()) ### ADDED TO RESTRICT BACTERIA: bacteriaOtus$sequences <- OtusFasta$sequence ## Run with restricted database source("~/GoogleDrive/NGBA_data/Compilation_code/DNA_data_analysis/PipelineFunctions.txt") unmatchedFasta <- list(sequences=bacteriaOtus$sequence[bacteriaOtus$nearestClass%in% c("","character(0)")], descs= bacteriaOtus$OTU[bacteriaOtus$nearestClass%in% c("","character(0)")]) dataBase <- "~/BLAST/db/current_GREENGENES_gg16S_unaligned_classID.fasta" cores <- 15 blockstarts <- floor(seq(1, length(unmatchedFasta$descs), length.out=cores+1))[-(cores+1)] blockends <- c(blockstarts[-1]-1,length(unmatchedFasta$descs)) for(core.i in 1:cores) { writeFast(unlist(unmatchedFasta$sequences[blockstarts[core.i]:blockends[core.i]]),unmatchedFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep="")) } cl <- makeCluster(cores) registerDoParallel(cl, cores=cores) tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% { system(paste("~/BLAST/bin/blastn -query tempFasta",core.i, ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching", core.i, ".txt", sep="")) } temp <- c() for(core.i in 1:cores) { temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep=""))) } matches <- grep("# 1 hits found", temp)+1 restrictedBacteriaOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4] names(restrictedBacteriaOtus) <- c("OTU", "OtuMatchRestricted", "identityRestricted", "lengthRestricted") restrictedBacteriaOtus$OtuMatch[as.numeric(restrictedBacteriaOtus$length) < 200] <- NA bacteriaOtus$OtuMatchRestricted <- bacteriaOtus$OtuMatch bacteriaOtus$identityRestricted <- bacteriaOtus$identity bacteriaOtus$lengthRestricted <- bacteriaOtus$length bacteriaOtus$OtuMatchRestricted[match(restrictedBacteriaOtus$OTU, bacteriaOtus$OTU)] <- restrictedBacteriaOtus$OtuMatchRestricted bacteriaOtus$identityRestricted[match(restrictedBacteriaOtus$OTU, bacteriaOtus$OTU)] <- restrictedBacteriaOtus$identityRestricted bacteriaOtus$lengthRestricted[match(restrictedBacteriaOtus$OTU, bacteriaOtus$OTU)] <- restrictedBacteriaOtus$length bacteriaOtus$nearestKingdom <- bacteriaData$kingdom[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)] bacteriaOtus$nearestPhylum <- bacteriaData$phylum[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)] bacteriaOtus$nearestClass <- bacteriaData$class[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)] bacteriaOtus$nearestOrder <- bacteriaData$order[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)] bacteriaOtus$nearestFamily <- bacteriaData$family[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)] bacteriaOtus$nearestGenus <- bacteriaData$genus[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)] bacteriaOtus$nearestSpecies <- bacteriaData$species[match(bacteriaOtus$OtuMatchRestricted, bacteriaData$OTU)] save(file=paste(mainPath,"Updated_bacteria_for_kate_2",sep=""), list=c("bacteriaOtus")) save(file=paste("~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/","Updated_bacteria_for_kate_2",sep=""), list=c("bacteriaOtus")) ##This file manually moved to shared drive later (5 May 2017) ################### ## Metazoans_18S11b ################### ##prep Metazoans data ##FAILURE -- COULDN'T GET THE WHOLE THING TO DOWNLOAD PROBLEM AT NCBI ##("Metazoa"[Organism] AND (18S[All Fields] OR "small subunit"[All Fields]) AND ("100"[SLEN] : "5000"[SLEN])) NOT environmental[All Fields] ##Because of issues, used http://ddbj.nig.ac.jp/arsa/search?lang=en&cond=advanced_search&pa=&an=&sl1=200&sl2=5000&mt=DNA&_mt=on&_mf=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&dt1=&dt2=&df=18S+OR+%22small+subunit%22+NOT+%22environmental%22&kw=&og=&ln=Metazoa&ra=&rt=&rj=&rp=&cm=&fq%5B0%5D.featureKey=&fq%5B0%5D.qualifierName=&fq%5B0%5D.qualifierValue=&at=&sortTarget=score&sortOrder=desc&displayFields=PrimaryAccessionNumber&displayFields=Definition&displayFields=SequenceLength&displayFields=MolecularType&displayFields=Organism&_displayFields=on&op=AND ##Saved as "all.Metazoa18S.August2016.fasta") system("~/BLAST/bin/formatdb -i /Users/dickiei/BLAST/db/all.Metazoa18S_ARSA.fasta -p F") ################################################################################### ###This block should run on any taxa by only changing the vars at top. ## ## metazoans mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/" taxaName <- "metazoans" taxaPrimer <- "Metazoans_18S11b" dataBase <- "/Users/dickiei/BLAST/db/all.Metazoa18S_ARSA.fasta" madPiper(mainPath = mainPath, taxaName = taxaName, taxaPrimer = taxaPrimer, primerLength = nchar(primers$seq[primers$name == "18S11b"]), dataBase = dataBase, projectName = "NGBA_phaseI", truncateL=250) setwd(paste(mainPath, taxaName, sep="")) trimmedReads <- readFAST("trimmed.fa") uniquesfa <- readFAST("uniques.fa") uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)] trimmedReads$descs <- gsub(">", "", trimmedReads$descs) seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)] <- gsub(">", "", uniquematch) OtuMatching <- read.table("sequence_match_to_Otus.txt") seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)] ## metazoansResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)]) ##Name Otus using BLAST. Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match). system(paste("cd ", mainPath, taxaName, sep="")) ##~/BLAST/bin/formatdb -i /Users/dickiei/BLAST/db/all.Metazoa18S.fasta -p F ##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt OtusFasta <- readFAST("otus.fa") cores <- 15 blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)] blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs)) for(core.i in 1:cores) { writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep="")) } cl <- makeCluster(cores) registerDoParallel(cl, cores=cores) tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% { system(paste("~/BLAST/bin/blastn -query tempFasta",core.i, ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching", core.i, ".txt", sep="")) } temp<-c() for(core.i in 1:cores) { temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep=""))) } matches <- grep("# 1 hits found", temp)+1 metazoansOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4] names(metazoansOtus) <- c("OTU", "OtuMatch", "identity", "length") metazoansOtus$OtuMatch[as.numeric(metazoansOtus$length) < 200] <- NA source("~/GoogleDrive/GenBankNames") temp <- readFAST(dataBase) temp$names <- unlist(lapply(strsplit(temp$descs, " "), function(x) paste(x[2],x[3]))) temp$ID <- unlist(lapply(strsplit(temp$descs, " "), function(x) x[1])) metazoansOtus$nearestSpecies <- temp$names[match(metazoansOtus$OtuMatch, temp$ID)] metazoansOtus$nearestGenus <- unlist(sapply(strsplit(metazoansOtus$nearestSpecies, " "), "[", 1)) taxonomyL <- lapply(unique(metazoansOtus$nearestGenus), getTaxed) taxonomyDF <- data.frame(genus=unique(metazoansOtus$nearestGenus), Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])), Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error",as.character(x$name)[x$rank=="phylum"])[1])), Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])), Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])), Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1]))) metazoansOtus$nearestKingdom <- as.character(taxonomyDF$Kingdom[match(metazoansOtus$nearestGenus, taxonomyDF$genus)]) metazoansOtus$nearestPhylum <- as.character(taxonomyDF$Phylum[match(metazoansOtus$nearestGenus, taxonomyDF$genus)]) metazoansOtus$nearestClass <- as.character(taxonomyDF$Class[match(metazoansOtus$nearestGenus, taxonomyDF$genus)]) metazoansOtus$nearestOrder <- as.character(taxonomyDF$Order[match(metazoansOtus$nearestGenus, taxonomyDF$genus)]) metazoansOtus$nearestFamily <- as.character(taxonomyDF$Family[match(metazoansOtus$nearestGenus, taxonomyDF$genus)]) ##add missing species Spp.list <- unique(metazoansOtus$nearestSpecies[is.na(metazoansOtus$nearestKingdom) & !is.na(metazoansOtus$nearestSpecies)]) taxonomyL <- lapply(Spp.list, getTaxed) taxonomyDF <- data.frame(genus=Spp.list, Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])), Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="phylum"])[1])), Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])), Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])), Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1])), stringsAsFactors=FALSE) metazoansOtus$nearestKingdom[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Kingdom[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)]) metazoansOtus$nearestPhylum[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Phylum[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)]) metazoansOtus$nearestClass[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Class[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)]) metazoansOtus$nearestOrder[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Order[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)]) metazoansOtus$nearestFamily[metazoansOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Family[match(metazoansOtus$nearestSpecies[metazoansOtus$nearestSpecies %in% Spp.list], Spp.list)]) ################### ## nematodes ################### ################################################################################### ###This block should run on any taxa by only changing the vars at top. ## ## nematodes mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/" taxaName <- "nematodes" taxaPrimer <- "Nematodes_F1" dataBase <- "/Users/dickiei/BLAST/db/all.Metazoa18S_ARSA.fasta" madPiper(mainPath = mainPath, taxaName = taxaName, taxaPrimer = taxaPrimer, primerLength = nchar(primers$seq[primers$name == "F1"]), dataBase = dataBase, projectName = "NGBA_phaseI", truncateL = 300) setwd(paste(mainPath, taxaName, sep="")) primerLength <- nchar(primers$seq[primers$name == "F1"]) dataBase <- "/Users/dickiei/BLAST/db/all.Metazoa18S_ARSA.fasta" projectName <- "NGBA_phaseI" dir.create(paste(mainPath, taxaName, sep="")) setwd(paste(mainPath, taxaName, sep="")) fileName <- paste(projectName, "_", taxaName, ".fastq", sep="") if(file.exists(fileName)) file.remove(fileName) writeFastq(narrow(allseqdata[seqData$primer %is% taxaPrimer], primerLength+as.numeric(!is.na(seqData$MID[seqData$primer %is% taxaPrimer]))*10), file=fileName, compress=FALSE) system(paste("cd ", mainPath, taxaName, sep="")) ##system(paste("usearch -fastq_eestats2 ", fileName, " -output eestats_", fileName, ".txt", sep="")) #Trim to 300 bo system(paste("usearch -fastq_filter ", fileName, " -fastq_trunclen 300 -fastaout trimmed.fa", sep="")) system(paste("usearch -fastq_filter ", fileName, " -fastq_trunclen 300 -fastq_maxee 1.0 -fastaout filtered.fa", sep="")) system("usearch -derep_fulllength filtered.fa -sizeout -fastaout uniques.fa") system("usearch -cluster_otus uniques.fa -minsize 2 -otus otus.fa -relabel Otu") system("usearch -usearch_global trimmed.fa -db otus.fa -strand plus -id 0.97 --blast6out sequence_match_to_Otus.txt") trimmedReads <- readFAST("trimmed.fa") uniquesfa <- readFAST("uniques.fa") uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)] trimmedReads$descs <- gsub(">", "", trimmedReads$descs) seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)] <- gsub(">", "", uniquematch) OtuMatching <- read.table("sequence_match_to_Otus.txt") OtuDBmatch <- read.table("Otus_match_to_DB.txt", sep="\t") seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)] ## nematodesResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)]) ##Name Otus using BLAST. Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match). system(paste("cd ", mainPath, taxaName, sep="")) ##~/BLAST/bin/formatdb -i /Users/dickiei/BLAST/db/UNITE_public_31.01.2016clean.fasta -p F ##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt OtusFasta <- readFAST("otus.fa") cores <- 15 blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)] blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs)) for(core.i in 1:cores) { writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep="")) } cl <- makeCluster(cores) registerDoParallel(cl, cores=cores) tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% { system(paste("~/BLAST/bin/blastn -query tempFasta",core.i, ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching", core.i, ".txt", sep="")) } temp <- c() for(core.i in 1:cores) { temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep=""))) } matches <- grep("# 1 hits found", temp)+1 nematodesOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4] names(nematodesOtus) <- c("OTU", "OtuMatch", "identity", "length") nematodesOtus$OtuMatch[as.numeric(nematodesOtus$length) < 200] <- NA source("~/GoogleDrive/GenBankNames") temp <- readFAST(dataBase) temp$names <- unlist(lapply(strsplit(temp$descs, " "), function(x) paste(x[2],x[3]))) temp$ID <- unlist(lapply(strsplit(temp$descs, " "), function(x) x[1])) nematodesOtus$nearestSpecies <- temp$names[match(nematodesOtus$OtuMatch, temp$ID)] nematodesOtus$nearestGenus <- unlist(sapply(strsplit(nematodesOtus$nearestSpecies, " "), "[", 1)) taxonomyL <- lapply(unique(nematodesOtus$nearestGenus), getTaxed) taxonomyDF <- data.frame(genus=unique(nematodesOtus$nearestGenus), Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])), Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error",as.character(x$name)[x$rank=="phylum"])[1])), Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])), Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])), Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1]))) nematodesOtus$nearestKingdom <- as.character(taxonomyDF$Kingdom[match(nematodesOtus$nearestGenus, taxonomyDF$genus)]) nematodesOtus$nearestPhylum <- as.character(taxonomyDF$Phylum[match(nematodesOtus$nearestGenus, taxonomyDF$genus)]) nematodesOtus$nearestClass <- as.character(taxonomyDF$Class[match(nematodesOtus$nearestGenus, taxonomyDF$genus)]) nematodesOtus$nearestOrder <- as.character(taxonomyDF$Order[match(nematodesOtus$nearestGenus, taxonomyDF$genus)]) nematodesOtus$nearestFamily <- as.character(taxonomyDF$Family[match(nematodesOtus$nearestGenus, taxonomyDF$genus)]) ##add missing species Spp.list <- unique(nematodesOtus$nearestSpecies[is.na(nematodesOtus$nearestKingdom) & !is.na(nematodesOtus$nearestSpecies)]) taxonomyL <- lapply(Spp.list, getTaxed) taxonomyDF <- data.frame(genus=Spp.list, Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])), Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="phylum"])[1])), Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])), Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])), Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1])), stringsAsFactors=FALSE) nematodesOtus$nearestKingdom[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Kingdom[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)]) nematodesOtus$nearestPhylum[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Phylum[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)]) nematodesOtus$nearestClass[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Class[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)]) nematodesOtus$nearestOrder[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Order[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)]) nematodesOtus$nearestFamily[nematodesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Family[match(nematodesOtus$nearestSpecies[nematodesOtus$nearestSpecies %in% Spp.list], Spp.list)]) ################### ## invertebrates ################### ## http://ddbj.nig.ac.jp/arsa/search?lang=en&cond=advanced_search&pa=&an=&sl1=200&sl2=5000&mt=DNA&_mt=on&_mf=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&_dv=on&dt1=&dt2=&df=%28COI+OR+%22CO1%22+OR+%22cytochrome+oxidase%22+OR+%22COX1%22+OR+%22cytochrome+c+oxidase%22%29+NOT+%28%22unverified%22+OR+%22uncultured%22%29&kw=&og=&ln=Metazoa&ra=&rt=&rj=&rp=&cm=&fq%5B0%5D.featureKey=&fq%5B0%5D.qualifierName=&fq%5B0%5D.qualifierValue=&at=&sortTarget=score&sortOrder=desc&displayFields=PrimaryAccessionNumber&displayFields=Definition&displayFields=SequenceLength&displayFields=MolecularType&displayFields=Organism&_displayFields=on&op=AND system("~/BLAST/bin/formatdb -i /Users/dickiei/BLAST/db/all.MetazoaCO1_ARSA.fasta -p F") ################################################################################### ###This block should run on any taxa by only changing the vars at top. ## ## invertebrates mainPath <- "~/BigData/Next_gen_biodiversity_assessment/NGBA_454_Data/" taxaName <- "invertebrates" taxaPrimer <- "Invertebrates_LCO1490" dataBase <- "~/BLAST/db/all.MetazoaCO1_ARSA.fasta" madPiper(mainPath = mainPath, taxaName = taxaName, taxaPrimer = taxaPrimer, primerLength = nchar(primers$seq[primers$name == "LCO1490"]), dataBase = dataBase, projectName = "NGBA_phaseI", truncateL = 250) setwd(paste(mainPath, taxaName, sep="")) trimmedReads <- readFAST("trimmed.fa") uniquesfa <- readFAST("uniques.fa") uniquematch <- uniquesfa$descs[match(trimmedReads$sequences, uniquesfa$sequences)] trimmedReads$descs <- gsub(">", "", trimmedReads$descs) seqData$uniqueSeq[match( trimmedReads$descs,seqData$ID)] <- gsub(">", "", uniquematch) OtuMatching <- read.table("sequence_match_to_Otus.txt") OtuDBmatch <- read.table("Otus_match_to_DB.txt", sep="\t") seqData$Otu[seqData$primer %is% taxaPrimer] <- as.character(OtuMatching$V2)[match(seqData$ID[seqData$primer %is% taxaPrimer], OtuMatching$V1)] ## invertebratesResults <- table(seqData$plot[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)], seqData$Otu[seqData$primer %is% taxaPrimer & !is.na(seqData$Otu)]) ##Name Otus using BLAST. Note that Usearch DOES NOT WORK for this, as heuristic (gives a match, but not BEST match). system(paste("cd ", mainPath, taxaName, sep="")) ##~/BLAST/bin/formatdb -i /Users/dickiei/BLAST/db/UNITE_public_31.01.2016clean.fasta -p F ##~/BLAST/bin/blastn -query otus.fa -db /Users/dickiei/BLAST/db/MetazoaCOIJuly2016clean.fasta -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching.txt OtusFasta <- readFAST("otus.fa") cores <- 15 blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)] blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs)) for(core.i in 1:cores) { writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste("tempFasta",core.i,".fasta", sep="")) } cl <- makeCluster(cores) registerDoParallel(cl, cores=cores) tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% { system(paste("~/BLAST/bin/blastn -query tempFasta",core.i, ".fasta -db ", dataBase , " -task blastn -dust no -outfmt 7 -num_alignments 1 -num_descriptions 1 -out OTU_Blast_matching", core.i, ".txt", sep="")) } temp <- c() for(core.i in 1:cores) { temp <- append(temp, readLines(paste("OTU_Blast_matching", core.i, ".txt", sep=""))) } matches <- grep("# 1 hits found", temp)+1 invertebratesOtus <- data.frame(matrix(unlist(strsplit(temp[matches], "\t")), byrow=TRUE, ncol=length(unlist(strsplit(temp[matches][1], "\t")))), stringsAsFactors=FALSE)[,1:4] names(invertebratesOtus) <- c("OTU", "OtuMatch", "identity", "length") invertebratesOtus$OtuMatch[as.numeric(invertebratesOtus$length) < 200] <- NA source("~/GoogleDrive/GenBankNames") temp <- readFAST(dataBase) temp$names <- unlist(lapply(strsplit(temp$descs, " "), function(x) paste(x[2],x[3]))) temp$ID <- unlist(lapply(strsplit(temp$descs, " "), function(x) x[1])) invertebratesOtus$nearestSpecies <- temp$names[match(invertebratesOtus$OtuMatch, temp$ID)] invertebratesOtus$nearestGenus <- unlist(sapply(strsplit(invertebratesOtus$nearestSpecies, " "), "[", 1)) taxonomyL <- lapply(unique(invertebratesOtus$nearestGenus), getTaxed) taxonomyDF <- data.frame(genus=unique(invertebratesOtus$nearestGenus), Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])), Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error",as.character(x$name)[x$rank=="phylum"])[1])), Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])), Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])), Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1]))) invertebratesOtus$nearestKingdom <- as.character(taxonomyDF$Kingdom[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)]) invertebratesOtus$nearestPhylum <- as.character(taxonomyDF$Phylum[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)]) invertebratesOtus$nearestClass <- as.character(taxonomyDF$Class[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)]) invertebratesOtus$nearestOrder <- as.character(taxonomyDF$Order[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)]) invertebratesOtus$nearestFamily <- as.character(taxonomyDF$Family[match(invertebratesOtus$nearestGenus, taxonomyDF$genus)]) ##add missing species Spp.list <- unique(invertebratesOtus$nearestSpecies[is.na(invertebratesOtus$nearestKingdom) & !is.na(invertebratesOtus$nearestSpecies)]) taxonomyL <- lapply(Spp.list, getTaxed) taxonomyDF <- data.frame(genus=Spp.list, Kingdom = unlist(sapply(taxonomyL, function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="kingdom"])[1])), Phylum = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="phylum"])[1])), Class = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="class"])[1])), Order = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="order"])[1])), Family = unlist(sapply(taxonomyL , function(x) ifelse(is.na(x), "Name_lookup_error", as.character(x$name)[x$rank=="family"])[1])), stringsAsFactors=FALSE) invertebratesOtus$nearestKingdom[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Kingdom[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)]) invertebratesOtus$nearestPhylum[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Phylum[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)]) invertebratesOtus$nearestClass[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Class[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)]) invertebratesOtus$nearestOrder[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Order[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)]) invertebratesOtus$nearestFamily[invertebratesOtus$nearestSpecies %in% Spp.list] <- as.character(taxonomyDF$Family[match(invertebratesOtus$nearestSpecies[invertebratesOtus$nearestSpecies %in% Spp.list], Spp.list)]) save(file=paste(mainPath,"All_R_output_UPARSE_NGBA_PhaseI",sep=""), list=ls())