herbParse_v2<- ## DPUT output, 14sept08 11:48pm function (oracleIn, dbaseIn, edIn = NULL, usdaIn = NULL, file.names = F) { require(cba) if (file.names) { oracle <- read.csv(oracleIn, header = T) dbase = vector("list", length(dbaseIn)) for (i in seq(length(dbaseIn))) dbase[[i]] <- read.csv(dbaseIn, header = T) names(dbase) <- names(dbaseIn) ed <- read.csv(edIn, header = T) if (!identical(usdaIn, NULL)) usda <- read.csv(usdaIn, header = T) } else { if (class(dbaseIn) != "list") stop("dbaseIn must be a list of dbase tables, preferably named") oracle <- oracleIn dbase <- dbaseIn ed <- edIn usda <- usdaIn } for (i in seq(length(dbaseIn))) dbase[[i]]$SN_CONCAT_FULL <- paste(toupper(dbase[[i]]$GENUS), toupper(dbase[[i]]$HYBRID), toupper(dbase[[i]]$SPECIES), toupper(dbase[[i]]$AUTHOR1), toupper(dbase[[i]]$IFS), toupper(dbase[[i]]$TRINOM), toupper(dbase[[i]]$AUTHOR2), sep = " ") dbaseWorking <- dbase[[1]][, c("DIVISION", "CLASS", "FAMILY", "GENUS", "HYBRID", "SPECIES", "AUTHOR1", "IFS", "TRINOM", "AUTHOR2", "SN_CONCAT_FULL")] if (length(dbase) > 1) { for (i in 2:length(dbase)) dbaseWorking <- rbind(dbaseWorking, dbase[[i]][, c("DIVISION", "CLASS", "FAMILY", "GENUS", "HYBRID", "SPECIES", "AUTHOR1", "IFS", "TRINOM", "AUTHOR2", "SN_CONCAT_FULL")]) } oracle <- as.data.frame(apply(oracle, c(1, 2), function(x) gsub("(null)", "", x, fixed = T)), stringsAsFactors = F) names(usda) <- toupper(names(usda)) names(ed) <- toupper(names(ed)) G <- length(dbaseWorking$GENUS) dbaseWorking$SN_CONCAT <- paste(toupper(dbaseWorking$GENUS), toupper(dbaseWorking$HYBRID), toupper(dbaseWorking$SPECIES), toupper(dbaseWorking$TRINOM), sep = " ") dbaseWorking$SN_CONCAT_NOWHITE <- gsub(" ", "", dbaseWorking$SN_CONCAT) dbaseWorking$SN_CONCAT_FULL_NOWHITE <- gsub(" ", "", dbaseWorking$SN_CONCAT_FULL) dbaseWorking$SN_CONCAT_FULL_NOWHITE <- gsub("'", "", dbaseWorking$SN_CONCAT_FULL) oracleHybGENUS <- unlist(lapply(oracle$GEN_HYB_CD, function(x) { if (as.character(x) == "HY") "x" else "" })) oracleHybSPECIES <- unlist(lapply(oracle$SPEC_HYB_CD, function(x) { if (as.character(x) == "HY") "x" else "" })) oracleSSP <- unlist(lapply(oracle$SSP, function(x) { if (nchar(as.character(x)) > 0) "ssp." else "" })) oracleVAR <- unlist(lapply(oracle$VAR, function(x) { if (nchar(as.character(x)) > 0) "var." else "" })) oracleCULTIVAR <- unlist(lapply(oracle$CULTIVAR, function(x) { if (nchar(as.character(x)) > 0) "cv." else "" })) oracleFMA <- unlist(lapply(oracle$FMA, function(x) { if (nchar(as.character(x)) > 0) "f." else "" })) oracle$SN_CONCAT <- paste(toupper(oracleHybGENUS), toupper(oracle$GENUS), toupper(oracleHybSPECIES), toupper(oracle$SPECIES), toupper(oracle$SSP), toupper(oracle$VAR), toupper(oracle$FMA), toupper(oracleCULTIVAR), toupper(oracle$CULTIVAR), sep = " ") oracle$SN_CONCAT_NOWHITE <- gsub(" ", "", oracle$SN_CONCAT) oracle$SN_CONCAT_FULL <- paste(toupper(oracleHybGENUS), toupper(oracle$GENUS), toupper(oracleHybSPECIES), toupper(oracle$SPECIES), toupper(oracle$AUTHOR), toupper(oracleSSP), toupper(oracle$SSP), toupper(oracle$SSP_AUTHOR), toupper(oracleVAR), toupper(oracle$VAR), toupper(oracle$VAR_AUTHOR), toupper(oracleFMA), toupper(oracle$FMA), toupper(oracle$FMA_AUTHOR), toupper(oracleCULTIVAR), toupper(oracle$CULTIVAR), sep = " ") oracle$SN_CONCAT_FULL_NOWHITE <- gsub(" ", "", oracle$SN_CONCAT_FULL) usdaHybGENUS <- unlist(lapply(usda$XGENUS, function(x) { if (nchar(as.character(x)) > 0) "x" else "" })) usdaHybSPECIES <- unlist(lapply(usda$XSPECIES, function(x) { if (nchar(as.character(x)) > 0) "x" else "" })) usdaFORMA <- unlist(lapply(usda$FORMA, function(x) { if (nchar(as.character(x)) > 0) "f." else "" })) usdaVarAUTH <- as.character(usda$QUADRANOMIAL.AUTHOR) usdaSspAUTH <- character(length(usda$GENUS)) usdaFormaAUTH <- character(length(usda$GENUS)) for (i in 1:length(usda$GENUS)) { if (usdaVarAUTH[i] != "") usdaSspAUTH[i] <- as.character(usda$TRINOMIAL.AUTHOR[i]) else { if (as.character(usda$SUBSPECIES[i]) != "") usdaSspAUTH[i] <- as.character(usda$TRINOMIAL.AUTHOR[i]) if (as.character(usda$VARIETY[i]) != "") usdaVarAUTH[i] <- as.character(usda$TRINOMIAL.AUTHOR[i]) if (as.character(usda$FORMA[i]) != "") usdaFormaAUTH[i] <- as.character(usda$TRINOMIAL.AUTHOR[i]) } } usda$SN_CONCAT <- paste(toupper(usda$GENUS), toupper(usdaHybSPECIES), toupper(usda$SPECIES), toupper(usda$SUBSPECIES), toupper(usda$VARIETY), toupper(usda$FORMA), sep = " ") usda$SN_CONCAT_NOWHITE <- gsub(" ", "", usda$SN_CONCAT) usda$SN_CONCAT_FULL <- paste(toupper(usdaHybGENUS), toupper(usda$GENUS), toupper(usdaHybSPECIES), toupper(usda$SPECIES), toupper(usda$BINOMIAL.AUTHOR), toupper(usda$SSP), toupper(usda$SUBSPECIES), toupper(usdaSspAUTH), toupper(usda$VAR), toupper(usda$VARIETY), toupper(usdaVarAUTH), toupper(usdaFORMA), toupper(usda$FORMA), toupper(usdaFormaAUTH), sep = " ") edHybGENUS <- unlist(lapply(ed$GEN_DSG, function(x) { if (nchar(as.character(x)) > 0) "x" else "" })) edHybSPECIES <- unlist(lapply(ed$SP_DSG, function(x) { if (nchar(as.character(x)) > 0) "x" else "" })) edAUTHOR <- as.character(ed$AUTHOR) for (i in 1:length(edAUTHOR)) { if (nchar(as.character(ed$BASONYM[i])) > 0) edAUTHOR[i] <- paste("(", ed$BASONYM[i], ") ", edAUTHOR[i], sep = "") } ed$SN_CONCAT <- paste(toupper(ed$GENUS_NAME), toupper(edHybSPECIES), toupper(ed$SP_NAME), toupper(ed$SSP_NAME), toupper(ed$CV_NAME), sep = " ") ed$SN_CONCAT_NOWHITE <- gsub(" ", "", ed$SN_CONCAT) ed$SN_CONCAT_FULL <- paste(toupper(edHybGENUS), toupper(ed$GENUS_NAME), toupper(edHybSPECIES), toupper(ed$SP_NAME), toupper(ed$SSP_DSG), toupper(ed$SSP_NAME), toupper(ed$CV_NAME), toupper(edAUTHOR), sep = " ") ed$SN_CONCAT_FULL_NOWHITE <- gsub(" ", "", ed$SN_CONCAT_FULL) concatSort <- sort(unique(dbaseWorking$SN_CONCAT_FULL)) N = length(concatSort) message("Making matchlist") matchList <- match(concatSort, dbaseWorking$SN_CONCAT_FULL) message("Matchlist made") # save(matchList, file='matchList.Rdata') dbaseTaxa <- data.frame(CONCAT_FULL = concatSort, CONCAT_FULL_NOWHITE = dbaseWorking$SN_CONCAT_FULL_NOWHITE[matchList], CONCAT = dbaseWorking$SN_CONCAT[matchList], CONCAT_NOWHITE = dbaseWorking$SN_CONCAT_NOWHITE[matchList], summaryVerdict = character(N), GENUS = dbaseWorking$GENUS[matchList], SPECIES = dbaseWorking$SPECIES[matchList], AUTHOR1 = dbaseWorking$AUTHOR1[matchList], TRINOM = dbaseWorking$TRINOM[matchList], AUTHOR2 = dbaseWorking$AUTHOR2[matchList], generationDate = rep(date(), N)) dbaseTaxa$SN_CHECKED_DATE <- character(N) dbaseTaxa$SN_CHECKED_USER <- character(N) dbaseTaxa$WORK_NEEDED <- character(N) dbaseTaxa$BEST_ORACLE_MATCH <- character(N) dbaseTaxa$BEST_ED_MATCH <- character(N) dbaseTaxa$BEST_USDA_MATCH <- character(N) dbaseTaxa <- dbaseTaxa[-(nchar(gsub(" ","",(as.character(dbaseTaxa$CONCAT_FULL)))) == 0), ] # gets rid of blank rows N <- length(dbaseTaxa$CONCAT_FULL) # needed so that loops run the right length of time # save(dbaseTaxa,file='dbaseTaxaMidstream.Rdata') oracleGenera <- sort(toupper(unique(gsub(" ", "", oracle$GENUS)))) oracleEpithets <- sort(toupper(unique(c(gsub(" ", "", oracle$SPECIES), gsub(" ", "", oracle$SSP), gsub(" ", "", oracle$VAR), gsub(" ", "", oracle$FMA), gsub(" ", "", oracle$CULTIVAR))))) oracleAuthors <- sort(toupper(unique(c(gsub(" ", "", oracle$AUTHOR), gsub(" ", "", oracle$SSP_AUTHOR), gsub(" ", "", oracle$VAR_AUTHOR), gsub(" ", "", oracle$FMA_AUTHOR))))) for (i in seq(N)) { oracleMatch = match(dbaseTaxa$CONCAT_FULL_NOWHITE[i], oracle$SN_CONCAT_FULL_NOWHITE) if (!is.na(oracleMatch)) { dbaseTaxa$BEST_ORACLE_MATCH[i] <- oracle$SN_CONCAT_FULL[oracleMatch[1]] dbaseTaxa$WORK_NEEDED[i] <- "Oracle match" } else { oracleClose <- agrep(dbaseTaxa$CONCAT_NOWHITE[i], oracle$SN_CONCAT_NOWHITE, value = F, max.distance = 0.05) if (length(oracleClose) == 0) { dbaseTaxa$BEST_ORACLE_MATCH[i] <- "0" } else { oracleClosest <- oracleClose[sdists(as.character(dbaseTaxa$CONCAT_NOWHITE[i]), oracle$SN_CONCAT_FULL[oracleClose], weight = c(1, 1, 1)) == min(sdists(as.character(dbaseTaxa$CONCAT_NOWHITE[i]), oracle$SN_CONCAT_FULL[oracleClose], weight = c(1, 1, 1)))][1] dbaseTaxa$BEST_ORACLE_MATCH[i] <- oracle$SN_CONCAT_FULL[oracleClosest] dbaseTaxa$WORK_NEEDED[i] <- "Oracle close" } if (!identical(usda, NULL)) { usdaClose <- agrep(as.character(dbaseTaxa$CONCAT_NOWHITE[i]), usda$SN_CONCAT_NOWHITE, value = F, max.distance = 0.05) if (length(usdaClose) == 0) { dbaseTaxa$BEST_USDA_MATCH[i] <- "0" } else { usdaClosest <- usdaClose[sdists(as.character(dbaseTaxa$CONCAT_NOWHITE[i]), usda$SN_CONCAT_FULL[usdaClose], weight = c(1, 1, 1)) == min(sdists(as.character(dbaseTaxa$CONCAT_NOWHITE[i]), usda$SN_CONCAT_FULL[usdaClose], weight = c(1, 1, 1)))][1] dbaseTaxa$BEST_USDA_MATCH[i] <- usda$SN_CONCAT_FULL[usdaClosest] } } if (!identical(ed, NULL)) { edClose <- agrep(as.character(dbaseTaxa$CONCAT_NOWHITE[i]), ed$SN_CONCAT_NOWHITE, value = F, max.distance = 0.05) if (length(edClose) == 0) { dbaseTaxa$BEST_ED_MATCH[i] <- "0" } else { edClosest <- edClose[sdists(as.character(dbaseTaxa$CONCAT_NOWHITE[i]), ed$SN_CONCAT_FULL[edClose], weight = c(1, 1, 1)) == min(sdists(as.character(dbaseTaxa$CONCAT_NOWHITE[i]), ed$SN_CONCAT_FULL[edClose], weight = c(1, 1, 1)))][1] dbaseTaxa$BEST_ED_MATCH[i] <- ed$SN_CONCAT_FULL[edClosest] } } } message(paste("Matched name", i, "of", N)) if(i == 1) write.csv(dbaseTaxa[1,], file = "dbaseCheck.csv", append = F, row.names = F) else write.table(dbaseTaxa[i,], file = "dbaseRunning.csv", append = T, col.names = F, row.names = F, sep = ",") } for (i in seq(length(dbase))) write.csv(dbase[i], paste("dbaseFile", i, ".", Sys.Date(), ".csv", sep = "")) write.csv(dbaseTaxa, paste("dbaseTaxaAnalyzed.", Sys.Date(), ".csv", sep = "")) write.csv(usda, paste("usda.", Sys.Date(), ".csv", sep = "")) write.csv(ed, paste("ed.", Sys.Date(), ".csv", sep = "")) write.csv(oracle, paste("oracle.", Sys.Date(), ".csv", sep = "")) return(list(dbaseTaxa = dbaseTaxa, dbase = dbase, oracle = oracle, ed = ed, usda = usda)) }