microsat <- ## This function reads in a flat file exported from GeneMapper (ABI), with alleles scored in GeneMapper. ## You need to change the heading names to "NAME", "ALLELE1", and "ALLELE2" for this to work, and the header names all have to be single words as this is written function(fileNames = NULL, missingData = -9) { if (identical(fileNames, NULL)) fileNames = choose.files() dataset = list(length(fileNames)) markersVector = character(length(fileNames)) namesVector = NULL for (i in 1:length(fileNames)) { infile = read.delim(fileNames[i]) N = length(infile$NAME) dataset[[i]] = data.frame(names = infile$NAME, allele1 = infile$ALLELE1, allele2 = infile$ALLELE2) for (j in 1:N) {if (is.na(dataset[[i]]$allele2[j])) dataset[[i]]$allele2[j] = dataset[[i]]$allele1[j] if (is.na(dataset[[i]]$allele1[j])) dataset[[i]]$allele1[j] = missingData if (is.na(dataset[[i]]$allele2[j])) dataset[[i]]$allele2[j] = missingData } markersVector[i] = as.vector(infile$MARKER[1]) namesVector = c(namesVector, as.vector(infile$NAME))} names(dataset) = markersVector return(dataset = dataset, namesVector = sort(unique(namesVector)), markersVector = markersVector) } microsatSummary <- function(microsatData) ## Generate summary data on a dataset exported from 'microsat' ## Arguments: ## microsatData = the ~$dataset object exported from microsat ## Values: ## sizes = list of unique alleles for each marker ## sizeMatrix = matrix of maxSize, minSize (excluding -9) for each marker { markerNames = names(microsatData) sizes = vector("list", length(markerNames)); names(sizes) = markerNames sizeMatrix = matrix(nrow = length(markerNames), ncol = 3, dimnames = list(markerNames, c("minSize", "maxSize", "unknowns"))) for (i in markerNames) { sizes[[i]] = sort(unique(c(microsatData[[i]]$allele1, microsatData[[i]]$allele2))) sizeMatrix[i, "minSize"] = min(sizes[[i]][sizes[[i]]!=-9]) sizeMatrix[i, "maxSize"] = max(sizes[[i]][sizes[[i]]!=-9])} results = list(sizes, sizeMatrix) return(results)} microsatDist = function(microsatIn, individuals = "NONE", metric = "meanIdentityInState", missingData = -9, verbose = F) { ## Calculates a pairwise distance matrix between individuals ## Arguments: ## microsatIn: an object of microsat type as exported from microsat ## Distances: ## meanIdentityInState: described in Lynch 1990, MBE 7: 478--484, p. 479 ## Currently assumes all datasets in microsatIn contain the same individuals with same names -- it should produce an error if not ## Andrew Hipp (ahipp@mortonarb.org), January 2008 locusDistance = function(microsatDataset, i, j) { ## Embedded function to calculate each locus pairwise difference ## Currently assumes all data present ## Works! 4 january 2007 row.names(microsatDataset) = microsatDataset$names individualI = unlist(microsatDataset[i, ][2:3]) individualJ = unlist(microsatDataset[j, ][2:3]) distance = length(intersect(individualI, individualJ)) / max(length(unique(individualI)), length(unique(individualJ))) # Following version did not work correctly for heterozygotes #distance = length(intersect(unlist(microsatDataset[i, ][2:3]), unlist(microsatDataset[j, ][2:3]))) / # length(unique(c(unlist(microsatDataset[i, ][2:3]), unlist(microsatDataset[j, ][2:3])))[unique(c(unlist(microsatDataset[i, ][2:3]), unlist(microsatDataset[j, ][2:3]))) != -9]) return(distance) } if(individuals == "NONE") individuals = select.list(microsatIn$namesVector, multiple = T, title = "Highlight individuals to include in distance matrix") N = length(individuals) microsatDistance = matrix(ncol = N, nrow = N, dimnames = list(individuals, individuals) ) for (i in individuals) { for (j in individuals) { tempDist = unlist(lapply(microsatIn$dataset, locusDistance, i = i, j = j)) microsatDistance[i, j] = mean(tempDist[tempDist != Inf]) if(verbose) message(paste("Completed individual",i,"X individual",j))}} microsatDistance = as.dist(microsatDistance) attr(microsatDistance, "Size") <- N attr(microsatDistance, "Labels") <- individuals attr(microsatDistance, "Diag") <- F attr(microsatDistance, "Upper") <- F attr(microsatDistance, "method") <- "microsatDist" attr(microsatDistance, "call") <- match.call() class(microsatDistance) <- "dist" return(microsatDistance) } microsatExport<- function(microsatIn, format = "structure", missingData = -9) { # A Hipp, 8 august 2007, 26 nov 07 numberOfIndividuals = length(microsatIn$namesVector) numberOfLoci = length(microsatIn$markersVector) ## STRUCTURE -- 8 august 2007 if (format == "structure") { namesVector = unlist(lapply(microsatIn$namesVector, rep, times = 2)) for (i in 1:length(namesVector)) {if (as.integer(i/2) == i/2) namesVector[i] = paste(namesVector[i], "_allele2", sep = "")} alleleMatrix = matrix(nrow = (numberOfIndividuals * 2), ncol = numberOfLoci, dimnames = list(namesVector, microsatIn$markersVector)) alleleMatrix[ , ] = missingData print(alleleMatrix) for (i in 1:length(microsatIn$dataset)) { for (j in 1:length(microsatIn$dataset[[i]]$names)) { alleleMatrix[as.character(microsatIn$dataset[[i]]$names[j]), microsatIn$markersVector[i]] = as.character(microsatIn$dataset[[i]]$allele1[j]) alleleMatrix[paste(as.character(microsatIn$dataset[[i]]$names[j]), "_allele2", sep = ""), microsatIn$markersVector[i]] = as.character(microsatIn$dataset[[i]]$allele2[j]) } } message("To save the formated structure file, copy the following and paste into the command line, changing the filename and variable as needed:") message("") message("write.table(___, '___.txt', quote = FALSE)") message("") message("Once saved, do a global replace for the string '_allele2', which should be replaced with a NULL.") message("Then the file should import into STRUCTURE without modification.") message("") message(paste("Number of individuals:",numberOfIndividuals)) message(paste("Number of loci:",numberOfLoci))} ## GENEPOP -- 26 nov 2007 if (format == "genepop") { alleleMatrix = matrix(nrow = (numberOfIndividuals), ncol = numberOfLoci, dimnames = list(microsatIn$namesVector, microsatIn$markersVector)) for (i in 1:length(microsatIn$dataset)) { for (j in 1:length(microsatIn$dataset[[i]]$names)) { alleleMatrix[as.character(microsatIn$dataset[[i]]$names[j]), microsatIn$markersVector[i]] = paste(as.character(microsatIn$dataset[[i]]$allele1[j]), as.character(microsatIn$dataset[[i]]$allele2[j]), sep = "") }} dimnames(alleleMatrix) = list(paste(microsatIn$namesVector, ","), paste(microsatIn$markersVector, ",")) message("To save the formated structure file, copy the following and paste into the command line, changing the filename and variable as needed:") message("") message("write.table(___, '___.txt', quote = FALSE)") message("") message("Once saved, add in the following lines:") message(" - A title") message(" - the word POP before each population") message("*** VERY IMPORTANT: a comma needs to be deleted after the last locus name")} ## hetTable -- 26 nov 2007 if (format == "hettable") { alleleMatrix = matrix(nrow = (numberOfIndividuals), ncol = numberOfLoci, dimnames = list(microsatIn$namesVector, microsatIn$markersVector)) for (i in 1:length(microsatIn$dataset)) { for (j in 1:length(microsatIn$dataset[[i]]$names)) { alleleMatrix[as.character(microsatIn$dataset[[i]]$names[j]), microsatIn$markersVector[i]] = as.character(microsatIn$dataset[[i]]$allele1[j]) == as.character(microsatIn$dataset[[i]]$allele2[j]) }} } ## ... return return(alleleMatrix) }