Package 'hddplot' reference manual

Title:	Use Known Groups in High-Dimensional Data to Derive Scores for Plots
Description:	Cross-validated linear discriminant calculations determine the optimum number of features. Test and training scores from successive cross-validation steps determine, via a principal components calculation, a low-dimensional global space onto which test scores are projected, in order to plot them. Further functions are included that are intended for didactic use. The package implements, and extends, methods described in J.H. Maindonald and C.J. Burden (2005) <https://journal.austms.org.au/V46/CTAC2004/Main/home.html>.
Authors:	John Maindonald
Maintainer:	John Maindonald <[email protected]>
License:	GPL (>=2)
Version:	0.59-1
Built:	2025-03-31 03:54:59 UTC
Source:	https://github.com/jhmaindonald/hddplot

Two subsets of data each take in turn the role of test set

Description

Usage

accTrainTest(x = matrix(rnorm(1000), ncol=20), cl = factor(rep(1:3,c(7,9,4))),
 traintest = divideUp(cl, nset=2), nfeatures = NULL, print.acc = FALSE,
 print.progress=TRUE)
accTrainTest(x = matrix(rnorm(1000), ncol=20), cl = factor(rep(1:3,c(7,9,4))),
 traintest = divideUp(cl, nset=2), nfeatures = NULL, print.acc = FALSE,
 print.progress=TRUE)

Arguments

`x`	Matrix; rows are features, and columns are observations ('samples')
`cl`	Factor that classifies columns into groups that will classify the data for purposes of discriminant calculations
`traintest`	Values that specify a division of observations into two groups. In the first pass (fold), one to be training and the other test, with the roles then reversed in a second pass or fold.
`nfeatures`	integer: numbers of features for which calculations are required
`print.acc`	logical: should accuracies be printed?
`print.progress`	logical: should progress by feature number be printed?

Value

`sub1.2`	row numbers of features, by order of values of the group separation measure, for the first subset (I) of the `x`
`acc1.2`	accuracies, with I as training set and II as test
`sub2.1`	row numbers of features, by order of values of the group separation measure, for the second subset (II) of the `x`
`acc2.1`	accuracies, with II as training set and I as test

Author(s)

John Maindonald

Examples

mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
gp.id <- divideUp(cl, nset=2)
accTrainTest(x=mat, cl=cl, traintest=gp.id,
             nfeatures=1:16, print.acc=TRUE, print.progress=TRUE)

## The function is currently defined as
function(x=matrix(rnorm(1000), ncol=20), cl = factor(rep(1:3, c(7,9,4))),
         traintest=divideUp(cl, nset=2), nfeatures=NULL, print.acc=FALSE){
    traintest <- factor(traintest)
    train <- traintest==levels(traintest)[1]
    testset <- traintest==levels(traintest)[2]
    cl1 <- cl[train]
    cl2 <- cl[testset]
    ng1 <- length(cl1)
    ng2 <- length(cl2)
    maxg <- max(c(ng1-length(unique(cl1))-2,
                  ng2-length(unique(cl2))-2))
    if(is.null(nfeatures)){
      max.features <- maxg
      nfeatures <- 1:max.features
    } else
    {
      if(max(nfeatures)>maxg)nfeatures <- nfeatures[nfeatures<=maxg]
      max.features <- max(nfeatures)
    }
    ord1 <- orderFeatures(x, cl, subset=train)[1:max.features]
    ord2 <- orderFeatures(x, cl, subset=testset)[1:max.features]
    ord <- unique(c(ord1, ord2))
    sub1 <- match(ord1, ord)
    sub2 <- match(ord2, ord)
    df1 <- data.frame(t(x[ord, train]))
    df2 <- data.frame(t(x[ord, testset]))
    acc1 <- acc2 <- numeric(max(nfeatures))
    for(i in nfeatures){
      if(print.progress)cat(paste(i, ":", sep=""))
      df1.lda <- lda(df1[, sub1[1:i], drop=FALSE], cl1)
      hat2 <- predict(df1.lda, newdata=df2[, sub1[1:i], drop=FALSE])$class
      tab <- table(hat2, cl2)
      acc1[i] <- sum(tab[row(tab)==col(tab)])/sum(tab)
      df2.lda <- lda(df2[, sub2[1:i], drop=FALSE], cl2)
      hat1 <- predict(df2.lda, newdata=df1[, sub2[1:i], drop=FALSE])$class
      tab <- table(hat1, cl1)
      acc2[i] <- sum(tab[row(tab)==col(tab)])/sum(tab)
    }
    cat("\n")
    if(print.acc){
      print(round(acc1,2))
      print(round(acc2,2))
    }
    maxacc1 <- max(acc1)
    maxacc2 <- max(acc2)
    sub1 <- match(maxacc1, acc1)
    sub2 <- match(maxacc2, acc2)
    nextacc1 <- max(acc1[acc1<1])
    nextacc2 <- max(acc1[acc1<2])
    lower1 <- maxacc1-sqrt(nextacc1*(1-nextacc1)/ng1)
    lower2 <- maxacc2-sqrt(nextacc2*(1-nextacc2)/ng2)
    lsub1 <- min((1:ng1)[acc1>lower1])
    lsub2 <- min((1:ng2)[acc2>lower2])
    lower <- c("Best accuracy, less 1SD  ",
               paste(paste(round(c(lower1, lower2),2), c(lsub1, lsub2),
                           sep=" ("), " features)   ", sep=""))
    best <- c("Best accuracy",
              paste(paste(round(c(maxacc1, maxacc2),2), c(sub1, sub2),
                          sep=" ("), " features)", sep=""))
    acc.df <- cbind(lower, best)
    dimnames(acc.df) <- list(c("Training/test split",
                               "I (training) / II (test)    ",
                               "II (training) / I (test)    "),c("",""))
    print(acc.df, quote=FALSE)
    invisible(list(sub1.2=ord1, acc1.2=acc1, sub2.1=ord2, acc2.1=acc2))
  }
mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
gp.id <- divideUp(cl, nset=2)
accTrainTest(x=mat, cl=cl, traintest=gp.id,
             nfeatures=1:16, print.acc=TRUE, print.progress=TRUE)

## The function is currently defined as
function(x=matrix(rnorm(1000), ncol=20), cl = factor(rep(1:3, c(7,9,4))),
         traintest=divideUp(cl, nset=2), nfeatures=NULL, print.acc=FALSE){
    traintest <- factor(traintest)
    train <- traintest==levels(traintest)[1]
    testset <- traintest==levels(traintest)[2]
    cl1 <- cl[train]
    cl2 <- cl[testset]
    ng1 <- length(cl1)
    ng2 <- length(cl2)
    maxg <- max(c(ng1-length(unique(cl1))-2,
                  ng2-length(unique(cl2))-2))
    if(is.null(nfeatures)){
      max.features <- maxg
      nfeatures <- 1:max.features
    } else
    {
      if(max(nfeatures)>maxg)nfeatures <- nfeatures[nfeatures<=maxg]
      max.features <- max(nfeatures)
    }
    ord1 <- orderFeatures(x, cl, subset=train)[1:max.features]
    ord2 <- orderFeatures(x, cl, subset=testset)[1:max.features]
    ord <- unique(c(ord1, ord2))
    sub1 <- match(ord1, ord)
    sub2 <- match(ord2, ord)
    df1 <- data.frame(t(x[ord, train]))
    df2 <- data.frame(t(x[ord, testset]))
    acc1 <- acc2 <- numeric(max(nfeatures))
    for(i in nfeatures){
      if(print.progress)cat(paste(i, ":", sep=""))
      df1.lda <- lda(df1[, sub1[1:i], drop=FALSE], cl1)
      hat2 <- predict(df1.lda, newdata=df2[, sub1[1:i], drop=FALSE])$class
      tab <- table(hat2, cl2)
      acc1[i] <- sum(tab[row(tab)==col(tab)])/sum(tab)
      df2.lda <- lda(df2[, sub2[1:i], drop=FALSE], cl2)
      hat1 <- predict(df2.lda, newdata=df1[, sub2[1:i], drop=FALSE])$class
      tab <- table(hat1, cl1)
      acc2[i] <- sum(tab[row(tab)==col(tab)])/sum(tab)
    }
    cat("\n")
    if(print.acc){
      print(round(acc1,2))
      print(round(acc2,2))
    }
    maxacc1 <- max(acc1)
    maxacc2 <- max(acc2)
    sub1 <- match(maxacc1, acc1)
    sub2 <- match(maxacc2, acc2)
    nextacc1 <- max(acc1[acc1<1])
    nextacc2 <- max(acc1[acc1<2])
    lower1 <- maxacc1-sqrt(nextacc1*(1-nextacc1)/ng1)
    lower2 <- maxacc2-sqrt(nextacc2*(1-nextacc2)/ng2)
    lsub1 <- min((1:ng1)[acc1>lower1])
    lsub2 <- min((1:ng2)[acc2>lower2])
    lower <- c("Best accuracy, less 1SD  ",
               paste(paste(round(c(lower1, lower2),2), c(lsub1, lsub2),
                           sep=" ("), " features)   ", sep=""))
    best <- c("Best accuracy",
              paste(paste(round(c(maxacc1, maxacc2),2), c(sub1, sub2),
                          sep=" ("), " features)", sep=""))
    acc.df <- cbind(lower, best)
    dimnames(acc.df) <- list(c("Training/test split",
                               "I (training) / II (test)    ",
                               "II (training) / I (test)    "),c("",""))
    print(acc.df, quote=FALSE)
    invisible(list(sub1.2=ord1, acc1.2=acc1, sub2.1=ord2, acc2.1=acc2))
  }

calculate aov F-statistic for each row of a matrix

Description

Returns on aov F-statistic for each row of x

Usage

aovFbyrow(x=matrix(rnorm(1000), ncol=20), cl = factor(rep(1:3, c(7,9,4))))
aovFbyrow(x=matrix(rnorm(1000), ncol=20), cl = factor(rep(1:3, c(7,9,4))))

Arguments

`x`	features by observations matrix
`cl`	factor that classifies the values in each row

Details

This uses the functions qr() and qr.qty() for the main part of the calculation, for handling the calculations efficently

Value

one F-statistic for each row of x

Author(s)

John Maindonald

Examples

mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
Fstats <- aovFbyrow(x = mat, cl = cl)

## The function is currently defined as
aovFbyrow <-
function(x=matrix(rnorm(1000), ncol=20),
         cl=factor(rep(1:3, c(7,9,4)))){
    y <- t(x)
    qr.obj <- qr(model.matrix(~cl))
    qty.obj <- qr.qty(qr.obj,y)
    tab <- table(factor(cl))
    dfb <- length(tab)-1
    dfw <- sum(tab)-dfb-1
    ms.between <- apply(qty.obj[2:(dfb+1), , drop=FALSE]^2, 2, sum)/dfb
    ms.within <- apply(qty.obj[-(1:(dfb+1)), , drop=FALSE]^2, 2, sum)/dfw
    Fstat <- ms.between/ms.within
  }
mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
Fstats <- aovFbyrow(x = mat, cl = cl)

## The function is currently defined as
aovFbyrow <-
function(x=matrix(rnorm(1000), ncol=20),
         cl=factor(rep(1:3, c(7,9,4)))){
    y <- t(x)
    qr.obj <- qr(model.matrix(~cl))
    qty.obj <- qr.qty(qr.obj,y)
    tab <- table(factor(cl))
    dfb <- length(tab)-1
    dfw <- sum(tab)-dfb-1
    ms.between <- apply(qty.obj[2:(dfb+1), , drop=FALSE]^2, 2, sum)/dfb
    ms.within <- apply(qty.obj[-(1:(dfb+1)), , drop=FALSE]^2, 2, sum)/dfw
    Fstat <- ms.between/ms.within
  }

Cross-validated accuracy, in linear discriminant calculations

Description

Determine cross-validated accuracy, for each of a number of features in a specified range, with feature selection repeated at each step of the cross-validation.

Usage

cvdisc(x, cl, nfold = c(10,1), test = "f", nfeatures = 2, seed = 31,
       funda = lda, print.progress = TRUE, subset = NULL)
cvdisc(x, cl, nfold = c(10,1), test = "f", nfeatures = 2, seed = 31,
       funda = lda, print.progress = TRUE, subset = NULL)

Arguments

`x`	Matrix; rows are features, and columns are observations ('samples')
`cl`	Factor that classifies columns into groups
`nfold`	Number of folds for the cross-validation. Optionally, a second number species the number of repeats of the cross-validation.
`test`	What statistic will be used to measure separation between groups? Currently `"f"` is the only possibility.
`nfeatures`	Specifies the different numbers of features (e.g., 1:10) that will be tried, to determine cross-validation accuracy in each instance
`seed`	This can be used to specify a starting value for the random number generator, in order to make calculations repeatable
`funda`	Function that will be used for discrimination. Currently `lda` is the only option
`print.progress`	Set to `TRUE` (default) for printing out, as calculations proceed, the number of the current fold
`subset`	Allows the use of a subset of the samples (observations)

Value

`folds`	Each column gives, for one run of the cross-validation, numbers that identify the `nfold` distinct folds of the cross-validation
`xUsed`	returns the rows of `x` that were used, in at least one fold
`cl`	Factor that classifies columns into groups
`acc.cv`	Cross-validated accuracy
`genelist`	Array: `max(nfeatures)` by number of folds by number of repeats, identifying the features chosen at each repeat of each fold. (for `k` < `max(nfeatures)` features, take the initial `k` rows
`Fmatrix`	Array, with the same dimensions as `genelist`, that gives the anova F-statistic when that feature is used on its own to separate groups
`nfeatures`	Specifies the different numbers of features that were tried, to determine cross-validation accuracy in each instance

Author(s)

John Maindonald

Examples

## Use first 500 rows (expression values) of Golub, for demonstration.
data(Golub)
data(golubInfo)
attach(golubInfo)
miniG.BM <- Golub[1:500, BM.PB=="BM"]  # 1st 500 rows only
cancer.BM <- cancer[BM.PB=="BM"]
miniG.cv <- cvdisc(miniG.BM, cl=cancer.BM, nfeatures=1:10,
                    nfold=c(3,1))
## Plot cross-validated accuracy, as a function of number of features
plot(miniG.cv$acc.cv, type="l")


## The function is currently defined as
function(x, cl, nfold=NULL, test="f",
           nfeatures=2, seed=31, funda=lda, print.progress=TRUE,
           subset=NULL){
    ## If nfold is not specified, use leave-one-out CV
    if(is.null(nfold))nfold <- sum(!is.na(cl))
    ## Option to omit one or more points
    if(!is.null(subset)){cl[!is.na(cl)][!subset] <- NA
                         nfold[1] <- min(nfold[1], sum(!is.na(cl)))
                       }
    if(any(is.na(cl))){x <- x[,!is.na(cl)]
                       cl <- cl[!is.na(cl)]
                     }
    if(length(nfold)==1)nfold <- c(nfold,1)
    cl <- factor(cl)
    ngp <- length(levels(cl))
    genes <- rownames(x)
    nobs <- dim(x)[2]
    if(is.null(genes)){
      genes <- paste(1:dim(x)[1])
      print("Input rows (features) are not named. Names")
      print(paste(1,":", dim(x)[1], " will be assigned.", sep=""))
      rownames(x) <- genes
    }
    require(MASS)
    if(!is.null(seed))set.seed(seed)
    Fcut <- NULL
    maxgenes <- max(nfeatures)
    ## Cross-validation calculations
    if(nfold[1]==nobs)foldids <- matrix(sample(1:nfold[1]),ncol=1) else
    foldids <- sapply(1:nfold[2], function(x)
                     divideUp(cl, nset=nfold[1]))
    genelist <- array("", dim=c(nrow=maxgenes, ncol=nfold[1], nleaf=nfold[2]))
    Fmatrix <- array(0, dim=c(nrow=maxgenes, ncol=nfold[1], nleaf=nfold[2]))
    testscores <- NULL
    acc.cv <- numeric(maxgenes)
    if(print.progress)
      cat("\n", "Preliminary per fold calculations","\n")
    for(k in 1:nfold[2])
      {
        foldk <- foldids[,k]
    ufold <- sort(unique(foldk))
    for(i in ufold){
      if(print.progress) cat(paste(i,":",sep=""))
      trainset <- (1:nobs)[foldk!=i]
      cli <- factor(cl[trainset])

      stat <- aovFbyrow(x=x[, trainset], cl=cli)
      ordi <- order(-abs(stat))[1:maxgenes]
      genelist[,i, k] <- genes[ordi]
      Fmatrix[, i, k] <- stat[ordi]
    }
  }
    ulist <- unique(as.vector(genelist))
    df <- data.frame(t(x[ulist, , drop=FALSE]))
    names(df) <- ulist
#######################################################################
    if(print.progress)cat("\n", "Show each choice of number of features:","\n")
    for(ng in nfeatures){
      hat <- cl
      if(print.progress)cat(paste(ng,":",sep=""))
    for(k in 1:nfold[2])
      {
        foldk <- foldids[,k]
        ufold <- sort(unique(foldk))
      for(i in ufold){
        testset <- (1:nobs)[foldk==i]
        trainset <- (1:nobs)[foldk!=i]
        ntest <- length(testset)
        ntrain <- nobs-ntest
        genes.i <- genelist[1:ng, i, k]
        dfi <- df[-testset, genes.i, drop=FALSE]
        newdfi <- df[testset, genes.i, drop=FALSE]
        cli <- cl[-testset]
        xy.xda <- funda(cli~., data=dfi)
        subs <- match(colnames(dfi), rownames(df))
        newpred.xda <- predict(xy.xda, newdata=newdfi, method="debiased")
        hat[testset] <- newpred.xda$class
      }
        tabk <- table(hat,cl)
        if(k==1)tab <- tabk else tab <- tab+tabk
      }
      acc.cv[ng] <- sum(tab[row(tab)==col(tab)])/sum(tab)
    }
    cat("\n")
    if(length(nfeatures)>1&all(diff(nfeatures)==1)){
      nobs <- length(cl)
      ng1 <- length(acc.cv)
      maxacc1 <- max(acc.cv)
      sub1 <- match(maxacc1, acc.cv)
      nextacc1 <- max(acc.cv[acc.cv<1])
      lower1 <- maxacc1-sqrt(nextacc1*(1-nextacc1)/nobs)
      lsub1 <- min((1:ng1)[acc.cv>lower1])
      lower <- c("Best accuracy, less 1SD  ",
                 paste(paste(round(c(lower1),2), c(lsub1),
                             sep=" ("), " features)   ", sep=""))
      best <- c("Best accuracy",
                paste(paste(round(c(maxacc1),2), c(sub1),
                            sep=" ("), " features)", sep=""))
      acc.df <- cbind(lower, best)
      dimnames(acc.df) <- list(c("Accuracy",
                                 "(Cross-validation)"),c("",""))
      print(acc.df, quote=FALSE)
    }
    invisible(list(foldids=foldids, xUsed=df, cl=cl, acc.cv=acc.cv,
                   genelist=genelist, Fmatrix=Fmatrix, nfeatures=nfeatures))
  }
## Use first 500 rows (expression values) of Golub, for demonstration.
data(Golub)
data(golubInfo)
attach(golubInfo)
miniG.BM <- Golub[1:500, BM.PB=="BM"]  # 1st 500 rows only
cancer.BM <- cancer[BM.PB=="BM"]
miniG.cv <- cvdisc(miniG.BM, cl=cancer.BM, nfeatures=1:10,
                    nfold=c(3,1))
## Plot cross-validated accuracy, as a function of number of features
plot(miniG.cv$acc.cv, type="l")


## The function is currently defined as
function(x, cl, nfold=NULL, test="f",
           nfeatures=2, seed=31, funda=lda, print.progress=TRUE,
           subset=NULL){
    ## If nfold is not specified, use leave-one-out CV
    if(is.null(nfold))nfold <- sum(!is.na(cl))
    ## Option to omit one or more points
    if(!is.null(subset)){cl[!is.na(cl)][!subset] <- NA
                         nfold[1] <- min(nfold[1], sum(!is.na(cl)))
                       }
    if(any(is.na(cl))){x <- x[,!is.na(cl)]
                       cl <- cl[!is.na(cl)]
                     }
    if(length(nfold)==1)nfold <- c(nfold,1)
    cl <- factor(cl)
    ngp <- length(levels(cl))
    genes <- rownames(x)
    nobs <- dim(x)[2]
    if(is.null(genes)){
      genes <- paste(1:dim(x)[1])
      print("Input rows (features) are not named. Names")
      print(paste(1,":", dim(x)[1], " will be assigned.", sep=""))
      rownames(x) <- genes
    }
    require(MASS)
    if(!is.null(seed))set.seed(seed)
    Fcut <- NULL
    maxgenes <- max(nfeatures)
    ## Cross-validation calculations
    if(nfold[1]==nobs)foldids <- matrix(sample(1:nfold[1]),ncol=1) else
    foldids <- sapply(1:nfold[2], function(x)
                     divideUp(cl, nset=nfold[1]))
    genelist <- array("", dim=c(nrow=maxgenes, ncol=nfold[1], nleaf=nfold[2]))
    Fmatrix <- array(0, dim=c(nrow=maxgenes, ncol=nfold[1], nleaf=nfold[2]))
    testscores <- NULL
    acc.cv <- numeric(maxgenes)
    if(print.progress)
      cat("\n", "Preliminary per fold calculations","\n")
    for(k in 1:nfold[2])
      {
        foldk <- foldids[,k]
    ufold <- sort(unique(foldk))
    for(i in ufold){
      if(print.progress) cat(paste(i,":",sep=""))
      trainset <- (1:nobs)[foldk!=i]
      cli <- factor(cl[trainset])

      stat <- aovFbyrow(x=x[, trainset], cl=cli)
      ordi <- order(-abs(stat))[1:maxgenes]
      genelist[,i, k] <- genes[ordi]
      Fmatrix[, i, k] <- stat[ordi]
    }
  }
    ulist <- unique(as.vector(genelist))
    df <- data.frame(t(x[ulist, , drop=FALSE]))
    names(df) <- ulist
#######################################################################
    if(print.progress)cat("\n", "Show each choice of number of features:","\n")
    for(ng in nfeatures){
      hat <- cl
      if(print.progress)cat(paste(ng,":",sep=""))
    for(k in 1:nfold[2])
      {
        foldk <- foldids[,k]
        ufold <- sort(unique(foldk))
      for(i in ufold){
        testset <- (1:nobs)[foldk==i]
        trainset <- (1:nobs)[foldk!=i]
        ntest <- length(testset)
        ntrain <- nobs-ntest
        genes.i <- genelist[1:ng, i, k]
        dfi <- df[-testset, genes.i, drop=FALSE]
        newdfi <- df[testset, genes.i, drop=FALSE]
        cli <- cl[-testset]
        xy.xda <- funda(cli~., data=dfi)
        subs <- match(colnames(dfi), rownames(df))
        newpred.xda <- predict(xy.xda, newdata=newdfi, method="debiased")
        hat[testset] <- newpred.xda$class
      }
        tabk <- table(hat,cl)
        if(k==1)tab <- tabk else tab <- tab+tabk
      }
      acc.cv[ng] <- sum(tab[row(tab)==col(tab)])/sum(tab)
    }
    cat("\n")
    if(length(nfeatures)>1&all(diff(nfeatures)==1)){
      nobs <- length(cl)
      ng1 <- length(acc.cv)
      maxacc1 <- max(acc.cv)
      sub1 <- match(maxacc1, acc.cv)
      nextacc1 <- max(acc.cv[acc.cv<1])
      lower1 <- maxacc1-sqrt(nextacc1*(1-nextacc1)/nobs)
      lsub1 <- min((1:ng1)[acc.cv>lower1])
      lower <- c("Best accuracy, less 1SD  ",
                 paste(paste(round(c(lower1),2), c(lsub1),
                             sep=" ("), " features)   ", sep=""))
      best <- c("Best accuracy",
                paste(paste(round(c(maxacc1),2), c(sub1),
                            sep=" ("), " features)", sep=""))
      acc.df <- cbind(lower, best)
      dimnames(acc.df) <- list(c("Accuracy",
                                 "(Cross-validation)"),c("",""))
      print(acc.df, quote=FALSE)
    }
    invisible(list(foldids=foldids, xUsed=df, cl=cl, acc.cv=acc.cv,
                   genelist=genelist, Fmatrix=Fmatrix, nfeatures=nfeatures))
  }

For high-dimensional data with known groups, derive scores for plotting

Description

This is designed to used with the output from cvdisc. Test and training scores from successive cross-validation steps determine, via a principal components calculation, a low-dimensional global space onto which test scores are projected, in order to plot them.

Usage

cvscores(cvlist, nfeatures, ndisc = NULL, cl.other,
         x.other, keepcols = NULL, print.progress = TRUE)
cvscores(cvlist, nfeatures, ndisc = NULL, cl.other,
         x.other, keepcols = NULL, print.progress = TRUE)

Arguments

`cvlist`	Output object from `cvdisc`
`nfeatures`	Number of features to use
`ndisc`	Dimension of space in which scores will be formed, at most one less than the number of groups
`cl.other`	Classifies additional observations that are to be projected onto the same low-dimensional space
`x.other`	Matrix from which additional observations will be taken
`keepcols`	Number of sets of principal component scores to use in discriminant calculations and consequent evaluation of scores that will determine the low-dimensional global space
`print.progress`	Set to `TRUE` (default) for printing out, as calculations proceed, the number of the current fold

Value

`scores`	Scores that can be plotted
`cl`	Factor that was used to classify observations into groups
`other.scores`	Other scores, if any, for plotting
`cl.other`	Factor that was used to classify the 'other' data into groups
`nfeatures`	Number of features used

Note

The methodology used here has developed beyond that described in Maindonald and Burden (2005)

Author(s)

John Maindonald

References

J. H. Maindonald and C. J. Burden, 2005. Selection bias in plots of microarray or other data that have been sampled from a high-dimensional space. In R. May and A.J. Roberts, eds., Proceedings of 12th Computational Techniques and Applications Conference CTAC-2004, volume 46, pp. C59–C74.

http://journal.austms.org.au/V46/CTAC2004/Main/home.html [March 15, 2005]

Examples

## Use first 500 rows (expression values) of Golub, for demonstration.
data(Golub)
data(golubInfo)
attach(golubInfo)
miniG.BM <- Golub[1:500, BM.PB=="BM"]  # 1st 500 rows only
cancer.BM <- cancer[BM.PB=="BM"]
miniG.cv <- cvdisc(miniG.BM, cl=cancer.BM, nfeatures=1:10,
                    nfold=c(3,1))
miniG.scores <- cvscores(cvlist=miniG.cv, nfeatures=4,
                         cl.other=NULL)
detach(golubInfo)

## The function is currently defined as
function(cvlist, nfeatures, ndisc=NULL, cl.other, x.other,
           keepcols=NULL, print.progress=TRUE
           ){
    library(MASS)
    foldids <- cvlist$foldids
    nfold <- c(length(unique(foldids)), dim(foldids)[2])

    ugenes <- unique(as.vector(cvlist$genelist[1:nfeatures, ,]))
    df <- cvlist$xUsed[, ugenes]
    cl <- cvlist$cl
    if(!length(cl)==dim(df)[1])
      stop(paste("length(cl) =", length(cl),"does not equal",
                 "dim(cvlist$df)[1] =", dim(df)[1]))
    levnames <- levels(cl)
    if(is.null(ndisc))ndisc <- length(levnames)-1
    ngp <- length(levnames)
    nobs <- dim(df)[1]
    allscores <- array(0, dim=c(nrow=nobs, ncol=ndisc*nfold[1], nleaf=nfold[2]))
    if(!is.null(cl.other)){
      cl.other <- factor(cl.other)
      if(is.null(dim(x.other)))stop("x.other must have dimension 2")
      if(!length(cl.other)==dim(x.other)[2])
        stop(paste("length(cl.other) =", length(cl.other),"does not equal",
                   "dim(x.other)[2] =", dim(x.other)[2]))
      df.other <- data.frame(t(x.other[ugenes, ,drop=FALSE]))
      colnames(df.other) <- ugenes
    }
    else other.scores <- NULL
    for(k in 1:nfold[2]){
      foldk <- foldids[,k]
      ufold <- sort(unique(foldk))
      j <- 0
      for(i in ufold){
        j <- j+1
        if(print.progress)cat(paste(if(j>1) ":" else "", i,sep=""))
        testi <- (1:nobs)[foldk==i]
        traini <- (1:nobs)[foldk!=i]
        ntest <- length(testi)
        ntrain <- nobs-ntest
        genes.i <- cvlist$genelist[1:nfeatures, i, k]
        dfi <- as.data.frame(df[-testi, genes.i, drop=FALSE])
        newdfi <- as.data.frame(df[testi, genes.i, drop=FALSE])
        cli <- cl[-testi]
        xy.xda <- lda(cli~., data=dfi)
        allscores[, ((i-1)*ndisc)+(1:ndisc), k] <-
          predict(xy.xda, newdata=df, dimen=ndisc)$x
      }
    }
    cat("\n")
    dim(allscores) <- c(nobs, ndisc*prod(nfold))
    if(is.null(keepcols))keepcols <- min(nfeatures, dim(allscores)[2])
    allscores.pcp <- data.frame(pcp(allscores, varscores=FALSE)$g[, 1:keepcols])
    globals <- predict(lda(cl ~ ., data=allscores.pcp))$x[,1:ndisc]
    fitscores <- array(0, dim=c(nrow=nobs, ncol=ndisc, nleaf=nfold[2]))
    for(k in 1:nfold[2]){
      foldk <- foldids[,k]
      ufold <- sort(unique(foldk))
##      ntimes.genes <- table(cvlist$genelist[1:nfeatures,,k])
      av <- colMeans(df)
      j <- 0
      for(i in ufold){
        j <- j+1
        cat(paste(if (j>1) ":" else "", i,sep=""))
        testi <- (1:nobs)[foldk==i]
        traini <- (1:nobs)[foldk!=i]
        genes.i <- cvlist$genelist[1:nfeatures, i, k]
        dfi <- data.frame(df[-testi, genes.i, drop=FALSE])
        newdfi <- data.frame(df[testi, genes.i, drop=FALSE])
        cli <- cl[-testi]
        traini.xda <- lda(cli~., data=dfi)
        scorei <- predict(traini.xda)$x[,1:ndisc]
        newpred.xda <- predict(traini.xda, newdata=newdfi)
        scorei.out <- newpred.xda$x[, 1:ndisc, drop=FALSE]
        scorei.all <- globals[-testi, 1:ndisc]
        avcol <- colMeans(scorei.all)
        scorei.all <- sweep(scorei.all, 2, avcol,"-")
        avi <- colMeans(scorei)
        scorei <- sweep(scorei, 2, avi,"-")
        trans <- qr.solve(scorei, scorei.all)
        scorei.out <- sweep(scorei.out, 2, avi, "-")
        fitscores[testi, , k] <- sweep(scorei.out%*%trans, 2, avcol, "+")
      }
    }
    fitscores <- apply(fitscores, 1:2, mean)

    if(!is.null(cl.other)){
      Fmatrix <- cvlist$Fmatrix
      ord <- order(Fmatrix)[1:nfeatures]
      rowcol <- cbind(as.vector(row(Fmatrix))[ord],as.vector(col(Fmatrix))[ord])
      ugenes <- unique(as.vector(cvlist$genelist[rowcol]))
      df <- cvlist$xUsed[, ugenes]
      xy.xda <- lda(cl~., data=df)
      train.scores <- predict(xy.xda, dimen=ndisc)$x
      other.scores <- predict(xy.xda, newdata=df.other,
                              dimen=ndisc)$x
      avcol <- colMeans(globals)
      all.scores <- sweep(globals, 2, avcol,"-")
      av.train <- colMeans(train.scores)
      train.scores <- sweep(train.scores, 2, av.train, "-")
      trans <- qr.solve(train.scores, all.scores)
      other.scores <- sweep(other.scores%*%trans, 2, avcol, "+")
    }
    if(print.progress)cat("\n")
    invisible(list(scores=fitscores, cl=cl, other=other.scores,
                   cl.other=cl.other, nfeatures=nfeatures))
  }
## Use first 500 rows (expression values) of Golub, for demonstration.
data(Golub)
data(golubInfo)
attach(golubInfo)
miniG.BM <- Golub[1:500, BM.PB=="BM"]  # 1st 500 rows only
cancer.BM <- cancer[BM.PB=="BM"]
miniG.cv <- cvdisc(miniG.BM, cl=cancer.BM, nfeatures=1:10,
                    nfold=c(3,1))
miniG.scores <- cvscores(cvlist=miniG.cv, nfeatures=4,
                         cl.other=NULL)
detach(golubInfo)

## The function is currently defined as
function(cvlist, nfeatures, ndisc=NULL, cl.other, x.other,
           keepcols=NULL, print.progress=TRUE
           ){
    library(MASS)
    foldids <- cvlist$foldids
    nfold <- c(length(unique(foldids)), dim(foldids)[2])

    ugenes <- unique(as.vector(cvlist$genelist[1:nfeatures, ,]))
    df <- cvlist$xUsed[, ugenes]
    cl <- cvlist$cl
    if(!length(cl)==dim(df)[1])
      stop(paste("length(cl) =", length(cl),"does not equal",
                 "dim(cvlist$df)[1] =", dim(df)[1]))
    levnames <- levels(cl)
    if(is.null(ndisc))ndisc <- length(levnames)-1
    ngp <- length(levnames)
    nobs <- dim(df)[1]
    allscores <- array(0, dim=c(nrow=nobs, ncol=ndisc*nfold[1], nleaf=nfold[2]))
    if(!is.null(cl.other)){
      cl.other <- factor(cl.other)
      if(is.null(dim(x.other)))stop("x.other must have dimension 2")
      if(!length(cl.other)==dim(x.other)[2])
        stop(paste("length(cl.other) =", length(cl.other),"does not equal",
                   "dim(x.other)[2] =", dim(x.other)[2]))
      df.other <- data.frame(t(x.other[ugenes, ,drop=FALSE]))
      colnames(df.other) <- ugenes
    }
    else other.scores <- NULL
    for(k in 1:nfold[2]){
      foldk <- foldids[,k]
      ufold <- sort(unique(foldk))
      j <- 0
      for(i in ufold){
        j <- j+1
        if(print.progress)cat(paste(if(j>1) ":" else "", i,sep=""))
        testi <- (1:nobs)[foldk==i]
        traini <- (1:nobs)[foldk!=i]
        ntest <- length(testi)
        ntrain <- nobs-ntest
        genes.i <- cvlist$genelist[1:nfeatures, i, k]
        dfi <- as.data.frame(df[-testi, genes.i, drop=FALSE])
        newdfi <- as.data.frame(df[testi, genes.i, drop=FALSE])
        cli <- cl[-testi]
        xy.xda <- lda(cli~., data=dfi)
        allscores[, ((i-1)*ndisc)+(1:ndisc), k] <-
          predict(xy.xda, newdata=df, dimen=ndisc)$x
      }
    }
    cat("\n")
    dim(allscores) <- c(nobs, ndisc*prod(nfold))
    if(is.null(keepcols))keepcols <- min(nfeatures, dim(allscores)[2])
    allscores.pcp <- data.frame(pcp(allscores, varscores=FALSE)$g[, 1:keepcols])
    globals <- predict(lda(cl ~ ., data=allscores.pcp))$x[,1:ndisc]
    fitscores <- array(0, dim=c(nrow=nobs, ncol=ndisc, nleaf=nfold[2]))
    for(k in 1:nfold[2]){
      foldk <- foldids[,k]
      ufold <- sort(unique(foldk))
##      ntimes.genes <- table(cvlist$genelist[1:nfeatures,,k])
      av <- colMeans(df)
      j <- 0
      for(i in ufold){
        j <- j+1
        cat(paste(if (j>1) ":" else "", i,sep=""))
        testi <- (1:nobs)[foldk==i]
        traini <- (1:nobs)[foldk!=i]
        genes.i <- cvlist$genelist[1:nfeatures, i, k]
        dfi <- data.frame(df[-testi, genes.i, drop=FALSE])
        newdfi <- data.frame(df[testi, genes.i, drop=FALSE])
        cli <- cl[-testi]
        traini.xda <- lda(cli~., data=dfi)
        scorei <- predict(traini.xda)$x[,1:ndisc]
        newpred.xda <- predict(traini.xda, newdata=newdfi)
        scorei.out <- newpred.xda$x[, 1:ndisc, drop=FALSE]
        scorei.all <- globals[-testi, 1:ndisc]
        avcol <- colMeans(scorei.all)
        scorei.all <- sweep(scorei.all, 2, avcol,"-")
        avi <- colMeans(scorei)
        scorei <- sweep(scorei, 2, avi,"-")
        trans <- qr.solve(scorei, scorei.all)
        scorei.out <- sweep(scorei.out, 2, avi, "-")
        fitscores[testi, , k] <- sweep(scorei.out%*%trans, 2, avcol, "+")
      }
    }
    fitscores <- apply(fitscores, 1:2, mean)

    if(!is.null(cl.other)){
      Fmatrix <- cvlist$Fmatrix
      ord <- order(Fmatrix)[1:nfeatures]
      rowcol <- cbind(as.vector(row(Fmatrix))[ord],as.vector(col(Fmatrix))[ord])
      ugenes <- unique(as.vector(cvlist$genelist[rowcol]))
      df <- cvlist$xUsed[, ugenes]
      xy.xda <- lda(cl~., data=df)
      train.scores <- predict(xy.xda, dimen=ndisc)$x
      other.scores <- predict(xy.xda, newdata=df.other,
                              dimen=ndisc)$x
      avcol <- colMeans(globals)
      all.scores <- sweep(globals, 2, avcol,"-")
      av.train <- colMeans(train.scores)
      train.scores <- sweep(train.scores, 2, av.train, "-")
      trans <- qr.solve(train.scores, all.scores)
      other.scores <- sweep(other.scores%*%trans, 2, avcol, "+")
    }
    if(print.progress)cat("\n")
    invisible(list(scores=fitscores, cl=cl, other=other.scores,
                   cl.other=cl.other, nfeatures=nfeatures))
  }

defective accuracy assessments from linear discriminant calculations

Description

Determine cross-validated accuracy, for each of a number of features in a specified range, in each case with a set of features that have been selected using the total data. The "accuracy" assessment are provided only for comparative purposes

Usage

defectiveCVdisc(x, cl, nfold = NULL, FUN = aovFbyrow, nfeatures = 2, seed = 31,
         funda = lda, foldids = NULL, subset = NULL, print.progress = TRUE)
defectiveCVdisc(x, cl, nfold = NULL, FUN = aovFbyrow, nfeatures = 2, seed = 31,
         funda = lda, foldids = NULL, subset = NULL, print.progress = TRUE)

Arguments

`x`	Matrix; rows are features, and columns are observations ('samples')
`cl`	Factor that classifies columns into groups
`nfold`	Number of folds for the cross-validation. Optionally, a second number species the number of repeats of the cross-validation
`FUN`	function used to calculate a measure, for each row, of separation into groups
`nfeatures`	Specifies the different numbers of features (e.g., 1:10) that will be tried, to determine cross-validation accuracy in each instance
`seed`	This can be used to specify a starting value for the random number generator, in order to make calculations repeatable
`funda`	Function that will be used for discrimination. Currently `lda` is the only option
`foldids`	Fold information, as output from `cvdisc()`
`subset`	Allows the use of a subset of the samples (observations)
`print.progress`	Set to `TRUE` (default) for printing out, as calculations proceed, the number of the current fold

Value

`acc.resub`	resubstitution measure of 'accuracy'
`acc.sel1`	'accuracy' from cross-validation, with the initially selected features

Author(s)

John Maindonald

Examples

mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
badaccs <- defectiveCVdisc(mat, cl, nfold=c(3,1), nfeatures=1:5)
## Note the list elements acc.resub and acc.sel1


## The function is currently defined as
function(x, cl, nfold=NULL, FUN=aovFbyrow,
           nfeatures=2, seed=31, funda=lda, foldids=NULL,
           subset=NULL, print.progress=TRUE){
    ## Option to omit one or more points
    if(!is.null(subset)) cl[!is.na(cl)][!subset] <- NA
    if(any(is.na(cl))){x <- x[,!is.na(cl)]
                       cl <- cl[!is.na(cl)]
                     }
    nobs <- dim(x)[2]
    ## Get fold information from foldids, if specified,
    ## else if nfold is not specified, use leave-one-out CV
    if(!is.null(foldids))
      nfold <- c(length(unique(foldids)), dim(foldids)[2])
    if(is.null(nfold)&is.null(foldids))nfold <- sum(!is.na(cl))
    else if(nfold[1]==nobs)foldids <- sample(1:nfold[1])
    else foldids <- sapply(1:nfold[2], function(x)
                     divideUp(cl, nset=nfold[1]))
    if(length(nfold)==1)nfold <- c(nfold,1)
    cl <- factor(cl)
    ngp <- length(levels(cl))
    genes <- rownames(x)
     if(is.null(genes)){
      genes <- paste(1:dim(x)[1])
      print("Input rows (features) are not named. Names")
      print(paste(1,":", dim(x)[1], " will be assigned.", sep=""))
      rownames(x) <- genes
    }
    require(MASS)
    if(!is.null(seed))set.seed(seed)
    Fcut <- NULL
    maxgenes <- max(nfeatures)

    stat <- FUN(x=x, cl)
    Fcut <- list(F=sort(stat, decreasing=TRUE)[nfeatures],
                 df=c(ngp-1, nobs-ngp))
    ord <- order(-abs(stat))[1:maxgenes]
    genes.ord <- genes[ord]
    selectonce.df <- data.frame(t(x[ord, , drop=FALSE]))
    acc.resub <- acc.sel1 <- numeric(maxgenes)
    if(nfold[1]==0)acc.sel1 <- NULL

    for(ng in nfeatures){
      resub.xda <- funda(cl~., data=selectonce.df[,1:ng,drop=FALSE])
      hat.rsb <- predict(resub.xda)$class
      tab.rsb <- table(hat.rsb, cl)
      acc.resub[ng] <- sum(tab.rsb[row(tab.rsb)==col(tab.rsb)])/sum(tab.rsb)
      if(nfold[1]==0)next
      if(nfold[1]==nobs){
        hat.sel1 <- funda(cl~., data=selectonce.df[,1:ng,drop=FALSE],
                          CV=TRUE)$class
        tab.one <- table(hat.sel1, cl)
        acc.sel1[ng] <- sum(tab.one[row(tab.one)==col(tab.one)])/sum(tab.one)
      } else
      {
      hat <- cl
      if(print.progress)cat(paste(ng,":",sep=""))
      for(k in 1:nfold[2])
      {
        foldk <- foldids[,k]
        ufold <- sort(unique(foldk))
        for(i in ufold){
          testset <- (1:nobs)[foldk==i]
          trainset <- (1:nobs)[foldk!=i]
          dfi <- selectonce.df[-testset, 1:ng, drop=FALSE]
          newdfi <- selectonce.df[testset, 1:ng, drop=FALSE]
          cli <- cl[-testset]
          xy.xda <- funda(cli~., data=dfi)
          subs <- match(colnames(dfi), rownames(df))
          newpred.xda <- predict(xy.xda, newdata=newdfi, method="debiased")
          hat[testset] <- newpred.xda$class
        }
        tabk <- table(hat,cl)
        if(k==1)tab <- tabk else tab <- tab+tabk
      }
      acc.sel1[ng] <- sum(tab[row(tab)==col(tab)])/sum(tab)
      }
    }
    if(print.progress)cat("\n")
    invisible(list(acc.resub=acc.resub, acc.sel1=acc.sel1, genes=genes.ord))
  }
mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
badaccs <- defectiveCVdisc(mat, cl, nfold=c(3,1), nfeatures=1:5)
## Note the list elements acc.resub and acc.sel1


## The function is currently defined as
function(x, cl, nfold=NULL, FUN=aovFbyrow,
           nfeatures=2, seed=31, funda=lda, foldids=NULL,
           subset=NULL, print.progress=TRUE){
    ## Option to omit one or more points
    if(!is.null(subset)) cl[!is.na(cl)][!subset] <- NA
    if(any(is.na(cl))){x <- x[,!is.na(cl)]
                       cl <- cl[!is.na(cl)]
                     }
    nobs <- dim(x)[2]
    ## Get fold information from foldids, if specified,
    ## else if nfold is not specified, use leave-one-out CV
    if(!is.null(foldids))
      nfold <- c(length(unique(foldids)), dim(foldids)[2])
    if(is.null(nfold)&is.null(foldids))nfold <- sum(!is.na(cl))
    else if(nfold[1]==nobs)foldids <- sample(1:nfold[1])
    else foldids <- sapply(1:nfold[2], function(x)
                     divideUp(cl, nset=nfold[1]))
    if(length(nfold)==1)nfold <- c(nfold,1)
    cl <- factor(cl)
    ngp <- length(levels(cl))
    genes <- rownames(x)
     if(is.null(genes)){
      genes <- paste(1:dim(x)[1])
      print("Input rows (features) are not named. Names")
      print(paste(1,":", dim(x)[1], " will be assigned.", sep=""))
      rownames(x) <- genes
    }
    require(MASS)
    if(!is.null(seed))set.seed(seed)
    Fcut <- NULL
    maxgenes <- max(nfeatures)

    stat <- FUN(x=x, cl)
    Fcut <- list(F=sort(stat, decreasing=TRUE)[nfeatures],
                 df=c(ngp-1, nobs-ngp))
    ord <- order(-abs(stat))[1:maxgenes]
    genes.ord <- genes[ord]
    selectonce.df <- data.frame(t(x[ord, , drop=FALSE]))
    acc.resub <- acc.sel1 <- numeric(maxgenes)
    if(nfold[1]==0)acc.sel1 <- NULL

    for(ng in nfeatures){
      resub.xda <- funda(cl~., data=selectonce.df[,1:ng,drop=FALSE])
      hat.rsb <- predict(resub.xda)$class
      tab.rsb <- table(hat.rsb, cl)
      acc.resub[ng] <- sum(tab.rsb[row(tab.rsb)==col(tab.rsb)])/sum(tab.rsb)
      if(nfold[1]==0)next
      if(nfold[1]==nobs){
        hat.sel1 <- funda(cl~., data=selectonce.df[,1:ng,drop=FALSE],
                          CV=TRUE)$class
        tab.one <- table(hat.sel1, cl)
        acc.sel1[ng] <- sum(tab.one[row(tab.one)==col(tab.one)])/sum(tab.one)
      } else
      {
      hat <- cl
      if(print.progress)cat(paste(ng,":",sep=""))
      for(k in 1:nfold[2])
      {
        foldk <- foldids[,k]
        ufold <- sort(unique(foldk))
        for(i in ufold){
          testset <- (1:nobs)[foldk==i]
          trainset <- (1:nobs)[foldk!=i]
          dfi <- selectonce.df[-testset, 1:ng, drop=FALSE]
          newdfi <- selectonce.df[testset, 1:ng, drop=FALSE]
          cli <- cl[-testset]
          xy.xda <- funda(cli~., data=dfi)
          subs <- match(colnames(dfi), rownames(df))
          newpred.xda <- predict(xy.xda, newdata=newdfi, method="debiased")
          hat[testset] <- newpred.xda$class
        }
        tabk <- table(hat,cl)
        if(k==1)tab <- tabk else tab <- tab+tabk
      }
      acc.sel1[ng] <- sum(tab[row(tab)==col(tab)])/sum(tab)
      }
    }
    if(print.progress)cat("\n")
    invisible(list(acc.resub=acc.resub, acc.sel1=acc.sel1, genes=genes.ord))
  }

Partition data into mutiple nearly equal subsets

Description

Randomly partition data into nearly equal subsets. If balanced=TRUE the requirement is imposed that the subsets should as far as possible be balanced with respect to a classifying factor. The multiple sets are suitable for use for determining the folds in a cross-validation.

Usage

divideUp(cl, nset = 2, seed = NULL, balanced=TRUE)
divideUp(cl, nset = 2, seed = NULL, balanced=TRUE)

Arguments

`cl`	classifying factor
`nset`	number of subsets into which to partition data
`seed`	set the seed, if required, in order to obtain reproducible results
`balanced`	logical: should subsets be as far as possible balanced with respect to the classifying factor?

Value

a set of indices that identify the nset subsets

Author(s)

John Maindonald

Examples

foldid <- divideUp(cl=rep(1:3, c(17,14,8)), nset=10)
table(rep(1:3, c(17,14,8)), foldid)
foldid <- divideUp(cl=rep(1:3, c(17,14,8)), nset=10,
       	    balanced=FALSE)
table(rep(1:3, c(17,14,8)), foldid)


## The function is currently defined as
function(cl = rep(1:3, c(7, 4, 8)), nset=2, seed=NULL, balanced=TRUE){
    if(!is.null(seed))set.seed(seed)
    if(balanced){
      ord <- order(cl)
      ordcl <- cl[ord]
      gp0 <- rep(sample(1:nset), length.out=length(cl))
      gp <- unlist(split(gp0,ordcl), function(x)sample(x))
      gp[ord] <- gp
    } else
    gp <- sample(rep(1:nset, length.out=length(cl)))
    as.vector(gp)
  }
foldid <- divideUp(cl=rep(1:3, c(17,14,8)), nset=10)
table(rep(1:3, c(17,14,8)), foldid)
foldid <- divideUp(cl=rep(1:3, c(17,14,8)), nset=10,
       	    balanced=FALSE)
table(rep(1:3, c(17,14,8)), foldid)


## The function is currently defined as
function(cl = rep(1:3, c(7, 4, 8)), nset=2, seed=NULL, balanced=TRUE){
    if(!is.null(seed))set.seed(seed)
    if(balanced){
      ord <- order(cl)
      ordcl <- cl[ord]
      gp0 <- rep(sample(1:nset), length.out=length(cl))
      gp <- unlist(split(gp0,ordcl), function(x)sample(x))
      gp[ord] <- gp
    } else
    gp <- sample(rep(1:nset, length.out=length(cl)))
    as.vector(gp)
  }

Golub data (7129 rows by 72 columns), after normalization

Description

These are a normalized version of the Golub leukemia data from the golubEsets package, available from:

http://www.bioconductor.org/download/experiments/

Usage

data(Golub)data(Golub)

Format

Numeric matrix: 7129 rows by 72 columns.

Details

Data have been normalized and are supplied, here, as a matrix.

Source

See the help page for the dataset golubMerge, in the golubEsets package, for details of the source of the original data.

References

Molecular Classification of Cancer: Class Discovery and Class Prediction by Gene Expression Monitoring, Science, 531-537, 1999, T. R. Golub and D. K. Slonim and P. Tamayo and C. Huard and M. Gaasenbeek and J. P. Mesirov and H. Coller and M.L. Loh and J. R. Downing and M. A. Caligiuri and C. D. Bloomfield and E. S. Lander

Examples

data(Golub)
## Select 20 rows from the data; show boxplots of variation across chips
boxplot(data.frame(t(Golub[sample(1:7129, 20), ]))) 
data(Golub)
## Select 20 rows from the data; show boxplots of variation across chips
boxplot(data.frame(t(Golub[sample(1:7129, 20), ])))

Classifying factors for the 72 columns of the Golub data set

Description

Details are given of the classifying factors for the 72 columns of the Golub data set.

Usage

data(golubInfo)data(golubInfo)

Format

A data frame with 72 observations on the following 6 variables, that identifies the samples (observations) in the data set Golub

Samples: a numeric vector: sample number
BM.PB: a factor with levels BM (from bone marrow) PB (from peripheral blood)
Gender: a factor with levels F M
Source: a factor with levels CALGB CCG DFCI St-Jude. These are the hospitals from which the sample came
tissue.mf: a factor with levels BM:NA BM:f BM:m PB:NA PB:f PB:m. This factor identifies the several combinations of source and Gender
cancer: a factor with levels allB allT aml There are two types of Acute Lymphoblastic Leukemia (allB and allT), plus Acute Myoblastic Leukemia (aml)

Source

See the help page for the dataset golubMerge, in the golubEsets package, for details of the source of the original data.

References

Examples

data(golubInfo)
str(golubInfo)
data(golubInfo)
str(golubInfo)

Order features, based on their ability to discriminate

Description

For each row of data, an F or (potentially) other statistic is calculated, using the function FUN, that measures the extent to which this variable separates the data into groups. This statistic is then used to order the rows.

Usage

 orderFeatures(x, cl, subset = NULL, FUN = aovFbyrow, values =
FALSE) orderFeatures(x, cl, subset = NULL, FUN = aovFbyrow, values =
FALSE)

Arguments

`x`	Matrix; rows are features, and columns are observations ('samples')
`cl`	Factor that classifies columns into groups
`subset`	allows specification of a subset of the columns of `data`
`FUN`	specifies the function used to measure separation between groups
`values`	if `TRUE`, F-values as well as the ordering are returned

Value

Either (values=FALSE) a vector that orders the rows, or (values=TRUE)

`ord`	a vector that orders the rows
`stat`	ordered values of the statistic

Author(s)

John Maindonald

Examples

mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
ord <- orderFeatures(mat, cl)

## The function is currently defined as
function(x, cl, subset=NULL, FUN=aovFbyrow, values=FALSE){
    if(dim(x)[2]!=length(cl))stop(paste("Dimension 2 of x is",
                  dim(x)[2], "differs from the length of cl (=",
                  length(cl)))
    ## Ensure that cl is a factor & has no redundant levels
    if(is.null(subset))
      cl <- factor(cl)
    else
      cl <- factor(cl[subset])
    if(is.null(subset))
      stat <- FUN(x, cl)
    else
      stat <- FUN(x[, subset], cl)
    ord <- order(-abs(stat))
    if(!values)ord else(list(ord=ord, stat=stat[ord]))
  }
mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
ord <- orderFeatures(mat, cl)

## The function is currently defined as
function(x, cl, subset=NULL, FUN=aovFbyrow, values=FALSE){
    if(dim(x)[2]!=length(cl))stop(paste("Dimension 2 of x is",
                  dim(x)[2], "differs from the length of cl (=",
                  length(cl)))
    ## Ensure that cl is a factor & has no redundant levels
    if(is.null(subset))
      cl <- factor(cl)
    else
      cl <- factor(cl[subset])
    if(is.null(subset))
      stat <- FUN(x, cl)
    else
      stat <- FUN(x[, subset], cl)
    ord <- order(-abs(stat))
    if(!values)ord else(list(ord=ord, stat=stat[ord]))
  }

convenience version of the singular value decomposition

Description

Packages results from an SVD on what can be either a cases by variables (features) or variables by cases layout, for use in principal component and related calculations

Usage

pcp(x = datasets::USArrests, varscores = TRUE, cases = "rows", center = "vars",
    standardize = FALSE, scale.cases = 1, log = FALSE, sc = 1, reflect = c(1, 1))
pcp(x = datasets::USArrests, varscores = TRUE, cases = "rows", center = "vars",
    standardize = FALSE, scale.cases = 1, log = FALSE, sc = 1, reflect = c(1, 1))

Arguments

`x`	matrix on which SVD is to be performed
`varscores`	logical; should scores be returned?
`cases`	specify either `"rows"` or `"columns"`
`center`	logical: if set to `"vars"`, then values of variables will be centered
`standardize`	logical: should values of variables be standardized to zero mean and unit deviance. Takes precedence over the setting of `center`
`scale.cases`	set to a value in [0,1]. `scale.cases=0` gives a pure rotation of the variables. `scale.cases=1` weights a/c the singular values
`log`	logical: should logarithms be taken, prior to the calculation?
`sc`	the variable scores are divided by $sqrt{sc-1}$ . By default, `sc` = number of cases
`reflect`	a vector of two elements, by default `c(1,1)`. Use of -1 in one or both positions can be useful in reconciling results with output from other software

Value

`g`	case scores
`h`	variable scores
`avv`	variable means
`sdev`	singular values, divides by the square root of one less than the number of cases

Author(s)

John Maindonald

Examples

USArrests.svd <- pcp(x = datasets::USArrests)

## The function is currently defined as
function(x=datasets::USArrests,
           varscores=TRUE,
           cases="rows",
           center="vars",
           standardize=FALSE,
           scale.cases=1,
           log=FALSE,
           sc=1,
           reflect=c(1,1))
{
  x <- as.matrix(x)
  avv <- 0
  sdv <- 1
  casedim <- 2-as.logical(cases=="rows")
  vardim <- 3-casedim
  ## casedim=1 if rows are cases; otherwise casedim=2
  ## scale.cases=0 gives a pure rotation of the variables
  ## scale.cases=1 weights a/c the singular values
  ncases <- dim(x)[casedim]
  nvar <- dim(x)[vardim]
  if(is.null(sc))sc <- dim(x)[casedim]-1
  if(log)x <- log(x, base=2)
  if(standardize){
    avv <- apply(x, vardim, mean)
    sdv <- apply(x, vardim, sd)
    x <- sweep(x, vardim, avv,"-")
    x <- sweep(x, vardim, sdv,"/")
  }
  else if(as.logical(match("vars", center, nomatch=0))){
    avv <- apply(x,vardim, mean)
    x <- sweep(x, vardim, avv,"-")}

  svdx <- La.svd(x, method = c("dgesdd"))
  h <- NULL
  if(cases=="rows"){
    g <- sweep(svdx$u, 2, svdx$d^scale.cases, "*")*sqrt(sc)
    if(varscores)
      h <- t((svdx$d^(1-scale.cases)* svdx$vt ))/sqrt(sc)
  }
  else if(cases=="columns"){
    g <- sweep(t(svdx$vt), 2, svdx$d^scale.cases, "*")*sqrt(sc)
    if(varscores)
      h <- sweep(svdx$u, 2, svdx$d^(1-scale.cases),"*")/sqrt(sc)
  }
  invisible(list(g=g, rotation=h, av=avv, sdev=svdx$d/sqrt(ncases-1)))
  }
USArrests.svd <- pcp(x = datasets::USArrests)

## The function is currently defined as
function(x=datasets::USArrests,
           varscores=TRUE,
           cases="rows",
           center="vars",
           standardize=FALSE,
           scale.cases=1,
           log=FALSE,
           sc=1,
           reflect=c(1,1))
{
  x <- as.matrix(x)
  avv <- 0
  sdv <- 1
  casedim <- 2-as.logical(cases=="rows")
  vardim <- 3-casedim
  ## casedim=1 if rows are cases; otherwise casedim=2
  ## scale.cases=0 gives a pure rotation of the variables
  ## scale.cases=1 weights a/c the singular values
  ncases <- dim(x)[casedim]
  nvar <- dim(x)[vardim]
  if(is.null(sc))sc <- dim(x)[casedim]-1
  if(log)x <- log(x, base=2)
  if(standardize){
    avv <- apply(x, vardim, mean)
    sdv <- apply(x, vardim, sd)
    x <- sweep(x, vardim, avv,"-")
    x <- sweep(x, vardim, sdv,"/")
  }
  else if(as.logical(match("vars", center, nomatch=0))){
    avv <- apply(x,vardim, mean)
    x <- sweep(x, vardim, avv,"-")}

  svdx <- La.svd(x, method = c("dgesdd"))
  h <- NULL
  if(cases=="rows"){
    g <- sweep(svdx$u, 2, svdx$d^scale.cases, "*")*sqrt(sc)
    if(varscores)
      h <- t((svdx$d^(1-scale.cases)* svdx$vt ))/sqrt(sc)
  }
  else if(cases=="columns"){
    g <- sweep(t(svdx$vt), 2, svdx$d^scale.cases, "*")*sqrt(sc)
    if(varscores)
      h <- sweep(svdx$u, 2, svdx$d^(1-scale.cases),"*")/sqrt(sc)
  }
  invisible(list(g=g, rotation=h, av=avv, sdev=svdx$d/sqrt(ncases-1)))
  }

Plot predictions for both a I/II train/test split, and the reverse

Description

A division of data is specified, for use of linear discriminant analysis, into a training and test set. Feature selection and model fitting is formed, first with I/II as training/test, then with II/I as training/test. Two graphs are plotted – for the I (training) /II (test) scores, and for the II/I scores.

Usage

plotTrainTest(x, nfeatures, cl, traintest,
              titles = c("A: I/II (train with I, scores are for II)",
                         "B: II/I (train with II, scores are for I)"))
plotTrainTest(x, nfeatures, cl, traintest,
              titles = c("A: I/II (train with I, scores are for II)",
                         "B: II/I (train with II, scores are for I)"))

Arguments

`x`	Matrix; rows are features, and columns are observations ('samples')
`nfeatures`	integer: numbers of features for which calculations are required
`cl`	Factor that classifies columns into groups that will classify the data for purposes of discriminant calculations
`traintest`	Values that specify a division of observations into two groups. In the first pass (fold), one to be training and the other test, with the roles then reversed in a second pass or fold.
`titles`	A character vector of length 2 giving titles for the two graphs

Value

Two graphs are plotted.

Author(s)

John Maindonald

Examples

mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
gp.id <- divideUp(cl, nset=2)
plotTrainTest(x=mat, cl=cl, traintest=gp.id, nfeatures=c(2,3))



## The function is currently defined as
function(x, nfeatures, cl, traintest,
           titles=c("A: I/II (train with I, scores are for II)",
             "B: II/I (train with II, scores are for I)")){
    oldpar <- par(mfrow=c(1,2), pty="s")
    on.exit(par(oldpar))
    if(length(nfeatures)==1)nfeatures <- rep(nfeatures,2)
    traintest <- factor(traintest)
    train <- traintest==levels(traintest)[1]
    testset <- traintest==levels(traintest)[2]
    cl1 <- cl[train]
    cl2 <- cl[testset]
    nf1 <- nfeatures[1]
    ord1 <- orderFeatures(x, cl, subset=train)
    df1 <- data.frame(t(x[ord1[1:nf1], train]))
    df2 <- data.frame(t(x[ord1[1:nf1], testset]))
    df1.lda <- lda(df1, cl1)
    scores <- predict(df1.lda, newdata=df2)$x
    scoreplot(scorelist=list(scores=scores, cl=cl2,
             nfeatures=nfeatures[1], other=NULL, cl.other=NULL),
           prefix.title="")
    mtext(side=3, line=2, titles[1], adj=0)
    nf2 <- nfeatures[2]
    ord2 <- orderFeatures(x, cl, subset=testset)
    df2 <- data.frame(t(x[ord2[1:nf2], testset]))
    df1 <- data.frame(t(x[ord2[1:nf2], train]))
    df2.lda <- lda(df2, cl2)
    scores <- predict(df2.lda, newdata=df1)$x
    scoreplot(scorelist=list(scores=scores, cl=cl1,
             nfeatures=nfeatures[2], other=NULL, cl.other=NULL),
           prefix.title="")
    mtext(side=3, line=2, titles[2], adj=0)
  }

mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
gp.id <- divideUp(cl, nset=2)
plotTrainTest(x=mat, cl=cl, traintest=gp.id, nfeatures=c(2,3))



## The function is currently defined as
function(x, nfeatures, cl, traintest,
           titles=c("A: I/II (train with I, scores are for II)",
             "B: II/I (train with II, scores are for I)")){
    oldpar <- par(mfrow=c(1,2), pty="s")
    on.exit(par(oldpar))
    if(length(nfeatures)==1)nfeatures <- rep(nfeatures,2)
    traintest <- factor(traintest)
    train <- traintest==levels(traintest)[1]
    testset <- traintest==levels(traintest)[2]
    cl1 <- cl[train]
    cl2 <- cl[testset]
    nf1 <- nfeatures[1]
    ord1 <- orderFeatures(x, cl, subset=train)
    df1 <- data.frame(t(x[ord1[1:nf1], train]))
    df2 <- data.frame(t(x[ord1[1:nf1], testset]))
    df1.lda <- lda(df1, cl1)
    scores <- predict(df1.lda, newdata=df2)$x
    scoreplot(scorelist=list(scores=scores, cl=cl2,
             nfeatures=nfeatures[1], other=NULL, cl.other=NULL),
           prefix.title="")
    mtext(side=3, line=2, titles[1], adj=0)
    nf2 <- nfeatures[2]
    ord2 <- orderFeatures(x, cl, subset=testset)
    df2 <- data.frame(t(x[ord2[1:nf2], testset]))
    df1 <- data.frame(t(x[ord2[1:nf2], train]))
    df2.lda <- lda(df2, cl2)
    scores <- predict(df2.lda, newdata=df1)$x
    scoreplot(scorelist=list(scores=scores, cl=cl1,
             nfeatures=nfeatures[2], other=NULL, cl.other=NULL),
           prefix.title="")
    mtext(side=3, line=2, titles[2], adj=0)
  }

a version of qqplot() that thins out points that overplot

Description

QQ-plots with large numbers of points typically generate graphics files that are unhelpfully large. This function handles the problem by removing points that are, for all practical purposes, redundant

Usage

qqthin(x, y, ends = c(0.01, 0.99), eps = 0.001, xlab = deparse(substitute(x)),
       adj.xlab = NULL, ylab = deparse(substitute(y)), show.line = TRUE,
       print.thinning.details=TRUE, centerline = TRUE, ...)
qqthin(x, y, ends = c(0.01, 0.99), eps = 0.001, xlab = deparse(substitute(x)),
       adj.xlab = NULL, ylab = deparse(substitute(y)), show.line = TRUE,
       print.thinning.details=TRUE, centerline = TRUE, ...)

Arguments

`x`	ordered values of `x` will be plotted on the x-axis
`y`	ordered values of `y` will be plotted on the y-axis
`ends`	outside these cumulative proportions of numbers of points, all points will be included in the graph
`eps`	controls the extent of overplotting
`xlab`	label for x-axis
`adj.xlab`	positioning of x-label
`ylab`	label for y-axis
`show.line`	logical; show the line y=x?
`print.thinning.details`	logical; print number of points after thinning?
`centerline`	logical; draw a line though the part of the graph where some points have been omitted?
`...`	additional graphics parameters

Value

Gives a qqplot. The number of points retained is returned invisibly.

Author(s)

John Maindonald

References

~put references to the literature/web site here ~

Examples

mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
Fstats <- aovFbyrow(x = mat, cl = cl)
qqthin(qf(ppoints(length(Fstats)), 2, 17), Fstats, eps=0.01)


## The function is currently defined as
function(x, y, ends=c(.01,.99), eps=0.001,
           xlab = deparse(substitute(x)), adj.xlab=NULL,
           ylab = deparse(substitute(y)), show.line=TRUE,
           print.thinning.details=TRUE,
           centerline=TRUE, ...){
    ## qqthin() is a substitute for qqplot(), that thins
    ## out plotted points from the region where they are
    ## dense.  Apart from the overlaid curve that shows
    ## the region where points have been thinned, it may
    ## be hard to distinguish the result of qqthin()
    ## from that of qqplot()
    xlab <- xlab
    ylab <- ylab
    x <- sort(x)
    y <- sort(y)
    dx<-diff(x)
    epsdist <- sqrt(diff(range(x))^2+diff(range(y))^2)*eps
    dx<-0.5*(c(dx[1],dx)+c(dx,dx[length(dx)]))
    dy<-diff(y)
    dy<-0.5*(c(dy[1],dy)+c(dy,dy[length(dy)]))
    dpoints <- epsdist/sqrt(dx^2+dy^2)
    ## dpoints is a local measure of the number of points
    ## per unit distance along the diagonal, with the unit
    ## set to approximately eps*(length of diagonal)
    dig<-floor(dpoints)+1
    ## dig is, roughly, the number of points per unit distance.
    ## We wish to retain one point per unit distance.  For this
    ## retain points where cdig rounds to an integer. For such
    ## points, cdig has increased by approx 1, relative to the
    ## previous point that is retained.
    cdig<-round(cumsum(1/dig))
    subs<-match(unique(cdig), cdig)
    if(is.null(adj.xlab))
    plot(x[subs], y[subs], xlab=xlab, ylab=ylab)
    else {
      plot(x[subs], y[subs], xlab="", ylab=ylab)
      mtext(side=1, xlab, adj=adj.xlab, line=par()$mgp[1])
    }
    if(any(diff(subs)>1)){
    n1 <- min(subs[c(diff(subs),0)>1])
    n2 <- max(subs[c(0,diff(subs))>1])
    ns1 <- match(n1, subs)
    ns2 <- match(n2, subs)
    if(print.thinning.details)
       print(paste("Graph retains", length(subs), "points."))
    if(centerline)
      lines(smooth.spline(x[subs[ns1:ns2]], y[subs[ns1:ns2]]),
            col="grey", lwd=2)
	    }
    if(show.line)abline(0, 1, col="red")
invisible(length(subs))
  }
mat <- matrix(rnorm(1000), ncol=20)
cl <- factor(rep(1:3, c(7,9,4)))
Fstats <- aovFbyrow(x = mat, cl = cl)
qqthin(qf(ppoints(length(Fstats)), 2, 17), Fstats, eps=0.01)


## The function is currently defined as
function(x, y, ends=c(.01,.99), eps=0.001,
           xlab = deparse(substitute(x)), adj.xlab=NULL,
           ylab = deparse(substitute(y)), show.line=TRUE,
           print.thinning.details=TRUE,
           centerline=TRUE, ...){
    ## qqthin() is a substitute for qqplot(), that thins
    ## out plotted points from the region where they are
    ## dense.  Apart from the overlaid curve that shows
    ## the region where points have been thinned, it may
    ## be hard to distinguish the result of qqthin()
    ## from that of qqplot()
    xlab <- xlab
    ylab <- ylab
    x <- sort(x)
    y <- sort(y)
    dx<-diff(x)
    epsdist <- sqrt(diff(range(x))^2+diff(range(y))^2)*eps
    dx<-0.5*(c(dx[1],dx)+c(dx,dx[length(dx)]))
    dy<-diff(y)
    dy<-0.5*(c(dy[1],dy)+c(dy,dy[length(dy)]))
    dpoints <- epsdist/sqrt(dx^2+dy^2)
    ## dpoints is a local measure of the number of points
    ## per unit distance along the diagonal, with the unit
    ## set to approximately eps*(length of diagonal)
    dig<-floor(dpoints)+1
    ## dig is, roughly, the number of points per unit distance.
    ## We wish to retain one point per unit distance.  For this
    ## retain points where cdig rounds to an integer. For such
    ## points, cdig has increased by approx 1, relative to the
    ## previous point that is retained.
    cdig<-round(cumsum(1/dig))
    subs<-match(unique(cdig), cdig)
    if(is.null(adj.xlab))
    plot(x[subs], y[subs], xlab=xlab, ylab=ylab)
    else {
      plot(x[subs], y[subs], xlab="", ylab=ylab)
      mtext(side=1, xlab, adj=adj.xlab, line=par()$mgp[1])
    }
    if(any(diff(subs)>1)){
    n1 <- min(subs[c(diff(subs),0)>1])
    n2 <- max(subs[c(0,diff(subs))>1])
    ns1 <- match(n1, subs)
    ns2 <- match(n2, subs)
    if(print.thinning.details)
       print(paste("Graph retains", length(subs), "points."))
    if(centerline)
      lines(smooth.spline(x[subs[ns1:ns2]], y[subs[ns1:ns2]]),
            col="grey", lwd=2)
	    }
    if(show.line)abline(0, 1, col="red")
invisible(length(subs))
  }

Plot discriminant function scores, with various identification

Description

There is provision for the plottting of two sets of scores on the same graph, possibly with different classifying factors. The function is designed for use with output from cvscores() or from simulateScores().

Usage

scoreplot(scorelist, plot.disc = 1:2, xlab = NULL, ylab = NULL, params = NULL,
          circle = NULL, cl.circle = NULL, circle.pos = c(1, 1), adj.circle = 1,
          adj.title = 0.5, join.legends = TRUE, prefix.title = "", cex.title = 1,
          ratio = 1, plot.folds = FALSE, ...)
scoreplot(scorelist, plot.disc = 1:2, xlab = NULL, ylab = NULL, params = NULL,
          circle = NULL, cl.circle = NULL, circle.pos = c(1, 1), adj.circle = 1,
          adj.title = 0.5, join.legends = TRUE, prefix.title = "", cex.title = 1,
          ratio = 1, plot.folds = FALSE, ...)

Arguments

`scorelist`	list, with elements `scores` (a matrix of scores) `cl` (a classifying factor), `other` (optional, a further sets of scores), `cl.other` (a a classifying factor for `other`, optional) and `nfeatures` (optional, used to label the graph)
`plot.disc`	choice of columns of `scorelist` to plot
`xlab`	label for x-axis
`ylab`	label for y-axis
`params`	List, with optional elements (lists) `points`, `other`, `circle` and `legend`. Allowed list elements for `points` and `other` are `cex`, `lwd`, `pch` and `col`. For `circle` they are `cex`, `lwd` and `col`. For `legend`, they are `cex` and `cex.other`
`circle`	identifies points that are to be circled
`cl.circle`	different colors may be used for different points, according to levels of `cl.circle`
`circle.pos`	This is a vector of length 2, that specifies where to place the legend information for the circling of points. Possibilities are `c(0,0)` (left, below), `c(1,1)` (right, above), etc.
`adj.circle`	controls positioning of circle legend
`adj.title`	controls positioning of title
`join.legends`	logical; should legends for `points` and `other` be combined?
`prefix.title`	prefix, to place before title
`cex.title`	`cex` for title
`ratio`	`y`-scale to `x`-scale ratio for graph
`plot.folds`	Plot individual fold information, comparing projected training scores with their projections onto the global space. This is not at present implemented
`...`	Other parameters to be passed to `eqscplot()`

Value

A graph is plotted.

Author(s)

John Maindonald

Examples

## Use first 500 rows (expression values) of Golub, for demonstration.
data(Golub)
data(golubInfo)
attach(golubInfo)
miniG.BM <- Golub[1:500, BM.PB=="BM"]  # 1st 500 rows only
cancer.BM <- cancer[BM.PB=="BM"]
miniG.cv <- cvdisc(miniG.BM, cl=cancer.BM, nfeatures=1:10,
                    nfold=c(3,1))
miniG.scores <- cvscores(cvlist=miniG.cv, nfeatures=4,
                         cl.other=NULL)
subsetB <- (cancer=="allB") & (tissue.mf %in% c("BM:f","BM:m","PB:m"))
tissue.mfB <- tissue.mf[subsetB, drop=TRUE]
scoreplot(scorelist=miniG.scores, cl.circle=tissue.mfB,
       circle=tissue.mfB%in%c("BM:f","BM:m"),
       params=list(circle=list(col=c("cyan","gray"))),
       prefix="BM samples -")
detach(golubInfo)

## The function is currently defined as
  function(scorelist, plot.disc=1:2,
           xlab=NULL, ylab=NULL, params=NULL,
           circle=NULL, cl.circle=NULL, circle.pos=c(1,1),
           adj.circle=1,
           adj.title=0.5, join.legends=T, prefix.title="Golub data - ",
           cex.title=1.0, ratio=1, plot.folds=FALSE, ...){
    library(MASS)
    combine.params <-
      function(params=list(circle=list(col=c("cyan","gray")))){
        default.params=list(points=list(cex=1, lwd=1.25, pch=1:8, col=1:8),
          other=list(cex=0.65, lwd=1.25, pch=13:9, col=c(6:8,5:1)),
          circle=list(cex=2, lwd=1, pch=1.75, col="gray40"),
          legend=list(cex=1, cex.other=1))
        nam <- names(params)
        if(!is.null(nam))
          for(a in nam){
            nam2 <- names(params[[a]])
            for(b in nam2)default.params[[a]][[b]] <- params[[a]][[b]]
          }
        default.params
      }
    params <- combine.params(params=params)
    cl <- scorelist$cl
    cl.other <- scorelist$cl.other
    if(!is.null(cl.other)) cl.other <- factor(cl.other)
    nfeatures <- scorelist$nfeatures
    if(length(plot.disc)==2){
      n1 <- plot.disc[1]
      n2 <- plot.disc[2]
      if(is.null(xlab))xlab <- paste("Discriminant function", n1)
      if(is.null(ylab))ylab <- paste("Discriminant function", n2)
    } else stop("plot.disc must be a vector of length 2")
    if(!is.factor(cl))cl <- factor(cl)
    levnames <- levels(cl)
    fitscores <- scorelist$scores
    other.scores <- scorelist$other
    ngp <- length(levnames)
    n1lim <- range(fitscores[,n1])
    n2lim <- range(fitscores[,n2])
    if(!is.null(cl.other)){
      n1lim <- range(c(n1lim, other.scores[,n1]))
      n2lim <- range(c(n2lim, other.scores[,n2]))
      levnum <- unclass(cl.other)
      levnames.other <- levels(cl.other)
      intlev.other <- unclass(cl.other)
      ngp.other <- length(levels(cl.other))
    }
    n1 <- plot.disc[1]; n2 <- plot.disc[2]
    intlev <- unclass(cl)
    oldpar <- par(lwd=1)
    on.exit(par(oldpar))
    eqscplot(n1lim, n2lim, type="n",
             xlab=xlab, ylab=ylab, ratio=ratio, ...)
    with(params$points,
         points(fitscores[,n1], fitscores[,n2], col=col[intlev],
                pch=pch[intlev], cex=cex, lwd=lwd))
    if(!is.null(cl.other))
      with(params$other,
           points(other.scores[,n1], other.scores[,n2],
                  pch=pch[intlev.other],
                  col=col[intlev.other],
                  cex=cex, lwd=lwd))
    if(!is.null(cl.circle)){
      cl.circle <- factor(cl.circle[circle])
      lev.circle <- levels(cl.circle)
      with(params$circle,
           points(fitscores[circle, n1], fitscores[circle,n2], pch=pch,
                  cex=cex, col=col[unclass(cl.circle)], lwd=lwd))
    }
    par(xpd=TRUE)
    chw <- par()$cxy[1]
    chh <- par()$cxy[2]
    par(lwd=1.5)
    ypos <- par()$usr[4]
    xmid <- mean(par()$usr[1:2])
    top.pos <- 0
    mtext(side=3, line=(top.pos+1), paste(prefix.title,
            nfeatures, "features"), cex=cex.title, adj=adj.title)
    ypos.legend <- ypos+(top.pos-0.45)*chh*0.8

    if(join.legends&!is.null(cl.other)){
      leg.info <- legend(xmid, ypos.legend, xjust=0.5, yjust=0, plot=FALSE,
                         x.intersp=0.5, ncol=ngp, legend=levnames,
                         pt.lwd=params$points$lwd,
                         pt.cex=params$points$cex,
                         cex=params$legend$cex,
                         pch=params$points$pch)
      legother.info <- legend(xmid, ypos.legend, xjust=0.5, yjust=0,
                              plot=FALSE, x.intersp=0.5,
                              ncol=ngp.other, legend=levnames.other,
                              pt.lwd=params$other$lwd,
                              pt.cex=params$other$cex,
                              cex=params$legend$cex.other,
                              pch=params$other$pch)
      leftoff <- 0.5*legother.info$rect$w-0.5*chw
      rightoff <- 0.5*leg.info$rect$w+0.5*chw
      ypos.other <- ypos.legend
    }
    else {
      leftoff <- 0
      rightoff <- 0
      ypos.other <- ypos+(top.pos-1.5)*chh*0.8
    }
    legend(xmid-leftoff, ypos.legend, xjust=0.5, yjust=0,
           bty="n", pch=params$points$pch,
           x.intersp=0.5, col=params$points$col, ncol=ngp,
           legend=levnames,
           pt.lwd=params$points$lwd,
           pt.cex=params$points$cex,
           cex=params$legend$cex)
    par(lwd=1)
    if(!is.null(cl.other))
      lego.info <- legend(xmid+rightoff, ypos.other, xjust=0.5, yjust=0,
                          pch=params$other$pch, x.intersp=0.5,
                          col=params$other$col, ncol=ngp.other,
                          pt.lwd=params$other$lwd,
                          pt.cex=params$other$cex,
                          legend=levnames.other,
                          cex=params$legend$cex.other,
                          bty="n")
    if(!is.null(cl.other)&join.legends)
      text(lego.info$rect$left+c(0.4*chw,lego.info$rect$w-0.25*chw),
           rep(ypos.other,2)+0.8*chh, labels=c("(",")"),
           cex=params$legend$cex,
           lwd=params$legend$lwd, bty="n")
    par(lwd=params$circle$lwd)
    if(!is.null(cl.circle))if(lev.circle[1]!=""){
      pch.circle <- params$circle$pch
      xy <- par()$usr[circle.pos+c(1,3)]
      legend(xy[1], xy[2],
             xjust=adj.circle[1], yjust=circle.pos[2], bty="n", x.intersp=0.5,
             pch=rep(pch.circle,length(lev.circle)), col=params$circle$col,
             ncol=1, legend=lev.circle, cex=0.85, pt.cex=1.5)
    }
    par(lwd=1, xpd=FALSE)
    if(plot.folds){
      mtext(side=1, line=1.25, "Discriminant function 1", outer=T)
      mtext(side=2, line=1.25, "Discriminant function 2", outer=T)
    }
  }
## Use first 500 rows (expression values) of Golub, for demonstration.
data(Golub)
data(golubInfo)
attach(golubInfo)
miniG.BM <- Golub[1:500, BM.PB=="BM"]  # 1st 500 rows only
cancer.BM <- cancer[BM.PB=="BM"]
miniG.cv <- cvdisc(miniG.BM, cl=cancer.BM, nfeatures=1:10,
                    nfold=c(3,1))
miniG.scores <- cvscores(cvlist=miniG.cv, nfeatures=4,
                         cl.other=NULL)
subsetB <- (cancer=="allB") & (tissue.mf %in% c("BM:f","BM:m","PB:m"))
tissue.mfB <- tissue.mf[subsetB, drop=TRUE]
scoreplot(scorelist=miniG.scores, cl.circle=tissue.mfB,
       circle=tissue.mfB%in%c("BM:f","BM:m"),
       params=list(circle=list(col=c("cyan","gray"))),
       prefix="BM samples -")
detach(golubInfo)

## The function is currently defined as
  function(scorelist, plot.disc=1:2,
           xlab=NULL, ylab=NULL, params=NULL,
           circle=NULL, cl.circle=NULL, circle.pos=c(1,1),
           adj.circle=1,
           adj.title=0.5, join.legends=T, prefix.title="Golub data - ",
           cex.title=1.0, ratio=1, plot.folds=FALSE, ...){
    library(MASS)
    combine.params <-
      function(params=list(circle=list(col=c("cyan","gray")))){
        default.params=list(points=list(cex=1, lwd=1.25, pch=1:8, col=1:8),
          other=list(cex=0.65, lwd=1.25, pch=13:9, col=c(6:8,5:1)),
          circle=list(cex=2, lwd=1, pch=1.75, col="gray40"),
          legend=list(cex=1, cex.other=1))
        nam <- names(params)
        if(!is.null(nam))
          for(a in nam){
            nam2 <- names(params[[a]])
            for(b in nam2)default.params[[a]][[b]] <- params[[a]][[b]]
          }
        default.params
      }
    params <- combine.params(params=params)
    cl <- scorelist$cl
    cl.other <- scorelist$cl.other
    if(!is.null(cl.other)) cl.other <- factor(cl.other)
    nfeatures <- scorelist$nfeatures
    if(length(plot.disc)==2){
      n1 <- plot.disc[1]
      n2 <- plot.disc[2]
      if(is.null(xlab))xlab <- paste("Discriminant function", n1)
      if(is.null(ylab))ylab <- paste("Discriminant function", n2)
    } else stop("plot.disc must be a vector of length 2")
    if(!is.factor(cl))cl <- factor(cl)
    levnames <- levels(cl)
    fitscores <- scorelist$scores
    other.scores <- scorelist$other
    ngp <- length(levnames)
    n1lim <- range(fitscores[,n1])
    n2lim <- range(fitscores[,n2])
    if(!is.null(cl.other)){
      n1lim <- range(c(n1lim, other.scores[,n1]))
      n2lim <- range(c(n2lim, other.scores[,n2]))
      levnum <- unclass(cl.other)
      levnames.other <- levels(cl.other)
      intlev.other <- unclass(cl.other)
      ngp.other <- length(levels(cl.other))
    }
    n1 <- plot.disc[1]; n2 <- plot.disc[2]
    intlev <- unclass(cl)
    oldpar <- par(lwd=1)
    on.exit(par(oldpar))
    eqscplot(n1lim, n2lim, type="n",
             xlab=xlab, ylab=ylab, ratio=ratio, ...)
    with(params$points,
         points(fitscores[,n1], fitscores[,n2], col=col[intlev],
                pch=pch[intlev], cex=cex, lwd=lwd))
    if(!is.null(cl.other))
      with(params$other,
           points(other.scores[,n1], other.scores[,n2],
                  pch=pch[intlev.other],
                  col=col[intlev.other],
                  cex=cex, lwd=lwd))
    if(!is.null(cl.circle)){
      cl.circle <- factor(cl.circle[circle])
      lev.circle <- levels(cl.circle)
      with(params$circle,
           points(fitscores[circle, n1], fitscores[circle,n2], pch=pch,
                  cex=cex, col=col[unclass(cl.circle)], lwd=lwd))
    }
    par(xpd=TRUE)
    chw <- par()$cxy[1]
    chh <- par()$cxy[2]
    par(lwd=1.5)
    ypos <- par()$usr[4]
    xmid <- mean(par()$usr[1:2])
    top.pos <- 0
    mtext(side=3, line=(top.pos+1), paste(prefix.title,
            nfeatures, "features"), cex=cex.title, adj=adj.title)
    ypos.legend <- ypos+(top.pos-0.45)*chh*0.8

    if(join.legends&!is.null(cl.other)){
      leg.info <- legend(xmid, ypos.legend, xjust=0.5, yjust=0, plot=FALSE,
                         x.intersp=0.5, ncol=ngp, legend=levnames,
                         pt.lwd=params$points$lwd,
                         pt.cex=params$points$cex,
                         cex=params$legend$cex,
                         pch=params$points$pch)
      legother.info <- legend(xmid, ypos.legend, xjust=0.5, yjust=0,
                              plot=FALSE, x.intersp=0.5,
                              ncol=ngp.other, legend=levnames.other,
                              pt.lwd=params$other$lwd,
                              pt.cex=params$other$cex,
                              cex=params$legend$cex.other,
                              pch=params$other$pch)
      leftoff <- 0.5*legother.info$rect$w-0.5*chw
      rightoff <- 0.5*leg.info$rect$w+0.5*chw
      ypos.other <- ypos.legend
    }
    else {
      leftoff <- 0
      rightoff <- 0
      ypos.other <- ypos+(top.pos-1.5)*chh*0.8
    }
    legend(xmid-leftoff, ypos.legend, xjust=0.5, yjust=0,
           bty="n", pch=params$points$pch,
           x.intersp=0.5, col=params$points$col, ncol=ngp,
           legend=levnames,
           pt.lwd=params$points$lwd,
           pt.cex=params$points$cex,
           cex=params$legend$cex)
    par(lwd=1)
    if(!is.null(cl.other))
      lego.info <- legend(xmid+rightoff, ypos.other, xjust=0.5, yjust=0,
                          pch=params$other$pch, x.intersp=0.5,
                          col=params$other$col, ncol=ngp.other,
                          pt.lwd=params$other$lwd,
                          pt.cex=params$other$cex,
                          legend=levnames.other,
                          cex=params$legend$cex.other,
                          bty="n")
    if(!is.null(cl.other)&join.legends)
      text(lego.info$rect$left+c(0.4*chw,lego.info$rect$w-0.25*chw),
           rep(ypos.other,2)+0.8*chh, labels=c("(",")"),
           cex=params$legend$cex,
           lwd=params$legend$lwd, bty="n")
    par(lwd=params$circle$lwd)
    if(!is.null(cl.circle))if(lev.circle[1]!=""){
      pch.circle <- params$circle$pch
      xy <- par()$usr[circle.pos+c(1,3)]
      legend(xy[1], xy[2],
             xjust=adj.circle[1], yjust=circle.pos[2], bty="n", x.intersp=0.5,
             pch=rep(pch.circle,length(lev.circle)), col=params$circle$col,
             ncol=1, legend=lev.circle, cex=0.85, pt.cex=1.5)
    }
    par(lwd=1, xpd=FALSE)
    if(plot.folds){
      mtext(side=1, line=1.25, "Discriminant function 1", outer=T)
      mtext(side=2, line=1.25, "Discriminant function 2", outer=T)
    }
  }

Generate linear discriminant scores from random data, after selection

Description

Simulates the effect of generating scores from random data, possibly with predicted scores calculates also for additional 'observations'

Usage

simulateScores(nrows = 7129, cl = rep(1:3, c(19, 10, 2)), x = NULL, cl.other = NULL,
               x.other = NULL, nfeatures = 15, dimen=2, seed = NULL)
simulateScores(nrows = 7129, cl = rep(1:3, c(19, 10, 2)), x = NULL, cl.other = NULL,
               x.other = NULL, nfeatures = 15, dimen=2, seed = NULL)

Arguments

`nrows`	number of rows of random data matrix
`cl`	classifying factor
`x`	data matrix, by default randomly generated
`cl.other`	classifying factor for additional observations
`x.other`	additional observations
`nfeatures`	number of features to select (by default uses aov F-statistic)
`dimen`	number of sets of discriminant scores to retain (at most one less than number of levels of `cl`)
`seed`	set, if required, so that calculations can be reproduced

Value

`scores`	matrix of scores
`cl`	classifying factor
`other`	matrix of 'other' scores
`cl.other`	classifying factor for `scores.other`
`nfeatures`	number of features used in generating the scores

Note

NB: Prior to 0.53, this function made (wrongly) a random selection of features.

Author(s)

John Maindonald

Examples

scorelist <- simulateScores(nrows=500, cl=rep(1:3, c(19,10,2)))
plot(scorelist$scores, col=unclass(scorelist$cl), pch=16)


## The function is currently defined as
simulateScores <-
  function (nrows = 7129, cl = rep(1:3, c(19, 10, 2)), x = NULL,
            cl.other = NULL, x.other = NULL, nfeatures = 15, dimen = 2,
            seed = NULL)
{
  if (!is.null(seed))
    set.seed(seed)
  m <- length(cl)
  m.other <- length(cl.other)
  if (is.null(x)) {
    x <- matrix(rnorm(nrows * m), nrow = nrows)
    rownames(x) <- paste(1:nrows)
  }
  else nrows <- dim(x)[1]
  if (is.null(x.other)) {
    x.other <- matrix(rnorm(nrows * m.other), nrow = nrows)
    rownames(x.other) <- paste(1:nrows)
  }
  if (is.numeric(cl))
    cl <- paste("Gp", cl, sep = "")
  if(!is.null(cl.other)){
    if (is.numeric(cl.other))
      cl.other <- paste("Gp", cl.other, sep = "")
    cl.other <- factor(cl.other)
  }
  cl <- factor(cl)
  if (dimen > length(levels(cl)) - 1)
    dimen <- length(levels(cl)) - 1
  ordfeatures <- orderFeatures(x, cl = cl, values = TRUE)
  stat <- ordfeatures$stat[1:nfeatures]
  ord.use <- ordfeatures$ord[1:nfeatures]
  xUse.ord <- data.frame(t(x[ord.use, ]))
  xUseOther.ord <- data.frame(t(x.other[ord.use, ]))
  ordUse.lda <- lda(xUse.ord, grouping = cl)
  scores <- predict(ordUse.lda, dimen = dimen)$x
  if(!is.null(cl.other))
    scores.other <- predict(ordUse.lda, newdata = xUseOther.ord,
                            dimen = dimen)$x else
  scores.other <- NULL
  invisible(list(scores = scores, cl = cl, other = scores.other,
                 cl.other = cl.other, nfeatures = nfeatures))
}
scorelist <- simulateScores(nrows=500, cl=rep(1:3, c(19,10,2)))
plot(scorelist$scores, col=unclass(scorelist$cl), pch=16)


## The function is currently defined as
simulateScores <-
  function (nrows = 7129, cl = rep(1:3, c(19, 10, 2)), x = NULL,
            cl.other = NULL, x.other = NULL, nfeatures = 15, dimen = 2,
            seed = NULL)
{
  if (!is.null(seed))
    set.seed(seed)
  m <- length(cl)
  m.other <- length(cl.other)
  if (is.null(x)) {
    x <- matrix(rnorm(nrows * m), nrow = nrows)
    rownames(x) <- paste(1:nrows)
  }
  else nrows <- dim(x)[1]
  if (is.null(x.other)) {
    x.other <- matrix(rnorm(nrows * m.other), nrow = nrows)
    rownames(x.other) <- paste(1:nrows)
  }
  if (is.numeric(cl))
    cl <- paste("Gp", cl, sep = "")
  if(!is.null(cl.other)){
    if (is.numeric(cl.other))
      cl.other <- paste("Gp", cl.other, sep = "")
    cl.other <- factor(cl.other)
  }
  cl <- factor(cl)
  if (dimen > length(levels(cl)) - 1)
    dimen <- length(levels(cl)) - 1
  ordfeatures <- orderFeatures(x, cl = cl, values = TRUE)
  stat <- ordfeatures$stat[1:nfeatures]
  ord.use <- ordfeatures$ord[1:nfeatures]
  xUse.ord <- data.frame(t(x[ord.use, ]))
  xUseOther.ord <- data.frame(t(x.other[ord.use, ]))
  ordUse.lda <- lda(xUse.ord, grouping = cl)
  scores <- predict(ordUse.lda, dimen = dimen)$x
  if(!is.null(cl.other))
    scores.other <- predict(ordUse.lda, newdata = xUseOther.ord,
                            dimen = dimen)$x else
  scores.other <- NULL
  invisible(list(scores = scores, cl = cl, other = scores.other,
                 cl.other = cl.other, nfeatures = nfeatures))
}

Package 'hddplot'

Help Index

Two subsets of data each take in turn the role of test set

Description

Usage

Arguments

Value

Author(s)

Examples

calculate aov F-statistic for each row of a matrix

Description

Usage

Arguments

Details

Value

Author(s)

See Also

Examples

Cross-validated accuracy, in linear discriminant calculations

Description

Usage

Arguments

Value

Author(s)

See Also

Examples

For high-dimensional data with known groups, derive scores for plotting

Description

Usage

Arguments

Value

Note

Author(s)

References

See Also

Examples

defective accuracy assessments from linear discriminant calculations

Description

Usage

Arguments

Value

Author(s)

See Also

Examples

Partition data into mutiple nearly equal subsets

Description

Usage

Arguments

Value

Author(s)

Examples

Golub data (7129 rows by 72 columns), after normalization

Description

Usage

Format

Details

Source

References

Examples

Classifying factors for the 72 columns of the Golub data set

Description

Usage

Format

Source

References

Examples

Order features, based on their ability to discriminate

Description

Usage

Arguments

Value

Author(s)

Examples

convenience version of the singular value decomposition

Description

Usage

Arguments

Value

Author(s)

See Also