next up previous
Next: util.character Up: CHNOSZ examples Previous: util.array

[1] util.blast

## using def2gi def <- "gi|218295810|ref|ZP_03496590.1|" stopifnot(all.equal(def2gi(def), "218295810")) ## process some of the BLAST output for proteins ## from Bison Pool metagenome (JGI, 2007) # read the file that connects taxids with the sequence identifier tfile <- system.file("extdata/bison/gi.taxid.txt.xz", package="CHNOSZ") gi.taxid <- scan(tfile, what=as.list(character(2)), flush=TRUE)
Read 4255 records
# read the file that connects names with the taxids nfile <- system.file("extdata/refseq/taxid_names.csv.xz", package="CHNOSZ") taxid.names <- read.csv(nfile) # the BLAST files sites <- c("N","S","R","Q","P") bfile <- paste("extdata/bison/bison", sites, "_vs_refseq57.blastp.xz", sep="") for(i in 1:5) { file <- system.file(bfile[i], package="CHNOSZ") # read the blast file, with default filtering settings bl <- read.blast(file) # process the blast file -- get taxon names ib <- id.blast(bl, gi.taxid, taxid.names, min.taxon=2^-7) # count each of the phyla bd <- as.matrix(sapply(unique(ib$phylum), function(x) (sum(x==ib$phylum)))) colnames(bd) <- sites[i] # make a matrix -- each column for a different file if(i==1) bardata <- bd else { bardata <- merge(bardata, bd, all=TRUE, by="row.names") rownames(bardata) <- bardata$Row.names bardata <- bardata[,-1] } }
read.blast: reading bisonN_vs_refseq57.blastp.xz read 1000 lines with 1000 query sequences similarity filtering leaves 1000 lines and 1000 query sequences evalue filtering leaves 1000 lines and 1000 query sequences max hits filtering leaves 1000 lines and 1000 query sequences id.blast: getting taxids ... getting taxid.names ... 23 unique phyla, 153 unique taxa min taxon abundance filtering leaves 790 query sequences, 5 phyla, 18 taxa min phylum abundance filtering leaves 790 query sequences, 5 phyla, 18 taxa read.blast: reading bisonS_vs_refseq57.blastp.xz read 1000 lines with 1000 query sequences similarity filtering leaves 1000 lines and 1000 query sequences evalue filtering leaves 1000 lines and 1000 query sequences max hits filtering leaves 1000 lines and 1000 query sequences id.blast: getting taxids ... getting taxid.names ... 30 unique phyla, 288 unique taxa min taxon abundance filtering leaves 557 query sequences, 4 phyla, 17 taxa min phylum abundance filtering leaves 557 query sequences, 4 phyla, 17 taxa read.blast: reading bisonR_vs_refseq57.blastp.xz read 1000 lines with 1000 query sequences similarity filtering leaves 1000 lines and 1000 query sequences evalue filtering leaves 1000 lines and 1000 query sequences max hits filtering leaves 1000 lines and 1000 query sequences id.blast: getting taxids ... getting taxid.names ... 28 unique phyla, 315 unique taxa min taxon abundance filtering leaves 498 query sequences, 6 phyla, 17 taxa min phylum abundance filtering leaves 498 query sequences, 6 phyla, 17 taxa read.blast: reading bisonQ_vs_refseq57.blastp.xz read 1000 lines with 1000 query sequences similarity filtering leaves 1000 lines and 1000 query sequences evalue filtering leaves 1000 lines and 1000 query sequences max hits filtering leaves 1000 lines and 1000 query sequences id.blast: getting taxids ... getting taxid.names ... 25 unique phyla, 252 unique taxa min taxon abundance filtering leaves 648 query sequences, 5 phyla, 12 taxa min phylum abundance filtering leaves 648 query sequences, 5 phyla, 12 taxa read.blast: reading bisonP_vs_refseq57.blastp.xz read 1000 lines with 1000 query sequences similarity filtering leaves 1000 lines and 1000 query sequences evalue filtering leaves 1000 lines and 1000 query sequences max hits filtering leaves 1000 lines and 1000 query sequences id.blast: getting taxids ... getting taxid.names ... 26 unique phyla, 327 unique taxa min taxon abundance filtering leaves 540 query sequences, 9 phyla, 17 taxa min phylum abundance filtering leaves 540 query sequences, 9 phyla, 17 taxa
# normalize the counts bardata[is.na(bardata)] <- 0 bardata <- t(t(bardata)/colSums(bardata)) # make a bar chart bp <- barplot(as.matrix(bardata), col=rainbow(nrow(bardata)), xlab="location", ylab="fractional abundance") # add labels to the bars names <- substr(row.names(bardata), 1, 3) for(i in 1:5) { bd <- bardata[,i] ib <- bd!=0 y <- (cumsum(bd) - bd/2)[ib] text(bp[i], y, names[ib]) } title(main=paste("Phylum Classification of Protein Sequences", "in Part of the Bison Pool Metagenome", sep="\n"))

Image utilblast1

 


next up previous
Next: util.character Up: CHNOSZ examples Previous: util.array