#install.packages("htmltools")
#library(htmltools)
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")
library( "DESeq2" )
library(ggplot2)
countsName <- "http://bioconnector.org/workshops/data/airway_scaledcounts.csv"
download.file(countsName, destfile = "airway_scaledcounts.csv", method = "auto")
countData <- read.csv('airway_scaledcounts.csv', header = TRUE, sep = ",")
head(countData)
## ensgene SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## 1 ENSG00000000003 723 486 904 445 1170
## 2 ENSG00000000005 0 0 0 0 0
## 3 ENSG00000000419 467 523 616 371 582
## 4 ENSG00000000457 347 258 364 237 318
## 5 ENSG00000000460 96 81 73 66 118
## 6 ENSG00000000938 0 0 1 0 2
## SRR1039517 SRR1039520 SRR1039521
## 1 1097 806 604
## 2 0 0 0
## 3 781 417 509
## 4 447 330 324
## 5 94 102 74
## 6 0 0 0
metaDataName <- "http://bioconnector.org/workshops/data/airway_metadata.csv"
download.file(metaDataName, destfile = "airway_metadata.csv", method = "auto")
metaData <- read.csv('airway_metadata.csv', header = TRUE, sep = ",")
metaData
## id dex celltype geo_id
## 1 SRR1039508 control N61311 GSM1275862
## 2 SRR1039509 treated N61311 GSM1275863
## 3 SRR1039512 control N052611 GSM1275866
## 4 SRR1039513 treated N052611 GSM1275867
## 5 SRR1039516 control N080611 GSM1275870
## 6 SRR1039517 treated N080611 GSM1275871
## 7 SRR1039520 control N061011 GSM1275874
## 8 SRR1039521 treated N061011 GSM1275875
dds <- DESeqDataSetFromMatrix(countData=countData,
colData=metaData,
design=~dex, tidy = TRUE)
## converting counts to integer mode
#Design specifies how the counts from each gene depend on our variables in the metadata
#For this dataset the factor we care about is our treatment status (dex)
#tidy=TRUE argument, which tells DESeq2 to output the results table with rownames as a first #column called 'row.
#let's see what this object looks like
dds
## class: DESeqDataSet
## dim: 38694 8
## metadata(1): version
## assays(1): counts
## rownames(38694): ENSG00000000003 ENSG00000000005 ...
## ENSG00000283120 ENSG00000283123
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(4): id dex celltype geo_id
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
?DESeq
## starting httpd help server ...
## done
#estimateSizeFactors
#This calculates the relative library depth of each sample
#estimateDispersions
#estimates the dispersion of counts for each gene
#nbinomWaldTest
#calculates the significance of coefficients in a Negative Binomial GLM using the size and dispersion outputs
res <- results(dds)
head(results(dds, tidy=TRUE)) #let's look at the results table
## row baseMean log2FoldChange lfcSE stat
## 1 ENSG00000000003 747.1941954 -0.33378331 0.16000697 -2.0860548
## 2 ENSG00000000005 0.0000000 NA NA NA
## 3 ENSG00000000419 520.1341601 0.20242061 0.09882721 2.0482276
## 4 ENSG00000000457 322.6648439 0.02363948 0.13950629 0.1694510
## 5 ENSG00000000460 87.6826252 -0.13160010 0.22930400 -0.5739111
## 6 ENSG00000000938 0.3191666 -0.08436408 0.15106353 -0.5584676
## pvalue padj
## 1 0.03697366 0.1624564
## 2 NA NA
## 3 0.04053770 0.1732073
## 4 0.86544189 0.9614644
## 5 0.56602799 0.8148481
## 6 0.57652514 NA
summary(res) #summary of results
##
## out of 25258 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1564, 6.2%
## LFC < 0 (down) : 1193, 4.7%
## outliers [1] : 212, 0.84%
## low counts [2] : 9918, 39%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res <- res[order(res$padj),]
head(res)
## log2 fold change (MAP): dex treated vs control
## Wald test p-value: dex treated vs control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat
## <numeric> <numeric> <numeric> <numeric>
## ENSG00000152583 954.7709 3.966942 0.21424708 18.51573
## ENSG00000179094 743.2527 2.713956 0.16595960 16.35311
## ENSG00000116584 2277.9135 -1.026994 0.06416315 -16.00599
## ENSG00000189221 2383.7537 3.090702 0.19641099 15.73589
## ENSG00000120129 3440.7038 2.759321 0.18957216 14.55552
## ENSG00000148175 13493.9204 1.401941 0.09837916 14.25038
## pvalue padj
## <numeric> <numeric>
## ENSG00000152583 1.541835e-76 2.332488e-72
## ENSG00000179094 4.133115e-60 3.126288e-56
## ENSG00000116584 1.160615e-57 5.852596e-54
## ENSG00000189221 8.583635e-56 3.246331e-52
## ENSG00000120129 5.387437e-48 1.630023e-44
## ENSG00000148175 4.458290e-46 1.124084e-42
#we can use plotCounts fxn to compare the normalized counts
#between treated and control groups for our top 6 genes
par(mfrow=c(2,3))
plotCounts(dds, gene="ENSG00000152583", intgroup="dex")
plotCounts(dds, gene="ENSG00000179094", intgroup="dex")
plotCounts(dds, gene="ENSG00000116584", intgroup="dex")
plotCounts(dds, gene="ENSG00000189221", intgroup="dex")
plotCounts(dds, gene="ENSG00000120129", intgroup="dex")
plotCounts(dds, gene="ENSG00000148175", intgroup="dex")
#Next steps in exploring these data...BLAST to database to find associated gene function
#reset par
par(mfrow=c(1,1))
# Make a basic volcano plot
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))
# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
#First we need to transform the raw count data
#vst function will perform variance stabilizing transformation
vsdata <- vst(dds, blind=FALSE)
plotPCA(vsdata, intgroup="dex") #using the DESEQ2 plotPCA fxn we can
#look at how our samples group by treatment