RNA-Sequence Analysis Workflow

1. Quality assess and clean raw sequencing data
2. Align reads to a reference
3. Count the number of reads assigned to each contig/gene
4. Extract counts and store in a matrix
5. Create column metadata table
6. Analyze count data using DESEQ2



Install packages and load libraries
#install.packages("htmltools")
#library(htmltools)
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")

library( "DESeq2" )
library(ggplot2)



Download data
countsName <- "http://bioconnector.org/workshops/data/airway_scaledcounts.csv"
download.file(countsName, destfile = "airway_scaledcounts.csv", method = "auto")

countData <- read.csv('airway_scaledcounts.csv', header = TRUE, sep = ",")
head(countData)
##           ensgene SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## 1 ENSG00000000003        723        486        904        445       1170
## 2 ENSG00000000005          0          0          0          0          0
## 3 ENSG00000000419        467        523        616        371        582
## 4 ENSG00000000457        347        258        364        237        318
## 5 ENSG00000000460         96         81         73         66        118
## 6 ENSG00000000938          0          0          1          0          2
##   SRR1039517 SRR1039520 SRR1039521
## 1       1097        806        604
## 2          0          0          0
## 3        781        417        509
## 4        447        330        324
## 5         94        102         74
## 6          0          0          0
metaDataName <- "http://bioconnector.org/workshops/data/airway_metadata.csv"
download.file(metaDataName, destfile = "airway_metadata.csv", method = "auto")

metaData <- read.csv('airway_metadata.csv', header = TRUE, sep = ",")
metaData
##           id     dex celltype     geo_id
## 1 SRR1039508 control   N61311 GSM1275862
## 2 SRR1039509 treated   N61311 GSM1275863
## 3 SRR1039512 control  N052611 GSM1275866
## 4 SRR1039513 treated  N052611 GSM1275867
## 5 SRR1039516 control  N080611 GSM1275870
## 6 SRR1039517 treated  N080611 GSM1275871
## 7 SRR1039520 control  N061011 GSM1275874
## 8 SRR1039521 treated  N061011 GSM1275875



Construct DESEQDataSet Object
dds <- DESeqDataSetFromMatrix(countData=countData, 
                              colData=metaData, 
                              design=~dex, tidy = TRUE)
## converting counts to integer mode
#Design specifies how the counts from each gene depend on our variables in the metadata
#For this dataset the factor we care about is our treatment status (dex)
#tidy=TRUE argument, which tells DESeq2 to output the results table with rownames as a first #column called 'row.

#let's see what this object looks like
dds
## class: DESeqDataSet 
## dim: 38694 8 
## metadata(1): version
## assays(1): counts
## rownames(38694): ENSG00000000003 ENSG00000000005 ...
##   ENSG00000283120 ENSG00000283123
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(4): id dex celltype geo_id
Now we’re ready to run DESEQ function
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing



What did we just do?
?DESeq
## starting httpd help server ...
##  done
#estimateSizeFactors
#This calculates the relative library depth of each sample 

#estimateDispersions
#estimates the dispersion of counts for each gene 

#nbinomWaldTest
#calculates the significance of coefficients in a Negative Binomial GLM using the size and dispersion outputs



Take a look at the results table
res <- results(dds)
head(results(dds, tidy=TRUE)) #let's look at the results table
##               row    baseMean log2FoldChange      lfcSE       stat
## 1 ENSG00000000003 747.1941954    -0.33378331 0.16000697 -2.0860548
## 2 ENSG00000000005   0.0000000             NA         NA         NA
## 3 ENSG00000000419 520.1341601     0.20242061 0.09882721  2.0482276
## 4 ENSG00000000457 322.6648439     0.02363948 0.13950629  0.1694510
## 5 ENSG00000000460  87.6826252    -0.13160010 0.22930400 -0.5739111
## 6 ENSG00000000938   0.3191666    -0.08436408 0.15106353 -0.5584676
##       pvalue      padj
## 1 0.03697366 0.1624564
## 2         NA        NA
## 3 0.04053770 0.1732073
## 4 0.86544189 0.9614644
## 5 0.56602799 0.8148481
## 6 0.57652514        NA



Summary of differential gene expression
summary(res) #summary of results
## 
## out of 25258 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)     : 1564, 6.2% 
## LFC < 0 (down)   : 1193, 4.7% 
## outliers [1]     : 212, 0.84% 
## low counts [2]   : 9918, 39% 
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results



Sort summary list by p-value
res <- res[order(res$padj),]
head(res)
## log2 fold change (MAP): dex treated vs control 
## Wald test p-value: dex treated vs control 
## DataFrame with 6 rows and 6 columns
##                   baseMean log2FoldChange      lfcSE      stat
##                  <numeric>      <numeric>  <numeric> <numeric>
## ENSG00000152583   954.7709       3.966942 0.21424708  18.51573
## ENSG00000179094   743.2527       2.713956 0.16595960  16.35311
## ENSG00000116584  2277.9135      -1.026994 0.06416315 -16.00599
## ENSG00000189221  2383.7537       3.090702 0.19641099  15.73589
## ENSG00000120129  3440.7038       2.759321 0.18957216  14.55552
## ENSG00000148175 13493.9204       1.401941 0.09837916  14.25038
##                       pvalue         padj
##                    <numeric>    <numeric>
## ENSG00000152583 1.541835e-76 2.332488e-72
## ENSG00000179094 4.133115e-60 3.126288e-56
## ENSG00000116584 1.160615e-57 5.852596e-54
## ENSG00000189221 8.583635e-56 3.246331e-52
## ENSG00000120129 5.387437e-48 1.630023e-44
## ENSG00000148175 4.458290e-46 1.124084e-42



plotCounts
#we can use plotCounts fxn to compare the normalized counts
#between treated and control groups for our top 6 genes
par(mfrow=c(2,3))

plotCounts(dds, gene="ENSG00000152583", intgroup="dex")
plotCounts(dds, gene="ENSG00000179094", intgroup="dex")
plotCounts(dds, gene="ENSG00000116584", intgroup="dex")
plotCounts(dds, gene="ENSG00000189221", intgroup="dex")
plotCounts(dds, gene="ENSG00000120129", intgroup="dex")
plotCounts(dds, gene="ENSG00000148175", intgroup="dex")

#Next steps in exploring these data...BLAST to database to find associated gene function



Volcano Plot
#reset par
par(mfrow=c(1,1))
# Make a basic volcano plot
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))



PCA
#First we need to transform the raw count data
#vst function will perform variance stabilizing transformation

vsdata <- vst(dds, blind=FALSE)
plotPCA(vsdata, intgroup="dex") #using the DESEQ2 plotPCA fxn we can

                                #look at how our samples group by treatment