# Chapter 30 Human pancreas dataset (Segerstolpe)

## 30.1 Introduction

This performs an analysis of the Segerstolpe et al. (2016) dataset, consisting of human pancreas cells from various donors.

## 30.2 Analysis code

library(scRNAseq)
sce.seger <- SegerstolpePancreasData()

### 30.2.2 Gene annotation

library(AnnotationHub)
edb <- AnnotationHub()[["AH73881"]]
symbols <- rowData(sce.seger)$symbol ens.id <- mapIds(edb, keys=symbols, keytype="SYMBOL", column="GENEID") ens.id <- ifelse(is.na(ens.id), symbols, ens.id) # Removing duplicated rows. keep <- !duplicated(ens.id) sce.seger <- sce.seger[keep,] rownames(sce.seger) <- ens.id[keep] ### 30.2.3 Sample annotation We simplify the names of some of the relevant column metadata fields for ease of access. Some editing of the cell type labels is necessary for consistency with other data sets. emtab.meta <- colData(sce.seger)[,c("cell type", "individual", "single cell well quality")] colnames(emtab.meta) <- c("CellType", "Donor", "Quality") colData(sce.seger) <- emtab.meta sce.seger$CellType <- gsub(" cell", "", sce.seger$CellType) sce.seger$CellType <- paste0(
toupper(substr(sce.seger$CellType, 1, 1)), substring(sce.seger$CellType, 2))

### 30.2.4 Quality control

unfiltered <- sce.seger

We remove low quality cells that were marked by the authors. We then perform additional quality control, as some of the remaining still have very low counts and numbers of detected features.

low.qual <- sce.seger$Quality == "low quality cell" library(scater) stats <- perCellQCMetrics(sce.seger) qc <- quickPerCellQC(stats, percent_subsets="altexps_ERCC_percent") sce.seger <- sce.seger[,!(qc$discard | low.qual)]

### 30.2.5 Normalization

We don’t normalize the spike-ins as there are some cells with no spike-in counts.

library(scran)
clusters <- quickCluster(sce.seger)
sce.seger <- computeSumFactors(sce.seger, clusters=clusters)
sce.seger <- logNormCounts(sce.seger) 

### 30.2.6 Variance modelling

We do not use cells with no spike-ins for variance modelling. Donor AZ also has very low spike-in counts and is subsequently ignored.

for.hvg <- sce.seger[,librarySizeFactors(altExp(sce.seger)) > 0
& sce.seger$Donor!="AZ"] dec.seger <- modelGeneVarWithSpikes(for.hvg, "ERCC", block=for.hvg$Donor)
chosen.hvgs <- getTopHVGs(dec.seger, n=2000)

### 30.2.7 Dimensionality reduction

library(BiocSingular)
set.seed(101011001)
sce.seger <- runPCA(sce.seger, subset_row=chosen.hvgs, ncomponents=25)
sce.seger <- runTSNE(sce.seger, dimred="PCA")

### 30.2.8 Clustering

snn.gr <- buildSNNGraph(sce.seger, use.dimred="PCA")
sce.seger$cluster <- factor(igraph::cluster_walktrap(snn.gr)$membership)

## 30.3 Results

### 30.3.1 Quality control statistics

colData(unfiltered) <- cbind(colData(unfiltered), stats)
unfiltered$discard <- qc$discard

gridExtra::grid.arrange(
plotColData(unfiltered, x="Donor", y="sum", colour_by="discard") +
scale_y_log10() + ggtitle("Total count"),
plotColData(unfiltered, x="Donor", y="detected", colour_by="discard") +
scale_y_log10() + ggtitle("Detected features"),
plotColData(unfiltered, x="Donor", y="altexps_ERCC_percent",
colour_by="discard") + ggtitle("ERCC percent"),
ncol=2
)

colSums(as.matrix(qc))
##              low_lib_size            low_n_features high_altexps_ERCC_percent
##                       197                       587                       902
##                      1028

### 30.3.2 Normalization

summary(sizeFactors(sce.seger))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
##   0.005   0.392   0.710   1.000   1.321  11.342
plot(librarySizeFactors(sce.seger), sizeFactors(sce.seger), pch=16,
xlab="Library size factors", ylab="Deconvolution factors", log="xy")

### 30.3.3 Variance modelling

par(mfrow=c(3,3))
blocked.stats <- dec.seger$per.block for (i in colnames(blocked.stats)) { current <- blocked.stats[[i]] plot(current$mean, current$total, main=i, pch=16, cex=0.5, xlab="Mean of log-expression", ylab="Variance of log-expression") curfit <- metadata(current) points(curfit$mean, curfit$var, col="red", pch=16) curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}

### 30.3.4 Dimensionality reduction

ncol(reducedDim(sce.seger, "PCA"))
## [1] 25

### 30.3.5 Clustering

We see a strong donor effect, which suggests that should have called fastMNN() at some point.

table(Cluster=sce.seger$cluster, Donor=sce.seger$Donor)
##        Donor
## Cluster  AZ HP1502401 HP1504101T2D HP1504901 HP1506401 HP1507101 HP1508501T2D HP1509101
##      1    4         5            3         0         6         8           72        25
##      2    2        20            8         1        36         2           30         2
##      3   14        16           11        12         4         0            1         1
##      4    0         0            3         0         2         8          127         2
##      5   30       102          139         6         0         2           10         4
##      6    0         0            0         0         0         0            0         0
##      7    0         0            0         0        16         1            6         1
##      8    0         2            0         0         2         6            6        11
##      9    2         0            0         1        44         1            0         0
##      10   0         0            2         0         2         6           12         3
##      11   2        14           70         0         0         1            2         0
##      12   0         0            6         0         0         0            0         0
##      13   1        14            0        16         0         0            7         0
##      14   1         1            1        11         0         0            0         0
##      15   0         0            0         0        26         0            0         0
##      16   1         1            0        91         0         0            0         0
##      17   0         0            0         0         0       133            4        49
##      18   0         0            0         0        35         0            0         0
##      19   4         2            0        34         0         0            0         0
##      20   0         0            0         0        61         0            3         1
##        Donor
## Cluster HP1525301T2D HP1526901T2D
##      1           125           49
##      2            23           13
##      3             0            0
##      4             1            2
##      5             2            0
##      6            84           96
##      7            10           35
##      8             5           34
##      9             1            0
##      10           13            4
##      11            0            0
##      12           12           68
##      13            0            0
##      14            0            0
##      15            0            0
##      16            0            0
##      17            0            0
##      18            0            0
##      19            0            0
##      20            0            0
##  [ reached getOption("max.print") -- omitted 4 rows ]
table(Cluster=sce.seger$cluster, Donor=sce.seger$CellType)
##        Donor
## Cluster Acinar Alpha Beta Co-expression Delta Ductal Endothelial Epsilon Gamma MHC class II Mast
##      1       0     0    0             0     0    282           0       0     0            5    2
##      2     135     0    0             0     0      0           0       0     0            0    0
##      3       0     1    1             0    44      0           0       0     1            0    0
##      4       0   109   12            18     0      0           0       0     0            0    0
##      5       0   280    1             6     1      0           0       3     0            0    0
##      6       0   180    0             0     0      0           0       0     0            0    0
##      7       0     0    0             0     0      0           0       2    67            0    0
##      8       0     0    0             0    65      0           0       0     0            0    0
##      9      49     0    0             0     0      0           0       0     0            0    0
##      10      0     0    0             0     0      0           0       0     0            0    0
##      11      0     0    0             0     0      0           0       1    88            0    0
##      12      0     0   81             5     0      0           0       0     0            0    0
##      13      1     0    0             0     0     35           0       0     0            0    0
##        Donor
## Cluster NANA PSC Unclassified Unclassified endocrine
##      1     8   0            0                      0
##      2     2   0            0                      0
##      3     1   0            0                     11
##      4     6   0            0                      0
##      5     3   0            0                      1
##      6     0   0            0                      0
##      7     0   0            0                      0
##      8     1   0            0                      0
##      9     0   0            0                      0
##      10    0  42            0                      0
##      11    0   0            0                      0
##      12    0   0            0                      0
##      13    1   0            0                      1
##  [ reached getOption("max.print") -- omitted 11 rows ]
plotTSNE(sce.seger, colour_by="cluster")

plotTSNE(sce.seger, colour_by="Donor")

## Session Info

R version 3.6.1 (2019-07-05)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 14.04.5 LTS

Matrix products: default
BLAS:   /home/ramezqui/Rbuild/danbuild/R-3.6.1/lib/libRblas.so
LAPACK: /home/ramezqui/Rbuild/danbuild/R-3.6.1/lib/libRlapack.so

locale:
[1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8
[4] LC_COLLATE=C               LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C
[10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base

other attached packages:
[1] BiocSingular_1.2.0          scran_1.14.3                scater_1.14.3
[4] ggplot2_3.2.1               ensembldb_2.10.0            AnnotationFilter_1.10.0
[7] GenomicFeatures_1.38.0      AnnotationDbi_1.48.0        AnnotationHub_2.18.0
[10] BiocFileCache_1.10.2        dbplyr_1.4.2                scRNAseq_2.0.2
[13] SingleCellExperiment_1.8.0  SummarizedExperiment_1.16.0 DelayedArray_0.12.0
[16] BiocParallel_1.20.0         matrixStats_0.55.0          Biobase_2.46.0
[19] GenomicRanges_1.38.0        GenomeInfoDb_1.22.0         IRanges_2.20.0
[22] S4Vectors_0.24.0            BiocGenerics_0.32.0         Cairo_1.5-10
[25] BiocStyle_2.14.0            OSCAUtils_0.0.1

loaded via a namespace (and not attached):
[1] Rtsne_0.15                    ggbeeswarm_0.6.0              colorspace_1.4-1
[4] XVector_0.26.0                BiocNeighbors_1.4.0           bit64_0.9-7
[7] interactiveDisplayBase_1.24.0 codetools_0.2-16              knitr_1.26
[10] zeallot_0.1.0                 Rsamtools_2.2.0               shiny_1.4.0
[13] BiocManager_1.30.9            compiler_3.6.1                httr_1.4.1
[16] dqrng_0.2.1                   backports_1.1.5               assertthat_0.2.1
[19] Matrix_1.2-17                 fastmap_1.0.1                 lazyeval_0.2.2
[22] limma_3.42.0                  later_1.0.0                   htmltools_0.4.0
[25] prettyunits_1.0.2             tools_3.6.1                   rsvd_1.0.2
[28] igraph_1.2.4.1                gtable_0.3.0                  glue_1.3.1
[31] GenomeInfoDbData_1.2.2        dplyr_0.8.3                   rappdirs_0.3.1
[34] Rcpp_1.0.3                    vctrs_0.2.0                   Biostrings_2.54.0
[37] ExperimentHub_1.12.0          rtracklayer_1.46.0            DelayedMatrixStats_1.8.0
[40] xfun_0.11                     stringr_1.4.0                 mime_0.7
[43] irlba_2.3.3                   statmod_1.4.32                XML_3.98-1.20
[46] edgeR_3.28.0                  zlibbioc_1.32.0               scales_1.0.0
[49] hms_0.5.2                     promises_1.1.0                ProtGenerics_1.18.0
[52] yaml_2.2.0                    curl_4.2                      memoise_1.1.0
[55] gridExtra_2.3                 biomaRt_2.42.0                stringi_1.4.3
[58] RSQLite_2.1.2                 BiocVersion_3.10.1            rlang_0.4.1
[61] pkgconfig_2.0.3               bitops_1.0-6                  evaluate_0.14
[64] lattice_0.20-38               purrr_0.3.3                   labeling_0.3
[67] GenomicAlignments_1.22.1      cowplot_1.0.0                 bit_1.1-14
[70] tidyselect_0.2.5              magrittr_1.5                  bookdown_0.15
[73] R6_2.4.1                      DBI_1.0.0                     pillar_1.4.2
[76] withr_2.1.2                   RCurl_1.95-4.12               tibble_2.1.3
[79] crayon_1.3.4                  rmarkdown_1.17                viridis_0.5.1
[82] progress_1.2.2                locfit_1.5-9.1                grid_3.6.1
[85] blob_1.2.0                    digest_0.6.22                 xtable_1.8-4
[88] httpuv_1.5.2                  openssl_1.4.1                 munsell_0.5.0
[91] beeswarm_0.2.3                viridisLite_0.3.0             vipor_0.4.5
[94] askpass_1.1                  

### Bibliography

Segerstolpe, A., A. Palasantza, P. Eliasson, E. M. Andersson, A. C. Andreasson, X. Sun, S. Picelli, et al. 2016. “Single-Cell Transcriptome Profiling of Human Pancreatic Islets in Health and Type 2 Diabetes.” Cell Metab. 24 (4):593–607.