Objectives

This package demonstrates the use of genotypes obtained via calls against the T2T reference genome. Our objectives are:

  • acquire and make the genotypes accessible to R/Bioconductor
  • bind relevant sample-level metadata to the resource
  • acquire associated RNA-seq assay outputs for a subset of the genotyped samples
  • perform a series of GWAS to identify expression-based quantitative trait loci (eQTL)

The reference genomic sequence

GRCh38

library(BSgenome.Hsapiens.NCBI.GRCh38)
seqinfo(BSgenome.Hsapiens.NCBI.GRCh38)
## Seqinfo object with 455 sequences (1 circular) from GRCh38 genome:
##   seqnames                       seqlengths isCircular genome
##   1                               248956422      FALSE GRCh38
##   2                               242193529      FALSE GRCh38
##   3                               198295559      FALSE GRCh38
##   4                               190214555      FALSE GRCh38
##   5                               181538259      FALSE GRCh38
##   ...                                   ...        ...    ...
##   HSCHR19KIR_FH08_BAX_HAP_CTG3_1     200773      FALSE GRCh38
##   HSCHR19KIR_FH13_A_HAP_CTG3_1       170148      FALSE GRCh38
##   HSCHR19KIR_FH13_BA2_HAP_CTG3_1     215732      FALSE GRCh38
##   HSCHR19KIR_FH15_A_HAP_CTG3_1       170537      FALSE GRCh38
##   HSCHR19KIR_RP5_B_HAP_CTG3_1        177381      FALSE GRCh38
BSgenome.Hsapiens.NCBI.GRCh38::Hsapiens
## | BSgenome object for Human
## | - organism: Homo sapiens
## | - provider: NCBI
## | - genome: GRCh38
## | - release date: 2013-12-17
## | - 455 sequence(s):
## |     1                                  2                                 
## |     3                                  4                                 
## |     5                                  6                                 
## |     7                                  8                                 
## |     9                                  10                                
## |     ...                                ...                               
## |     HSCHR19KIR_FH05_B_HAP_CTG3_1       HSCHR19KIR_FH06_A_HAP_CTG3_1      
## |     HSCHR19KIR_FH06_BA1_HAP_CTG3_1     HSCHR19KIR_FH08_A_HAP_CTG3_1      
## |     HSCHR19KIR_FH08_BAX_HAP_CTG3_1     HSCHR19KIR_FH13_A_HAP_CTG3_1      
## |     HSCHR19KIR_FH13_BA2_HAP_CTG3_1     HSCHR19KIR_FH15_A_HAP_CTG3_1      
## |     HSCHR19KIR_RP5_B_HAP_CTG3_1                                          
## | 
## | Tips: call 'seqnames()' on the object to get all the sequence names, call
## | 'seqinfo()' to get the full sequence info, use the '$' or '[[' operator to
## | access a given sequence, see '?BSgenome' for more information.
hs38 = BSgenome.Hsapiens.NCBI.GRCh38
hs38$`1`
## 248956422-letter DNAString object
## seq: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

T2T

library(BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0)
seqinfo(BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0)
## Seqinfo object with 25 sequences (1 circular) from T2T-CHM13v2.0 genome:
##   seqnames seqlengths isCircular        genome
##   1         248387328      FALSE T2T-CHM13v2.0
##   2         242696752      FALSE T2T-CHM13v2.0
##   3         201105948      FALSE T2T-CHM13v2.0
##   4         193574945      FALSE T2T-CHM13v2.0
##   5         182045439      FALSE T2T-CHM13v2.0
##   ...             ...        ...           ...
##   21         45090682      FALSE T2T-CHM13v2.0
##   22         51324926      FALSE T2T-CHM13v2.0
##   X         154259566      FALSE T2T-CHM13v2.0
##   Y          62460029      FALSE T2T-CHM13v2.0
##   MT            16569       TRUE T2T-CHM13v2.0
BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0::Hsapiens
## | BSgenome object for Human
## | - organism: Homo sapiens
## | - provider: NCBI
## | - genome: T2T-CHM13v2.0
## | - release date: 2022/01/24
## | - 25 sequence(s):
## |     1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18 
## |     19  20  21  22  X   Y   MT                                             
## | 
## | Tips: call 'seqnames()' on the object to get all the sequence names, call
## | 'seqinfo()' to get the full sequence info, use the '$' or '[[' operator to
## | access a given sequence, see '?BSgenome' for more information.
hst2t = BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0
hst2t$`1`
## 248387328-letter DNAString object
## seq: CACCCTAAACCCTAACCCCTAACCCTAACCCTAACC...AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT

18 million non-called positions in GRCh38 are now called.

alphabetFrequency(hst2t$`1`)
##        A        C        G        T        M        R        W        S 
## 73600418 51017014 52064401 71705495        0        0        0        0 
##        Y        K        V        H        D        B        N        - 
##        0        0        0        0        0        0        0        0 
##        +        . 
##        0        0
alphabetFrequency(hs38$`1`)
##        A        C        G        T        M        R        W        S 
## 67070277 48055043 48111528 67244164        1        1        0        0 
##        Y        K        V        H        D        B        N        - 
##        0        0        0        0        0        0 18475408        0 
##        +        . 
##        0        0

Genes and transcripts

BiocT2T::install_early_t2t_txdb()
library(TxDb.Hsapiens.NCBI.CHM13v2)
TxDb.Hsapiens.NCBI.CHM13v2
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: chm13v2.0_RefSeq_Liftoff_v4.gff3
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # miRBase build ID: NA
## # method: Liftoff_v4
## # source: JHU
## # Resource URL: https://ccb.jhu.edu/T2T.shtml
## # Genome: T2T-CHM13v2.0
## # Nb of transcripts: 181747
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2023-03-16 11:00:14 -0400 (Thu, 16 Mar 2023)
## # GenomicFeatures version at creation time: 1.51.4
## # RSQLite version at creation time: 2.3.0
## # DBSCHEMAVERSION: 1.2
txdb = TxDb.Hsapiens.NCBI.CHM13v2
genes(txdb)
## GRanges object with 45316 ranges and 1 metadata column:
##              seqnames            ranges strand |     gene_id
##                 <Rle>         <IRanges>  <Rle> | <character>
##         A1BG    chr19 61441599-61449907      - |        A1BG
##     A1BG-AS1    chr19 61448385-61451599      + |    A1BG-AS1
##         A1CF    chr10 51648044-51734261      - |        A1CF
##          A2M    chr12   9049893-9098416      - |         A2M
##      A2M-AS1    chr12   9047362-9050240      + |     A2M-AS1
##          ...      ...               ...    ... .         ...
##   gene-TRNS2     chrM       11630-11688      + |  gene-TRNS2
##    gene-TRNT     chrM       15311-15376      + |   gene-TRNT
##    gene-TRNV     chrM         1026-1094      + |   gene-TRNV
##    gene-TRNW     chrM         4935-5002      + |   gene-TRNW
##    gene-TRNY     chrM         5249-5314      - |   gene-TRNY
##   -------
##   seqinfo: 25 sequences (1 circular) from T2T-CHM13v2.0 genome; no seqlengths
## [1] 181747
length(exons(txdb))
## [1] 374407