A1_T2Toverview.Rmd
This package demonstrates the use of genotypes obtained via calls against the T2T reference genome. Our objectives are:
library(BSgenome.Hsapiens.NCBI.GRCh38)
seqinfo(BSgenome.Hsapiens.NCBI.GRCh38)
## Seqinfo object with 455 sequences (1 circular) from GRCh38 genome:
## seqnames seqlengths isCircular genome
## 1 248956422 FALSE GRCh38
## 2 242193529 FALSE GRCh38
## 3 198295559 FALSE GRCh38
## 4 190214555 FALSE GRCh38
## 5 181538259 FALSE GRCh38
## ... ... ... ...
## HSCHR19KIR_FH08_BAX_HAP_CTG3_1 200773 FALSE GRCh38
## HSCHR19KIR_FH13_A_HAP_CTG3_1 170148 FALSE GRCh38
## HSCHR19KIR_FH13_BA2_HAP_CTG3_1 215732 FALSE GRCh38
## HSCHR19KIR_FH15_A_HAP_CTG3_1 170537 FALSE GRCh38
## HSCHR19KIR_RP5_B_HAP_CTG3_1 177381 FALSE GRCh38
BSgenome.Hsapiens.NCBI.GRCh38::Hsapiens
## | BSgenome object for Human
## | - organism: Homo sapiens
## | - provider: NCBI
## | - genome: GRCh38
## | - release date: 2013-12-17
## | - 455 sequence(s):
## | 1 2
## | 3 4
## | 5 6
## | 7 8
## | 9 10
## | ... ...
## | HSCHR19KIR_FH05_B_HAP_CTG3_1 HSCHR19KIR_FH06_A_HAP_CTG3_1
## | HSCHR19KIR_FH06_BA1_HAP_CTG3_1 HSCHR19KIR_FH08_A_HAP_CTG3_1
## | HSCHR19KIR_FH08_BAX_HAP_CTG3_1 HSCHR19KIR_FH13_A_HAP_CTG3_1
## | HSCHR19KIR_FH13_BA2_HAP_CTG3_1 HSCHR19KIR_FH15_A_HAP_CTG3_1
## | HSCHR19KIR_RP5_B_HAP_CTG3_1
## |
## | Tips: call 'seqnames()' on the object to get all the sequence names, call
## | 'seqinfo()' to get the full sequence info, use the '$' or '[[' operator to
## | access a given sequence, see '?BSgenome' for more information.
hs38 = BSgenome.Hsapiens.NCBI.GRCh38
hs38$`1`
## 248956422-letter DNAString object
## seq: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
library(BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0)
seqinfo(BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0)
## Seqinfo object with 25 sequences (1 circular) from T2T-CHM13v2.0 genome:
## seqnames seqlengths isCircular genome
## 1 248387328 FALSE T2T-CHM13v2.0
## 2 242696752 FALSE T2T-CHM13v2.0
## 3 201105948 FALSE T2T-CHM13v2.0
## 4 193574945 FALSE T2T-CHM13v2.0
## 5 182045439 FALSE T2T-CHM13v2.0
## ... ... ... ...
## 21 45090682 FALSE T2T-CHM13v2.0
## 22 51324926 FALSE T2T-CHM13v2.0
## X 154259566 FALSE T2T-CHM13v2.0
## Y 62460029 FALSE T2T-CHM13v2.0
## MT 16569 TRUE T2T-CHM13v2.0
BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0::Hsapiens
## | BSgenome object for Human
## | - organism: Homo sapiens
## | - provider: NCBI
## | - genome: T2T-CHM13v2.0
## | - release date: 2022/01/24
## | - 25 sequence(s):
## | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## | 19 20 21 22 X Y MT
## |
## | Tips: call 'seqnames()' on the object to get all the sequence names, call
## | 'seqinfo()' to get the full sequence info, use the '$' or '[[' operator to
## | access a given sequence, see '?BSgenome' for more information.
hst2t = BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0
hst2t$`1`
## 248387328-letter DNAString object
## seq: CACCCTAAACCCTAACCCCTAACCCTAACCCTAACC...AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT
18 million non-called positions in GRCh38 are now called.
alphabetFrequency(hst2t$`1`)
## A C G T M R W S
## 73600418 51017014 52064401 71705495 0 0 0 0
## Y K V H D B N -
## 0 0 0 0 0 0 0 0
## + .
## 0 0
alphabetFrequency(hs38$`1`)
## A C G T M R W S
## 67070277 48055043 48111528 67244164 1 1 0 0
## Y K V H D B N -
## 0 0 0 0 0 0 18475408 0
## + .
## 0 0
BiocT2T::install_early_t2t_txdb()
library(TxDb.Hsapiens.NCBI.CHM13v2)
TxDb.Hsapiens.NCBI.CHM13v2
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: chm13v2.0_RefSeq_Liftoff_v4.gff3
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # miRBase build ID: NA
## # method: Liftoff_v4
## # source: JHU
## # Resource URL: https://ccb.jhu.edu/T2T.shtml
## # Genome: T2T-CHM13v2.0
## # Nb of transcripts: 181747
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2023-03-16 11:00:14 -0400 (Thu, 16 Mar 2023)
## # GenomicFeatures version at creation time: 1.51.4
## # RSQLite version at creation time: 2.3.0
## # DBSCHEMAVERSION: 1.2
txdb = TxDb.Hsapiens.NCBI.CHM13v2
genes(txdb)
## GRanges object with 45316 ranges and 1 metadata column:
## seqnames ranges strand | gene_id
## <Rle> <IRanges> <Rle> | <character>
## A1BG chr19 61441599-61449907 - | A1BG
## A1BG-AS1 chr19 61448385-61451599 + | A1BG-AS1
## A1CF chr10 51648044-51734261 - | A1CF
## A2M chr12 9049893-9098416 - | A2M
## A2M-AS1 chr12 9047362-9050240 + | A2M-AS1
## ... ... ... ... . ...
## gene-TRNS2 chrM 11630-11688 + | gene-TRNS2
## gene-TRNT chrM 15311-15376 + | gene-TRNT
## gene-TRNV chrM 1026-1094 + | gene-TRNV
## gene-TRNW chrM 4935-5002 + | gene-TRNW
## gene-TRNY chrM 5249-5314 - | gene-TRNY
## -------
## seqinfo: 25 sequences (1 circular) from T2T-CHM13v2.0 genome; no seqlengths
length(transcripts(txdb))
## [1] 181747
## [1] 374407