lkparq: claude-based parquet transformation of TxDb from Gencode v49
Vincent J. Carey, stvjc at channing.harvard.edu
March 13, 2026
Source:vignettes/lkparq.Rmd
lkparq.RmdIntroduction
This is an experiment to evaluate an approach to TxDb production that is decoupled from AnnotationDbi and is based on serializations of information in Gencode GTFs to Parquet.
The code that transformed Gencode GTF to parquet is in inst/python in the sources of this package. The code was generated by prompting Anthropic Claude Opus 4.5 in March of 2026.
library(GenomicRanges)
library(arrow)
library(dplyr)
library(lkparq)
parqloc = system.file("gc49", package="lkparq")
gtf <- GTFParquet(parqloc)
print(gtf)## GTFParquet object
## Path: /private/var/folders/yw/gfhgh7k565v9w83x_k764wbc0000gp/T/RtmplBznwD/temp_libpath82ef7fcad40d/lkparq/gc49
## Genome: GRCh38
## Available tables:
## [x] genes
## [x] transcripts
## [x] exons
## [x] cds
## [x] features
## [x] metadata
## Metadata:
## description: evidence-based annotation of the human genome (GRCh38), version 49 (Ensembl 115)
## provider: GENCODE
## contact: gencode-help@ebi.ac.uk
## format: gtf
## date: 2025-07-08
## genome: GRCh38
# View GTF metadata (version, date, provider)
gtf_metadata(gtf)## description
## "evidence-based annotation of the human genome (GRCh38), version 49 (Ensembl 115)"
## provider
## "GENCODE"
## contact
## "gencode-help@ebi.ac.uk"
## format
## "gtf"
## date
## "2025-07-08"
## genome
## "GRCh38"
Rich Gene Queries
Get ALL genes with FULL attributes
## GRanges object with 6 ranges and 6 metadata columns:
## seqnames ranges strand | gene_name
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000290825 chr1 11121-24894 + | DDX11L16
## ENSG00000223972 chr1 12010-13670 + | DDX11L1
## ENSG00000310526 chr1 14356-30744 - | WASH7P
## ENSG00000227232 chr1 14696-24886 - | WASH7P
## ENSG00000278267 chr1 17369-17436 - | MIR6859-1
## ENSG00000243485 chr1 28589-31109 + | MIR1302-2HG
## gene_type source level
## <character> <character> <integer>
## ENSG00000290825 lncRNA HAVANA <NA>
## ENSG00000223972 transcribed_unproces.. HAVANA <NA>
## ENSG00000310526 lncRNA HAVANA <NA>
## ENSG00000227232 transcribed_unproces.. HAVANA <NA>
## ENSG00000278267 miRNA ENSEMBL <NA>
## ENSG00000243485 lncRNA HAVANA <NA>
## tags havana_gene
## <character> <character>
## ENSG00000290825 overlaps_pseudogene <NA>
## ENSG00000223972 <NA> OTTHUMG00000000961.2
## ENSG00000310526 ncRNA_host;overlappi.. <NA>
## ENSG00000227232 overlapping_locus OTTHUMG00000000958.1
## ENSG00000278267 <NA> <NA>
## ENSG00000243485 ncRNA_host OTTHUMG00000000959.2
## -------
## seqinfo: 25 sequences from GRCh38 genome; no seqlengths
Filter by gene_type
## [1] 20097
## [1] 34880
# Convenience functions
pc_genes <- protein_coding_genes(gtf)
lnc_genes <- lncRNA_genes(gtf)
# See what gene types are available
gene_types(gtf)##
## artifact IG_C_gene
## 19 14
## IG_C_pseudogene IG_D_gene
## 9 37
## IG_J_gene IG_J_pseudogene
## 18 3
## IG_pseudogene IG_V_gene
## 1 146
## IG_V_pseudogene lncRNA
## 187 34880
## miRNA misc_RNA
## 1879 2207
## Mt_rRNA Mt_tRNA
## 2 22
## processed_pseudogene protein_coding
## 9487 20097
## ribozyme rRNA
## 8 47
## rRNA_pseudogene scaRNA
## 497 49
## snoRNA snRNA
## 942 1901
## sRNA TEC
## 5 1019
## TR_C_gene TR_D_gene
## 6 5
## TR_J_gene TR_J_pseudogene
## 79 4
## TR_V_gene TR_V_pseudogene
## 107 33
## transcribed_processed_pseudogene transcribed_unitary_pseudogene
## 1149 201
## transcribed_unprocessed_pseudogene translated_processed_pseudogene
## 1587 2
## unitary_pseudogene unprocessed_pseudogene
## 89 1949
## vault_RNA
## 4
# lncRNA miRNA protein_coding pseudogene ...Filter by annotation confidence level
- Level 1: verified loci (experimental evidence)
- Level 2: manually annotated loci
- Level 3: automatically annotated loci
verified_genes <- genes(gtf, filter = list(level = 1))
# Filter by source (HAVANA = manual, ENSEMBL = automatic)
manual_genes <- genes(gtf, filter = list(source = "HAVANA"))
# Filter by tags
# Common tags: basic, CCDS, overlaps_pseudogene, etc.
# Note: tags is a semicolon-separated string, so use grep for complex filters
all_genes <- genes(gtf)
basic_genes <- all_genes[grep("basic", mcols(all_genes)$tags)]
# Combine filters
chr1_protein_coding <- genes(gtf, filter = list(
chrom = "chr1",
gene_type = "protein_coding"
))Rich Transcript Queries
# Transcripts include transcript_type, transcript_name, support level, etc.
tx <- transcripts(gtf)
head(mcols(tx))## DataFrame with 6 rows and 11 columns
## transcript_name transcript_type gene_id gene_name
## <character> <character> <character> <character>
## ENST00000832824 DDX11L16-260 lncRNA ENSG00000290825.2 DDX11L16
## ENST00000832825 DDX11L16-261 lncRNA ENSG00000290825.2 DDX11L16
## ENST00000832826 DDX11L16-262 lncRNA ENSG00000290825.2 DDX11L16
## ENST00000832827 DDX11L16-263 lncRNA ENSG00000290825.2 DDX11L16
## ENST00000832828 DDX11L16-264 lncRNA ENSG00000290825.2 DDX11L16
## ENST00000832829 DDX11L16-265 lncRNA ENSG00000290825.2 DDX11L16
## source level tags transcript_support_level
## <character> <integer> <character> <character>
## ENST00000832824 HAVANA NA TAGENE NA
## ENST00000832825 HAVANA NA TAGENE NA
## ENST00000832826 HAVANA NA TAGENE NA
## ENST00000832827 HAVANA NA TAGENE NA
## ENST00000832828 HAVANA NA basic;TAGENE NA
## ENST00000832829 HAVANA NA TAGENE NA
## havana_transcript ccdsid protein_id
## <character> <character> <character>
## ENST00000832824 NA NA NA
## ENST00000832825 NA NA NA
## ENST00000832826 NA NA NA
## ENST00000832827 NA NA NA
## ENST00000832828 NA NA NA
## ENST00000832829 NA NA NA
# transcript_name transcript_type gene_id gene_name source level tags
# transcript_support_level havana_transcript ccdsid protein_id
# See transcript types
transcript_types(gtf)##
## artifact IG_C_gene
## 19 23
## IG_C_pseudogene IG_D_gene
## 9 37
## IG_J_gene IG_J_pseudogene
## 18 3
## IG_pseudogene IG_V_gene
## 1 146
## IG_V_pseudogene lncRNA
## 187 189152
## miRNA misc_RNA
## 1879 2207
## Mt_rRNA Mt_tRNA
## 2 22
## non_stop_decay nonsense_mediated_decay
## 105 21949
## processed_pseudogene processed_transcript
## 9488 12
## protein_coding protein_coding_CDS_not_defined
## 211446 26573
## protein_coding_LoF retained_intron
## 74 34239
## ribozyme rRNA
## 8 47
## rRNA_pseudogene scaRNA
## 497 49
## snoRNA snRNA
## 942 1901
## sRNA TEC
## 5 1108
## TR_C_gene TR_D_gene
## 6 5
## TR_J_gene TR_J_pseudogene
## 79 4
## TR_V_gene TR_V_pseudogene
## 107 33
## transcribed_processed_pseudogene transcribed_unitary_pseudogene
## 1149 201
## transcribed_unprocessed_pseudogene translated_processed_pseudogene
## 1589 2
## unitary_pseudogene unprocessed_pseudogene
## 89 1949
## vault_RNA
## 4
# Filter by transcript type
coding_tx <- transcripts(gtf, filter = list(transcript_type = "protein_coding"))
# Filter by transcript support level (TSL)
# TSL1 = all splice junctions supported by RNA-seq
# TSL2 = best supporting transcript is flagged as suspect
# TSL3 = not all splice junctions supported
# TSL4/5 = poor support
high_confidence_tx <- transcripts(gtf, filter = list(transcript_support_level = "1"))
# Find transcripts with CCDS IDs (highly curated)
all_tx <- transcripts(gtf)
ccds_tx <- all_tx[!is.na(mcols(all_tx)$ccdsid)]
length(ccds_tx)## [1] 95584
Exon Queries with Context
## DataFrame with 6 rows and 6 columns
## exon_number transcript_id gene_id source
## <integer> <character> <character> <character>
## ENSE00004248723 NA ENST00000832824.1 ENSG00000290825.2 HAVANA
## ENSE00004248721 NA ENST00000832825.1 ENSG00000290825.2 HAVANA
## ENSE00004248726 NA ENST00000832826.1 ENSG00000290825.2 HAVANA
## ENSE00004248710 NA ENST00000832827.1 ENSG00000290825.2 HAVANA
## ENSE00004248702 NA ENST00000832828.1 ENSG00000290825.2 HAVANA
## ENSE00004248719 NA ENST00000832829.1 ENSG00000290825.2 HAVANA
## level tags
## <integer> <character>
## ENSE00004248723 NA TAGENE
## ENSE00004248721 NA TAGENE
## ENSE00004248726 NA TAGENE
## ENSE00004248710 NA TAGENE
## ENSE00004248702 NA basic;TAGENE
## ENSE00004248719 NA TAGENE
# Exons for a specific gene
brca1_exons <- exons(gtf, filter = list(gene_id_stripped = "ENSG00000012048"))
# Exons grouped by transcript
exons_by_tx <- exonsBy(gtf, by = "tx")
length(exons_by_tx) # Number of transcripts## [1] 507365
# Look at one transcript's exons (sorted by exon_number)
exons_by_tx[[1]]## GRanges object with 6 ranges and 2 metadata columns:
## seqnames ranges strand | exon_number
## <Rle> <IRanges> <Rle> | <integer>
## ENSE00001872691 chr7 127588411-127588565 + | <NA>
## ENSE00003494180 chr7 127589083-127589163 + | <NA>
## ENSE00003504066 chr7 127589485-127589594 + | <NA>
## ENSE00003678978 chr7 127590066-127590137 + | <NA>
## ENSE00003676786 chr7 127590963-127591088 + | <NA>
## ENSE00000882271 chr7 127591213-127591700 + | <NA>
## gene_id_stripped
## <character>
## ENSE00001872691 ENSG00000004059
## ENSE00003494180 ENSG00000004059
## ENSE00003504066 ENSG00000004059
## ENSE00003678978 ENSG00000004059
## ENSE00003676786 ENSG00000004059
## ENSE00000882271 ENSG00000004059
## -------
## seqinfo: 25 sequences from GRCh38 genome; no seqlengths
# Exons grouped by gene
exons_by_gene <- exonsBy(gtf, by = "gene")CDS and Protein Information
## DataFrame with 6 rows and 8 columns
## transcript_id gene_id
## <character> <character>
## ENST00000641515.2:CDS:65565-65573 ENST00000641515.2 ENSG00000186092.7
## ENST00000641515.2:CDS:69037-70005 ENST00000641515.2 ENSG00000186092.7
## ENST00000426406.4:CDS:450743-451678 ENST00000426406.4 ENSG00000284733.2
## ENST00000332831.5:CDS:685719-686654 ENST00000332831.5 ENSG00000284662.2
## ENST00000968544.1:CDS:924432-924948 ENST00000968544.1 ENSG00000187634.14
## ENST00000616016.5:CDS:924432-924948 ENST00000616016.5 ENSG00000187634.14
## protein_id exon_number frame
## <character> <integer> <integer>
## ENST00000641515.2:CDS:65565-65573 ENSP00000493376.2 NA 0
## ENST00000641515.2:CDS:69037-70005 ENSP00000493376.2 NA 0
## ENST00000426406.4:CDS:450743-451678 ENSP00000409316.1 NA 0
## ENST00000332831.5:CDS:685719-686654 ENSP00000329982.2 NA 0
## ENST00000968544.1:CDS:924432-924948 ENSP00000638603.1 NA 0
## ENST00000616016.5:CDS:924432-924948 ENSP00000478421.2 NA 0
## source level
## <character> <integer>
## ENST00000641515.2:CDS:65565-65573 HAVANA NA
## ENST00000641515.2:CDS:69037-70005 HAVANA NA
## ENST00000426406.4:CDS:450743-451678 HAVANA NA
## ENST00000332831.5:CDS:685719-686654 HAVANA NA
## ENST00000968544.1:CDS:924432-924948 HAVANA NA
## ENST00000616016.5:CDS:924432-924948 HAVANA NA
## tags
## <character>
## ENST00000641515.2:CDS:65565-65573 RNA_Seq_supported_pa..
## ENST00000641515.2:CDS:69037-70005 RNA_Seq_supported_pa..
## ENST00000426406.4:CDS:450743-451678 basic;Ensembl_canoni..
## ENST00000332831.5:CDS:685719-686654 basic;Ensembl_canoni..
## ENST00000968544.1:CDS:924432-924948 basic;GENCODE_Primar..
## ENST00000616016.5:CDS:924432-924948 CAGE_supported_TSS;R..
# transcript_id gene_id protein_id exon_number frame source level tags
# CDS grouped by transcript
cds_by_tx <- cdsBy(gtf, by = "tx")
# Find all CDS for a specific protein
all_cds <- cds(gtf)
# protein_ids <- unique(mcols(all_cds)$protein_id)Transcripts by Gene
# Get transcripts grouped by gene
tx_by_gene <- transcriptsBy(gtf, by = "gene")
# How many transcripts per gene?
tx_counts <- lengths(tx_by_gene)
summary(tx_counts)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 6.448 6.000 522.000
## ENSG00000304176 ENSG00000304309 ENSG00000249859 ENSG00000227888 ENSG00000255717
## 522 448 426 411 363
## ENSG00000181722 ENSG00000291178 ENSG00000310527 ENSG00000223482 ENSG00000272168
## 360 353 331 327 326
## ENSG00000093167 ENSG00000215386 ENSG00000290549 ENSG00000228315 ENSG00000293616
## 321 317 310 289 289
## ENSG00000310526 ENSG00000231721 ENSG00000291158 ENSG00000148180 ENSG00000291154
## 285 284 276 274 270
Region Queries
# Find genes in a region
region <- GRanges("chr1", IRanges(1e6, 2e6))
genes_in_query <- genes_in_region(gtf, region)
# Find transcripts in a region
tx_in_query <- transcripts_in_region(gtf, region)
# Get promoter regions (now with gene_type!)
promoters <- promoters(genes(gtf), upstream = 2000, downstream = 200)
# promoters still has all mcols from genes
head(mcols(promoters)$gene_type)## [1] "lncRNA" "transcribed_unprocessed_pseudogene"
## [3] "lncRNA" "transcribed_unprocessed_pseudogene"
## [5] "miRNA" "lncRNA"
# Filter for protein-coding promoters
pc_promoters <- promoters[mcols(promoters)$gene_type == "protein_coding"]
head(pc_promoters)## GRanges object with 6 ranges and 6 metadata columns:
## seqnames ranges strand | gene_name gene_type
## <Rle> <IRanges> <Rle> | <character> <character>
## ENSG00000186092 chr1 63419-65618 + | OR4F5 protein_coding
## ENSG00000284733 chr1 451479-453678 - | OR4F29 protein_coding
## ENSG00000284662 chr1 686455-688654 - | OR4F16 protein_coding
## ENSG00000187634 chr1 921923-924122 + | SAMD11 protein_coding
## ENSG00000188976 chr1 960515-962714 - | NOC2L protein_coding
## ENSG00000187961 chr1 958576-960775 + | KLHL17 protein_coding
## source level tags havana_gene
## <character> <integer> <character> <character>
## ENSG00000186092 HAVANA <NA> <NA> OTTHUMG00000001094.4
## ENSG00000284733 HAVANA <NA> <NA> OTTHUMG00000002860.3
## ENSG00000284662 HAVANA <NA> <NA> OTTHUMG00000002581.3
## ENSG00000187634 HAVANA <NA> <NA> OTTHUMG00000040719.11
## ENSG00000188976 HAVANA <NA> <NA> OTTHUMG00000040720.2
## ENSG00000187961 HAVANA <NA> <NA> OTTHUMG00000040721.8
## -------
## seqinfo: 25 sequences from GRCh38 genome; no seqlengths
Performance Tips
Use filters to reduce data loaded genes(gtf, filter = list(chrom = “chr1”)) # Fast genes(gtf)[seqnames(genes(gtf)) == “chr1”] # Slower (loads all first)
Select only columns you need genes(gtf, columns = c(“gene_name”, “gene_type”))
Use versioned IDs when you need them genes(gtf, use_versioned_ids = TRUE) # ENSG00000290825.2 genes(gtf, use_versioned_ids = FALSE) # ENSG00000290825 (default)
For large operations, consider working with the Parquet directly
library(arrow)
genes_tbl <- open_dataset("parquet_output/genes.parquet")
genes_tbl |> filter(gene_type == "protein_coding") |> collect()
Session information
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.7.4
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] tools stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] lkparq_0.0.2 dplyr_1.2.0 arrow_23.0.1.1
## [4] GenomicRanges_1.62.1 Seqinfo_1.0.0 IRanges_2.44.0
## [7] S4Vectors_0.48.0 BiocGenerics_0.56.0 generics_0.1.4
## [10] BiocStyle_2.38.0
##
## loaded via a namespace (and not attached):
## [1] SummarizedExperiment_1.40.0 KEGGREST_1.50.0
## [3] rjson_0.2.23 xfun_0.56
## [5] bslib_0.10.0 htmlwidgets_1.6.4
## [7] lattice_0.22-9 Biobase_2.70.0
## [9] vctrs_0.7.1 bitops_1.0-9
## [11] parallel_4.5.2 curl_7.0.0
## [13] tibble_3.3.1 AnnotationDbi_1.72.0
## [15] RSQLite_2.4.6 blob_1.3.0
## [17] pkgconfig_2.0.3 Matrix_1.7-4
## [19] cigarillo_1.0.0 desc_1.4.3
## [21] assertthat_0.2.1 lifecycle_1.0.5
## [23] compiler_4.5.2 Rsamtools_2.26.0
## [25] textshaping_1.0.4 Biostrings_2.78.0
## [27] codetools_0.2-20 GenomeInfoDb_1.46.2
## [29] htmltools_0.5.9 sass_0.4.10
## [31] RCurl_1.98-1.17 yaml_2.3.12
## [33] pillar_1.11.1 pkgdown_2.2.0
## [35] crayon_1.5.3 jquerylib_0.1.4
## [37] BiocParallel_1.44.0 DelayedArray_0.36.0
## [39] cachem_1.1.0 abind_1.4-8
## [41] tidyselect_1.2.1 digest_0.6.39
## [43] purrr_1.2.1 restfulr_0.0.16
## [45] bookdown_0.46 grid_4.5.2
## [47] fastmap_1.2.0 SparseArray_1.10.8
## [49] cli_3.6.5 magrittr_2.0.4
## [51] S4Arrays_1.10.1 GenomicFeatures_1.62.0
## [53] XML_3.99-0.22 withr_3.0.2
## [55] UCSC.utils_1.6.1 bit64_4.6.0-1
## [57] rmarkdown_2.30 XVector_0.50.0
## [59] httr_1.4.8 matrixStats_1.5.0
## [61] bit_4.6.0 otel_0.2.0
## [63] ragg_1.5.0 png_0.1-8
## [65] memoise_2.0.1 evaluate_1.0.5
## [67] knitr_1.51 BiocIO_1.20.0
## [69] rtracklayer_1.70.1 rlang_1.1.7
## [71] glue_1.8.0 DBI_1.3.0
## [73] BiocManager_1.30.27 jsonlite_2.0.0
## [75] R6_2.6.1 MatrixGenerics_1.22.0
## [77] GenomicAlignments_1.46.0 systemfonts_1.3.1
## [79] fs_1.6.6