lkparq: claude-based parquet transformation of TxDb from Gencode v49

Introduction

This is an experiment to evaluate an approach to TxDb production that is decoupled from AnnotationDbi and is based on serializations of information in Gencode GTFs to Parquet.

The code that transformed Gencode GTF to parquet is in inst/python in the sources of this package. The code was generated by prompting Anthropic Claude Opus 4.5 in March of 2026.

library(GenomicRanges)
library(arrow)
library(dplyr)
library(lkparq)
parqloc = system.file("gc49", package="lkparq")

gtf <- GTFParquet(parqloc)
print(gtf)

## GTFParquet object
##   Path: /private/var/folders/yw/gfhgh7k565v9w83x_k764wbc0000gp/T/RtmplBznwD/temp_libpath82ef7fcad40d/lkparq/gc49 
##   Genome: GRCh38 
##   Available tables:
##     [x] genes
##     [x] transcripts
##     [x] exons
##     [x] cds
##     [x] features
##     [x] metadata
##   Metadata:
##     description: evidence-based annotation of the human genome (GRCh38), version 49 (Ensembl 115)
##     provider: GENCODE
##     contact: gencode-help@ebi.ac.uk
##     format: gtf
##     date: 2025-07-08
##     genome: GRCh38

# View GTF metadata (version, date, provider)
gtf_metadata(gtf)

##                                                                        description 
## "evidence-based annotation of the human genome (GRCh38), version 49 (Ensembl 115)" 
##                                                                           provider 
##                                                                          "GENCODE" 
##                                                                            contact 
##                                                           "gencode-help@ebi.ac.uk" 
##                                                                             format 
##                                                                              "gtf" 
##                                                                               date 
##                                                                       "2025-07-08" 
##                                                                             genome 
##                                                                           "GRCh38"

Rich Gene Queries

Get ALL genes with FULL attributes

gr_genes <- genes(gtf)
head(gr_genes)

## GRanges object with 6 ranges and 6 metadata columns:
##                   seqnames      ranges strand |   gene_name
##                      <Rle>   <IRanges>  <Rle> | <character>
##   ENSG00000290825     chr1 11121-24894      + |    DDX11L16
##   ENSG00000223972     chr1 12010-13670      + |     DDX11L1
##   ENSG00000310526     chr1 14356-30744      - |      WASH7P
##   ENSG00000227232     chr1 14696-24886      - |      WASH7P
##   ENSG00000278267     chr1 17369-17436      - |   MIR6859-1
##   ENSG00000243485     chr1 28589-31109      + | MIR1302-2HG
##                                gene_type      source     level
##                              <character> <character> <integer>
##   ENSG00000290825                 lncRNA      HAVANA      <NA>
##   ENSG00000223972 transcribed_unproces..      HAVANA      <NA>
##   ENSG00000310526                 lncRNA      HAVANA      <NA>
##   ENSG00000227232 transcribed_unproces..      HAVANA      <NA>
##   ENSG00000278267                  miRNA     ENSEMBL      <NA>
##   ENSG00000243485                 lncRNA      HAVANA      <NA>
##                                     tags          havana_gene
##                              <character>          <character>
##   ENSG00000290825    overlaps_pseudogene                 <NA>
##   ENSG00000223972                   <NA> OTTHUMG00000000961.2
##   ENSG00000310526 ncRNA_host;overlappi..                 <NA>
##   ENSG00000227232      overlapping_locus OTTHUMG00000000958.1
##   ENSG00000278267                   <NA>                 <NA>
##   ENSG00000243485             ncRNA_host OTTHUMG00000000959.2
##   -------
##   seqinfo: 25 sequences from GRCh38 genome; no seqlengths

Filter by gene_type

protein_coding <- genes(gtf, filter = list(gene_type = "protein_coding"))
length(protein_coding)

## [1] 20097

lncRNAs <- genes(gtf, filter = list(gene_type = "lncRNA"))
length(lncRNAs)

## [1] 34880

# Convenience functions
pc_genes <- protein_coding_genes(gtf)
lnc_genes <- lncRNA_genes(gtf)

# See what gene types are available
gene_types(gtf)

## 
##                           artifact                          IG_C_gene 
##                                 19                                 14 
##                    IG_C_pseudogene                          IG_D_gene 
##                                  9                                 37 
##                          IG_J_gene                    IG_J_pseudogene 
##                                 18                                  3 
##                      IG_pseudogene                          IG_V_gene 
##                                  1                                146 
##                    IG_V_pseudogene                             lncRNA 
##                                187                              34880 
##                              miRNA                           misc_RNA 
##                               1879                               2207 
##                            Mt_rRNA                            Mt_tRNA 
##                                  2                                 22 
##               processed_pseudogene                     protein_coding 
##                               9487                              20097 
##                           ribozyme                               rRNA 
##                                  8                                 47 
##                    rRNA_pseudogene                             scaRNA 
##                                497                                 49 
##                             snoRNA                              snRNA 
##                                942                               1901 
##                               sRNA                                TEC 
##                                  5                               1019 
##                          TR_C_gene                          TR_D_gene 
##                                  6                                  5 
##                          TR_J_gene                    TR_J_pseudogene 
##                                 79                                  4 
##                          TR_V_gene                    TR_V_pseudogene 
##                                107                                 33 
##   transcribed_processed_pseudogene     transcribed_unitary_pseudogene 
##                               1149                                201 
## transcribed_unprocessed_pseudogene    translated_processed_pseudogene 
##                               1587                                  2 
##                 unitary_pseudogene             unprocessed_pseudogene 
##                                 89                               1949 
##                          vault_RNA 
##                                  4

#    lncRNA      miRNA     protein_coding    pseudogene ...

Filter by annotation confidence level

Level 1: verified loci (experimental evidence)
Level 2: manually annotated loci
Level 3: automatically annotated loci

verified_genes <- genes(gtf, filter = list(level = 1))

# Filter by source (HAVANA = manual, ENSEMBL = automatic)
manual_genes <- genes(gtf, filter = list(source = "HAVANA"))

# Filter by tags
# Common tags: basic, CCDS, overlaps_pseudogene, etc.
# Note: tags is a semicolon-separated string, so use grep for complex filters
all_genes <- genes(gtf)
basic_genes <- all_genes[grep("basic", mcols(all_genes)$tags)]

# Combine filters
chr1_protein_coding <- genes(gtf, filter = list(
  chrom = "chr1",
  gene_type = "protein_coding"
))

Rich Transcript Queries

# Transcripts include transcript_type, transcript_name, support level, etc.
tx <- transcripts(gtf)
head(mcols(tx))

## DataFrame with 6 rows and 11 columns
##                 transcript_name transcript_type           gene_id   gene_name
##                     <character>     <character>       <character> <character>
## ENST00000832824    DDX11L16-260          lncRNA ENSG00000290825.2    DDX11L16
## ENST00000832825    DDX11L16-261          lncRNA ENSG00000290825.2    DDX11L16
## ENST00000832826    DDX11L16-262          lncRNA ENSG00000290825.2    DDX11L16
## ENST00000832827    DDX11L16-263          lncRNA ENSG00000290825.2    DDX11L16
## ENST00000832828    DDX11L16-264          lncRNA ENSG00000290825.2    DDX11L16
## ENST00000832829    DDX11L16-265          lncRNA ENSG00000290825.2    DDX11L16
##                      source     level         tags transcript_support_level
##                 <character> <integer>  <character>              <character>
## ENST00000832824      HAVANA        NA       TAGENE                       NA
## ENST00000832825      HAVANA        NA       TAGENE                       NA
## ENST00000832826      HAVANA        NA       TAGENE                       NA
## ENST00000832827      HAVANA        NA       TAGENE                       NA
## ENST00000832828      HAVANA        NA basic;TAGENE                       NA
## ENST00000832829      HAVANA        NA       TAGENE                       NA
##                 havana_transcript      ccdsid  protein_id
##                       <character> <character> <character>
## ENST00000832824                NA          NA          NA
## ENST00000832825                NA          NA          NA
## ENST00000832826                NA          NA          NA
## ENST00000832827                NA          NA          NA
## ENST00000832828                NA          NA          NA
## ENST00000832829                NA          NA          NA

#   transcript_name transcript_type gene_id gene_name source level tags
#   transcript_support_level havana_transcript ccdsid protein_id

# See transcript types
transcript_types(gtf)

## 
##                           artifact                          IG_C_gene 
##                                 19                                 23 
##                    IG_C_pseudogene                          IG_D_gene 
##                                  9                                 37 
##                          IG_J_gene                    IG_J_pseudogene 
##                                 18                                  3 
##                      IG_pseudogene                          IG_V_gene 
##                                  1                                146 
##                    IG_V_pseudogene                             lncRNA 
##                                187                             189152 
##                              miRNA                           misc_RNA 
##                               1879                               2207 
##                            Mt_rRNA                            Mt_tRNA 
##                                  2                                 22 
##                     non_stop_decay            nonsense_mediated_decay 
##                                105                              21949 
##               processed_pseudogene               processed_transcript 
##                               9488                                 12 
##                     protein_coding     protein_coding_CDS_not_defined 
##                             211446                              26573 
##                 protein_coding_LoF                    retained_intron 
##                                 74                              34239 
##                           ribozyme                               rRNA 
##                                  8                                 47 
##                    rRNA_pseudogene                             scaRNA 
##                                497                                 49 
##                             snoRNA                              snRNA 
##                                942                               1901 
##                               sRNA                                TEC 
##                                  5                               1108 
##                          TR_C_gene                          TR_D_gene 
##                                  6                                  5 
##                          TR_J_gene                    TR_J_pseudogene 
##                                 79                                  4 
##                          TR_V_gene                    TR_V_pseudogene 
##                                107                                 33 
##   transcribed_processed_pseudogene     transcribed_unitary_pseudogene 
##                               1149                                201 
## transcribed_unprocessed_pseudogene    translated_processed_pseudogene 
##                               1589                                  2 
##                 unitary_pseudogene             unprocessed_pseudogene 
##                                 89                               1949 
##                          vault_RNA 
##                                  4

# Filter by transcript type
coding_tx <- transcripts(gtf, filter = list(transcript_type = "protein_coding"))

# Filter by transcript support level (TSL)
# TSL1 = all splice junctions supported by RNA-seq
# TSL2 = best supporting transcript is flagged as suspect
# TSL3 = not all splice junctions supported
# TSL4/5 = poor support
high_confidence_tx <- transcripts(gtf, filter = list(transcript_support_level = "1"))

# Find transcripts with CCDS IDs (highly curated)
all_tx <- transcripts(gtf)
ccds_tx <- all_tx[!is.na(mcols(all_tx)$ccdsid)]
length(ccds_tx)

## [1] 95584

Exon Queries with Context

# Exons include exon_number, transcript_id, gene_id
ex <- exons(gtf)
head(mcols(ex))

## DataFrame with 6 rows and 6 columns
##                 exon_number     transcript_id           gene_id      source
##                   <integer>       <character>       <character> <character>
## ENSE00004248723          NA ENST00000832824.1 ENSG00000290825.2      HAVANA
## ENSE00004248721          NA ENST00000832825.1 ENSG00000290825.2      HAVANA
## ENSE00004248726          NA ENST00000832826.1 ENSG00000290825.2      HAVANA
## ENSE00004248710          NA ENST00000832827.1 ENSG00000290825.2      HAVANA
## ENSE00004248702          NA ENST00000832828.1 ENSG00000290825.2      HAVANA
## ENSE00004248719          NA ENST00000832829.1 ENSG00000290825.2      HAVANA
##                     level         tags
##                 <integer>  <character>
## ENSE00004248723        NA       TAGENE
## ENSE00004248721        NA       TAGENE
## ENSE00004248726        NA       TAGENE
## ENSE00004248710        NA       TAGENE
## ENSE00004248702        NA basic;TAGENE
## ENSE00004248719        NA       TAGENE

# Exons for a specific gene
brca1_exons <- exons(gtf, filter = list(gene_id_stripped = "ENSG00000012048"))

# Exons grouped by transcript
exons_by_tx <- exonsBy(gtf, by = "tx")
length(exons_by_tx)  # Number of transcripts

## [1] 507365

# Look at one transcript's exons (sorted by exon_number)
exons_by_tx[[1]]

## GRanges object with 6 ranges and 2 metadata columns:
##                   seqnames              ranges strand | exon_number
##                      <Rle>           <IRanges>  <Rle> |   <integer>
##   ENSE00001872691     chr7 127588411-127588565      + |        <NA>
##   ENSE00003494180     chr7 127589083-127589163      + |        <NA>
##   ENSE00003504066     chr7 127589485-127589594      + |        <NA>
##   ENSE00003678978     chr7 127590066-127590137      + |        <NA>
##   ENSE00003676786     chr7 127590963-127591088      + |        <NA>
##   ENSE00000882271     chr7 127591213-127591700      + |        <NA>
##                   gene_id_stripped
##                        <character>
##   ENSE00001872691  ENSG00000004059
##   ENSE00003494180  ENSG00000004059
##   ENSE00003504066  ENSG00000004059
##   ENSE00003678978  ENSG00000004059
##   ENSE00003676786  ENSG00000004059
##   ENSE00000882271  ENSG00000004059
##   -------
##   seqinfo: 25 sequences from GRCh38 genome; no seqlengths

# Exons grouped by gene
exons_by_gene <- exonsBy(gtf, by = "gene")

CDS and Protein Information

# CDS includes protein_id and frame
cds_regions <- cds(gtf)
head(mcols(cds_regions))

## DataFrame with 6 rows and 8 columns
##                                         transcript_id            gene_id
##                                           <character>        <character>
## ENST00000641515.2:CDS:65565-65573   ENST00000641515.2  ENSG00000186092.7
## ENST00000641515.2:CDS:69037-70005   ENST00000641515.2  ENSG00000186092.7
## ENST00000426406.4:CDS:450743-451678 ENST00000426406.4  ENSG00000284733.2
## ENST00000332831.5:CDS:685719-686654 ENST00000332831.5  ENSG00000284662.2
## ENST00000968544.1:CDS:924432-924948 ENST00000968544.1 ENSG00000187634.14
## ENST00000616016.5:CDS:924432-924948 ENST00000616016.5 ENSG00000187634.14
##                                            protein_id exon_number     frame
##                                           <character>   <integer> <integer>
## ENST00000641515.2:CDS:65565-65573   ENSP00000493376.2          NA         0
## ENST00000641515.2:CDS:69037-70005   ENSP00000493376.2          NA         0
## ENST00000426406.4:CDS:450743-451678 ENSP00000409316.1          NA         0
## ENST00000332831.5:CDS:685719-686654 ENSP00000329982.2          NA         0
## ENST00000968544.1:CDS:924432-924948 ENSP00000638603.1          NA         0
## ENST00000616016.5:CDS:924432-924948 ENSP00000478421.2          NA         0
##                                          source     level
##                                     <character> <integer>
## ENST00000641515.2:CDS:65565-65573        HAVANA        NA
## ENST00000641515.2:CDS:69037-70005        HAVANA        NA
## ENST00000426406.4:CDS:450743-451678      HAVANA        NA
## ENST00000332831.5:CDS:685719-686654      HAVANA        NA
## ENST00000968544.1:CDS:924432-924948      HAVANA        NA
## ENST00000616016.5:CDS:924432-924948      HAVANA        NA
##                                                       tags
##                                                <character>
## ENST00000641515.2:CDS:65565-65573   RNA_Seq_supported_pa..
## ENST00000641515.2:CDS:69037-70005   RNA_Seq_supported_pa..
## ENST00000426406.4:CDS:450743-451678 basic;Ensembl_canoni..
## ENST00000332831.5:CDS:685719-686654 basic;Ensembl_canoni..
## ENST00000968544.1:CDS:924432-924948 basic;GENCODE_Primar..
## ENST00000616016.5:CDS:924432-924948 CAGE_supported_TSS;R..

#   transcript_id gene_id protein_id exon_number frame source level tags

# CDS grouped by transcript
cds_by_tx <- cdsBy(gtf, by = "tx")

# Find all CDS for a specific protein
all_cds <- cds(gtf)
# protein_ids <- unique(mcols(all_cds)$protein_id)

UTRs and Codons

# 5' UTRs
utr5 <- utrs(gtf, type = "5prime")

# 3' UTRs
utr3 <- utrs(gtf, type = "3prime")

# Start codons
start_codons <- codons(gtf, type = "start")

# Stop codons
stop_codons <- codons(gtf, type = "stop")

Transcripts by Gene

# Get transcripts grouped by gene
tx_by_gene <- transcriptsBy(gtf, by = "gene")

# How many transcripts per gene?
tx_counts <- lengths(tx_by_gene)
summary(tx_counts)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   6.448   6.000 522.000

# Genes with most isoforms
head(sort(tx_counts, decreasing = TRUE), 20)

## ENSG00000304176 ENSG00000304309 ENSG00000249859 ENSG00000227888 ENSG00000255717 
##             522             448             426             411             363 
## ENSG00000181722 ENSG00000291178 ENSG00000310527 ENSG00000223482 ENSG00000272168 
##             360             353             331             327             326 
## ENSG00000093167 ENSG00000215386 ENSG00000290549 ENSG00000228315 ENSG00000293616 
##             321             317             310             289             289 
## ENSG00000310526 ENSG00000231721 ENSG00000291158 ENSG00000148180 ENSG00000291154 
##             285             284             276             274             270

Region Queries

# Find genes in a region
region <- GRanges("chr1", IRanges(1e6, 2e6))
genes_in_query <- genes_in_region(gtf, region)

# Find transcripts in a region
tx_in_query <- transcripts_in_region(gtf, region)

# Get promoter regions (now with gene_type!)
promoters <- promoters(genes(gtf), upstream = 2000, downstream = 200)
# promoters still has all mcols from genes
head(mcols(promoters)$gene_type)

## [1] "lncRNA"                             "transcribed_unprocessed_pseudogene"
## [3] "lncRNA"                             "transcribed_unprocessed_pseudogene"
## [5] "miRNA"                              "lncRNA"

# Filter for protein-coding promoters
pc_promoters <- promoters[mcols(promoters)$gene_type == "protein_coding"]
head(pc_promoters)

## GRanges object with 6 ranges and 6 metadata columns:
##                   seqnames        ranges strand |   gene_name      gene_type
##                      <Rle>     <IRanges>  <Rle> | <character>    <character>
##   ENSG00000186092     chr1   63419-65618      + |       OR4F5 protein_coding
##   ENSG00000284733     chr1 451479-453678      - |      OR4F29 protein_coding
##   ENSG00000284662     chr1 686455-688654      - |      OR4F16 protein_coding
##   ENSG00000187634     chr1 921923-924122      + |      SAMD11 protein_coding
##   ENSG00000188976     chr1 960515-962714      - |       NOC2L protein_coding
##   ENSG00000187961     chr1 958576-960775      + |      KLHL17 protein_coding
##                        source     level        tags           havana_gene
##                   <character> <integer> <character>           <character>
##   ENSG00000186092      HAVANA      <NA>        <NA>  OTTHUMG00000001094.4
##   ENSG00000284733      HAVANA      <NA>        <NA>  OTTHUMG00000002860.3
##   ENSG00000284662      HAVANA      <NA>        <NA>  OTTHUMG00000002581.3
##   ENSG00000187634      HAVANA      <NA>        <NA> OTTHUMG00000040719.11
##   ENSG00000188976      HAVANA      <NA>        <NA>  OTTHUMG00000040720.2
##   ENSG00000187961      HAVANA      <NA>        <NA>  OTTHUMG00000040721.8
##   -------
##   seqinfo: 25 sequences from GRCh38 genome; no seqlengths

Performance Tips

Use filters to reduce data loaded genes(gtf, filter = list(chrom = “chr1”)) # Fast genes(gtf)[seqnames(genes(gtf)) == “chr1”] # Slower (loads all first)
Select only columns you need genes(gtf, columns = c(“gene_name”, “gene_type”))
Use versioned IDs when you need them genes(gtf, use_versioned_ids = TRUE) # ENSG00000290825.2 genes(gtf, use_versioned_ids = FALSE) # ENSG00000290825 (default)
For large operations, consider working with the Parquet directly

    library(arrow)
    genes_tbl <- open_dataset("parquet_output/genes.parquet")
    genes_tbl |> filter(gene_type == "protein_coding") |> collect()

Session information

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.7.4
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] tools     stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] lkparq_0.0.2         dplyr_1.2.0          arrow_23.0.1.1      
##  [4] GenomicRanges_1.62.1 Seqinfo_1.0.0        IRanges_2.44.0      
##  [7] S4Vectors_0.48.0     BiocGenerics_0.56.0  generics_0.1.4      
## [10] BiocStyle_2.38.0    
## 
## loaded via a namespace (and not attached):
##  [1] SummarizedExperiment_1.40.0 KEGGREST_1.50.0            
##  [3] rjson_0.2.23                xfun_0.56                  
##  [5] bslib_0.10.0                htmlwidgets_1.6.4          
##  [7] lattice_0.22-9              Biobase_2.70.0             
##  [9] vctrs_0.7.1                 bitops_1.0-9               
## [11] parallel_4.5.2              curl_7.0.0                 
## [13] tibble_3.3.1                AnnotationDbi_1.72.0       
## [15] RSQLite_2.4.6               blob_1.3.0                 
## [17] pkgconfig_2.0.3             Matrix_1.7-4               
## [19] cigarillo_1.0.0             desc_1.4.3                 
## [21] assertthat_0.2.1            lifecycle_1.0.5            
## [23] compiler_4.5.2              Rsamtools_2.26.0           
## [25] textshaping_1.0.4           Biostrings_2.78.0          
## [27] codetools_0.2-20            GenomeInfoDb_1.46.2        
## [29] htmltools_0.5.9             sass_0.4.10                
## [31] RCurl_1.98-1.17             yaml_2.3.12                
## [33] pillar_1.11.1               pkgdown_2.2.0              
## [35] crayon_1.5.3                jquerylib_0.1.4            
## [37] BiocParallel_1.44.0         DelayedArray_0.36.0        
## [39] cachem_1.1.0                abind_1.4-8                
## [41] tidyselect_1.2.1            digest_0.6.39              
## [43] purrr_1.2.1                 restfulr_0.0.16            
## [45] bookdown_0.46               grid_4.5.2                 
## [47] fastmap_1.2.0               SparseArray_1.10.8         
## [49] cli_3.6.5                   magrittr_2.0.4             
## [51] S4Arrays_1.10.1             GenomicFeatures_1.62.0     
## [53] XML_3.99-0.22               withr_3.0.2                
## [55] UCSC.utils_1.6.1            bit64_4.6.0-1              
## [57] rmarkdown_2.30              XVector_0.50.0             
## [59] httr_1.4.8                  matrixStats_1.5.0          
## [61] bit_4.6.0                   otel_0.2.0                 
## [63] ragg_1.5.0                  png_0.1-8                  
## [65] memoise_2.0.1               evaluate_1.0.5             
## [67] knitr_1.51                  BiocIO_1.20.0              
## [69] rtracklayer_1.70.1          rlang_1.1.7                
## [71] glue_1.8.0                  DBI_1.3.0                  
## [73] BiocManager_1.30.27         jsonlite_2.0.0             
## [75] R6_2.6.1                    MatrixGenerics_1.22.0      
## [77] GenomicAlignments_1.46.0    systemfonts_1.3.1          
## [79] fs_1.6.6

Vincent J. Carey, stvjc at channing.harvard.edu

March 13, 2026