Introduction

bedbase.org assembles genomic data on many thousands of region-based scoring files. See the about page for bedbase.org for details.

This package explores the resource using Bioconductor/R. We’ll use interesting facilities of the hca package to explore JSON outputs of the bedbase API.

A basic query

We’ll find out about resources related to the GM12878 cell line.

Metadata acquisition

The get_bb_metadata function is very simple. We’ll learn how to vary the types of data retrieved later on. One could question whether ‘GM12878’ is a ‘cell type’ and we will examine this concept another time.

library(bedbaseRClient)
q1 = get_bb_metadata(query_type="cell_type", query_val="GM12878")
q1
## Response [http://bedbase.org/_private_api/query/bedfiles/other-%3E%3E%27cell_type%27%3D%25s?query_val=GM12878&columns=name&columns=other]
##   Date: 2021-06-09 16:02
##   Status: 200
##   Content-Type: application/json
##   Size: 741 kB
cc = httr::content(q1)
names(cc[[1]][[2]])
##  [1] "GSE"                 "bigbed"              "format"             
##  [4] "genome"              "tissue"              "antibody"           
##  [7] "cell_type"           "file_name"           "treatment"          
## [10] "yaml_file"           "data_source"         "description"        
## [13] "sample_name"         "exp_protocol"        "bedbase_config"     
## [16] "output_file_path"    "open_signal_matrix"  "pipeline_interfaces"

Metadata exploration

Illustration with known fields and values

The content returned by the API is complex. The hca package can help navigate.

library(hca)
lcc = lol(cc) # list of lists
lol_path(lcc)
## # A tibble: 24 x 3
##    path                      n is_leaf
##    <chr>                 <int> <lgl>  
##  1 [*]                     881 FALSE  
##  2 [*][*]                 1762 FALSE  
##  3 [*][*].antibody           3 TRUE   
##  4 [*][*].bedbase_config   881 TRUE   
##  5 [*][*].bigbed           881 TRUE   
##  6 [*][*].cell_type        881 TRUE   
##  7 [*][*].data_source      881 TRUE   
##  8 [*][*].description      881 TRUE   
##  9 [*][*].exp_protocol     881 TRUE   
## 10 [*][*].file_acc         878 TRUE   
## # … with 14 more rows

With lol_pull, we can extract the content present along a given structural path.

exps = lol_pull(lcc, "[*][*].exp_protocol")
table(exps)
## exps
##             CAGE          ChiPseq        DNase-seq Histone ChIP-seq 
##               12                3               16               52 
##  microRNA counts          RAMPAGE      TF ChIP-seq 
##                2                3              793

That’s informative. Let’s check how many targets are transcription factors. We’ll use the Lambert table in the TFutils package.

library(TFutils)
lam = retrieve_lambert_main()
targs = lol_pull(lcc, "[*][*].target")
length(intersect(targs, lam$Name))
## [1] 129

We can pivot to a different cell type as follows:

q2 = get_bb_metadata(query_type="cell_type", query_val="K562")
kk = httr::content(q2)
ktargs = lol_pull(lol(kk), "[*][*].target")
length(intersect(ktargs, lam$Name))
## [1] 296

Enumerating available experiments

We’ll use the API to acquire 50 metadata records. Cell type information is in an API component called other.

fixc = function(x) ifelse(is.null(x), NA_character_, x)
ctcheck = httr::GET("http://bedbase.org/api/bed/all/data?ids=md5sum&ids=other&limit=50")
ddl = lol(httr::content(ctcheck))
types = vapply(lol_pull(ddl, "data[*]")[seq(2,100,2)], function(x) x[["cell_type"]], character(1))
abs = vapply(lol_pull(ddl, "data[*]")[seq(2,100,2)], function(x) fixc(x[["antibody"]]), character(1))
targs = vapply(lol_pull(ddl, "data[*]")[seq(2,100,2)], function(x) fixc(x[["target"]]), character(1))
exps = vapply(lol_pull(ddl, "data[*]")[seq(2,100,2)], function(x) fixc(x[["exp_protocol"]]), character(1))

The md5sum component is used for direct file access.

mds = unlist(lol_pull(ddl, "data[*]")[seq(1,100,2)])
tab50 = data.frame(celltype=types, expt=exps, Ab=abs, targ=targs, md5=mds)
DT::datatable(tab50)

A targeted query

sel = tab50 |> filter(expt=="ChiPseq", 
    celltype=="GM12878", Ab=="IKZF1") |> 
    select(md5) 
query_bb(sel[1,1], which=GenomicRanges::GRanges("chr17:38000000-39000000"))
## GRanges object with 36 ranges and 6 metadata columns:
##        seqnames            ranges strand |        name     score signalValue
##           <Rle>         <IRanges>  <Rle> | <character> <integer>   <numeric>
##    [1]    chr17 38005205-38005573      * |           .       986    105.7468
##    [2]    chr17 38012382-38012887      * |           .      1000     95.1338
##    [3]    chr17 38012382-38012887      * |           .      1000    364.9017
##    [4]    chr17 38026670-38027306      * |           .       608     43.4833
##    [5]    chr17 38027916-38028696      * |           .      1000    199.5565
##    ...      ...               ...    ... .         ...       ...         ...
##   [32]    chr17 38867635-38868271      * |           .       538     54.7645
##   [33]    chr17 38873198-38873637      * |           .      1000    370.8526
##   [34]    chr17 38991100-38991632      * |           .      1000     23.2802
##   [35]    chr17 38991100-38991632      * |           .      1000    257.4294
##   [36]    chr17 38992296-38992932      * |           .       543     72.8908
##           pValue    qValue      peak
##        <numeric> <numeric> <integer>
##    [1]        -1   3.98052       164
##    [2]        -1   3.92232       458
##    [3]        -1   4.66339       255
##    [4]        -1   2.65281       318
##    [5]        -1   4.66339       502
##    ...       ...       ...       ...
##   [32]        -1   3.28290       318
##   [33]        -1   4.66339       215
##   [34]        -1   1.13672       474
##   [35]        -1   4.66339       275
##   [36]        -1   3.72042       318
##   -------
##   seqinfo: 34 sequences from an unspecified genome