Introduction

This vignette goes through material at the “Adding data” tutorial document. Our objective is to use R and python together, with basilisk managing the python infrastructure.

We will simulate records with schematized information about 10 proteins and then use write_nodes to generate a CSV file.

The primary interface

loadBiocypher connects the Biocypher modules to R via basilisk and reticulate. A completely isolated miniconda environment, currently using Python 3.9, manages all the python code.

library(biocBiocypher)
bcobj = loadBiocypher()
bcobj
## biocypher_refs produced with basilisk.
##  use $biocypher_ref for modules, $generator_ref for simulator

The data generator

gen = bcobj$generator_ref
names(gen)
##  [1] "BioCypher"                    "Complex"                     
##  [3] "EntrezProtein"                "Interaction"                 
##  [5] "InteractionGenerator"         "Node"                        
##  [7] "node_generator"               "Protein"                     
##  [9] "ProteinProteinInteraction"    "r"                           
## [11] "random"                       "RandomPropertyProtein"       
## [13] "RandomPropertyProteinIsoform" "string"

The following R code generates records on 10 proteins:

prots = lapply(1:10, function(x) gen$Protein())
names(prots[[1]])
## [1] "get_id"         "get_label"      "get_properties" "id"            
## [5] "label"          "properties"
prots[[1]]$properties
## $sequence
## [1] "KVEKNWTWSDWTEVHHAVIGWDVYHDVPVNKEGPHDKVFTANLNLKRSNSCGSKLQACMQQMCTLKQFK"
## 
## $description
## [1] "m o p f z r u r k n"
## 
## $taxon
## [1] "9606"

This list is not known to the python main module (__main__) however. We need to use

reticulate::py_run_string("proteins = [Protein() for _ in range(10)]")
names(reticulate::py)  # symbols known to main
##  [1] "BioCypher"                    "Complex"                     
##  [3] "EntrezProtein"                "Interaction"                 
##  [5] "InteractionGenerator"         "Node"                        
##  [7] "node_generator"               "Protein"                     
##  [9] "ProteinProteinInteraction"    "proteins"                    
## [11] "r"                            "random"                      
## [13] "RandomPropertyProtein"        "RandomPropertyProteinIsoform"
## [15] "string"

Producing the graph nodes

Several configuration files are defined for this specific tutorial.

bc_config_path = system.file("tutorial_0.5.11", 
     "01_biocypher_config.yaml", package="biocBiocypher")
schema_config_path = system.file("tutorial_0.5.11", 
     "01_schema_config.yaml", package="biocBiocypher")
readLines(schema_config_path)
## [1] "protein:"                         "    represented_as: node"        
## [3] "    preferred_id: uniprot"        "    input_label: uniprot_protein"

These configurations are loaded into the main interface:

bc = bcobj$biocypher_ref
bc_configd = bc$BioCypher(
    biocypher_config_path=bc_config_path,
    schema_config_path=schema_config_path
)

The node_generator was written to use a globally defined variable proteins. That was defined above with py_run_string.

bc_configd$write_nodes(gen$node_generator())
## [1] TRUE
o1 = dir("biocypher-out", full=TRUE)[1]
fi = dir(o1, full=TRUE,patt="part")
he = strsplit(readLines(dir(o1, full=TRUE, patt="head"), warn=FALSE), ";")[[1]]
dat = read.delim(fi, sep=";", h=FALSE)
names(dat) = he
library(DT)
datatable(dat)
cat(reticulate::py_capture_output(bc_configd$summary()))
## Showing ontology structure based on https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
## entity
## └── named thing
##     └── biological entity
##         └── polypeptide
##             └── protein
## 
## 
## INFO -- No duplicate nodes in input.
## INFO -- No duplicate edges in input.
## INFO -- No missing labels in input.