PubChem API in R#

by Vishank Patel and Adam M. Nguyen

Documentation:

Pubchem API Documentation: https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access

These recipe examples were tested on March 24, 2023.

Attribution: This tutorial was adapted from supporting information in:

Scalfani, V. F.; Ralph, S. C. Alshaikh, A. A.; Bara, J. E. Programmatic Compilation of Chemical Data and Literature From PubChem Using Matlab. Chemical Engineering Education, 2020, 54, 230. https://doi.org/10.18260/2-1-370.660-115508 and vfscalfani/MATLAB-cheminformatics)

Tutorial License: This tutorial uses the knitr and imager R libraries, which are licensed as GPL-3 and LGPL-3, respectively. As a result, this tutorial code is licensed as GPL-3: UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook

Setup#

Importing the necessary libraries and setting up the base api:

library(tidyverse)  #essential packages
library(dplyr)      #tibbles (R data_frames)
library(purrr)      #character manipulation 
library(httr)       #GET() API requests
library(jsonlite)   #converting to JSON
library(knitr)      #including graphics
library(imager)     #including images
library(magick)     #Image manipulation

# Create base URL for PubChem API
api <- 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/'

1. PubChem Similarity#

Search for chemical structures in PubChem via a Fingerprint Tanimoto Similarity Search.

Get compound image#

compoundID <- "2734162"
CID_URL <- paste0(api,"cid/",compoundID,"/PNG")  #paste0 concatenates strings 

include_graphics(CID_URL)

Replace the above CID value (CID_SS_query) with a different CID to customize.

Retrieve InChI and SMILES#

Retrieve InChI

inchi_url <- paste0(api,"cid/",compoundID,"/property/inchi/TXT")

raw_inchi <- rawToChar(GET(inchi_url)$content);   #"$content" filters the http response from the output and only returns the required output data 
inchi <- raw_inchi %>% gsub("\n","",.);           #"." refers to raw_inchi in gsub
inchi
## [1] "InChI=1S/C8H15N2/c1-3-4-5-10-7-6-9(2)8-10/h6-8H,3-5H2,1-2H3/q+1"

Retrieve Isomeric SMILES

IS_url <- paste0(api,"cid/",compoundID,"/property/IsomericSMILES/TXT");

raw_IS <- rawToChar(GET(IS_url)$content);
IS <- raw_IS %>% gsub("\n","",.);
IS
## [1] "CCCCN1C=C[N+](=C1)C"

Retrieve Identifier and Property Data#

Create an identifier/property dataset from Similarity Search results.

Retrieve the following data from CID hit results: InChI, Isomeric SMILES, MW, Heavy Atom Count, Rotable Bond Count, and Charge

short_CIDs <- CIDs1_df$CID[1:25] #taking the first 25 CIDs from the similarity search results

#initializing the tibble
similarity_results_tibble <- tibble();
similarity_results_tibble <- add_column(similarity_results_tibble,
                             Compound_ID = "",
                             InChi = "",
                             IsoSMI = "",
                             MW = "",
                             Heavy_Atom_Count = "",
                             Rotatable_Bond_Count = "",
                             Charge = ""
                             );


for (CID in short_CIDs) {
  
  #define the api calls:
  api = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/';
  CID_InChI_url = paste0(api,'cid/',toString(CID),'/property/InChI/TXT');
  CID_IsoSMI_url = paste0(api,'cid/',toString(CID),'/property/IsomericSMILES/TXT');
  CID_MW_url = paste0(api,'cid/',toString(CID),'/property/MolecularWeight/TXT');
  CID_HeavyAtomCount_url = paste0(api,'cid/',toString(CID),'/property/HeavyAtomCount/TXT');
  CID_RotatableBondCount_url = paste0(api,'cid/',toString(CID),'/property/RotatableBondCount/TXT');
  CID_Charge_url = paste0(api,'cid/',toString(CID),'/property/Charge/TXT');
  
  
  #downloading the data
  inchi_temp <- rawToChar(GET(CID_InChI_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)       # adding a delay for the PubChem server
  isoSMI_temp <- rawToChar(GET(CID_IsoSMI_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)
  mw_temp <- rawToChar(GET(CID_MW_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)
  heavy_atom_count_temp <- rawToChar(GET(CID_HeavyAtomCount_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)
  rotatable_bond_count_temp <- rawToChar(GET(CID_RotatableBondCount_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)
  charge_temp <- rawToChar(GET(CID_Charge_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)

  #Appending the data in a tibble
  similarity_results_tibble <- similarity_results_tibble %>%
    add_row(
      Compound_ID = toString(CID),
      InChi = inchi_temp,
      IsoSMI = isoSMI_temp,
      MW = mw_temp,
      Heavy_Atom_Count = heavy_atom_count_temp,
      Rotatable_Bond_Count = rotatable_bond_count_temp,
      Charge = charge_temp
    )

}

similarity_results_tibble
## # A tibble: 25 × 7
##    Compound_ID InChi                         IsoSMI MW    Heavy…¹ Rotat…² Charge
##    <chr>       <chr>                         <chr>  <chr> <chr>   <chr>   <chr> 
##  1 2734161     InChI=1S/C8H15N2.ClH/c1-3-4-… CCCCN… 174.… 11      3       0     
##  2 61347       InChI=1S/C7H12N2/c1-2-3-5-9-… CCCCN… 124.… 9       3       0     
##  3 529334      InChI=1S/C8H14N2/c1-2-3-4-6-… CCCCC… 138.… 10      4       0     
##  4 304622      InChI=1S/C8H14N2/c1-3-4-6-10… CCCCN… 138.… 10      3       0     
##  5 118785      InChI=1S/C6H10N2/c1-2-4-8-5-… CCCN1… 110.… 8       2       0     
##  6 12971008    InChI=1S/C7H13N2.HI/c1-3-4-9… CCCN1… 252.… 10      2       0     
##  7 11448496    InChI=1S/C8H15N2.HI/c1-3-4-5… CCCCN… 266.… 11      3       0     
##  8 11424151    InChI=1S/C8H15N2.CHNS/c1-3-4… CCCCN… 197.… 13      3       0     
##  9 11171745    InChI=1S/C8H15N2.C2N3/c1-3-4… CCCCN… 205.… 15      3       0     
## 10 11160028    InChI=1S/C7H13N2.BrH/c1-3-4-… CCCN1… 205.… 10      2       0     
## # … with 15 more rows, and abbreviated variable names ¹​Heavy_Atom_Count,
## #   ²​Rotatable_Bond_Count

We will now export the generated dataframe as a tab separated text file. The file will be saved in the present working directory.

write.table(similarity_results_tibble, file = "Data/R_Similarityq_results.txt", sep = "\t", row.names = TRUE, col.names = NA);