3rd Party Embeddings

3rd Party Embeddings#

# Using 3rd party embeddings as features to the model
%matplotlib inline

import Utils as model_helpers
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

Computing Features

half_enantiomer_data = pd.read_csv("half_enantiomer_data.csv")
# Loads in embeddings from 3rd party model to use as features
gme = np.load('../data/thirdparty/enantiomer-embeddings-for-rick.npz', allow_pickle=True) # Load the file
gme = gme['embeddings'].item() # Extract the data
gme = {k: v.squeeze() for k, v in gme.items()} # Flatten the arrays
gme_df = pd.DataFrame(gme).T  # Turn into a dataframe
gme_df.head()
0 1 2 3 4 5 6 7 8 9 ... 246 247 248 249 250 251 252 253 254 255
CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2 3.361583 0.382813 0.000000 0.000000 2.153011 1.410133 2.376704 0.0 0.765796 0.000000 ... 0.000000 0.331122 0.251479 0.697581 0.569553 0.785193 0.000000 3.134834 3.344208 0.000000
CCCC[C@H](C(C)C)[C@H](O)CC 1.591573 0.000000 0.428940 1.412535 0.304237 4.056973 2.067844 0.0 0.000000 0.145670 ... 0.000000 0.000000 0.000000 0.664556 1.274769 0.000000 0.200347 2.116929 0.000000 0.557281
CCCCCCCC[C@@H](C)O 0.222926 0.000000 0.223172 0.000000 0.000000 5.710248 2.740240 0.0 1.230071 0.000000 ... 0.801922 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.291554 0.525811 2.257053
CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O 2.282184 0.912073 0.000000 0.000000 2.890187 0.176412 2.256818 0.0 0.569661 0.000000 ... 0.101928 0.000000 0.340307 1.982442 0.941730 0.164528 0.000000 3.065782 2.076539 0.000000
C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O 3.561810 0.000000 0.271860 3.111673 0.000000 0.742194 2.071393 0.0 0.000000 0.158586 ... 0.771586 0.000000 0.000000 0.382187 0.000000 1.091824 1.321833 0.000000 0.289515 0.716665

5 rows × 256 columns

# Make copy of original data and set the index to match that of the gme model
half_enantiomer_data_copy = half_enantiomer_data
half_enantiomer_data_copy = half_enantiomer_data_copy.set_index("SMILES String")
# Keep the columns in the gme df that match the index names in original dataset
common_index = half_enantiomer_data_copy.index.intersection(gme_df.index)
half_enantiomer_data_copy = half_enantiomer_data_copy.loc[common_index]
gme_df = gme_df.loc[common_index]
# Combine original dataset with gme df
g_model_embeddings = half_enantiomer_data_copy.join(gme_df, how="inner")
# Reset the index to be "Moecule Name"
g_model_embeddings = g_model_embeddings.set_index("Molecule Name")
g_model_embeddings.head()
Unnamed: 0 Unnamed: 0.1 Pubchem ID # Note Other SMILES Method Contributor Detection Threshold Detection Units Normalized Detection Threshold ... 246 247 248 249 250 251 252 253 254 255
Molecule Name
(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone 20 20 7160826 NaN NaN NaN NaN 6.10E+01 ppb 61.00 ... 0.000000 0.000000 0.245207 0.000000 0.000000 0.000000 2.003086 0.000000 2.417500 0.000000
(S)-(+)-2-methylbutanal 42 42 6971249 NaN NaN NaN NaN 1.00E+01 ppm in air 10000.00 ... 0.000000 0.866084 1.189604 1.203395 2.965221 0.000000 0.000000 2.632849 0.000000 0.000000
( 1S,5S)-(-)-\\xce\\xb1-Pinene 64 64 440968 Changed from PubChem ID 6654 CC1=CC[C@H]2C[C@@H]1C2(C)C Different PubChem ID DW 1.00E-01 ppb 0.10 ... 1.348683 0.000000 0.685428 0.290208 0.000000 0.000000 0.000000 0.955723 3.763213 0.000000
(1R,3S,4S)-(+)-neomenthol 96 96 439263 NaN CC(C)[C@@H]1CC[C@@H](C)C[C@@H]1O ChemDraw + Cactus DW 8.10E-01 ppb 0.81 ... 0.000000 0.000000 0.204114 1.303833 2.614392 0.249996 0.000000 3.942642 1.846398 0.000000
(3S)-(-)-dihydrocitronellol 106 106 92283029 NaN CC(C)CCC[C@@H](C)CCO ChemDraw + Cactus DW 2.50E+02 ppb 250.00 ... 1.637907 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.083949 1.556027 2.074914

5 rows × 271 columns

assert ((g_model_embeddings.iloc[:,15:].var() <= 0).sum() == 0), "This should be 0 if not, get rid of columns with 0 varience"

Model

# Illustrate the magnitude differences across enantiomeric pairs in the dataset
model_helpers.fold_difference_of_enantiomers(half_enantiomer_data)
_images/5fba6fa4176d09f19f218fda9b3318c07afc7c5f96e2e3d35954744ac180c86b.png
x_gme = g_model_embeddings.iloc[:,15:]
y_gme = g_model_embeddings["log_abs"]
Xn_gme = pd.DataFrame(StandardScaler().fit_transform(x_gme), index=x_gme.index, columns=x_gme.columns)
model_helpers.create_model(Xn_gme, y_gme)
100%|██████████| 13/13 [00:02<00:00,  4.60it/s]
_images/324b7b8b7eb2cb3f6a97f38c768a100e9070258bf4a7edfdcda5718c642c32f1.png