3rd Party Embeddings

3rd Party Embeddings#

# Using 3rd party embeddings as features to the model

%matplotlib inline

import Utils as model_helpers
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

Computing Features

half_enantiomer_data = pd.read_csv("half_enantiomer_data.csv")

# Loads in embeddings from 3rd party model to use as features
gme = np.load('../data/thirdparty/enantiomer-embeddings-for-rick.npz', allow_pickle=True) # Load the file
gme = gme['embeddings'].item() # Extract the data
gme = {k: v.squeeze() for k, v in gme.items()} # Flatten the arrays
gme_df = pd.DataFrame(gme).T  # Turn into a dataframe

gme_df.head()

	0	1	2	3	4	5	6	8	9	...	246	247	248	249	250	251	252	253	254	255
CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2	3.361583	0.382813	0.000000	0.000000	2.153011	1.410133	2.376704	0.765796	0.000000	...	0.000000	0.331122	0.251479	0.697581	0.569553	0.785193	0.000000	3.134834	3.344208	0.000000
CCCC[C@H](C(C)C)[C@H](O)CC	1.591573	0.000000	0.428940	1.412535	0.304237	4.056973	2.067844	0.000000	0.145670	...	0.000000	0.000000	0.000000	0.664556	1.274769	0.000000	0.200347	2.116929	0.000000	0.557281
CCCCCCCC[C@@H](C)O	0.222926	0.000000	0.223172	0.000000	0.000000	5.710248	2.740240	1.230071	0.000000	...	0.801922	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.291554	0.525811	2.257053
CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O	2.282184	0.912073	0.000000	0.000000	2.890187	0.176412	2.256818	0.569661	0.000000	...	0.101928	0.000000	0.340307	1.982442	0.941730	0.164528	0.000000	3.065782	2.076539	0.000000
C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O	3.561810	0.000000	0.271860	3.111673	0.000000	0.742194	2.071393	0.000000	0.158586	...	0.771586	0.000000	0.000000	0.382187	0.000000	1.091824	1.321833	0.000000	0.289515	0.716665

5 rows × 256 columns

# Make copy of original data and set the index to match that of the gme model
half_enantiomer_data_copy = half_enantiomer_data
half_enantiomer_data_copy = half_enantiomer_data_copy.set_index("SMILES String")

# Keep the columns in the gme df that match the index names in original dataset
common_index = half_enantiomer_data_copy.index.intersection(gme_df.index)
half_enantiomer_data_copy = half_enantiomer_data_copy.loc[common_index]
gme_df = gme_df.loc[common_index]

# Combine original dataset with gme df
g_model_embeddings = half_enantiomer_data_copy.join(gme_df, how="inner")

# Reset the index to be "Moecule Name"
g_model_embeddings = g_model_embeddings.set_index("Molecule Name")

g_model_embeddings.head()

	Unnamed: 0	Unnamed: 0.1	Pubchem ID #	Note	Other SMILES	Method	Contributor	Detection Threshold	Detection Units	Normalized Detection Threshold	...	246	247	248	249	250	251	252	253	254	255
Molecule Name
(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone	20	20	7160826	NaN	NaN	NaN	NaN	6.10E+01	ppb	61.00	...	0.000000	0.000000	0.245207	0.000000	0.000000	0.000000	2.003086	0.000000	2.417500	0.000000
(S)-(+)-2-methylbutanal	42	42	6971249	NaN	NaN	NaN	NaN	1.00E+01	ppm in air	10000.00	...	0.000000	0.866084	1.189604	1.203395	2.965221	0.000000	0.000000	2.632849	0.000000	0.000000
( 1S,5S)-(-)-\\xce\\xb1-Pinene	64	64	440968	Changed from PubChem ID 6654	CC1=CC[C@H]2C[C@@H]1C2(C)C	Different PubChem ID	DW	1.00E-01	ppb	0.10	...	1.348683	0.000000	0.685428	0.290208	0.000000	0.000000	0.000000	0.955723	3.763213	0.000000
(1R,3S,4S)-(+)-neomenthol	96	96	439263	NaN	CC(C)[C@@H]1CC[C@@H](C)C[C@@H]1O	ChemDraw + Cactus	DW	8.10E-01	ppb	0.81	...	0.000000	0.000000	0.204114	1.303833	2.614392	0.249996	0.000000	3.942642	1.846398	0.000000
(3S)-(-)-dihydrocitronellol	106	106	92283029	NaN	CC(C)CCC[C@@H](C)CCO	ChemDraw + Cactus	DW	2.50E+02	ppb	250.00	...	1.637907	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.083949	1.556027	2.074914

5 rows × 271 columns

assert ((g_model_embeddings.iloc[:,15:].var() <= 0).sum() == 0), "This should be 0 if not, get rid of columns with 0 varience"

Model

# Illustrate the magnitude differences across enantiomeric pairs in the dataset
model_helpers.fold_difference_of_enantiomers(half_enantiomer_data)

_images/5fba6fa4176d09f19f218fda9b3318c07afc7c5f96e2e3d35954744ac180c86b.png

x_gme = g_model_embeddings.iloc[:,15:]
y_gme = g_model_embeddings["log_abs"]
Xn_gme = pd.DataFrame(StandardScaler().fit_transform(x_gme), index=x_gme.index, columns=x_gme.columns)

model_helpers.create_model(Xn_gme, y_gme)

100%|██████████| 13/13 [00:02<00:00,  4.60it/s]

_images/324b7b8b7eb2cb3f6a97f38c768a100e9070258bf4a7edfdcda5718c642c32f1.png