{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## 3rd Party Embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Using 3rd party embeddings as features to the model" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "import Utils as model_helpers\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.preprocessing import StandardScaler" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Computing Features" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "half_enantiomer_data = pd.read_csv(\"half_enantiomer_data.csv\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Loads in embeddings from 3rd party model to use as features\n", "gme = np.load('../data/thirdparty/enantiomer-embeddings-for-rick.npz', allow_pickle=True) # Load the file\n", "gme = gme['embeddings'].item() # Extract the data\n", "gme = {k: v.squeeze() for k, v in gme.items()} # Flatten the arrays\n", "gme_df = pd.DataFrame(gme).T # Turn into a dataframe" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...246247248249250251252253254255
CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C23.3615830.3828130.0000000.0000002.1530111.4101332.3767040.00.7657960.000000...0.0000000.3311220.2514790.6975810.5695530.7851930.0000003.1348343.3442080.000000
CCCC[C@H](C(C)C)[C@H](O)CC1.5915730.0000000.4289401.4125350.3042374.0569732.0678440.00.0000000.145670...0.0000000.0000000.0000000.6645561.2747690.0000000.2003472.1169290.0000000.557281
CCCCCCCC[C@@H](C)O0.2229260.0000000.2231720.0000000.0000005.7102482.7402400.01.2300710.000000...0.8019220.0000000.0000000.0000000.0000000.0000000.0000001.2915540.5258112.257053
CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O2.2821840.9120730.0000000.0000002.8901870.1764122.2568180.00.5696610.000000...0.1019280.0000000.3403071.9824420.9417300.1645280.0000003.0657822.0765390.000000
C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O3.5618100.0000000.2718603.1116730.0000000.7421942.0713930.00.0000000.158586...0.7715860.0000000.0000000.3821870.0000001.0918241.3218330.0000000.2895150.716665
\n", "

5 rows × 256 columns

\n", "
" ], "text/plain": [ " 0 1 2 \\\n", "CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2 3.361583 0.382813 0.000000 \n", "CCCC[C@H](C(C)C)[C@H](O)CC 1.591573 0.000000 0.428940 \n", "CCCCCCCC[C@@H](C)O 0.222926 0.000000 0.223172 \n", "CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O 2.282184 0.912073 0.000000 \n", "C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O 3.561810 0.000000 0.271860 \n", "\n", " 3 4 5 \\\n", "CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2 0.000000 2.153011 1.410133 \n", "CCCC[C@H](C(C)C)[C@H](O)CC 1.412535 0.304237 4.056973 \n", "CCCCCCCC[C@@H](C)O 0.000000 0.000000 5.710248 \n", "CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O 0.000000 2.890187 0.176412 \n", "C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O 3.111673 0.000000 0.742194 \n", "\n", " 6 7 8 9 \\\n", "CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2 2.376704 0.0 0.765796 0.000000 \n", "CCCC[C@H](C(C)C)[C@H](O)CC 2.067844 0.0 0.000000 0.145670 \n", "CCCCCCCC[C@@H](C)O 2.740240 0.0 1.230071 0.000000 \n", "CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O 2.256818 0.0 0.569661 0.000000 \n", "C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O 2.071393 0.0 0.000000 0.158586 \n", "\n", " ... 246 247 248 \\\n", "CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2 ... 0.000000 0.331122 0.251479 \n", "CCCC[C@H](C(C)C)[C@H](O)CC ... 0.000000 0.000000 0.000000 \n", "CCCCCCCC[C@@H](C)O ... 0.801922 0.000000 0.000000 \n", "CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O ... 0.101928 0.000000 0.340307 \n", "C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O ... 0.771586 0.000000 0.000000 \n", "\n", " 249 250 251 \\\n", "CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2 0.697581 0.569553 0.785193 \n", "CCCC[C@H](C(C)C)[C@H](O)CC 0.664556 1.274769 0.000000 \n", "CCCCCCCC[C@@H](C)O 0.000000 0.000000 0.000000 \n", "CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O 1.982442 0.941730 0.164528 \n", "C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O 0.382187 0.000000 1.091824 \n", "\n", " 252 253 254 \\\n", "CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2 0.000000 3.134834 3.344208 \n", "CCCC[C@H](C(C)C)[C@H](O)CC 0.200347 2.116929 0.000000 \n", "CCCCCCCC[C@@H](C)O 0.000000 1.291554 0.525811 \n", "CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O 0.000000 3.065782 2.076539 \n", "C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O 1.321833 0.000000 0.289515 \n", "\n", " 255 \n", "CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2 0.000000 \n", "CCCC[C@H](C(C)C)[C@H](O)CC 0.557281 \n", "CCCCCCCC[C@@H](C)O 2.257053 \n", "CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O 0.000000 \n", "C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O 0.716665 \n", "\n", "[5 rows x 256 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gme_df.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Make copy of original data and set the index to match that of the gme model\n", "half_enantiomer_data_copy = half_enantiomer_data\n", "half_enantiomer_data_copy = half_enantiomer_data_copy.set_index(\"SMILES String\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Keep the columns in the gme df that match the index names in original dataset\n", "common_index = half_enantiomer_data_copy.index.intersection(gme_df.index)\n", "half_enantiomer_data_copy = half_enantiomer_data_copy.loc[common_index]\n", "gme_df = gme_df.loc[common_index]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Combine original dataset with gme df\n", "g_model_embeddings = half_enantiomer_data_copy.join(gme_df, how=\"inner\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Reset the index to be \"Moecule Name\"\n", "g_model_embeddings = g_model_embeddings.set_index(\"Molecule Name\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Unnamed: 0.1Pubchem ID #NoteOther SMILESMethodContributorDetection ThresholdDetection UnitsNormalized Detection Threshold...246247248249250251252253254255
Molecule Name
(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone20207160826NaNNaNNaNNaN6.10E+01ppb61.00...0.0000000.0000000.2452070.0000000.0000000.0000002.0030860.0000002.4175000.000000
(S)-(+)-2-methylbutanal42426971249NaNNaNNaNNaN1.00E+01ppm in air10000.00...0.0000000.8660841.1896041.2033952.9652210.0000000.0000002.6328490.0000000.000000
( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene6464440968Changed from PubChem ID 6654CC1=CC[C@H]2C[C@@H]1C2(C)CDifferent PubChem IDDW1.00E-01ppb0.10...1.3486830.0000000.6854280.2902080.0000000.0000000.0000000.9557233.7632130.000000
(1R,3S,4S)-(+)-neomenthol9696439263NaNCC(C)[C@@H]1CC[C@@H](C)C[C@@H]1OChemDraw + CactusDW8.10E-01ppb0.81...0.0000000.0000000.2041141.3038332.6143920.2499960.0000003.9426421.8463980.000000
(3S)-(-)-dihydrocitronellol10610692283029NaNCC(C)CCC[C@@H](C)CCOChemDraw + CactusDW2.50E+02ppb250.00...1.6379070.0000000.0000000.0000000.0000000.0000000.0000001.0839491.5560272.074914
\n", "

5 rows × 271 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 Unnamed: 0.1 \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone 20 20 \n", "(S)-(+)-2-methylbutanal 42 42 \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene 64 64 \n", "(1R,3S,4S)-(+)-neomenthol 96 96 \n", "(3S)-(-)-dihydrocitronellol 106 106 \n", "\n", " Pubchem ID # \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone 7160826 \n", "(S)-(+)-2-methylbutanal 6971249 \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene 440968 \n", "(1R,3S,4S)-(+)-neomenthol 439263 \n", "(3S)-(-)-dihydrocitronellol 92283029 \n", "\n", " Note \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone NaN \n", "(S)-(+)-2-methylbutanal NaN \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene Changed from PubChem ID 6654 \n", "(1R,3S,4S)-(+)-neomenthol NaN \n", "(3S)-(-)-dihydrocitronellol NaN \n", "\n", " Other SMILES \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone NaN \n", "(S)-(+)-2-methylbutanal NaN \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene CC1=CC[C@H]2C[C@@H]1C2(C)C \n", "(1R,3S,4S)-(+)-neomenthol CC(C)[C@@H]1CC[C@@H](C)C[C@@H]1O \n", "(3S)-(-)-dihydrocitronellol CC(C)CCC[C@@H](C)CCO \n", "\n", " Method \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone NaN \n", "(S)-(+)-2-methylbutanal NaN \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene Different PubChem ID \n", "(1R,3S,4S)-(+)-neomenthol ChemDraw + Cactus \n", "(3S)-(-)-dihydrocitronellol ChemDraw + Cactus \n", "\n", " Contributor \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone NaN \n", "(S)-(+)-2-methylbutanal NaN \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene DW \n", "(1R,3S,4S)-(+)-neomenthol DW \n", "(3S)-(-)-dihydrocitronellol DW \n", "\n", " Detection Threshold \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone 6.10E+01 \n", "(S)-(+)-2-methylbutanal 1.00E+01 \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene 1.00E-01 \n", "(1R,3S,4S)-(+)-neomenthol 8.10E-01 \n", "(3S)-(-)-dihydrocitronellol 2.50E+02 \n", "\n", " Detection Units \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone ppb \n", "(S)-(+)-2-methylbutanal ppm in air \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene ppb \n", "(1R,3S,4S)-(+)-neomenthol ppb \n", "(3S)-(-)-dihydrocitronellol ppb \n", "\n", " Normalized Detection Threshold \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone 61.00 \n", "(S)-(+)-2-methylbutanal 10000.00 \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene 0.10 \n", "(1R,3S,4S)-(+)-neomenthol 0.81 \n", "(3S)-(-)-dihydrocitronellol 250.00 \n", "\n", " ... 246 247 \\\n", "Molecule Name ... \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone ... 0.000000 0.000000 \n", "(S)-(+)-2-methylbutanal ... 0.000000 0.866084 \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene ... 1.348683 0.000000 \n", "(1R,3S,4S)-(+)-neomenthol ... 0.000000 0.000000 \n", "(3S)-(-)-dihydrocitronellol ... 1.637907 0.000000 \n", "\n", " 248 249 \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone 0.245207 0.000000 \n", "(S)-(+)-2-methylbutanal 1.189604 1.203395 \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene 0.685428 0.290208 \n", "(1R,3S,4S)-(+)-neomenthol 0.204114 1.303833 \n", "(3S)-(-)-dihydrocitronellol 0.000000 0.000000 \n", "\n", " 250 251 \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone 0.000000 0.000000 \n", "(S)-(+)-2-methylbutanal 2.965221 0.000000 \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene 0.000000 0.000000 \n", "(1R,3S,4S)-(+)-neomenthol 2.614392 0.249996 \n", "(3S)-(-)-dihydrocitronellol 0.000000 0.000000 \n", "\n", " 252 253 \\\n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone 2.003086 0.000000 \n", "(S)-(+)-2-methylbutanal 0.000000 2.632849 \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene 0.000000 0.955723 \n", "(1R,3S,4S)-(+)-neomenthol 0.000000 3.942642 \n", "(3S)-(-)-dihydrocitronellol 0.000000 1.083949 \n", "\n", " 254 255 \n", "Molecule Name \n", "(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone 2.417500 0.000000 \n", "(S)-(+)-2-methylbutanal 0.000000 0.000000 \n", " ( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene 3.763213 0.000000 \n", "(1R,3S,4S)-(+)-neomenthol 1.846398 0.000000 \n", "(3S)-(-)-dihydrocitronellol 1.556027 2.074914 \n", "\n", "[5 rows x 271 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g_model_embeddings.head()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "assert ((g_model_embeddings.iloc[:,15:].var() <= 0).sum() == 0), \"This should be 0 if not, get rid of columns with 0 varience\"" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Model" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Illustrate the magnitude differences across enantiomeric pairs in the dataset\n", "model_helpers.fold_difference_of_enantiomers(half_enantiomer_data)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "x_gme = g_model_embeddings.iloc[:,15:]\n", "y_gme = g_model_embeddings[\"log_abs\"]\n", "Xn_gme = pd.DataFrame(StandardScaler().fit_transform(x_gme), index=x_gme.index, columns=x_gme.columns)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 13/13 [00:02<00:00, 4.60it/s]\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "model_helpers.create_model(Xn_gme, y_gme)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "f48e850f9ba95f784ffa8abe9e192013ea9f7c58fba7063fff0218a6f7b5b546" } } }, "nbformat": 4, "nbformat_minor": 2 }