{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## 3rd Party Embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Using 3rd party embeddings as features to the model" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "import Utils as model_helpers\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.preprocessing import StandardScaler" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Computing Features" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "half_enantiomer_data = pd.read_csv(\"half_enantiomer_data.csv\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Loads in embeddings from 3rd party model to use as features\n", "gme = np.load('../data/thirdparty/enantiomer-embeddings-for-rick.npz', allow_pickle=True) # Load the file\n", "gme = gme['embeddings'].item() # Extract the data\n", "gme = {k: v.squeeze() for k, v in gme.items()} # Flatten the arrays\n", "gme_df = pd.DataFrame(gme).T # Turn into a dataframe" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "... | \n", "246 | \n", "247 | \n", "248 | \n", "249 | \n", "250 | \n", "251 | \n", "252 | \n", "253 | \n", "254 | \n", "255 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CC1(C)[C@H]2CC[C@H](CO)[C@@H]1C2 | \n", "3.361583 | \n", "0.382813 | \n", "0.000000 | \n", "0.000000 | \n", "2.153011 | \n", "1.410133 | \n", "2.376704 | \n", "0.0 | \n", "0.765796 | \n", "0.000000 | \n", "... | \n", "0.000000 | \n", "0.331122 | \n", "0.251479 | \n", "0.697581 | \n", "0.569553 | \n", "0.785193 | \n", "0.000000 | \n", "3.134834 | \n", "3.344208 | \n", "0.000000 | \n", "
CCCC[C@H](C(C)C)[C@H](O)CC | \n", "1.591573 | \n", "0.000000 | \n", "0.428940 | \n", "1.412535 | \n", "0.304237 | \n", "4.056973 | \n", "2.067844 | \n", "0.0 | \n", "0.000000 | \n", "0.145670 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.664556 | \n", "1.274769 | \n", "0.000000 | \n", "0.200347 | \n", "2.116929 | \n", "0.000000 | \n", "0.557281 | \n", "
CCCCCCCC[C@@H](C)O | \n", "0.222926 | \n", "0.000000 | \n", "0.223172 | \n", "0.000000 | \n", "0.000000 | \n", "5.710248 | \n", "2.740240 | \n", "0.0 | \n", "1.230071 | \n", "0.000000 | \n", "... | \n", "0.801922 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.291554 | \n", "0.525811 | \n", "2.257053 | \n", "
CC1(C)[C@H]2CC[C@](C)(C2)[C@H]1O | \n", "2.282184 | \n", "0.912073 | \n", "0.000000 | \n", "0.000000 | \n", "2.890187 | \n", "0.176412 | \n", "2.256818 | \n", "0.0 | \n", "0.569661 | \n", "0.000000 | \n", "... | \n", "0.101928 | \n", "0.000000 | \n", "0.340307 | \n", "1.982442 | \n", "0.941730 | \n", "0.164528 | \n", "0.000000 | \n", "3.065782 | \n", "2.076539 | \n", "0.000000 | \n", "
C=C1CC[C@@H](C)C(C)(C)[C@@H]1/C=C/C(C)=O | \n", "3.561810 | \n", "0.000000 | \n", "0.271860 | \n", "3.111673 | \n", "0.000000 | \n", "0.742194 | \n", "2.071393 | \n", "0.0 | \n", "0.000000 | \n", "0.158586 | \n", "... | \n", "0.771586 | \n", "0.000000 | \n", "0.000000 | \n", "0.382187 | \n", "0.000000 | \n", "1.091824 | \n", "1.321833 | \n", "0.000000 | \n", "0.289515 | \n", "0.716665 | \n", "
5 rows × 256 columns
\n", "\n", " | Unnamed: 0 | \n", "Unnamed: 0.1 | \n", "Pubchem ID # | \n", "Note | \n", "Other SMILES | \n", "Method | \n", "Contributor | \n", "Detection Threshold | \n", "Detection Units | \n", "Normalized Detection Threshold | \n", "... | \n", "246 | \n", "247 | \n", "248 | \n", "249 | \n", "250 | \n", "251 | \n", "252 | \n", "253 | \n", "254 | \n", "255 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Molecule Name | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
(R)-(-)- muscone/(R)--3-methyl cyclopentadecanone | \n", "20 | \n", "20 | \n", "7160826 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6.10E+01 | \n", "ppb | \n", "61.00 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.245207 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "2.003086 | \n", "0.000000 | \n", "2.417500 | \n", "0.000000 | \n", "
(S)-(+)-2-methylbutanal | \n", "42 | \n", "42 | \n", "6971249 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "1.00E+01 | \n", "ppm in air | \n", "10000.00 | \n", "... | \n", "0.000000 | \n", "0.866084 | \n", "1.189604 | \n", "1.203395 | \n", "2.965221 | \n", "0.000000 | \n", "0.000000 | \n", "2.632849 | \n", "0.000000 | \n", "0.000000 | \n", "
( 1S,5S)-(-)-\\\\xce\\\\xb1-Pinene | \n", "64 | \n", "64 | \n", "440968 | \n", "Changed from PubChem ID 6654 | \n", "CC1=CC[C@H]2C[C@@H]1C2(C)C | \n", "Different PubChem ID | \n", "DW | \n", "1.00E-01 | \n", "ppb | \n", "0.10 | \n", "... | \n", "1.348683 | \n", "0.000000 | \n", "0.685428 | \n", "0.290208 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.955723 | \n", "3.763213 | \n", "0.000000 | \n", "
(1R,3S,4S)-(+)-neomenthol | \n", "96 | \n", "96 | \n", "439263 | \n", "NaN | \n", "CC(C)[C@@H]1CC[C@@H](C)C[C@@H]1O | \n", "ChemDraw + Cactus | \n", "DW | \n", "8.10E-01 | \n", "ppb | \n", "0.81 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.204114 | \n", "1.303833 | \n", "2.614392 | \n", "0.249996 | \n", "0.000000 | \n", "3.942642 | \n", "1.846398 | \n", "0.000000 | \n", "
(3S)-(-)-dihydrocitronellol | \n", "106 | \n", "106 | \n", "92283029 | \n", "NaN | \n", "CC(C)CCC[C@@H](C)CCO | \n", "ChemDraw + Cactus | \n", "DW | \n", "2.50E+02 | \n", "ppb | \n", "250.00 | \n", "... | \n", "1.637907 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.083949 | \n", "1.556027 | \n", "2.074914 | \n", "
5 rows × 271 columns
\n", "