{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## PCA and VCD (Could delete since there were no matches)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Comput PCS and VCD features to feed into model" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "half_enantiomer_data = pd.read_csv(\"half_enantiomer_data.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Read in pca and vcd dataframe to use as features\n", "pca_vcd_values = pd.read_csv(\"../data/vcd/pca_vcd_values.csv\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Molecule Name01234567891011121314
0(R)-(-)-Carvone7.464317-43.591126-8.046428-0.574730-3.5802545.872903-2.6892950.9102353.2638881.241722-1.817992-10.064818-5.5171726.9568730.003187
1(1R,2S,5R)-(-)-Menthol-15.984788-48.40975511.599679-6.459996-2.895890-21.8171877.98736630.168379-4.855940-27.966750-15.59379118.8817946.680355-3.869645-18.299473
2(-)-Menthyl chloride-5.850617-48.61299725.383460-1.972609-9.623228-12.079316-5.0148429.790833-11.125811-10.94892412.772888-4.0687234.1969340.3262000.769741
3(-)-Borneol0.756580-25.884643-4.9870436.148153-18.1679097.294051-16.9543979.16687710.1600468.51254122.090822-60.1886035.2464299.022766-17.823573
4(-)-Isoplegol-12.599590-33.717238-12.966523-18.4790835.945713-6.208222-4.33739210.783442-12.6402041.7736431.9274006.9351225.8564858.257695-9.983220
...................................................
156β-Cholestanol6.415996-18.84098220.303654-29.238431-26.0580528.328641-8.52025119.75326324.65059437.600273-1.058662-16.20017537.116864-1.951506-10.766619
157Cholesterol Ethyl Carbonate230.771312139.215929-45.562654-28.4452637.019739-30.85706818.819408-6.5421387.706800-5.8963787.642116-5.1880795.985704-34.834249-7.737224
158Cholesterol n-Caprate181.67742533.15122372.048835-49.14726019.66489320.812246-3.9777175.35103814.182154-2.6077716.333818-0.251176-7.47013819.545668-8.373291
159Cholesterol Laurate-130.152846164.03522961.091725-37.136634-13.84715414.540341-4.944992-3.1050622.5088001.7000325.7006788.617406-4.49628422.1504492.389079
160Cholesterol Myristate-132.817778151.83579966.528593-50.4547387.11273121.5767476.387919-5.878909-6.0111547.6486694.8939297.8665581.41072611.2210580.188386
\n", "

161 rows × 16 columns

\n", "
" ], "text/plain": [ " Molecule Name 0 1 2 \\\n", "0 (R)-(-)-Carvone 7.464317 -43.591126 -8.046428 \n", "1 (1R,2S,5R)-(-)-Menthol -15.984788 -48.409755 11.599679 \n", "2 (-)-Menthyl chloride -5.850617 -48.612997 25.383460 \n", "3 (-)-Borneol 0.756580 -25.884643 -4.987043 \n", "4 (-)-Isoplegol -12.599590 -33.717238 -12.966523 \n", ".. ... ... ... ... \n", "156 β-Cholestanol 6.415996 -18.840982 20.303654 \n", "157 Cholesterol Ethyl Carbonate 230.771312 139.215929 -45.562654 \n", "158 Cholesterol n-Caprate 181.677425 33.151223 72.048835 \n", "159 Cholesterol Laurate -130.152846 164.035229 61.091725 \n", "160 Cholesterol Myristate -132.817778 151.835799 66.528593 \n", "\n", " 3 4 5 6 7 8 \\\n", "0 -0.574730 -3.580254 5.872903 -2.689295 0.910235 3.263888 \n", "1 -6.459996 -2.895890 -21.817187 7.987366 30.168379 -4.855940 \n", "2 -1.972609 -9.623228 -12.079316 -5.014842 9.790833 -11.125811 \n", "3 6.148153 -18.167909 7.294051 -16.954397 9.166877 10.160046 \n", "4 -18.479083 5.945713 -6.208222 -4.337392 10.783442 -12.640204 \n", ".. ... ... ... ... ... ... \n", "156 -29.238431 -26.058052 8.328641 -8.520251 19.753263 24.650594 \n", "157 -28.445263 7.019739 -30.857068 18.819408 -6.542138 7.706800 \n", "158 -49.147260 19.664893 20.812246 -3.977717 5.351038 14.182154 \n", "159 -37.136634 -13.847154 14.540341 -4.944992 -3.105062 2.508800 \n", "160 -50.454738 7.112731 21.576747 6.387919 -5.878909 -6.011154 \n", "\n", " 9 10 11 12 13 14 \n", "0 1.241722 -1.817992 -10.064818 -5.517172 6.956873 0.003187 \n", "1 -27.966750 -15.593791 18.881794 6.680355 -3.869645 -18.299473 \n", "2 -10.948924 12.772888 -4.068723 4.196934 0.326200 0.769741 \n", "3 8.512541 22.090822 -60.188603 5.246429 9.022766 -17.823573 \n", "4 1.773643 1.927400 6.935122 5.856485 8.257695 -9.983220 \n", ".. ... ... ... ... ... ... \n", "156 37.600273 -1.058662 -16.200175 37.116864 -1.951506 -10.766619 \n", "157 -5.896378 7.642116 -5.188079 5.985704 -34.834249 -7.737224 \n", "158 -2.607771 6.333818 -0.251176 -7.470138 19.545668 -8.373291 \n", "159 1.700032 5.700678 8.617406 -4.496284 22.150449 2.389079 \n", "160 7.648669 4.893929 7.866558 1.410726 11.221058 0.188386 \n", "\n", "[161 rows x 16 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca_vcd_values" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Set index to Molecule Name\n", "pca_vcd_values = pca_vcd_values.set_index('Molecule Name')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567891011121314
Molecule Name
(R)-(-)-Carvone7.464317-43.591126-8.046428-0.574730-3.5802545.872903-2.6892950.9102353.2638881.241722-1.817992-10.064818-5.5171726.9568730.003187
(1R,2S,5R)-(-)-Menthol-15.984788-48.40975511.599679-6.459996-2.895890-21.8171877.98736630.168379-4.855940-27.966750-15.59379118.8817946.680355-3.869645-18.299473
(-)-Menthyl chloride-5.850617-48.61299725.383460-1.972609-9.623228-12.079316-5.0148429.790833-11.125811-10.94892412.772888-4.0687234.1969340.3262000.769741
(-)-Borneol0.756580-25.884643-4.9870436.148153-18.1679097.294051-16.9543979.16687710.1600468.51254122.090822-60.1886035.2464299.022766-17.823573
(-)-Isoplegol-12.599590-33.717238-12.966523-18.4790835.945713-6.208222-4.33739210.783442-12.6402041.7736431.9274006.9351225.8564858.257695-9.983220
\n", "
" ], "text/plain": [ " 0 1 2 3 4 \\\n", "Molecule Name \n", "(R)-(-)-Carvone 7.464317 -43.591126 -8.046428 -0.574730 -3.580254 \n", "(1R,2S,5R)-(-)-Menthol -15.984788 -48.409755 11.599679 -6.459996 -2.895890 \n", "(-)-Menthyl chloride -5.850617 -48.612997 25.383460 -1.972609 -9.623228 \n", "(-)-Borneol 0.756580 -25.884643 -4.987043 6.148153 -18.167909 \n", "(-)-Isoplegol -12.599590 -33.717238 -12.966523 -18.479083 5.945713 \n", "\n", " 5 6 7 8 9 \\\n", "Molecule Name \n", "(R)-(-)-Carvone 5.872903 -2.689295 0.910235 3.263888 1.241722 \n", "(1R,2S,5R)-(-)-Menthol -21.817187 7.987366 30.168379 -4.855940 -27.966750 \n", "(-)-Menthyl chloride -12.079316 -5.014842 9.790833 -11.125811 -10.948924 \n", "(-)-Borneol 7.294051 -16.954397 9.166877 10.160046 8.512541 \n", "(-)-Isoplegol -6.208222 -4.337392 10.783442 -12.640204 1.773643 \n", "\n", " 10 11 12 13 14 \n", "Molecule Name \n", "(R)-(-)-Carvone -1.817992 -10.064818 -5.517172 6.956873 0.003187 \n", "(1R,2S,5R)-(-)-Menthol -15.593791 18.881794 6.680355 -3.869645 -18.299473 \n", "(-)-Menthyl chloride 12.772888 -4.068723 4.196934 0.326200 0.769741 \n", "(-)-Borneol 22.090822 -60.188603 5.246429 9.022766 -17.823573 \n", "(-)-Isoplegol 1.927400 6.935122 5.856485 8.257695 -9.983220 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca_vcd_values.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Copy original dataframe\n", "half_enantiomer_data_copy = half_enantiomer_data\n", "half_enantiomer_data_copy;" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Keep the columns in the gme df that match the index names in original dataset\n", "common_index = half_enantiomer_data_copy.index.intersection(pca_vcd_values.index)\n", "half_enantiomer_data_copy = half_enantiomer_data_copy.loc[common_index]\n", "pca_vcd_features = pca_vcd_values.loc[common_index]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# Combine original dataset with pca_vcd\n", "pca_vcd = half_enantiomer_data_copy.join(pca_vcd_features, how=\"inner\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Unnamed: 0.1Molecule NamePubchem ID #NoteSMILES StringOther SMILESMethodContributorDetection Threshold...567891011121314
\n", "

0 rows × 32 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Unnamed: 0, Unnamed: 0.1, Molecule Name, Pubchem ID #, Note, SMILES String, Other SMILES, Method, Contributor, Detection Threshold, Detection Units, Normalized Detection Threshold, Molecule Odour, Resources, N, log_abs, det, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\n", "Index: []\n", "\n", "[0 rows x 32 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca_vcd" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "f48e850f9ba95f784ffa8abe9e192013ea9f7c58fba7063fff0218a6f7b5b546" } } }, "nbformat": 4, "nbformat_minor": 2 }