{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Load Data" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "#Gets the updates from the development files that are imported\n", "%load_ext autoreload" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "#Imports the pands library, the math library, and the init class python file\n", "#Last line: updates matlab library that is used in init file\n", "import Utils as model_helpers\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 456 molecules\n" ] } ], "source": [ "# Loads in the data\n", "load_smiles_strings = model_helpers.load_other_smiles(coleman=True)\n", "enantiomer_data = model_helpers.load_data(\"coleman\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Molecule NamePubchem ID #NoteSMILES StringOther SMILESMethodContributorDetection ThresholdDetection UnitsNormalized Detection ThresholdMolecule OdourResourcesN
0(R)-(-)-gamma-ionone11389922NaNCC(=O)/C=C/[C@H]1C(=C)CCCC1(C)CNaNNaNNaN1.10E+01ppb water11.00Weak green, fruity, pineapple-like odor with m...Rows 66-100 are from here: https://www.jstage....0
1(S)-(+)-gamma-ionone11194862NaNCC(=O)/C=C/[C@@H]1C(=C)CCCC1(C)CNaNNaNNaN7.00E-02ppb water0.07Linear, very pleasant, floral, green, woody od...Rows 101-121 are from here: https://github.com...0
2(4R)-(-)-carvone439570NaNCC1=CC[C@H](CC1=O)C(=C)CNaNNaNNaN2.00E+00ppb2.00sweet spearmint, fresh herbalRows 122 - 193 are from here: https://github.c...1
3(4S)-(+)-carvone16724NaNCC1=CC[C@@H](CC1=O)C(=C)CNaNNaNNaN1.30E+02ppb130.00caraway, fresh herbalRows 194-223 are from here: https://github.com...1
4(4R,7R)-(+)-galaxolide14177988NaNC[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)CNaNNaNNaN4.40E-01ppb in air0.44weak to almost odorlessRows 224-267 are from here: https://github.com...2
\n", "
" ], "text/plain": [ " Molecule Name Pubchem ID # Note \\\n", "0 (R)-(-)-gamma-ionone 11389922 NaN \n", "1 (S)-(+)-gamma-ionone 11194862 NaN \n", "2 (4R)-(-)-carvone 439570 NaN \n", "3 (4S)-(+)-carvone 16724 NaN \n", "4 (4R,7R)-(+)-galaxolide 14177988 NaN \n", "\n", " SMILES String Other SMILES Method \\\n", "0 CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C NaN NaN \n", "1 CC(=O)/C=C/[C@@H]1C(=C)CCCC1(C)C NaN NaN \n", "2 CC1=CC[C@H](CC1=O)C(=C)C NaN NaN \n", "3 CC1=CC[C@@H](CC1=O)C(=C)C NaN NaN \n", "4 C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C NaN NaN \n", "\n", " Contributor Detection Threshold Detection Units \\\n", "0 NaN 1.10E+01 ppb water \n", "1 NaN 7.00E-02 ppb water \n", "2 NaN 2.00E+00 ppb \n", "3 NaN 1.30E+02 ppb \n", "4 NaN 4.40E-01 ppb in air \n", "\n", " Normalized Detection Threshold \\\n", "0 11.00 \n", "1 0.07 \n", "2 2.00 \n", "3 130.00 \n", "4 0.44 \n", "\n", " Molecule Odour \\\n", "0 Weak green, fruity, pineapple-like odor with m... \n", "1 Linear, very pleasant, floral, green, woody od... \n", "2 sweet spearmint, fresh herbal \n", "3 caraway, fresh herbal \n", "4 weak to almost odorless \n", "\n", " Resources N \n", "0 Rows 66-100 are from here: https://www.jstage.... 0 \n", "1 Rows 101-121 are from here: https://github.com... 0 \n", "2 Rows 122 - 193 are from here: https://github.c... 1 \n", "3 Rows 194-223 are from here: https://github.com... 1 \n", "4 Rows 224-267 are from here: https://github.com... 2 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Groups each pair of enantiomers by making them have the same number associated with column \"N\"\n", "# Takes each pair of enantiomers and computes the ratio of the Normalized Detection Thresholds between the two\n", "# Sets the Normalized Detection Thresholds to be of the same type to avoid type errors later on\n", "enantiomer_data['N'] = np.arange(0, enantiomer_data.shape[0]/2, 0.5).astype(int)\n", "enantiomer_data['Normalized Detection Threshold'] = enantiomer_data['Normalized Detection Threshold'].astype('float')\n", "enantiomer_data.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# Adding all new smiles strings from \"Other SMILES\" column to official \"SMILES String\" column\n", "enantiomer_data[\"SMILES String\"] = enantiomer_data[\"Other SMILES\"].combine_first(enantiomer_data[\"SMILES String\"]) " ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Molecule NamePubchem ID #NoteSMILES StringOther SMILESMethodContributorDetection ThresholdDetection UnitsNormalized Detection ThresholdMolecule OdourResourcesN
72(3R)-(-)-linalool443158NaNCC(=CCC[C@](C)(C=C)O)CNaNNaNNaN8.00E-04ppb0.0008floral, woody lavenderNaN36
73(3S)-(+)-linalool67179NaNCC(=CCC[C@@](C)(C=C)O)CCC(=CCC[C@@](C)(C=C)O)CPubChem Isomeric SMILESDW7.40E-03ppb0.0074sweet, floral; odor reminiscent of petitgrain ...NaN36
\n", "
" ], "text/plain": [ " Molecule Name Pubchem ID # Note SMILES String \\\n", "72 (3R)-(-)-linalool 443158 NaN CC(=CCC[C@](C)(C=C)O)C \n", "73 (3S)-(+)-linalool 67179 NaN CC(=CCC[C@@](C)(C=C)O)C \n", "\n", " Other SMILES Method Contributor \\\n", "72 NaN NaN NaN \n", "73 CC(=CCC[C@@](C)(C=C)O)C PubChem Isomeric SMILES DW \n", "\n", " Detection Threshold Detection Units Normalized Detection Threshold \\\n", "72 8.00E-04 ppb 0.0008 \n", "73 7.40E-03 ppb 0.0074 \n", "\n", " Molecule Odour Resources N \n", "72 floral, woody lavender NaN 36 \n", "73 sweet, floral; odor reminiscent of petitgrain ... NaN 36 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Searching for a specific molecule in the dataframe. Checking that SMILES Strings differ\n", "enantiomer_data[enantiomer_data['Molecule Name'].str.contains('lina')]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "enantiomer_data.to_csv(\"enantiomer_data.csv\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# Take the absolute log values and harmonic values for each enantiomeric pair\n", "half_log_abs = enantiomer_data.groupby('N').apply(model_helpers.log_abs)\n", "half_det = enantiomer_data.groupby('N').apply(model_helpers.harmonic)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Creates a new data frame with just one odorant of each enantiomeric pair from the original dataset \n", "# adds the absolute value and detection threshold value for remaining odorants from enantiomering pair\n", "half_enantiomer_data = enantiomer_data.iloc[::2].copy()\n", "half_enantiomer_data.loc[:, 'log_abs'] = half_log_abs.values\n", "half_enantiomer_data.loc[:, 'det'] = half_det.values" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# This line makes sure that the rest of the exsisting null values are equal in the new data frame and in the new data frame's 'log_abs' column\n", "assert half_log_abs.isnull().sum() == half_enantiomer_data['log_abs'].isnull().sum()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Molecule NamePubchem ID #NoteSMILES StringOther SMILESMethodContributorDetection ThresholdDetection UnitsNormalized Detection ThresholdMolecule OdourResourcesNlog_absdet
0(R)-(-)-gamma-ionone11389922NaNCC(=O)/C=C/[C@H]1C(=C)CCCC1(C)CNaNNaNNaN1.10E+01ppb water11.00Weak green, fruity, pineapple-like odor with m...Rows 66-100 are from here: https://www.jstage....02.196295-0.856627
2(4R)-(-)-carvone439570NaNCC1=CC[C@H](CC1=O)C(=C)CNaNNaNNaN2.00E+00ppb2.00sweet spearmint, fresh herbalRows 122 - 193 are from here: https://github.c...11.8129130.595429
4(4R,7R)-(+)-galaxolide14177988NaNC[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)CNaNNaNNaN4.40E-01ppb in air0.44weak to almost odorlessRows 224-267 are from here: https://github.com...22.643453-2.699956
6(4R,4aS,6R)-(+) nootkatone1268142NaNC[C@@H]1CC(=O)C=C2[C@]1(C[C@@H](CC2)C(=C)C)CNaNNaNNaN1.50E+01ppm15000.00grapefruit odorRows 370-407 are from here: https://github.com...33.6434534.477023
8(2S,4R)-(+) cis-2-methyl-4-propyl-1,3-oxathiane6931728NaNCCC[C@@H]1CCO[C@@H](S1)CNaNNaNNaN2.00E+00ppb2.00ctypical sulfurous, with a rubbery onion note;...Rows 424-435 are from here: https://github.com...40.3010300.425969
\n", "
" ], "text/plain": [ " Molecule Name Pubchem ID # Note \\\n", "0 (R)-(-)-gamma-ionone 11389922 NaN \n", "2 (4R)-(-)-carvone 439570 NaN \n", "4 (4R,7R)-(+)-galaxolide 14177988 NaN \n", "6 (4R,4aS,6R)-(+) nootkatone 1268142 NaN \n", "8 (2S,4R)-(+) cis-2-methyl-4-propyl-1,3-oxathiane 6931728 NaN \n", "\n", " SMILES String Other SMILES Method \\\n", "0 CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C NaN NaN \n", "2 CC1=CC[C@H](CC1=O)C(=C)C NaN NaN \n", "4 C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C NaN NaN \n", "6 C[C@@H]1CC(=O)C=C2[C@]1(C[C@@H](CC2)C(=C)C)C NaN NaN \n", "8 CCC[C@@H]1CCO[C@@H](S1)C NaN NaN \n", "\n", " Contributor Detection Threshold Detection Units \\\n", "0 NaN 1.10E+01 ppb water \n", "2 NaN 2.00E+00 ppb \n", "4 NaN 4.40E-01 ppb in air \n", "6 NaN 1.50E+01 ppm \n", "8 NaN 2.00E+00 ppb \n", "\n", " Normalized Detection Threshold \\\n", "0 11.00 \n", "2 2.00 \n", "4 0.44 \n", "6 15000.00 \n", "8 2.00 \n", "\n", " Molecule Odour \\\n", "0 Weak green, fruity, pineapple-like odor with m... \n", "2 sweet spearmint, fresh herbal \n", "4 weak to almost odorless \n", "6 grapefruit odor \n", "8 ctypical sulfurous, with a rubbery onion note;... \n", "\n", " Resources N log_abs det \n", "0 Rows 66-100 are from here: https://www.jstage.... 0 2.196295 -0.856627 \n", "2 Rows 122 - 193 are from here: https://github.c... 1 1.812913 0.595429 \n", "4 Rows 224-267 are from here: https://github.com... 2 2.643453 -2.699956 \n", "6 Rows 370-407 are from here: https://github.com... 3 3.643453 4.477023 \n", "8 Rows 424-435 are from here: https://github.com... 4 0.301030 0.425969 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# This line checks that log_abs and det columns were added properly\n", "half_enantiomer_data.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Gets rid of all the invalid SMILES Strings, specifically the duplicates because we don't want to count their perceptual features twice and the \"nan\" values \n", "half_enantiomer_data = half_enantiomer_data.drop_duplicates(subset=['SMILES String'])\n", "half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('NaN', na=True)]\n", "half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('nan', na=True)]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# Assert statement to ensure that we only have unqiue smiles strings \n", "assert half_enantiomer_data['SMILES String'].shape == half_enantiomer_data['SMILES String'].unique().shape, \"Number of SMILES strings should equal number of unique SMILES strings at this stage\"" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# Assert that there are no more nan values in the smiles string column\n", "assert sum(half_enantiomer_data['SMILES String']=='nan') == 0, \"There should be no NaN SMILES strings at this point\"" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# Gets rid of the rows with a null log_abs value\n", "half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['log_abs'].isnull()]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# Assert that there are no more log_abs of det values with the value null\n", "assert not sum(half_enantiomer_data['log_abs'].isnull())\n", "assert not sum(half_enantiomer_data['det'].isnull())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.7.3 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]" }, "vscode": { "interpreter": { "hash": "f48e850f9ba95f784ffa8abe9e192013ea9f7c58fba7063fff0218a6f7b5b546" } } }, "nbformat": 4, "nbformat_minor": 4 }