{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "#Gets the updates from the development files that are imported\n",
    "%load_ext autoreload"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Imports the pands library, the math library, and the init class python file\n",
    "#Last line: updates matlab library that is used in init file\n",
    "import Utils as model_helpers\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 456 molecules\n"
     ]
    }
   ],
   "source": [
    "# Loads in the data\n",
    "load_smiles_strings = model_helpers.load_other_smiles(coleman=True)\n",
    "enantiomer_data = model_helpers.load_data(\"coleman\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Molecule Name</th>\n",
       "      <th>Pubchem ID #</th>\n",
       "      <th>Note</th>\n",
       "      <th>SMILES String</th>\n",
       "      <th>Other SMILES</th>\n",
       "      <th>Method</th>\n",
       "      <th>Contributor</th>\n",
       "      <th>Detection Threshold</th>\n",
       "      <th>Detection Units</th>\n",
       "      <th>Normalized Detection Threshold</th>\n",
       "      <th>Molecule Odour</th>\n",
       "      <th>Resources</th>\n",
       "      <th>N</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(R)-(-)-gamma-ionone</td>\n",
       "      <td>11389922</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.10E+01</td>\n",
       "      <td>ppb water</td>\n",
       "      <td>11.00</td>\n",
       "      <td>Weak green, fruity, pineapple-like odor with m...</td>\n",
       "      <td>Rows 66-100 are from here: https://www.jstage....</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(S)-(+)-gamma-ionone</td>\n",
       "      <td>11194862</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CC(=O)/C=C/[C@@H]1C(=C)CCCC1(C)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.00E-02</td>\n",
       "      <td>ppb water</td>\n",
       "      <td>0.07</td>\n",
       "      <td>Linear, very pleasant, floral, green, woody od...</td>\n",
       "      <td>Rows 101-121 are from here: https://github.com...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(4R)-(-)-carvone</td>\n",
       "      <td>439570</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CC1=CC[C@H](CC1=O)C(=C)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.00E+00</td>\n",
       "      <td>ppb</td>\n",
       "      <td>2.00</td>\n",
       "      <td>sweet spearmint, fresh herbal</td>\n",
       "      <td>Rows 122 - 193 are from here: https://github.c...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(4S)-(+)-carvone</td>\n",
       "      <td>16724</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CC1=CC[C@@H](CC1=O)C(=C)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.30E+02</td>\n",
       "      <td>ppb</td>\n",
       "      <td>130.00</td>\n",
       "      <td>caraway, fresh herbal</td>\n",
       "      <td>Rows 194-223 are from here: https://github.com...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(4R,7R)-(+)-galaxolide</td>\n",
       "      <td>14177988</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.40E-01</td>\n",
       "      <td>ppb in air</td>\n",
       "      <td>0.44</td>\n",
       "      <td>weak to almost odorless</td>\n",
       "      <td>Rows 224-267 are from here: https://github.com...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Molecule Name  Pubchem ID # Note  \\\n",
       "0    (R)-(-)-gamma-ionone      11389922  NaN   \n",
       "1    (S)-(+)-gamma-ionone      11194862  NaN   \n",
       "2        (4R)-(-)-carvone        439570  NaN   \n",
       "3        (4S)-(+)-carvone         16724  NaN   \n",
       "4  (4R,7R)-(+)-galaxolide      14177988  NaN   \n",
       "\n",
       "                                    SMILES String Other SMILES Method  \\\n",
       "0                 CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C          NaN    NaN   \n",
       "1                CC(=O)/C=C/[C@@H]1C(=C)CCCC1(C)C          NaN    NaN   \n",
       "2                        CC1=CC[C@H](CC1=O)C(=C)C          NaN    NaN   \n",
       "3                       CC1=CC[C@@H](CC1=O)C(=C)C          NaN    NaN   \n",
       "4  C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C          NaN    NaN   \n",
       "\n",
       "  Contributor Detection Threshold Detection Units  \\\n",
       "0         NaN            1.10E+01       ppb water   \n",
       "1         NaN            7.00E-02       ppb water   \n",
       "2         NaN            2.00E+00             ppb   \n",
       "3         NaN            1.30E+02             ppb   \n",
       "4         NaN            4.40E-01      ppb in air   \n",
       "\n",
       "   Normalized Detection Threshold  \\\n",
       "0                           11.00   \n",
       "1                            0.07   \n",
       "2                            2.00   \n",
       "3                          130.00   \n",
       "4                            0.44   \n",
       "\n",
       "                                      Molecule Odour  \\\n",
       "0  Weak green, fruity, pineapple-like odor with m...   \n",
       "1  Linear, very pleasant, floral, green, woody od...   \n",
       "2                      sweet spearmint, fresh herbal   \n",
       "3                              caraway, fresh herbal   \n",
       "4                            weak to almost odorless   \n",
       "\n",
       "                                           Resources  N  \n",
       "0  Rows 66-100 are from here: https://www.jstage....  0  \n",
       "1  Rows 101-121 are from here: https://github.com...  0  \n",
       "2  Rows 122 - 193 are from here: https://github.c...  1  \n",
       "3  Rows 194-223 are from here: https://github.com...  1  \n",
       "4  Rows 224-267 are from here: https://github.com...  2  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Groups each pair of enantiomers by making them have the same number associated with column \"N\"\n",
    "# Takes each pair of enantiomers and computes the ratio of the Normalized Detection Thresholds between the two\n",
    "# Sets the Normalized Detection Thresholds to be of the same type to avoid type errors later on\n",
    "enantiomer_data['N'] = np.arange(0, enantiomer_data.shape[0]/2, 0.5).astype(int)\n",
    "enantiomer_data['Normalized Detection Threshold'] = enantiomer_data['Normalized Detection Threshold'].astype('float')\n",
    "enantiomer_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adding all new smiles strings from \"Other SMILES\" column to official \"SMILES String\" column\n",
    "enantiomer_data[\"SMILES String\"] = enantiomer_data[\"Other SMILES\"].combine_first(enantiomer_data[\"SMILES String\"]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Molecule Name</th>\n",
       "      <th>Pubchem ID #</th>\n",
       "      <th>Note</th>\n",
       "      <th>SMILES String</th>\n",
       "      <th>Other SMILES</th>\n",
       "      <th>Method</th>\n",
       "      <th>Contributor</th>\n",
       "      <th>Detection Threshold</th>\n",
       "      <th>Detection Units</th>\n",
       "      <th>Normalized Detection Threshold</th>\n",
       "      <th>Molecule Odour</th>\n",
       "      <th>Resources</th>\n",
       "      <th>N</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>(3R)-(-)-linalool</td>\n",
       "      <td>443158</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CC(=CCC[C@](C)(C=C)O)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.00E-04</td>\n",
       "      <td>ppb</td>\n",
       "      <td>0.0008</td>\n",
       "      <td>floral, woody lavender</td>\n",
       "      <td>NaN</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>(3S)-(+)-linalool</td>\n",
       "      <td>67179</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CC(=CCC[C@@](C)(C=C)O)C</td>\n",
       "      <td>CC(=CCC[C@@](C)(C=C)O)C</td>\n",
       "      <td>PubChem Isomeric SMILES</td>\n",
       "      <td>DW</td>\n",
       "      <td>7.40E-03</td>\n",
       "      <td>ppb</td>\n",
       "      <td>0.0074</td>\n",
       "      <td>sweet, floral; odor reminiscent of petitgrain ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        Molecule Name  Pubchem ID # Note            SMILES String  \\\n",
       "72  (3R)-(-)-linalool        443158  NaN   CC(=CCC[C@](C)(C=C)O)C   \n",
       "73  (3S)-(+)-linalool         67179  NaN  CC(=CCC[C@@](C)(C=C)O)C   \n",
       "\n",
       "               Other SMILES                   Method Contributor  \\\n",
       "72                      NaN                      NaN         NaN   \n",
       "73  CC(=CCC[C@@](C)(C=C)O)C  PubChem Isomeric SMILES          DW   \n",
       "\n",
       "   Detection Threshold Detection Units  Normalized Detection Threshold  \\\n",
       "72            8.00E-04             ppb                          0.0008   \n",
       "73            7.40E-03             ppb                          0.0074   \n",
       "\n",
       "                                       Molecule Odour Resources   N  \n",
       "72                            floral, woody lavender        NaN  36  \n",
       "73  sweet, floral; odor reminiscent of petitgrain ...       NaN  36  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Searching for a specific molecule in the dataframe. Checking that SMILES Strings differ\n",
    "enantiomer_data[enantiomer_data['Molecule Name'].str.contains('lina')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "enantiomer_data.to_csv(\"enantiomer_data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Take the absolute log values and harmonic values for each enantiomeric pair\n",
    "half_log_abs = enantiomer_data.groupby('N').apply(model_helpers.log_abs)\n",
    "half_det = enantiomer_data.groupby('N').apply(model_helpers.harmonic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creates a new data frame with just one odorant of each enantiomeric pair from the original dataset \n",
    "# adds the absolute value and detection threshold value for remaining odorants from enantiomering pair\n",
    "half_enantiomer_data = enantiomer_data.iloc[::2].copy()\n",
    "half_enantiomer_data.loc[:, 'log_abs'] = half_log_abs.values\n",
    "half_enantiomer_data.loc[:, 'det'] = half_det.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This line makes sure that the rest of the exsisting null values are equal in the new data frame and in the new data frame's 'log_abs' column\n",
    "assert half_log_abs.isnull().sum() == half_enantiomer_data['log_abs'].isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Molecule Name</th>\n",
       "      <th>Pubchem ID #</th>\n",
       "      <th>Note</th>\n",
       "      <th>SMILES String</th>\n",
       "      <th>Other SMILES</th>\n",
       "      <th>Method</th>\n",
       "      <th>Contributor</th>\n",
       "      <th>Detection Threshold</th>\n",
       "      <th>Detection Units</th>\n",
       "      <th>Normalized Detection Threshold</th>\n",
       "      <th>Molecule Odour</th>\n",
       "      <th>Resources</th>\n",
       "      <th>N</th>\n",
       "      <th>log_abs</th>\n",
       "      <th>det</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(R)-(-)-gamma-ionone</td>\n",
       "      <td>11389922</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.10E+01</td>\n",
       "      <td>ppb water</td>\n",
       "      <td>11.00</td>\n",
       "      <td>Weak green, fruity, pineapple-like odor with m...</td>\n",
       "      <td>Rows 66-100 are from here: https://www.jstage....</td>\n",
       "      <td>0</td>\n",
       "      <td>2.196295</td>\n",
       "      <td>-0.856627</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(4R)-(-)-carvone</td>\n",
       "      <td>439570</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CC1=CC[C@H](CC1=O)C(=C)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.00E+00</td>\n",
       "      <td>ppb</td>\n",
       "      <td>2.00</td>\n",
       "      <td>sweet spearmint, fresh herbal</td>\n",
       "      <td>Rows 122 - 193 are from here: https://github.c...</td>\n",
       "      <td>1</td>\n",
       "      <td>1.812913</td>\n",
       "      <td>0.595429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(4R,7R)-(+)-galaxolide</td>\n",
       "      <td>14177988</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.40E-01</td>\n",
       "      <td>ppb in air</td>\n",
       "      <td>0.44</td>\n",
       "      <td>weak to almost odorless</td>\n",
       "      <td>Rows 224-267 are from here: https://github.com...</td>\n",
       "      <td>2</td>\n",
       "      <td>2.643453</td>\n",
       "      <td>-2.699956</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>(4R,4aS,6R)-(+) nootkatone</td>\n",
       "      <td>1268142</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C[C@@H]1CC(=O)C=C2[C@]1(C[C@@H](CC2)C(=C)C)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.50E+01</td>\n",
       "      <td>ppm</td>\n",
       "      <td>15000.00</td>\n",
       "      <td>grapefruit odor</td>\n",
       "      <td>Rows 370-407 are from here: https://github.com...</td>\n",
       "      <td>3</td>\n",
       "      <td>3.643453</td>\n",
       "      <td>4.477023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>(2S,4R)-(+) cis-2-methyl-4-propyl-1,3-oxathiane</td>\n",
       "      <td>6931728</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CCC[C@@H]1CCO[C@@H](S1)C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.00E+00</td>\n",
       "      <td>ppb</td>\n",
       "      <td>2.00</td>\n",
       "      <td>ctypical sulfurous, with a rubbery onion note;...</td>\n",
       "      <td>Rows 424-435 are from here: https://github.com...</td>\n",
       "      <td>4</td>\n",
       "      <td>0.301030</td>\n",
       "      <td>0.425969</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Molecule Name  Pubchem ID # Note  \\\n",
       "0                             (R)-(-)-gamma-ionone      11389922  NaN   \n",
       "2                                 (4R)-(-)-carvone        439570  NaN   \n",
       "4                           (4R,7R)-(+)-galaxolide      14177988  NaN   \n",
       "6                       (4R,4aS,6R)-(+) nootkatone       1268142  NaN   \n",
       "8  (2S,4R)-(+) cis-2-methyl-4-propyl-1,3-oxathiane       6931728  NaN   \n",
       "\n",
       "                                    SMILES String Other SMILES Method  \\\n",
       "0                 CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C          NaN    NaN   \n",
       "2                        CC1=CC[C@H](CC1=O)C(=C)C          NaN    NaN   \n",
       "4  C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C          NaN    NaN   \n",
       "6    C[C@@H]1CC(=O)C=C2[C@]1(C[C@@H](CC2)C(=C)C)C          NaN    NaN   \n",
       "8                        CCC[C@@H]1CCO[C@@H](S1)C          NaN    NaN   \n",
       "\n",
       "  Contributor Detection Threshold Detection Units  \\\n",
       "0         NaN            1.10E+01       ppb water   \n",
       "2         NaN            2.00E+00             ppb   \n",
       "4         NaN            4.40E-01      ppb in air   \n",
       "6         NaN            1.50E+01             ppm   \n",
       "8         NaN            2.00E+00             ppb   \n",
       "\n",
       "   Normalized Detection Threshold  \\\n",
       "0                           11.00   \n",
       "2                            2.00   \n",
       "4                            0.44   \n",
       "6                        15000.00   \n",
       "8                            2.00   \n",
       "\n",
       "                                      Molecule Odour  \\\n",
       "0  Weak green, fruity, pineapple-like odor with m...   \n",
       "2                      sweet spearmint, fresh herbal   \n",
       "4                            weak to almost odorless   \n",
       "6                                    grapefruit odor   \n",
       "8  ctypical sulfurous, with a rubbery onion note;...   \n",
       "\n",
       "                                           Resources  N   log_abs       det  \n",
       "0  Rows 66-100 are from here: https://www.jstage....  0  2.196295 -0.856627  \n",
       "2  Rows 122 - 193 are from here: https://github.c...  1  1.812913  0.595429  \n",
       "4  Rows 224-267 are from here: https://github.com...  2  2.643453 -2.699956  \n",
       "6  Rows 370-407 are from here: https://github.com...  3  3.643453  4.477023  \n",
       "8  Rows 424-435 are from here: https://github.com...  4  0.301030  0.425969  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# This line checks that log_abs and det columns were added properly\n",
    "half_enantiomer_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gets rid of all the invalid SMILES Strings, specifically the duplicates because we don't want to count their perceptual features twice and the \"nan\" values \n",
    "half_enantiomer_data = half_enantiomer_data.drop_duplicates(subset=['SMILES String'])\n",
    "half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('NaN', na=True)]\n",
    "half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('nan', na=True)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assert statement to ensure that we only have unqiue smiles strings \n",
    "assert half_enantiomer_data['SMILES String'].shape == half_enantiomer_data['SMILES String'].unique().shape, \"Number of SMILES strings should equal number of unique SMILES strings at this stage\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assert that there are no more nan values in the smiles string column\n",
    "assert sum(half_enantiomer_data['SMILES String']=='nan') == 0, \"There should be no NaN SMILES strings at this point\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gets rid of the rows with a null log_abs value\n",
    "half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['log_abs'].isnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assert that there are no more log_abs of det values with the value null\n",
    "assert not sum(half_enantiomer_data['log_abs'].isnull())\n",
    "assert not sum(half_enantiomer_data['det'].isnull())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7.3 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]"
  },
  "vscode": {
   "interpreter": {
    "hash": "f48e850f9ba95f784ffa8abe9e192013ea9f7c58fba7063fff0218a6f7b5b546"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}