## Load Data

In [31]:
#Gets the updates from the development files that are imported
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
#Imports the pands library, the math library, and the init class python file
#Last line: updates matlab library that is used in init file
import Utils as model_helpers
import pandas as pd

In [16]:
# Loads in the data
load_smiles_strings = model_helpers.load_other_smiles(coleman=True)
enantiomer_data = model_helpers.load_data("coleman")

Loaded 456 molecules


In [17]:
# Groups each pair of enantiomers by making them have the same number associated with column "N"
# Takes each pair of enantiomers and computes the ratio of the Normalized Detection Thresholds between the two
# Sets the Normalized Detection Thresholds to be of the same type to avoid type errors later on
enantiomer_data['N'] = np.arange(0, enantiomer_data.shape[0]/2, 0.5).astype(int)
enantiomer_data['Normalized Detection Threshold'] = enantiomer_data['Normalized Detection Threshold'].astype('float')
enantiomer_data.head()

Unnamed: 0,Molecule Name,Pubchem ID #,Note,SMILES String,Other SMILES,Method,Contributor,Detection Threshold,Detection Units,Normalized Detection Threshold,Molecule Odour,Resources,N
0,(R)-(-)-gamma-ionone,11389922,,CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C,,,,11.0,ppb water,11.0,"Weak green, fruity, pineapple-like odor with m...",Rows 66-100 are from here: https://www.jstage....,0
1,(S)-(+)-gamma-ionone,11194862,,CC(=O)/C=C/[C@@H]1C(=C)CCCC1(C)C,,,,0.07,ppb water,0.07,"Linear, very pleasant, floral, green, woody od...",Rows 101-121 are from here: https://github.com...,0
2,(4R)-(-)-carvone,439570,,CC1=CC[C@H](CC1=O)C(=C)C,,,,2.0,ppb,2.0,"sweet spearmint, fresh herbal",Rows 122 - 193 are from here: https://github.c...,1
3,(4S)-(+)-carvone,16724,,CC1=CC[C@@H](CC1=O)C(=C)C,,,,130.0,ppb,130.0,"caraway, fresh herbal",Rows 194-223 are from here: https://github.com...,1
4,"(4R,7R)-(+)-galaxolide",14177988,,C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C,,,,0.44,ppb in air,0.44,weak to almost odorless,Rows 224-267 are from here: https://github.com...,2


In [18]:
# Adding all new smiles strings from "Other SMILES" column to official "SMILES String" column
enantiomer_data["SMILES String"] = enantiomer_data["Other SMILES"].combine_first(enantiomer_data["SMILES String"]) 

In [19]:
# Searching for a specific molecule in the dataframe. Checking that SMILES Strings differ
enantiomer_data[enantiomer_data['Molecule Name'].str.contains('lina')]

Unnamed: 0,Molecule Name,Pubchem ID #,Note,SMILES String,Other SMILES,Method,Contributor,Detection Threshold,Detection Units,Normalized Detection Threshold,Molecule Odour,Resources,N
72,(3R)-(-)-linalool,443158,,CC(=CCC[C@](C)(C=C)O)C,,,,0.0008,ppb,0.0008,"floral, woody lavender",,36
73,(3S)-(+)-linalool,67179,,CC(=CCC[C@@](C)(C=C)O)C,CC(=CCC[C@@](C)(C=C)O)C,PubChem Isomeric SMILES,DW,0.0074,ppb,0.0074,"sweet, floral; odor reminiscent of petitgrain ...",,36


In [20]:
enantiomer_data.to_csv("enantiomer_data.csv")

In [21]:
# Take the absolute log values and harmonic values for each enantiomeric pair
half_log_abs = enantiomer_data.groupby('N').apply(model_helpers.log_abs)
half_det = enantiomer_data.groupby('N').apply(model_helpers.harmonic)

In [22]:
# Creates a new data frame with just one odorant of each enantiomeric pair from the original dataset 
# adds the absolute value and detection threshold value for remaining odorants from enantiomering pair
half_enantiomer_data = enantiomer_data.iloc[::2].copy()
half_enantiomer_data.loc[:, 'log_abs'] = half_log_abs.values
half_enantiomer_data.loc[:, 'det'] = half_det.values

In [23]:
# This line makes sure that the rest of the exsisting null values are equal in the new data frame and in the new data frame's 'log_abs' column
assert half_log_abs.isnull().sum() == half_enantiomer_data['log_abs'].isnull().sum()

In [24]:
# This line checks that log_abs and det columns were added properly
half_enantiomer_data.head()

Unnamed: 0,Molecule Name,Pubchem ID #,Note,SMILES String,Other SMILES,Method,Contributor,Detection Threshold,Detection Units,Normalized Detection Threshold,Molecule Odour,Resources,N,log_abs,det
0,(R)-(-)-gamma-ionone,11389922,,CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C,,,,11.0,ppb water,11.0,"Weak green, fruity, pineapple-like odor with m...",Rows 66-100 are from here: https://www.jstage....,0,2.196295,-0.856627
2,(4R)-(-)-carvone,439570,,CC1=CC[C@H](CC1=O)C(=C)C,,,,2.0,ppb,2.0,"sweet spearmint, fresh herbal",Rows 122 - 193 are from here: https://github.c...,1,1.812913,0.595429
4,"(4R,7R)-(+)-galaxolide",14177988,,C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C,,,,0.44,ppb in air,0.44,weak to almost odorless,Rows 224-267 are from here: https://github.com...,2,2.643453,-2.699956
6,"(4R,4aS,6R)-(+) nootkatone",1268142,,C[C@@H]1CC(=O)C=C2[C@]1(C[C@@H](CC2)C(=C)C)C,,,,15.0,ppm,15000.0,grapefruit odor,Rows 370-407 are from here: https://github.com...,3,3.643453,4.477023
8,"(2S,4R)-(+) cis-2-methyl-4-propyl-1,3-oxathiane",6931728,,CCC[C@@H]1CCO[C@@H](S1)C,,,,2.0,ppb,2.0,"ctypical sulfurous, with a rubbery onion note;...",Rows 424-435 are from here: https://github.com...,4,0.30103,0.425969


In [25]:
# Gets rid of all the invalid SMILES Strings, specifically the duplicates because we don't want to count their perceptual features twice and the "nan" values 
half_enantiomer_data = half_enantiomer_data.drop_duplicates(subset=['SMILES String'])
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('NaN', na=True)]
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('nan', na=True)]

In [26]:
# Assert statement to ensure that we only have unqiue smiles strings 
assert half_enantiomer_data['SMILES String'].shape == half_enantiomer_data['SMILES String'].unique().shape, "Number of SMILES strings should equal number of unique SMILES strings at this stage"

In [27]:
# Assert that there are no more nan values in the smiles string column
assert sum(half_enantiomer_data['SMILES String']=='nan') == 0, "There should be no NaN SMILES strings at this point"

In [28]:
# Gets rid of the rows with a null log_abs value
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['log_abs'].isnull()]

In [29]:
# Assert that there are no more log_abs of det values with the value null
assert not sum(half_enantiomer_data['log_abs'].isnull())
assert not sum(half_enantiomer_data['det'].isnull())