Load Data

Load Data#

#Gets the updates from the development files that are imported
%load_ext autoreload
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
#Imports the pands library, the math library, and the init class python file
#Last line: updates matlab library that is used in init file
import Utils as model_helpers
import pandas as pd
# Loads in the data
load_smiles_strings = model_helpers.load_other_smiles(coleman=True)
enantiomer_data = model_helpers.load_data("coleman")
Loaded 456 molecules
# Groups each pair of enantiomers by making them have the same number associated with column "N"
# Takes each pair of enantiomers and computes the ratio of the Normalized Detection Thresholds between the two
# Sets the Normalized Detection Thresholds to be of the same type to avoid type errors later on
enantiomer_data['N'] = np.arange(0, enantiomer_data.shape[0]/2, 0.5).astype(int)
enantiomer_data['Normalized Detection Threshold'] = enantiomer_data['Normalized Detection Threshold'].astype('float')
enantiomer_data.head()
Molecule Name Pubchem ID # Note SMILES String Other SMILES Method Contributor Detection Threshold Detection Units Normalized Detection Threshold Molecule Odour Resources N
0 (R)-(-)-gamma-ionone 11389922 NaN CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C NaN NaN NaN 1.10E+01 ppb water 11.00 Weak green, fruity, pineapple-like odor with m... Rows 66-100 are from here: https://www.jstage.... 0
1 (S)-(+)-gamma-ionone 11194862 NaN CC(=O)/C=C/[C@@H]1C(=C)CCCC1(C)C NaN NaN NaN 7.00E-02 ppb water 0.07 Linear, very pleasant, floral, green, woody od... Rows 101-121 are from here: https://github.com... 0
2 (4R)-(-)-carvone 439570 NaN CC1=CC[C@H](CC1=O)C(=C)C NaN NaN NaN 2.00E+00 ppb 2.00 sweet spearmint, fresh herbal Rows 122 - 193 are from here: https://github.c... 1
3 (4S)-(+)-carvone 16724 NaN CC1=CC[C@@H](CC1=O)C(=C)C NaN NaN NaN 1.30E+02 ppb 130.00 caraway, fresh herbal Rows 194-223 are from here: https://github.com... 1
4 (4R,7R)-(+)-galaxolide 14177988 NaN C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C NaN NaN NaN 4.40E-01 ppb in air 0.44 weak to almost odorless Rows 224-267 are from here: https://github.com... 2
# Adding all new smiles strings from "Other SMILES" column to official "SMILES String" column
enantiomer_data["SMILES String"] = enantiomer_data["Other SMILES"].combine_first(enantiomer_data["SMILES String"]) 
# Searching for a specific molecule in the dataframe. Checking that SMILES Strings differ
enantiomer_data[enantiomer_data['Molecule Name'].str.contains('lina')]
Molecule Name Pubchem ID # Note SMILES String Other SMILES Method Contributor Detection Threshold Detection Units Normalized Detection Threshold Molecule Odour Resources N
72 (3R)-(-)-linalool 443158 NaN CC(=CCC[C@](C)(C=C)O)C NaN NaN NaN 8.00E-04 ppb 0.0008 floral, woody lavender NaN 36
73 (3S)-(+)-linalool 67179 NaN CC(=CCC[C@@](C)(C=C)O)C CC(=CCC[C@@](C)(C=C)O)C PubChem Isomeric SMILES DW 7.40E-03 ppb 0.0074 sweet, floral; odor reminiscent of petitgrain ... NaN 36
enantiomer_data.to_csv("enantiomer_data.csv")
# Take the absolute log values and harmonic values for each enantiomeric pair
half_log_abs = enantiomer_data.groupby('N').apply(model_helpers.log_abs)
half_det = enantiomer_data.groupby('N').apply(model_helpers.harmonic)
# Creates a new data frame with just one odorant of each enantiomeric pair from the original dataset 
# adds the absolute value and detection threshold value for remaining odorants from enantiomering pair
half_enantiomer_data = enantiomer_data.iloc[::2].copy()
half_enantiomer_data.loc[:, 'log_abs'] = half_log_abs.values
half_enantiomer_data.loc[:, 'det'] = half_det.values
# This line makes sure that the rest of the exsisting null values are equal in the new data frame and in the new data frame's 'log_abs' column
assert half_log_abs.isnull().sum() == half_enantiomer_data['log_abs'].isnull().sum()
# This line checks that log_abs and det columns were added properly
half_enantiomer_data.head()
Molecule Name Pubchem ID # Note SMILES String Other SMILES Method Contributor Detection Threshold Detection Units Normalized Detection Threshold Molecule Odour Resources N log_abs det
0 (R)-(-)-gamma-ionone 11389922 NaN CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C NaN NaN NaN 1.10E+01 ppb water 11.00 Weak green, fruity, pineapple-like odor with m... Rows 66-100 are from here: https://www.jstage.... 0 2.196295 -0.856627
2 (4R)-(-)-carvone 439570 NaN CC1=CC[C@H](CC1=O)C(=C)C NaN NaN NaN 2.00E+00 ppb 2.00 sweet spearmint, fresh herbal Rows 122 - 193 are from here: https://github.c... 1 1.812913 0.595429
4 (4R,7R)-(+)-galaxolide 14177988 NaN C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C NaN NaN NaN 4.40E-01 ppb in air 0.44 weak to almost odorless Rows 224-267 are from here: https://github.com... 2 2.643453 -2.699956
6 (4R,4aS,6R)-(+) nootkatone 1268142 NaN C[C@@H]1CC(=O)C=C2[C@]1(C[C@@H](CC2)C(=C)C)C NaN NaN NaN 1.50E+01 ppm 15000.00 grapefruit odor Rows 370-407 are from here: https://github.com... 3 3.643453 4.477023
8 (2S,4R)-(+) cis-2-methyl-4-propyl-1,3-oxathiane 6931728 NaN CCC[C@@H]1CCO[C@@H](S1)C NaN NaN NaN 2.00E+00 ppb 2.00 ctypical sulfurous, with a rubbery onion note;... Rows 424-435 are from here: https://github.com... 4 0.301030 0.425969
# Gets rid of all the invalid SMILES Strings, specifically the duplicates because we don't want to count their perceptual features twice and the "nan" values 
half_enantiomer_data = half_enantiomer_data.drop_duplicates(subset=['SMILES String'])
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('NaN', na=True)]
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('nan', na=True)]
# Assert statement to ensure that we only have unqiue smiles strings 
assert half_enantiomer_data['SMILES String'].shape == half_enantiomer_data['SMILES String'].unique().shape, "Number of SMILES strings should equal number of unique SMILES strings at this stage"
# Assert that there are no more nan values in the smiles string column
assert sum(half_enantiomer_data['SMILES String']=='nan') == 0, "There should be no NaN SMILES strings at this point"
# Gets rid of the rows with a null log_abs value
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['log_abs'].isnull()]
# Assert that there are no more log_abs of det values with the value null
assert not sum(half_enantiomer_data['log_abs'].isnull())
assert not sum(half_enantiomer_data['det'].isnull())