Load Data

Load Data#

#Gets the updates from the development files that are imported
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

#Imports the pands library, the math library, and the init class python file
#Last line: updates matlab library that is used in init file
import Utils as model_helpers
import pandas as pd

# Loads in the data
load_smiles_strings = model_helpers.load_other_smiles(coleman=True)
enantiomer_data = model_helpers.load_data("coleman")

Loaded 456 molecules

# Groups each pair of enantiomers by making them have the same number associated with column "N"
# Takes each pair of enantiomers and computes the ratio of the Normalized Detection Thresholds between the two
# Sets the Normalized Detection Thresholds to be of the same type to avoid type errors later on
enantiomer_data['N'] = np.arange(0, enantiomer_data.shape[0]/2, 0.5).astype(int)
enantiomer_data['Normalized Detection Threshold'] = enantiomer_data['Normalized Detection Threshold'].astype('float')
enantiomer_data.head()

	Molecule Name	Pubchem ID #	Note	SMILES String	Other SMILES	Method	Contributor	Detection Threshold	Detection Units	Normalized Detection Threshold	Molecule Odour	Resources	N
0	(R)-(-)-gamma-ionone	11389922	NaN	CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C	NaN	NaN	NaN	1.10E+01	ppb water	11.00	Weak green, fruity, pineapple-like odor with m...	Rows 66-100 are from here: https://www.jstage....	0
1	(S)-(+)-gamma-ionone	11194862	NaN	CC(=O)/C=C/[C@@H]1C(=C)CCCC1(C)C	NaN	NaN	NaN	7.00E-02	ppb water	0.07	Linear, very pleasant, floral, green, woody od...	Rows 101-121 are from here: https://github.com...	0
2	(4R)-(-)-carvone	439570	NaN	CC1=CC[C@H](CC1=O)C(=C)C	NaN	NaN	NaN	2.00E+00	ppb	2.00	sweet spearmint, fresh herbal	Rows 122 - 193 are from here: https://github.c...	1
3	(4S)-(+)-carvone	16724	NaN	CC1=CC[C@@H](CC1=O)C(=C)C	NaN	NaN	NaN	1.30E+02	ppb	130.00	caraway, fresh herbal	Rows 194-223 are from here: https://github.com...	1
4	(4R,7R)-(+)-galaxolide	14177988	NaN	C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C	NaN	NaN	NaN	4.40E-01	ppb in air	0.44	weak to almost odorless	Rows 224-267 are from here: https://github.com...	2

# Adding all new smiles strings from "Other SMILES" column to official "SMILES String" column
enantiomer_data["SMILES String"] = enantiomer_data["Other SMILES"].combine_first(enantiomer_data["SMILES String"]) 

# Searching for a specific molecule in the dataframe. Checking that SMILES Strings differ
enantiomer_data[enantiomer_data['Molecule Name'].str.contains('lina')]

	Molecule Name	Pubchem ID #	Note	SMILES String	Other SMILES	Method	Contributor	Detection Threshold	Detection Units	Normalized Detection Threshold	Molecule Odour	Resources	N
72	(3R)-(-)-linalool	443158	NaN	CC(=CCC[C@](C)(C=C)O)C	NaN	NaN	NaN	8.00E-04	ppb	0.0008	floral, woody lavender	NaN	36
73	(3S)-(+)-linalool	67179	NaN	CC(=CCC[C@@](C)(C=C)O)C	CC(=CCC[C@@](C)(C=C)O)C	PubChem Isomeric SMILES	DW	7.40E-03	ppb	0.0074	sweet, floral; odor reminiscent of petitgrain ...	NaN	36

enantiomer_data.to_csv("enantiomer_data.csv")

# Take the absolute log values and harmonic values for each enantiomeric pair
half_log_abs = enantiomer_data.groupby('N').apply(model_helpers.log_abs)
half_det = enantiomer_data.groupby('N').apply(model_helpers.harmonic)

# Creates a new data frame with just one odorant of each enantiomeric pair from the original dataset 
# adds the absolute value and detection threshold value for remaining odorants from enantiomering pair
half_enantiomer_data = enantiomer_data.iloc[::2].copy()
half_enantiomer_data.loc[:, 'log_abs'] = half_log_abs.values
half_enantiomer_data.loc[:, 'det'] = half_det.values

# This line makes sure that the rest of the exsisting null values are equal in the new data frame and in the new data frame's 'log_abs' column
assert half_log_abs.isnull().sum() == half_enantiomer_data['log_abs'].isnull().sum()

# This line checks that log_abs and det columns were added properly
half_enantiomer_data.head()

	Molecule Name	Pubchem ID #	Note	SMILES String	Other SMILES	Method	Contributor	Detection Threshold	Detection Units	Normalized Detection Threshold	Molecule Odour	Resources	N	log_abs	det
0	(R)-(-)-gamma-ionone	11389922	NaN	CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C	NaN	NaN	NaN	1.10E+01	ppb water	11.00	Weak green, fruity, pineapple-like odor with m...	Rows 66-100 are from here: https://www.jstage....	0	2.196295	-0.856627
2	(4R)-(-)-carvone	439570	NaN	CC1=CC[C@H](CC1=O)C(=C)C	NaN	NaN	NaN	2.00E+00	ppb	2.00	sweet spearmint, fresh herbal	Rows 122 - 193 are from here: https://github.c...	1	1.812913	0.595429
4	(4R,7R)-(+)-galaxolide	14177988	NaN	C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C	NaN	NaN	NaN	4.40E-01	ppb in air	0.44	weak to almost odorless	Rows 224-267 are from here: https://github.com...	2	2.643453	-2.699956
6	(4R,4aS,6R)-(+) nootkatone	1268142	NaN	C[C@@H]1CC(=O)C=C2[C@]1(C[C@@H](CC2)C(=C)C)C	NaN	NaN	NaN	1.50E+01	ppm	15000.00	grapefruit odor	Rows 370-407 are from here: https://github.com...	3	3.643453	4.477023
8	(2S,4R)-(+) cis-2-methyl-4-propyl-1,3-oxathiane	6931728	NaN	CCC[C@@H]1CCO[C@@H](S1)C	NaN	NaN	NaN	2.00E+00	ppb	2.00	ctypical sulfurous, with a rubbery onion note;...	Rows 424-435 are from here: https://github.com...	4	0.301030	0.425969

# Gets rid of all the invalid SMILES Strings, specifically the duplicates because we don't want to count their perceptual features twice and the "nan" values 
half_enantiomer_data = half_enantiomer_data.drop_duplicates(subset=['SMILES String'])
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('NaN', na=True)]
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('nan', na=True)]

# Assert statement to ensure that we only have unqiue smiles strings 
assert half_enantiomer_data['SMILES String'].shape == half_enantiomer_data['SMILES String'].unique().shape, "Number of SMILES strings should equal number of unique SMILES strings at this stage"

# Assert that there are no more nan values in the smiles string column
assert sum(half_enantiomer_data['SMILES String']=='nan') == 0, "There should be no NaN SMILES strings at this point"

# Gets rid of the rows with a null log_abs value
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['log_abs'].isnull()]

# Assert that there are no more log_abs of det values with the value null
assert not sum(half_enantiomer_data['log_abs'].isnull())
assert not sum(half_enantiomer_data['det'].isnull())