PCA and VCD (Could delete since there were no matches)#
# Comput PCS and VCD features to feed into model
import pandas as pd
half_enantiomer_data = pd.read_csv("half_enantiomer_data.csv")
# Read in pca and vcd dataframe to use as features
pca_vcd_values = pd.read_csv("../data/vcd/pca_vcd_values.csv")
pca_vcd_values
Molecule Name | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | (R)-(-)-Carvone | 7.464317 | -43.591126 | -8.046428 | -0.574730 | -3.580254 | 5.872903 | -2.689295 | 0.910235 | 3.263888 | 1.241722 | -1.817992 | -10.064818 | -5.517172 | 6.956873 | 0.003187 |
1 | (1R,2S,5R)-(-)-Menthol | -15.984788 | -48.409755 | 11.599679 | -6.459996 | -2.895890 | -21.817187 | 7.987366 | 30.168379 | -4.855940 | -27.966750 | -15.593791 | 18.881794 | 6.680355 | -3.869645 | -18.299473 |
2 | (-)-Menthyl chloride | -5.850617 | -48.612997 | 25.383460 | -1.972609 | -9.623228 | -12.079316 | -5.014842 | 9.790833 | -11.125811 | -10.948924 | 12.772888 | -4.068723 | 4.196934 | 0.326200 | 0.769741 |
3 | (-)-Borneol | 0.756580 | -25.884643 | -4.987043 | 6.148153 | -18.167909 | 7.294051 | -16.954397 | 9.166877 | 10.160046 | 8.512541 | 22.090822 | -60.188603 | 5.246429 | 9.022766 | -17.823573 |
4 | (-)-Isoplegol | -12.599590 | -33.717238 | -12.966523 | -18.479083 | 5.945713 | -6.208222 | -4.337392 | 10.783442 | -12.640204 | 1.773643 | 1.927400 | 6.935122 | 5.856485 | 8.257695 | -9.983220 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
156 | β-Cholestanol | 6.415996 | -18.840982 | 20.303654 | -29.238431 | -26.058052 | 8.328641 | -8.520251 | 19.753263 | 24.650594 | 37.600273 | -1.058662 | -16.200175 | 37.116864 | -1.951506 | -10.766619 |
157 | Cholesterol Ethyl Carbonate | 230.771312 | 139.215929 | -45.562654 | -28.445263 | 7.019739 | -30.857068 | 18.819408 | -6.542138 | 7.706800 | -5.896378 | 7.642116 | -5.188079 | 5.985704 | -34.834249 | -7.737224 |
158 | Cholesterol n-Caprate | 181.677425 | 33.151223 | 72.048835 | -49.147260 | 19.664893 | 20.812246 | -3.977717 | 5.351038 | 14.182154 | -2.607771 | 6.333818 | -0.251176 | -7.470138 | 19.545668 | -8.373291 |
159 | Cholesterol Laurate | -130.152846 | 164.035229 | 61.091725 | -37.136634 | -13.847154 | 14.540341 | -4.944992 | -3.105062 | 2.508800 | 1.700032 | 5.700678 | 8.617406 | -4.496284 | 22.150449 | 2.389079 |
160 | Cholesterol Myristate | -132.817778 | 151.835799 | 66.528593 | -50.454738 | 7.112731 | 21.576747 | 6.387919 | -5.878909 | -6.011154 | 7.648669 | 4.893929 | 7.866558 | 1.410726 | 11.221058 | 0.188386 |
161 rows × 16 columns
# Set index to Molecule Name
pca_vcd_values = pca_vcd_values.set_index('Molecule Name')
pca_vcd_values.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Molecule Name | |||||||||||||||
(R)-(-)-Carvone | 7.464317 | -43.591126 | -8.046428 | -0.574730 | -3.580254 | 5.872903 | -2.689295 | 0.910235 | 3.263888 | 1.241722 | -1.817992 | -10.064818 | -5.517172 | 6.956873 | 0.003187 |
(1R,2S,5R)-(-)-Menthol | -15.984788 | -48.409755 | 11.599679 | -6.459996 | -2.895890 | -21.817187 | 7.987366 | 30.168379 | -4.855940 | -27.966750 | -15.593791 | 18.881794 | 6.680355 | -3.869645 | -18.299473 |
(-)-Menthyl chloride | -5.850617 | -48.612997 | 25.383460 | -1.972609 | -9.623228 | -12.079316 | -5.014842 | 9.790833 | -11.125811 | -10.948924 | 12.772888 | -4.068723 | 4.196934 | 0.326200 | 0.769741 |
(-)-Borneol | 0.756580 | -25.884643 | -4.987043 | 6.148153 | -18.167909 | 7.294051 | -16.954397 | 9.166877 | 10.160046 | 8.512541 | 22.090822 | -60.188603 | 5.246429 | 9.022766 | -17.823573 |
(-)-Isoplegol | -12.599590 | -33.717238 | -12.966523 | -18.479083 | 5.945713 | -6.208222 | -4.337392 | 10.783442 | -12.640204 | 1.773643 | 1.927400 | 6.935122 | 5.856485 | 8.257695 | -9.983220 |
# Copy original dataframe
half_enantiomer_data_copy = half_enantiomer_data
half_enantiomer_data_copy;
# Keep the columns in the gme df that match the index names in original dataset
common_index = half_enantiomer_data_copy.index.intersection(pca_vcd_values.index)
half_enantiomer_data_copy = half_enantiomer_data_copy.loc[common_index]
pca_vcd_features = pca_vcd_values.loc[common_index]
# Combine original dataset with pca_vcd
pca_vcd = half_enantiomer_data_copy.join(pca_vcd_features, how="inner")
pca_vcd
Unnamed: 0 | Unnamed: 0.1 | Molecule Name | Pubchem ID # | Note | SMILES String | Other SMILES | Method | Contributor | Detection Threshold | ... | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
---|
0 rows × 32 columns