import pandas as pd
import openpyxl
import os
import numpy as np
from pathlib import Path 

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

import math
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px


directory = os.getcwd()
directory = directory + '/meats'


#data downloaded from usda website cuts of meat from each animal stored in a separate excel file.
# function for opening each excel file
def get_excel_df(filename,file_loc):
    xls = pd.ExcelFile(file_loc+'/'+filename)
    return xls


# function for opening each excel file
def get_excel_df(filename,file_loc):
    xls = pd.ExcelFile(file_loc+'/'+filename)
    return xls


# this function reads in the nutrient data from each sheet in a given excel file and 
#creates a long format dataframe
def get_data_from_xl(file, xl_file):
    combined_df = pd.DataFrame()

    for sheet_name in xl_file.sheet_names:
        # Load the data from the current sheet into a DataFrame
        df = pd.read_excel(xl_file, sheet_name, header = None, usecols = [0,3])
        start_idx = df[df[0]=='Water'].index[0]
        end_idx = df[df[0]=='Vitamin B12'].index[0]
        df = df[start_idx:end_idx+1]

        if 'Pork' in file:
            df['Type']= 'Pork '+sheet_name
        elif 'Beef' in file:
            df['Type']= 'Beef '+sheet_name
        else:
            df['Type']= sheet_name
        combined_df = pd.concat([combined_df,df])

    combined_df['Nutrient'] = combined_df[0]
    combined_df['Amount'] = combined_df[3]
    return combined_df


file_extension = '.xlsx'

# List all files in the directory
files = os.listdir(directory)

# filter all excel files in the directory
filtered_files = [file for file in files if file.endswith(file_extension)]
all_types_combined_df =  pd.DataFrame()

# loop over these files and extract nutrient data into a dataframe
for file in filtered_files:
    xl_df = get_excel_df(file,directory)
    ret_df = get_data_from_xl(file, xl_df)
    all_types_combined_df = pd.concat([all_types_combined_df,ret_df])


all_types_combined_df = all_types_combined_df[['Type','Nutrient','Amount']]


wide_df = all_types_combined_df.pivot_table(index='Type', columns='Nutrient', values='Amount')


wide_df.head()


#dataframe to numpy array
arr = np.array(wide_df)


# feature standardization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(arr)


#impute missing values
imp_zeros = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value = 0)
imp_zeros.fit(scaled_data)
imputed = imp_zeros.transform(scaled_data)


#first perfrom PCA with all features to understand the principal contributors to the variance, make scree plot
pca1 = PCA()
Xt1 = pca1.fit_transform(imputed)


pca1.explained_variance_ratio_

array([4.12951882e-01, 1.68996185e-01, 8.83646153e-02, 7.40882400e-02,
       5.66420316e-02, 3.77375487e-02, 3.38447948e-02, 2.61048524e-02,
       2.46746744e-02, 1.69673920e-02, 1.26967701e-02, 1.15317175e-02,
       9.59946520e-03, 6.99932598e-03, 5.29167127e-03, 4.03548010e-03,
       3.02312997e-03, 1.70586596e-03, 1.59896732e-03, 1.28196791e-03,
       8.52107271e-04, 6.64851637e-04, 2.43168713e-04, 8.19285655e-05,
       1.64126770e-05, 4.95314158e-06, 2.11630937e-33, 2.11630937e-33,
       2.11630937e-33])


n_pcs= pca1.components_.shape[0]
component_index = [np.abs(pca1.components_[i]).argmax() for i in range(n_pcs)]


initial_feature_names = wide_df.columns
initial_feature_names
most_important_names = [initial_feature_names[component_index[i]] for i in range(n_pcs)]

dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}


data = {'Feature Name': most_important_names,'Component Index': component_index , 'Explained Variance':(pca1.explained_variance_ratio_)*100}
df = pd.DataFrame(data)
df['Explained Variance'] = df['Explained Variance'].round(2).astype(str) + '%'
df.head()


#scree plot
plt.plot(range(0,29),pca1.explained_variance_ratio_)
plt.title("Scree plot: Principal components as a function of explained variance")
plt.xlabel("PC")
plt.ylabel("explained variance ratio")
plt.show()


#combined explained variance
sum(pca1.explained_variance_ratio_[0:6])

0.8387805031771249


#perfrom PCA with n_components = 6
pca = PCA(n_components = 6)
Xt = pca.fit_transform(imputed)


km = KMeans(n_clusters=4,init='random',n_init='auto')
rand_init_predicted = km.fit(Xt)
wide_df['Clusters_labels'] = km.labels_


#label cut by source (animal)
def categorize_meat(meat):
    if 'Beef' in meat:
        return "Cow"
    elif 'Pork' in meat:
        return "Pig"
    elif 'lamb' in meat:
        return "Sheep"
    elif 'veal' in meat:
        return "Calf"
wide_df['Meat_type_labels'] = wide_df.index.map(categorize_meat)


data = {"PC1":Xt[:,1],"PC2":Xt[:,2], "cluster_labels":wide_df["Clusters_labels"]}
df = pd.DataFrame(data)
df['cluster_labels'] = df['cluster_labels'].astype('category')


f = px.scatter(df, x = df['PC1'], y = df['PC2'], color = df['cluster_labels'],color_discrete_sequence=px.colors.qualitative.Vivid,
                 template = 'simple_white')
f.update_layout(title = {
        'text': "<b>Data points colored by clustering labels oriented along PC1 and PC2</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
f.show()


from sklearn.metrics import silhouette_score

range_n_clusters = [2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15,16,17]
sc = []
for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters, n_init="auto", random_state=10)
    cluster_labels = clusterer.fit_predict(Xt)
    silhouette_avg = silhouette_score(Xt,cluster_labels)
    sc.append(silhouette_avg)
plt.plot(range_n_clusters, sc) 
plt.title("Average Silhouette Scores as a function of number of clusters")
plt.xlabel("Number of clusters")
plt.ylabel ("Average Silhouette Score")
plt.show()


fig = px.scatter( wide_df,  x = 'Clusters_labels',y = wide_df.index,
                     color = 'Meat_type_labels',color_discrete_sequence=px.colors.qualitative.Vivid,
                 title = "<b>K-means clustering of cuts of meat from four different animals</b>",
                 template = 'simple_white')
               

fig.update_xaxes(tickvals=[0, 1, 2, 3])

Nutrient	Ash	Calcium, Ca	Calories from fat	Carbohydrate, by difference	Cholesterol	Energy	Fatty acids, total saturated	Fatty acids, total trans	Fiber, total dietary	Iron, Fe	...	Sugars, total	Thiamin	Total lipid (fat)	Vitamin A	Vitamin B12	Vitamin B6	Vitamin C, total ascorbic acid	Vitamin D	Water	Zinc, Zn
Type
Beef Brisket	1.0115	12.4891	76.0338	0.0000	68.6000	158.3250	3.4245	NaN	0.0	2.0211	...	0.0	0.0792	8.4482	12.5831	1.8762	0.5966	0.0	NaN	70.5221	5.0533
Beef Denver Cut	0.9053	11.4525	94.5270	0.0942	67.8000	171.6874	4.3699	0.6694	0.0	2.3829	...	0.0	0.0794	10.5030	13.1041	3.2179	0.4208	0.0	NaN	69.3016	7.5064
Beef Eye round steak	1.0815	12.6419	26.5203	0.0000	62.2000	119.7131	1.1988	NaN	0.0	1.3731	...	0.0	0.0631	2.9467	16.6994	2.0573	0.6345	0.0	NaN	73.4599	3.3601
Beef Flank	0.9830	24.0657	54.4986	0.0000	62.0928	145.2364	2.5144	NaN	0.0	1.5376	...	0.0	0.0758	6.0554	0.0000	1.0075	0.5861	0.0	NaN	71.9934	3.6027
Beef Mock Tender	1.1109	11.0122	37.8054	0.0000	67.9000	121.9910	1.9760	0.2387	0.0	2.2499	...	0.0	0.0798	4.2006	11.0951	2.9941	0.4451	0.0	NaN	74.0081	7.8519

	Feature Name	Component Index	Explained Variance
0	Fatty acids, total saturated	6	41.3%
1	Iron, Fe	9	16.9%
2	Niacin	11	8.84%
3	Sodium, Na	18	7.41%
4	Calcium, Ca	1	5.66%

Unsupervised learning: K-means Clustering

Unsupervised Learning: K-means Clustering¶

Question:¶

Approach and summary:¶

Data preprocessing¶

Dimensionality reduction:

K-means clustering¶

Results