// Get recommendations public Recommendation[] GetRecommendations(int k, DistanceChoice distance_choice, int[] recipe, Voting voting) { MLContext ml = new MLContext(); DataManager dm = new DataManager(); // get features (ingredient names) string[] ingrNames = dm.GetFeatures(); // get training recipes Data[] data = dm.GetRecipes(ModelChoice.KNN, DataPurpose.TRAIN); Recommendation[] recommendations = new Recommendation[ingrNames.Length]; // iterate through all ingredients for (int i = 0; i < ingrNames.Length; i++) { Ingredient current_ingr = new Ingredient(i, ingrNames[i]); // calculate all distances Neighbors[] distances = GetDistances(distance_choice, recipe, data, voting); double recommended = 0; double not_recommended = 0; // k nearest neighbors vote // recommend ingredient if the majority of neighbors contains the ingredient for (int top = 0; top < k; top++) { // recommend ingredient if (distances[top].recipe.Contains(i)) { if (voting.Equals(Voting.Unweighted)) { recommended++; } else { recommended += distances[top].distance; } } // do not recommend ingredient else { if (voting.Equals(Voting.Unweighted)) { not_recommended++; } else { not_recommended += distances[top].distance; } } } recommendations[i] = new Recommendation(current_ingr, (recommended + 1.0) / (not_recommended + 2.0)); } recommendations = recommendations.OrderByDescending(r => r.score).ToArray(); return(recommendations); }
// Hamming Distance static double HammingDistance(int[] a, int[] b, Voting voting) { // number of differences between two recipes double distance = a.Length + b.Length - (a.Intersect(b).ToArray().Length * 2); // Weighted voting if (voting.Equals(Voting.Weighted)) { return(1 / (distance * distance)); } // Unweighted voting else { return(distance); } }
// Jaccard Similarity static double JaccardSimilarity(int[] a, int[] b, Voting voting) { double intersect = a.Intersect(b).ToArray().Length; double union = a.Union(b).ToArray().Length; double distance = (intersect / union); // Weighted voting if (voting.Equals(Voting.Weighted)) { return(1 / (distance * distance)); } // Unweighted voting else { return(distance); } }
// Levenshtein distance (minimum number of edits) static double LevenshteinDistance(int[] a, int[] b, Voting voting) { int[][] matrix = new int[a.Length + 1][]; for (int i = 0; i < a.Length + 1; i++) { int[] curr_a = a.Take(i).ToArray(); matrix[i] = new int[b.Length + 1]; for (int j = 0; j < b.Length + 1; j++) { int[] curr_b = a.Take(j).ToArray(); if (Math.Min(curr_a.Length, curr_b.Length) == 0) { matrix[i][j] = Math.Max(curr_a.Length, curr_b.Length); } else { int x = matrix[i - 1][j] + 1; int y = matrix[i][j - 1] + 1; int z = matrix[i - 1][j - 1]; if (a[i - 1] != b[j - 1]) { z += 1; } matrix[i][j] = Math.Min(Math.Min(x, y), z); } } } double distance = matrix[a.Length][b.Length]; // Weighted voting if (voting.Equals(Voting.Weighted)) { return(1 / (distance * distance)); } // Unweighted voting else { return(distance); } }
// Update results (TP, TN, FP, FN) for KNN // ingredients are recommended if the majority of neighbors contain the ingredient static Results GetKNNResults(int k, Data[] test_data, string[] ingrNames, DistanceChoice distance_choice, Data[] train_data, Voting voting) { DataManager dm = new DataManager(); KNN knn = new KNN(); // keep track of results Results results = new Results(0); // group test data by recipeId IGrouping <int, Data>[] recipes = test_data.GroupBy(d => d.recipeId).ToArray(); int count = 0; // iterate through all test recipes foreach (IGrouping <int, Data> recipe in recipes) { count++; // current test recipe int[] current_recipe = dm.GetRecipe(recipe.ToArray()); // calculate all distances, sort neighbors by distance to current recipe Neighbors[] distances = knn.GetDistances(distance_choice, current_recipe, train_data, voting); // iterate through all features (unique ingredients) for (int i = 0; i < ingrNames.Length; i++) { // keep track of votes from neighboring recipes double recommended = 0; double not_recommended = 0; // k nearest neighbors vote for (int top = 0; top < k; top++) { // recommend ingredient if (distances[top].recipe.Contains(i)) { if (voting.Equals(Voting.Unweighted)) { recommended++; } else { recommended += distances[top].distance; } } // do not recommend else { if (voting.Equals(Voting.Unweighted)) { not_recommended++; } else { not_recommended += distances[top].distance; } } } results = UpdateResults(results, current_recipe.Contains(i), recommended >= not_recommended); } } return(results); }
// find optimal k public void GetOptimalK(DistanceChoice distance_choice, Voting voting, int max_k) { Console.WriteLine("Determining optimal k for " + distance_choice.ToString() + " distance"); Console.WriteLine(DateTime.Now.ToLongTimeString()); KNN knn = new KNN(); MLContext ml = new MLContext(); DataManager dm = new DataManager(); // get training data IDataView train_dataView = dm.GetDataView(ModelChoice.KNN, ml, DataPurpose.TRAIN); // get features IDataView features = dm.GetDataView(ModelChoice.KNN, ml, DataPurpose.FEATURES); string[] ingrNames = features.GetColumn <string>(features.Schema["ingrName"]).ToArray(); // set number of folds to 5 int num_folds = 5; Console.WriteLine(num_folds + "-fold cross validation..."); // Cross validation split var folds = ml.Data.CrossValidationSplit(train_dataView, num_folds, samplingKeyColumnName: "recipeId"); // keep track of f1 scores for each value of k double[] f1s = new double[max_k]; // try different values of k for (int k = 1; k <= max_k; k++) { // show progress Console.WriteLine("\nk = " + k + "\t" + DateTime.Now.ToLongTimeString()); f1s[(k - 1)] = 0.0; // keep track of fold results (update TP, TN, FP, FN to later determine f1 score) Results[] fold_results = new Results[num_folds]; // iterate through each fold for (int fold = 0; fold < num_folds; fold++) { // get training data for current fold Data[] train_data = dm.GetData(folds[fold].TrainSet, features); // get test data for current fold Data[] validation_data = dm.GetData(folds[fold].TestSet, features); // number of training recipes for current fold int num_recipes = train_data.GroupBy(d => d.recipeId).ToArray().Length; // number of test recipes for current fold int num_validation_recipes = validation_data.GroupBy(d => d.recipeId).ToArray().Length; // group test recipes by recipeId IGrouping <int, Data>[] recipes = validation_data.GroupBy(d => d.recipeId).ToArray(); // iterate through test recipes for current fold foreach (IGrouping <int, Data> current in recipes) { // current recipe int[] recipe = dm.GetRecipe(current.ToArray()); // calculate distances between test recipe and training recipes, and get sorted neighbors Neighbors[] distances = knn.GetDistances(distance_choice, recipe, train_data, voting); // iterate through all features (unique ingredients) for (int i = 0; i < ingrNames.Length; i++) { // keep track of votes double recommended = 0; double not_recommended = 0; // find k nearest neighbors for (int top = 0; top < k; top++) { // recommend ingredient if (distances[top].recipe.Contains(i)) { if (voting.Equals(Voting.Unweighted)) { recommended++; } else { recommended += distances[top].distance; } } // do not recommend ingredient else { if (voting.Equals(Voting.Unweighted)) { not_recommended++; } else { not_recommended += distances[top].distance; } } } // update results for current fold fold_results[fold] = UpdateResults(fold_results[fold], recipe.Contains(i), recommended >= not_recommended); } } } f1s[(k - 1)] = fold_results.Average(a => a.getF1()); Console.WriteLine("Average f1: " + f1s[(k - 1)]); } // display the optimal k Console.WriteLine("\nOPTIMAL k = " + (Array.IndexOf(f1s, f1s.Max()) + 1) + " with an f1 score of " + f1s.Max()); }