/// <summary> /// The user-based KNN collaborative filtering described in paper: /// Resnick, P., et al., "GroupLens: an open architecture for collaborative filtering of netnews", 1994. /// Link: http://dx.doi.org/10.1145/192844.192905 /// </summary> /// <param name="R_train"></param> /// <param name="R_unknown"></param> /// <param name="K"></param> /// <returns></returns> public static DataMatrix PredictRatings(DataMatrix R_train, DataMatrix R_unknown, SimilarityData neighborsByUser, int K) { // Debug Debug.Assert(R_train.UserCount == R_unknown.UserCount); Debug.Assert(R_train.ItemCount == R_unknown.ItemCount); int cappedCount = 0, globalMeanCount = 0; // This matrix stores predictions DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount); // Basic statistics from train set double globalMean = R_train.GetGlobalMean(); Vector<double> meanByUser = R_train.GetUserMeans(); Vector<double> meanByItem = R_train.GetItemMeans(); // Predict ratings for each test user // Single thread appears to be very fast, parallel.foreach is unnecessary Object lockMe = new Object(); Parallel.ForEach(R_unknown.Users, user => { int indexOfUser = user.Item1; RatingVector userRatings = new RatingVector(R_train.GetRow(indexOfUser)); RatingVector unknownRatings = new RatingVector(user.Item2); Utils.PrintEpoch("Predicting user/total", indexOfUser, R_train.UserCount); // Note that there are more than K neighbors in the list (sorted by similarity) // we will use the top-K neighbors WHO HAVE RATED THE ITEM // For example we have 200 top neighbors, and we hope there are // K neighbors in the list have rated the item. We can't keep // everyone in the neighbor list because there are too many for large data sets var topNeighborsOfUser = neighborsByUser[indexOfUser]; //Dictionary<int, double> topKNeighbors = KNNCore.GetTopKNeighborsByUser(userSimilarities, indexOfUser, K); double meanOfUser = meanByUser[indexOfUser]; // Loop through each ratingto be predicted foreach (Tuple<int, double> unknownRating in unknownRatings.Ratings) { int itemIndex = unknownRating.Item1; double prediction; // TODO: we actually should use the Top-K neighbors // that have rated this item, otherwise we may have // only a few neighbors rated this item // Compute the average rating on item iid given // by the top K neighbors. Each rating is offsetted by // the neighbor's average and weighted by the similarity double weightedSum = 0; double weightSum = 0; int currentTopKCount = 0; foreach (KeyValuePair<int, double> neighbor in topNeighborsOfUser) { int neighborIndex = neighbor.Key; double similarityOfNeighbor = neighbor.Value; double itemRatingOfNeighbor = R_train[neighborIndex, itemIndex]; // We count only if the neighbor has seen this item before if (itemRatingOfNeighbor != 0) { weightSum += similarityOfNeighbor; weightedSum += (itemRatingOfNeighbor - meanByUser[neighborIndex]) * similarityOfNeighbor; currentTopKCount++; if (currentTopKCount >= K) { break; } // Stop when we have seen K neighbors } } // A zero weightedSum means this is a cold item and global mean will be assigned by default if (weightedSum != 0) { prediction = meanOfUser + weightedSum / weightSum; } else { prediction = globalMean; globalMeanCount++; } // Cap the ratings if (prediction > Config.Ratings.MaxRating) { cappedCount++; prediction = Config.Ratings.MaxRating; } if (prediction < Config.Ratings.MinRating) { cappedCount++; prediction = Config.Ratings.MinRating; } lock (lockMe) { R_predicted[indexOfUser, itemIndex] = prediction; } } }); Utils.PrintValue("# capped predictions", cappedCount.ToString("D")); Utils.PrintValue("# default predictions", globalMeanCount.ToString("D")); return R_predicted; }
public static DataMatrix PredictRatings(PrefRelations PR_train, DataMatrix R_unknown, int K, SimilarityData neighborsByUser) { Debug.Assert(PR_train.UserCount == R_unknown.UserCount); Debug.Assert(PR_train.ItemCount == R_unknown.ItemCount); // This matrix stores predictions DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount); // This can be considered as the R_train in standard UserKNN SparseMatrix positionMatrix = PR_train.GetPositionMatrix(); DataMatrix ratingMatrixFromPositions = new DataMatrix(positionMatrix); Vector<double> meanByUser = ratingMatrixFromPositions.GetUserMeans(); Vector<double> meanByItem = ratingMatrixFromPositions.GetItemMeans(); double globalMean = ratingMatrixFromPositions.GetGlobalMean(); // Predict positions for each test user // Appears to be very fast, parallel.foreach is unnecessary foreach (Tuple<int, Vector<double>> user in R_unknown.Users) { int indexOfUser = user.Item1; Vector<double> indexesOfUnknownRatings = user.Item2; Utils.PrintEpoch("Predicting user/total", indexOfUser, PR_train.UserCount); // Note that there are more than K neighbors in the list (sorted by similarity) // we will use the top-K neighbors WHO HAVE RATED THE ITEM // For example we have 200 top neighbors, and we hope there are // K neighbors in the list have rated the item. We can't keep // everyone in the neighbor list because there are too many for large data sets var topNeighborsOfUser = neighborsByUser[indexOfUser]; double meanOfUser = meanByUser[indexOfUser]; // Loop through each position to be predicted foreach (Tuple<int, double> unknownRating in indexesOfUnknownRatings.EnumerateIndexed(Zeros.AllowSkip)) { int indexOfUnknownItem = unknownRating.Item1; // Compute the position of this item for the user // by combining neighbors' positions on this item double weightedSum = 0; double weightSum = 0; int currentTopKCount = 0; foreach (KeyValuePair<int, double> neighbor in topNeighborsOfUser) { int indexOfNeighbor = neighbor.Key; double similarityOfNeighbor = neighbor.Value; double itemPositionOfNeighbor = ratingMatrixFromPositions[indexOfNeighbor, indexOfUnknownItem]; // We count only if the neighbor has seen this item before if (itemPositionOfNeighbor != 0) { // Recall that we use a constant to hold position value 0 // we revert it back here if (itemPositionOfNeighbor == Config.ZeroInSparseMatrix) { Debug.Assert(true, "By using the PositionShift constant, we should not be in here."); itemPositionOfNeighbor = 0; } weightSum += similarityOfNeighbor; weightedSum += (itemPositionOfNeighbor - meanByUser[indexOfNeighbor]) * similarityOfNeighbor; currentTopKCount++; if(currentTopKCount>= K) { break; } } } // If any neighbor has seen this item if (currentTopKCount != 0) { // TODO: Add user mean may improve the performance R_predicted[indexOfUser, indexOfUnknownItem] = meanOfUser + weightedSum / weightSum; } else { R_predicted[indexOfUser, indexOfUnknownItem] = globalMean; } } }//); return R_predicted; }
public static DataMatrix PredictRatings(DataMatrix R_train, DataMatrix R_unknown, int maxEpoch, double learnRate, double regularization, int factorCount) { int userCount = R_train.UserCount; int itemCount = R_train.ItemCount; int ratingCount = R_train.NonZerosCount; double meanOfGlobal = R_train.GetGlobalMean(); DataMatrix R_train_unknown = R_train.IndexesOfNonZeroElements(); // For testing convergence // User latent vectors with default seed Matrix<double> P = Utils.CreateRandomMatrixFromNormal(userCount, factorCount, 0, 0.1, Config.Seed); // Matrix<double> P = Utils.CreateRandomMatrixFromUniform(userCount, factorCount, 0, 0.1, Config.Seed); // Item latent vectors with a different seed Matrix<double> Q = Utils.CreateRandomMatrixFromNormal(factorCount, itemCount, 0, 0.1, Config.Seed + 1); //Matrix<double> Q = Utils.CreateRandomMatrixFromUniform(factorCount, itemCount, 0, 0.1, Config.Seed + 1); // SGD double e_prev = double.MaxValue; for (int epoch = 0; epoch < maxEpoch; ++epoch) { foreach (Tuple<int, int, double> element in R_train.Ratings) { int indexOfUser = element.Item1; int indexOfItem = element.Item2; double rating = element.Item3; double e_ij = rating - (meanOfGlobal + P.Row(indexOfUser).DotProduct(Q.Column(indexOfItem))); // Update feature vectors Vector<double> P_u = P.Row(indexOfUser); Vector<double> Q_i = Q.Column(indexOfItem); Vector<double> P_u_updated = P_u + (Q_i.Multiply(e_ij) - P_u.Multiply(regularization)).Multiply(learnRate); P.SetRow(indexOfUser, P_u_updated); Vector<double> Q_i_updated = Q_i + (P_u.Multiply(e_ij) - Q_i.Multiply(regularization)).Multiply(learnRate); Q.SetColumn(indexOfItem, Q_i_updated); #region Update feature vectors loop version /* // Update feature vectors for (int k = 0; k < factorCount; ++k) { double factorOfUser = P[indexOfUser, k]; double factorOfItem = Q[k, indexOfItem]; P[indexOfUser, k] += learnRate * (e_ij * factorOfItem - regularization * factorOfUser); Q[k, indexOfItem] += learnRate * (e_ij * factorOfUser - regularization * factorOfItem); } */ #endregion } // Display the current regularized error see if it converges double e_curr = 0; if (epoch == 0 || epoch == maxEpoch - 1 || epoch % (int)Math.Ceiling(maxEpoch * 0.1) == 4) { Matrix<double> predictedMatrix = R_train_unknown.PointwiseMultiply(P.Multiply(Q)); SparseMatrix correctMatrix = R_train.Matrix; double squaredError = (correctMatrix - predictedMatrix).SquaredSum(); double regularizationPenaty = regularization * (P.SquaredSum() + Q.SquaredSum()); double objective = squaredError + regularizationPenaty; #region Linear implementation /* double e = 0; foreach (Tuple<int, int, double> element in R_train.Ratings) { int indexOfUser = element.Item1; int indexOfItem = element.Item2; double rating = element.Item3; e += Math.Pow(rating - P.Row(indexOfUser).DotProduct(Q.Column(indexOfItem)), 2); for (int k = 0; k < factorCount; ++k) { e += (regularization / 2) * (Math.Pow(P[indexOfUser, k], 2) + Math.Pow(Q[k, indexOfItem], 2)); } } */ #endregion // Record the current error e_curr = objective; // Stop the learning if the regularized error falls below a certain threshold if (e_prev - e_curr < 0.001) { Console.WriteLine("Improvment less than 0.001, learning stopped."); break; } e_prev = e_curr; Utils.PrintEpoch("Epoch", epoch, maxEpoch, "Objective cost", objective); } } SparseMatrix R_predicted = new SparseMatrix(R_unknown.UserCount, R_unknown.ItemCount); foreach(var element in R_unknown.Matrix.EnumerateIndexed(Zeros.AllowSkip)) { int indexOfUser = element.Item1; int indexOfItem = element.Item2; double r_predicted = meanOfGlobal + P.Row(indexOfUser) * Q.Column(indexOfItem); if (r_predicted > Config.Ratings.MaxRating) r_predicted = Config.Ratings.MaxRating; if (r_predicted < Config.Ratings.MinRating) r_predicted = Config.Ratings.MinRating; R_predicted[indexOfUser, indexOfItem] = r_predicted; } return new DataMatrix(R_predicted); //return new RatingMatrix(R_unknown.PointwiseMultiply(P.Multiply(Q))); }
/// <summary> /// The user-based KNN collaborative filtering described in paper: /// Resnick, P., et al., "GroupLens: an open architecture for collaborative filtering of netnews", 1994. /// Link: http://dx.doi.org/10.1145/192844.192905 /// </summary> /// <param name="R_train"></param> /// <param name="R_unknown"></param> /// <param name="K"></param> /// <returns></returns> public static DataMatrix PredictRatings(DataMatrix R_train, DataMatrix R_unknown, SimilarityData neighborsByUser, int K) { // Debug Debug.Assert(R_train.UserCount == R_unknown.UserCount); Debug.Assert(R_train.ItemCount == R_unknown.ItemCount); int cappedCount = 0, globalMeanCount = 0; // This matrix stores predictions DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount); // Basic statistics from train set double globalMean = R_train.GetGlobalMean(); Vector <double> meanByUser = R_train.GetUserMeans(); Vector <double> meanByItem = R_train.GetItemMeans(); // Predict ratings for each test user // Single thread appears to be very fast, parallel.foreach is unnecessary Object lockMe = new Object(); Parallel.ForEach(R_unknown.Users, user => { int indexOfUser = user.Item1; RatingVector userRatings = new RatingVector(R_train.GetRow(indexOfUser)); RatingVector unknownRatings = new RatingVector(user.Item2); Utils.PrintEpoch("Predicting user/total", indexOfUser, R_train.UserCount); // Note that there are more than K neighbors in the list (sorted by similarity) // we will use the top-K neighbors WHO HAVE RATED THE ITEM // For example we have 200 top neighbors, and we hope there are // K neighbors in the list have rated the item. We can't keep // everyone in the neighbor list because there are too many for large data sets var topNeighborsOfUser = neighborsByUser[indexOfUser]; //Dictionary<int, double> topKNeighbors = KNNCore.GetTopKNeighborsByUser(userSimilarities, indexOfUser, K); double meanOfUser = meanByUser[indexOfUser]; // Loop through each ratingto be predicted foreach (Tuple <int, double> unknownRating in unknownRatings.Ratings) { int itemIndex = unknownRating.Item1; double prediction; // TODO: we actually should use the Top-K neighbors // that have rated this item, otherwise we may have // only a few neighbors rated this item // Compute the average rating on item iid given // by the top K neighbors. Each rating is offsetted by // the neighbor's average and weighted by the similarity double weightedSum = 0; double weightSum = 0; int currentTopKCount = 0; foreach (KeyValuePair <int, double> neighbor in topNeighborsOfUser) { int neighborIndex = neighbor.Key; double similarityOfNeighbor = neighbor.Value; double itemRatingOfNeighbor = R_train[neighborIndex, itemIndex]; // We count only if the neighbor has seen this item before if (itemRatingOfNeighbor != 0) { weightSum += similarityOfNeighbor; weightedSum += (itemRatingOfNeighbor - meanByUser[neighborIndex]) * similarityOfNeighbor; currentTopKCount++; if (currentTopKCount >= K) { break; } // Stop when we have seen K neighbors } } // A zero weightedSum means this is a cold item and global mean will be assigned by default if (weightedSum != 0) { prediction = meanOfUser + weightedSum / weightSum; } else { prediction = globalMean; globalMeanCount++; } // Cap the ratings if (prediction > Config.Ratings.MaxRating) { cappedCount++; prediction = Config.Ratings.MaxRating; } if (prediction < Config.Ratings.MinRating) { cappedCount++; prediction = Config.Ratings.MinRating; } lock (lockMe) { R_predicted[indexOfUser, itemIndex] = prediction; } } }); Utils.PrintValue("# capped predictions", cappedCount.ToString("D")); Utils.PrintValue("# default predictions", globalMeanCount.ToString("D")); return(R_predicted); }