Example #1
0
        /// <summary>
        /// The user-based KNN collaborative filtering described in paper: 
        /// Resnick, P., et al., "GroupLens: an open architecture for collaborative filtering of netnews", 1994.
        /// Link: http://dx.doi.org/10.1145/192844.192905
        /// </summary>
        /// <param name="R_train"></param>
        /// <param name="R_unknown"></param>
        /// <param name="K"></param>
        /// <returns></returns>
        public static DataMatrix PredictRatings(DataMatrix R_train, DataMatrix R_unknown, SimilarityData neighborsByUser, int K)
        {
            // Debug
            Debug.Assert(R_train.UserCount == R_unknown.UserCount);
            Debug.Assert(R_train.ItemCount == R_unknown.ItemCount);
            int cappedCount = 0, globalMeanCount = 0;

            // This matrix stores predictions
            DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount);

            // Basic statistics from train set
            double globalMean = R_train.GetGlobalMean();
            Vector<double> meanByUser = R_train.GetUserMeans();
            Vector<double> meanByItem = R_train.GetItemMeans();

            // Predict ratings for each test user
            // Single thread appears to be very fast, parallel.foreach is unnecessary
            Object lockMe = new Object();
            Parallel.ForEach(R_unknown.Users, user =>
            {
                int indexOfUser = user.Item1;
                RatingVector userRatings = new RatingVector(R_train.GetRow(indexOfUser));
                RatingVector unknownRatings = new RatingVector(user.Item2);

                Utils.PrintEpoch("Predicting user/total", indexOfUser, R_train.UserCount);

                // Note that there are more than K neighbors in the list (sorted by similarity)
                // we will use the top-K neighbors WHO HAVE RATED THE ITEM
                // For example we have 200 top neighbors, and we hope there are
                // K neighbors in the list have rated the item. We can't keep
                // everyone in the neighbor list because there are too many for large data sets
                var topNeighborsOfUser = neighborsByUser[indexOfUser];
                //Dictionary<int, double> topKNeighbors = KNNCore.GetTopKNeighborsByUser(userSimilarities, indexOfUser, K);

                double meanOfUser = meanByUser[indexOfUser];

                // Loop through each ratingto be predicted
                foreach (Tuple<int, double> unknownRating in unknownRatings.Ratings)
                {
                    int itemIndex = unknownRating.Item1;
                    double prediction;

                    // TODO: we actually should use the Top-K neighbors
                    // that have rated this item, otherwise we may have
                    // only a few neighbors rated this item

                    // Compute the average rating on item iid given 
                    // by the top K neighbors. Each rating is offsetted by
                    // the neighbor's average and weighted by the similarity
                    double weightedSum = 0;
                    double weightSum = 0;
                    int currentTopKCount = 0;
                    foreach (KeyValuePair<int, double> neighbor in topNeighborsOfUser)
                    {
                        int neighborIndex = neighbor.Key;
                        double similarityOfNeighbor = neighbor.Value;
                        double itemRatingOfNeighbor = R_train[neighborIndex, itemIndex];

                        // We count only if the neighbor has seen this item before
                        if (itemRatingOfNeighbor != 0)
                        {
                            weightSum += similarityOfNeighbor;
                            weightedSum += (itemRatingOfNeighbor - meanByUser[neighborIndex]) * similarityOfNeighbor;
                            currentTopKCount++;
                            if (currentTopKCount >= K) { break; }   // Stop when we have seen K neighbors
                        }
                    }
                    // A zero weightedSum means this is a cold item and global mean will be assigned by default
                    if (weightedSum != 0)
                    {
                        prediction = meanOfUser + weightedSum / weightSum;
                    }
                    else
                    {
                        prediction = globalMean;
                        globalMeanCount++;
                    }

                    // Cap the ratings
                    if (prediction > Config.Ratings.MaxRating)
                    {
                        cappedCount++;
                        prediction = Config.Ratings.MaxRating;
                    }
                    if (prediction < Config.Ratings.MinRating)
                    {
                        cappedCount++;
                        prediction = Config.Ratings.MinRating;
                    }

                    lock (lockMe)
                    {
                        R_predicted[indexOfUser, itemIndex] = prediction;
                    }
                }
            });
            Utils.PrintValue("# capped predictions", cappedCount.ToString("D"));
            Utils.PrintValue("# default predictions", globalMeanCount.ToString("D"));
            return R_predicted;
        }
Example #2
0
        public static DataMatrix PredictRatings(PrefRelations PR_train,
            DataMatrix R_unknown, int K, SimilarityData neighborsByUser)
        {
            Debug.Assert(PR_train.UserCount == R_unknown.UserCount);
            Debug.Assert(PR_train.ItemCount == R_unknown.ItemCount);

            // This matrix stores predictions
            DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount);

            // This can be considered as the R_train in standard UserKNN
            SparseMatrix positionMatrix = PR_train.GetPositionMatrix();
            DataMatrix ratingMatrixFromPositions = new DataMatrix(positionMatrix);

            Vector<double> meanByUser = ratingMatrixFromPositions.GetUserMeans();
            Vector<double> meanByItem = ratingMatrixFromPositions.GetItemMeans();
            double globalMean = ratingMatrixFromPositions.GetGlobalMean();

            // Predict positions for each test user
            // Appears to be very fast, parallel.foreach is unnecessary
            foreach (Tuple<int, Vector<double>> user in R_unknown.Users)
            {
                int indexOfUser = user.Item1;
                Vector<double> indexesOfUnknownRatings = user.Item2;

                Utils.PrintEpoch("Predicting user/total", indexOfUser, PR_train.UserCount);

                // Note that there are more than K neighbors in the list (sorted by similarity)
                // we will use the top-K neighbors WHO HAVE RATED THE ITEM
                // For example we have 200 top neighbors, and we hope there are
                // K neighbors in the list have rated the item. We can't keep
                // everyone in the neighbor list because there are too many for large data sets
                var topNeighborsOfUser = neighborsByUser[indexOfUser];

                double meanOfUser = meanByUser[indexOfUser];

                // Loop through each position to be predicted
                foreach (Tuple<int, double> unknownRating in indexesOfUnknownRatings.EnumerateIndexed(Zeros.AllowSkip))
                {
                    int indexOfUnknownItem = unknownRating.Item1;

                    // Compute the position of this item for the user
                    // by combining neighbors' positions on this item
                    double weightedSum = 0;
                    double weightSum = 0;
                    int currentTopKCount = 0;
                    foreach (KeyValuePair<int, double> neighbor in topNeighborsOfUser)
                    {
                        int indexOfNeighbor = neighbor.Key;
                        double similarityOfNeighbor = neighbor.Value;
                        double itemPositionOfNeighbor = ratingMatrixFromPositions[indexOfNeighbor, indexOfUnknownItem];

                        // We count only if the neighbor has seen this item before
                        if (itemPositionOfNeighbor != 0)
                        {
                            // Recall that we use a constant to hold position value 0
                            // we revert it back here
                            if (itemPositionOfNeighbor == Config.ZeroInSparseMatrix)
                            {
                                Debug.Assert(true, "By using the PositionShift constant, we should not be in here.");
                                itemPositionOfNeighbor = 0;
                            }
                            weightSum += similarityOfNeighbor;
                            weightedSum += (itemPositionOfNeighbor - meanByUser[indexOfNeighbor]) * similarityOfNeighbor;
                            currentTopKCount++;
                            if(currentTopKCount>= K)
                            {
                                break;
                            }
                        }
                    }

                    // If any neighbor has seen this item
                    if (currentTopKCount != 0)
                    {
                        // TODO: Add user mean may improve the performance
                        R_predicted[indexOfUser, indexOfUnknownItem] = meanOfUser + weightedSum / weightSum;
                    }
                    else
                    {
                        R_predicted[indexOfUser, indexOfUnknownItem] = globalMean;
                    }
                }
            }//);
            return R_predicted;
        }
Example #3
0
        public static DataMatrix PredictRatings(DataMatrix R_train, DataMatrix R_unknown,
            int maxEpoch, double learnRate, double regularization, int factorCount)
        {
            int userCount = R_train.UserCount;
            int itemCount = R_train.ItemCount;
            int ratingCount = R_train.NonZerosCount;
            double meanOfGlobal = R_train.GetGlobalMean();
            DataMatrix R_train_unknown = R_train.IndexesOfNonZeroElements();  // For testing convergence

            // User latent vectors with default seed
            Matrix<double> P = Utils.CreateRandomMatrixFromNormal(userCount, factorCount, 0, 0.1, Config.Seed);
           // Matrix<double> P = Utils.CreateRandomMatrixFromUniform(userCount, factorCount, 0, 0.1, Config.Seed);
            // Item latent vectors with a different seed
            Matrix<double> Q = Utils.CreateRandomMatrixFromNormal(factorCount, itemCount, 0, 0.1, Config.Seed + 1);
            //Matrix<double> Q = Utils.CreateRandomMatrixFromUniform(factorCount, itemCount, 0, 0.1, Config.Seed + 1);

            // SGD
            double e_prev = double.MaxValue;
            for (int epoch = 0; epoch < maxEpoch; ++epoch)
            {
                foreach (Tuple<int, int, double> element in R_train.Ratings)
                {
                    int indexOfUser = element.Item1;
                    int indexOfItem = element.Item2;
                    double rating = element.Item3;

                    double e_ij = rating - (meanOfGlobal + P.Row(indexOfUser).DotProduct(Q.Column(indexOfItem)));
                    
                    // Update feature vectors
                    Vector<double> P_u = P.Row(indexOfUser);
                    Vector<double> Q_i = Q.Column(indexOfItem);

                    Vector<double> P_u_updated = P_u + (Q_i.Multiply(e_ij) - P_u.Multiply(regularization)).Multiply(learnRate);
                    P.SetRow(indexOfUser, P_u_updated);

                    Vector<double> Q_i_updated = Q_i + (P_u.Multiply(e_ij) - Q_i.Multiply(regularization)).Multiply(learnRate);
                    Q.SetColumn(indexOfItem, Q_i_updated);

                    #region Update feature vectors loop version
                    /*
                    // Update feature vectors
                    for (int k = 0; k < factorCount; ++k)
                    {
                        double factorOfUser = P[indexOfUser, k];
                        double factorOfItem = Q[k, indexOfItem];

                        P[indexOfUser, k] += learnRate * (e_ij * factorOfItem - regularization * factorOfUser);
                        Q[k, indexOfItem] += learnRate * (e_ij * factorOfUser - regularization * factorOfItem);
                    }
                    */
                    #endregion
                }

                // Display the current regularized error see if it converges

                double e_curr = 0;
                if (epoch == 0 || epoch == maxEpoch - 1 || epoch % (int)Math.Ceiling(maxEpoch * 0.1) == 4)
                {
                    Matrix<double> predictedMatrix = R_train_unknown.PointwiseMultiply(P.Multiply(Q));
                    SparseMatrix correctMatrix = R_train.Matrix;
                    double squaredError = (correctMatrix - predictedMatrix).SquaredSum();
                    double regularizationPenaty = regularization * (P.SquaredSum() + Q.SquaredSum());
                    double objective = squaredError + regularizationPenaty;

                    #region Linear implementation
                    /*
                    double e = 0;
                    foreach (Tuple<int, int, double> element in R_train.Ratings)
                    {
                        int indexOfUser = element.Item1;
                        int indexOfItem = element.Item2;
                        double rating = element.Item3;

                        e += Math.Pow(rating - P.Row(indexOfUser).DotProduct(Q.Column(indexOfItem)), 2);

                        for (int k = 0; k < factorCount; ++k)
                        {
                            e += (regularization / 2) * (Math.Pow(P[indexOfUser, k], 2) + Math.Pow(Q[k, indexOfItem], 2));
                        }
                    }
                    */
                    #endregion

                    // Record the current error
                    e_curr = objective;

                    // Stop the learning if the regularized error falls below a certain threshold
                    if (e_prev - e_curr < 0.001)
                    {
                        Console.WriteLine("Improvment less than 0.001, learning stopped.");
                        break;
                    }
                    e_prev = e_curr;

                    Utils.PrintEpoch("Epoch", epoch, maxEpoch, "Objective cost", objective);
                }

            }

            SparseMatrix R_predicted = new SparseMatrix(R_unknown.UserCount, R_unknown.ItemCount);
            foreach(var element in R_unknown.Matrix.EnumerateIndexed(Zeros.AllowSkip))
            {
                int indexOfUser = element.Item1;
                int indexOfItem = element.Item2;
                double r_predicted = meanOfGlobal + P.Row(indexOfUser) * Q.Column(indexOfItem);

                if (r_predicted > Config.Ratings.MaxRating) r_predicted = Config.Ratings.MaxRating;
                if (r_predicted < Config.Ratings.MinRating) r_predicted = Config.Ratings.MinRating;

                R_predicted[indexOfUser, indexOfItem] = r_predicted;
            }
            return new DataMatrix(R_predicted);
            //return new RatingMatrix(R_unknown.PointwiseMultiply(P.Multiply(Q)));
        }
Example #4
0
        /// <summary>
        /// The user-based KNN collaborative filtering described in paper:
        /// Resnick, P., et al., "GroupLens: an open architecture for collaborative filtering of netnews", 1994.
        /// Link: http://dx.doi.org/10.1145/192844.192905
        /// </summary>
        /// <param name="R_train"></param>
        /// <param name="R_unknown"></param>
        /// <param name="K"></param>
        /// <returns></returns>
        public static DataMatrix PredictRatings(DataMatrix R_train, DataMatrix R_unknown, SimilarityData neighborsByUser, int K)
        {
            // Debug
            Debug.Assert(R_train.UserCount == R_unknown.UserCount);
            Debug.Assert(R_train.ItemCount == R_unknown.ItemCount);
            int cappedCount = 0, globalMeanCount = 0;

            // This matrix stores predictions
            DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount);

            // Basic statistics from train set
            double          globalMean = R_train.GetGlobalMean();
            Vector <double> meanByUser = R_train.GetUserMeans();
            Vector <double> meanByItem = R_train.GetItemMeans();

            // Predict ratings for each test user
            // Single thread appears to be very fast, parallel.foreach is unnecessary
            Object lockMe = new Object();

            Parallel.ForEach(R_unknown.Users, user =>
            {
                int indexOfUser             = user.Item1;
                RatingVector userRatings    = new RatingVector(R_train.GetRow(indexOfUser));
                RatingVector unknownRatings = new RatingVector(user.Item2);

                Utils.PrintEpoch("Predicting user/total", indexOfUser, R_train.UserCount);

                // Note that there are more than K neighbors in the list (sorted by similarity)
                // we will use the top-K neighbors WHO HAVE RATED THE ITEM
                // For example we have 200 top neighbors, and we hope there are
                // K neighbors in the list have rated the item. We can't keep
                // everyone in the neighbor list because there are too many for large data sets
                var topNeighborsOfUser = neighborsByUser[indexOfUser];
                //Dictionary<int, double> topKNeighbors = KNNCore.GetTopKNeighborsByUser(userSimilarities, indexOfUser, K);

                double meanOfUser = meanByUser[indexOfUser];

                // Loop through each ratingto be predicted
                foreach (Tuple <int, double> unknownRating in unknownRatings.Ratings)
                {
                    int itemIndex = unknownRating.Item1;
                    double prediction;

                    // TODO: we actually should use the Top-K neighbors
                    // that have rated this item, otherwise we may have
                    // only a few neighbors rated this item

                    // Compute the average rating on item iid given
                    // by the top K neighbors. Each rating is offsetted by
                    // the neighbor's average and weighted by the similarity
                    double weightedSum   = 0;
                    double weightSum     = 0;
                    int currentTopKCount = 0;
                    foreach (KeyValuePair <int, double> neighbor in topNeighborsOfUser)
                    {
                        int neighborIndex           = neighbor.Key;
                        double similarityOfNeighbor = neighbor.Value;
                        double itemRatingOfNeighbor = R_train[neighborIndex, itemIndex];

                        // We count only if the neighbor has seen this item before
                        if (itemRatingOfNeighbor != 0)
                        {
                            weightSum   += similarityOfNeighbor;
                            weightedSum += (itemRatingOfNeighbor - meanByUser[neighborIndex]) * similarityOfNeighbor;
                            currentTopKCount++;
                            if (currentTopKCount >= K)
                            {
                                break;
                            }                                       // Stop when we have seen K neighbors
                        }
                    }
                    // A zero weightedSum means this is a cold item and global mean will be assigned by default
                    if (weightedSum != 0)
                    {
                        prediction = meanOfUser + weightedSum / weightSum;
                    }
                    else
                    {
                        prediction = globalMean;
                        globalMeanCount++;
                    }

                    // Cap the ratings
                    if (prediction > Config.Ratings.MaxRating)
                    {
                        cappedCount++;
                        prediction = Config.Ratings.MaxRating;
                    }
                    if (prediction < Config.Ratings.MinRating)
                    {
                        cappedCount++;
                        prediction = Config.Ratings.MinRating;
                    }

                    lock (lockMe)
                    {
                        R_predicted[indexOfUser, itemIndex] = prediction;
                    }
                }
            });
            Utils.PrintValue("# capped predictions", cappedCount.ToString("D"));
            Utils.PrintValue("# default predictions", globalMeanCount.ToString("D"));
            return(R_predicted);
        }