Exemple #1
0
        private static SimilarityData Similarity(string userAnswer, string correctAnswer, Card card, Rules rules, CorrectCertainty certainty)
        {
            var similarityData = new List <SimilarityData>();

            void KeepBestSimilarityData()
            {
                // Keep best similarity data
                similarityData = similarityData.OrderBy(x => x.Difference).ThenBy(x => (int)x.Certainty).ToList();
                similarityData = similarityData.Take(1).ToList();
            }

            if (rules.HasFlag(Rules.IgnoreOpeningWhitespace))
            {
                similarityData.Add(Similarity(userAnswer.TrimStart(' '), correctAnswer.TrimStart(' '), card, rules & ~Rules.IgnoreOpeningWhitespace,
                                              (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));
                // Math.Max to use worst certainty (if the certainty when calling this method was 'maybe correct', new certainty can't be 'probably correct' for instance)
            }

            if (rules.HasFlag(Rules.IgnoreEndingWhitespace))
            {
                similarityData.Add(Similarity(userAnswer.TrimEnd(' '), correctAnswer.TrimEnd(' '), card, rules & ~Rules.IgnoreEndingWhitespace,
                                              (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));
            }

            if (rules.HasFlag(Rules.IgnoreFirstCapitalization))
            {
                similarityData.Add(Similarity(CapitalizeFirstChar(userAnswer), CapitalizeFirstChar(correctAnswer), card, rules & ~Rules.IgnoreFirstCapitalization,
                                              (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));
            }

            if (rules.HasFlag(Rules.IgnoreDotsInEnd))
            {
                similarityData.Add(Similarity(userAnswer.TrimEnd('.'), correctAnswer.TrimEnd('.'), card, rules & ~Rules.IgnoreDotsInEnd,
                                              (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));
            }

            if (rules.HasFlag(Rules.TreatWordsBetweenSlashAsSynonyms))
            {
                if (correctAnswer.Contains("/"))
                {
                    var  synonymSimilarities = new List <SimilarityData>();
                    bool any = false;
                    foreach (var userSynonym in userAnswer.Split('/').Where(x => !string.IsNullOrWhiteSpace(x)))
                    {
                        any = true;

                        var matches = new List <SimilarityData>();
                        foreach (var correctSynonym in correctAnswer.Split('/').Where(x => !string.IsNullOrWhiteSpace(x)))
                        {
                            matches.Add(Similarity(userSynonym, correctSynonym.TrimStart(' '), card, rules,
                                                   (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));
                        }

                        // Add best match
                        synonymSimilarities.Add(matches.OrderBy(x => x.Difference).First());
                    }

                    if (any)
                    {
                        // At least one synonym was entered!

                        if (synonymSimilarities.All(x => x.Difference == 0))
                        {
                            // Provided synonyms are correct
                            similarityData.Add(
                                new SimilarityData(0, (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty), correctAnswer, card));
                        }
                        else
                        {
                            similarityData.Add(
                                new SimilarityData(synonymSimilarities.Select(x => x.Difference).Max(),
                                                   (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty), correctAnswer, card));
                        }
                    }
                }
            }

            if (rules.HasFlag(Rules.TreatWordInParenthesisAsOptional))
            {
                if (correctAnswer.Contains("(") && correctAnswer.Contains(")"))
                {
                    if (!correctAnswer.TrimStart().StartsWith("("))
                    {
                        string w1 = correctAnswer.Split('(')[0].TrimEnd(' '); // tarp (tarpaulin) => tarp
                        similarityData.Add(Similarity(userAnswer, w1, card, rules, (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));
                    }

                    string w2 = correctAnswer.Split('(')[1].Split(')')[0].TrimStart(' ').TrimEnd(' '); // tarp (tarpaulin) => tarpaulin
                    similarityData.Add(Similarity(userAnswer, w2, card, rules, (CorrectCertainty)Math.Max((int)CorrectCertainty.MaybeCorrect, (int)certainty)));

                    //string w3 = correctAnswer.Replace("(", "").Replace(")", ""); // (eye)lash => eyelash

                    var rgp1 = new Regex(Regex.Escape("("));
                    var rgp2 = new Regex(Regex.Escape(")"));

                    string w3 = rgp1.Replace(correctAnswer, "", 1); // (eye)lash => eye)lash (replace first occurence of starting paranthesis)
                    w3 = rgp2.Replace(w3, "", 1);                   // eyelash => eyelash (replace first occurence of ending paranthesis)

                    similarityData.Add(Similarity(userAnswer, w3, card, rules, (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));

                    if (!correctAnswer.TrimEnd().EndsWith(")") || correctAnswer.Count(c => c == ')') > 1)
                    {
                        string w4 = correctAnswer.Split(new[] { ')' }, 2)[1].TrimStart(' '); // (eye)lash => lash
                        similarityData.Add(Similarity(userAnswer, w4, card, rules, (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));
                    }
                }
            }

            int difference = Fastenshtein.Levenshtein.Distance(userAnswer, correctAnswer);

            similarityData.Add(new SimilarityData(difference, certainty, correctAnswer, card));
            KeepBestSimilarityData();
//#warning the best similarity data that is being kept is not necessarily equal to the written answer in the quiz!!! this potentially shows a wrong answer in "ProbablyCorrectAnswer" dialog
            SimilarityData best = similarityData.First();

            return(best);
        }
Exemple #2
0
        /// <summary>
        /// The user-based KNN collaborative filtering described in paper:
        /// Resnick, P., et al., "GroupLens: an open architecture for collaborative filtering of netnews", 1994.
        /// Link: http://dx.doi.org/10.1145/192844.192905
        /// </summary>
        /// <param name="R_train"></param>
        /// <param name="R_unknown"></param>
        /// <param name="K"></param>
        /// <returns></returns>
        public static DataMatrix PredictRatings(DataMatrix R_train, DataMatrix R_unknown, SimilarityData neighborsByUser, int K)
        {
            // Debug
            Debug.Assert(R_train.UserCount == R_unknown.UserCount);
            Debug.Assert(R_train.ItemCount == R_unknown.ItemCount);
            int cappedCount = 0, globalMeanCount = 0;

            // This matrix stores predictions
            DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount);

            // Basic statistics from train set
            double          globalMean = R_train.GetGlobalMean();
            Vector <double> meanByUser = R_train.GetUserMeans();
            Vector <double> meanByItem = R_train.GetItemMeans();

            // Predict ratings for each test user
            // Single thread appears to be very fast, parallel.foreach is unnecessary
            Object lockMe = new Object();

            Parallel.ForEach(R_unknown.Users, user =>
            {
                int indexOfUser             = user.Item1;
                RatingVector userRatings    = new RatingVector(R_train.GetRow(indexOfUser));
                RatingVector unknownRatings = new RatingVector(user.Item2);

                Utils.PrintEpoch("Predicting user/total", indexOfUser, R_train.UserCount);

                // Note that there are more than K neighbors in the list (sorted by similarity)
                // we will use the top-K neighbors WHO HAVE RATED THE ITEM
                // For example we have 200 top neighbors, and we hope there are
                // K neighbors in the list have rated the item. We can't keep
                // everyone in the neighbor list because there are too many for large data sets
                var topNeighborsOfUser = neighborsByUser[indexOfUser];
                //Dictionary<int, double> topKNeighbors = KNNCore.GetTopKNeighborsByUser(userSimilarities, indexOfUser, K);

                double meanOfUser = meanByUser[indexOfUser];

                // Loop through each ratingto be predicted
                foreach (Tuple <int, double> unknownRating in unknownRatings.Ratings)
                {
                    int itemIndex = unknownRating.Item1;
                    double prediction;

                    // TODO: we actually should use the Top-K neighbors
                    // that have rated this item, otherwise we may have
                    // only a few neighbors rated this item

                    // Compute the average rating on item iid given
                    // by the top K neighbors. Each rating is offsetted by
                    // the neighbor's average and weighted by the similarity
                    double weightedSum   = 0;
                    double weightSum     = 0;
                    int currentTopKCount = 0;
                    foreach (KeyValuePair <int, double> neighbor in topNeighborsOfUser)
                    {
                        int neighborIndex           = neighbor.Key;
                        double similarityOfNeighbor = neighbor.Value;
                        double itemRatingOfNeighbor = R_train[neighborIndex, itemIndex];

                        // We count only if the neighbor has seen this item before
                        if (itemRatingOfNeighbor != 0)
                        {
                            weightSum   += similarityOfNeighbor;
                            weightedSum += (itemRatingOfNeighbor - meanByUser[neighborIndex]) * similarityOfNeighbor;
                            currentTopKCount++;
                            if (currentTopKCount >= K)
                            {
                                break;
                            }                                       // Stop when we have seen K neighbors
                        }
                    }
                    // A zero weightedSum means this is a cold item and global mean will be assigned by default
                    if (weightedSum != 0)
                    {
                        prediction = meanOfUser + weightedSum / weightSum;
                    }
                    else
                    {
                        prediction = globalMean;
                        globalMeanCount++;
                    }

                    // Cap the ratings
                    if (prediction > Config.Ratings.MaxRating)
                    {
                        cappedCount++;
                        prediction = Config.Ratings.MaxRating;
                    }
                    if (prediction < Config.Ratings.MinRating)
                    {
                        cappedCount++;
                        prediction = Config.Ratings.MinRating;
                    }

                    lock (lockMe)
                    {
                        R_predicted[indexOfUser, itemIndex] = prediction;
                    }
                }
            });
            Utils.PrintValue("# capped predictions", cappedCount.ToString("D"));
            Utils.PrintValue("# default predictions", globalMeanCount.ToString("D"));
            return(R_predicted);
        }
Exemple #3
0
        public string GetReadyForNumerical(bool saveLoadedData = true)
        {
            if (ReadyForNumerical)
            {
                return("Is ready.");
            }

            StringBuilder log = new StringBuilder();

            Utils.StartTimer();

            log.AppendLine(Utils.PrintHeading("Create R_train/R_test sets from " + DataSetFile));
            Utils.LoadMovieLensSplitByCount(DataSetFile, out R_train,
                                            out R_test, MinCountOfRatings, MaxCountOfRatings, CountOfRatingsForTrain, ShuffleData, Seed);

            Console.WriteLine(R_train.DatasetBrief("Train set"));
            Console.WriteLine(R_test.DatasetBrief("Test set"));
            log.AppendLine(R_train.DatasetBrief("Train set"));
            log.AppendLine(R_test.DatasetBrief("Test set"));

            R_unknown = R_test.IndexesOfNonZeroElements();

            log.AppendLine(Utils.PrintValue("Relevant item criteria", RelevantItemCriteria.ToString("0.0")));
            RelevantItemsByUser = ItemRecommendationCore.GetRelevantItemsByUser(R_test, RelevantItemCriteria);
            log.AppendLine(Utils.PrintValue("Mean # of relevant items per user",
                                            RelevantItemsByUser.Average(k => k.Value.Count).ToString("0")));
            log.AppendLine(Utils.StopTimer());

            #region Prepare similarity data
            if (File.Exists(GetDataFileName("USR")) &&
                File.Exists(GetDataFileName("ISR")) &&
                File.Exists(GetDataFileName("SSIIR")))
            {
                Utils.StartTimer();
                Utils.PrintHeading("Load user-user similarities (rating based)");
                UserSimilaritiesOfRating = Utils.IO <SimilarityData> .LoadObject(GetDataFileName("USR"));

                Utils.StopTimer();

                Utils.StartTimer();
                Utils.PrintHeading("Load item-item similarities (rating based)");
                ItemSimilaritiesOfRating = Utils.IO <SimilarityData> .LoadObject(GetDataFileName("ISR"));

                Utils.StopTimer();

                Utils.StartTimer();
                Utils.PrintHeading("Load item-item strong similarity indicators (rating based)");
                StrongSimilarityIndicatorsByItemRating = Utils.IO <HashSet <Tuple <int, int> > > .LoadObject(GetDataFileName("SSIIR"));

                Utils.StopTimer();
            }
            else
            {
                Utils.StartTimer();
                Utils.PrintHeading("Compute user-user similarities (rating based)");
                Metric.GetPearsonOfRows(R_train, MaxCountOfNeighbors, StrongSimilarityThreshold,
                                        out UserSimilaritiesOfRating);
                if (saveLoadedData)
                {
                    Utils.IO <SimilarityData> .SaveObject(UserSimilaritiesOfRating, GetDataFileName("USR"));
                }
                Utils.StopTimer();

                Utils.StartTimer();
                Utils.PrintHeading("Compute item-item similarities (rating based)");
                Metric.GetPearsonOfColumns(R_train, MaxCountOfNeighbors, StrongSimilarityThreshold,
                                           out ItemSimilaritiesOfRating, out StrongSimilarityIndicatorsByItemRating);
                if (saveLoadedData)
                {
                    Utils.IO <SimilarityData> .SaveObject(ItemSimilaritiesOfRating, GetDataFileName("ISR"));

                    Utils.IO <HashSet <Tuple <int, int> > >
                    .SaveObject(StrongSimilarityIndicatorsByItemRating, GetDataFileName("SSIIR"));
                }
                Utils.StopTimer();
            }
            #endregion

            ReadyForNumerical = true;

            return(log.ToString());
        }
Exemple #4
0
        public string GetReadyForOrdinal(bool saveLoadedData = true)
        {
            if (!ReadyForNumerical)
            {
                GetReadyForNumerical();
            }
            if (ReadyForOrdinal)
            {
                return("Is ready.");
            }

            StringBuilder log = new StringBuilder();

            Utils.StartTimer();
            log.AppendLine(Utils.PrintHeading("Prepare preferecen relation data"));

            Console.WriteLine("Converting R_train into PR_train");
            log.AppendLine("Converting R_train into PR_train");
            PR_train = PrefRelations.CreateDiscrete(R_train);

            //Console.WriteLine("Converting R_test into PR_test");
            //log.AppendLine("Converting R_test into PR_test");
            //PR_test = PrefRelations.CreateDiscrete(R_test);

            log.AppendLine(Utils.StopTimer());

            #region Prepare similarity data
            if (File.Exists(GetDataFileName("USP")) &&
                File.Exists(GetDataFileName("ISP")) &&
                File.Exists(GetDataFileName("SSIIP")))
            {
                Utils.StartTimer();
                Utils.PrintHeading("Load user, item, indicators variables (Pref based)");
                UserSimilaritiesOfPref = Utils.IO <SimilarityData> .LoadObject(GetDataFileName("USP"));

                ItemSimilaritiesOfPref = Utils.IO <SimilarityData> .LoadObject(GetDataFileName("ISP"));

                StrongSimilarityIndicatorsByItemPref = Utils.IO <HashSet <Tuple <int, int> > > .LoadObject(GetDataFileName("SSIIP"));

                Utils.StopTimer();
            }
            else
            {
                Utils.StartTimer();
                Utils.PrintHeading("Compute user-user similarities (Pref based)");
                Metric.GetCosineOfPrefRelations(PR_train, MaxCountOfNeighbors,
                                                StrongSimilarityThreshold, out UserSimilaritiesOfPref);
                Utils.StopTimer();

                // For the moment, we use user-wise preferences to compute
                // item-item similarities, it is not the same as user-user pref similarities
                Utils.StartTimer();
                Utils.PrintHeading("Compute item-item similarities (Pref based)");
                DataMatrix PR_userwise_preferences = new DataMatrix(PR_train.GetPositionMatrix());
                Metric.GetPearsonOfColumns(PR_userwise_preferences, MaxCountOfNeighbors, StrongSimilarityThreshold,
                                           out ItemSimilaritiesOfPref, out StrongSimilarityIndicatorsByItemPref);
                Utils.StopTimer();

                if (saveLoadedData)
                {
                    Utils.IO <SimilarityData> .SaveObject(UserSimilaritiesOfPref, GetDataFileName("USP"));

                    Utils.IO <SimilarityData> .SaveObject(ItemSimilaritiesOfPref, GetDataFileName("ISP"));

                    Utils.IO <HashSet <Tuple <int, int> > >
                    .SaveObject(StrongSimilarityIndicatorsByItemPref, GetDataFileName("SSIIP"));
                }
                Utils.StopTimer();
            }
            #endregion



            ReadyForOrdinal = true;

            return(log.ToString());
        }
Exemple #5
0
        public static DataMatrix PredictRatings(PrefRelations PR_train,
                                                DataMatrix R_unknown, int K, SimilarityData neighborsByUser)
        {
            Debug.Assert(PR_train.UserCount == R_unknown.UserCount);
            Debug.Assert(PR_train.ItemCount == R_unknown.ItemCount);

            // This matrix stores predictions
            DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount);

            // This can be considered as the R_train in standard UserKNN
            SparseMatrix positionMatrix            = PR_train.GetPositionMatrix();
            DataMatrix   ratingMatrixFromPositions = new DataMatrix(positionMatrix);

            Vector <double> meanByUser = ratingMatrixFromPositions.GetUserMeans();
            Vector <double> meanByItem = ratingMatrixFromPositions.GetItemMeans();
            double          globalMean = ratingMatrixFromPositions.GetGlobalMean();

            // Predict positions for each test user
            // Appears to be very fast, parallel.foreach is unnecessary
            foreach (Tuple <int, Vector <double> > user in R_unknown.Users)
            {
                int             indexOfUser             = user.Item1;
                Vector <double> indexesOfUnknownRatings = user.Item2;

                Utils.PrintEpoch("Predicting user/total", indexOfUser, PR_train.UserCount);

                // Note that there are more than K neighbors in the list (sorted by similarity)
                // we will use the top-K neighbors WHO HAVE RATED THE ITEM
                // For example we have 200 top neighbors, and we hope there are
                // K neighbors in the list have rated the item. We can't keep
                // everyone in the neighbor list because there are too many for large data sets
                var topNeighborsOfUser = neighborsByUser[indexOfUser];

                double meanOfUser = meanByUser[indexOfUser];

                // Loop through each position to be predicted
                foreach (Tuple <int, double> unknownRating in indexesOfUnknownRatings.EnumerateIndexed(Zeros.AllowSkip))
                {
                    int indexOfUnknownItem = unknownRating.Item1;

                    // Compute the position of this item for the user
                    // by combining neighbors' positions on this item
                    double weightedSum      = 0;
                    double weightSum        = 0;
                    int    currentTopKCount = 0;
                    foreach (KeyValuePair <int, double> neighbor in topNeighborsOfUser)
                    {
                        int    indexOfNeighbor        = neighbor.Key;
                        double similarityOfNeighbor   = neighbor.Value;
                        double itemPositionOfNeighbor = ratingMatrixFromPositions[indexOfNeighbor, indexOfUnknownItem];

                        // We count only if the neighbor has seen this item before
                        if (itemPositionOfNeighbor != 0)
                        {
                            // Recall that we use a constant to hold position value 0
                            // we revert it back here
                            if (itemPositionOfNeighbor == Config.ZeroInSparseMatrix)
                            {
                                Debug.Assert(true, "By using the PositionShift constant, we should not be in here.");
                                itemPositionOfNeighbor = 0;
                            }
                            weightSum   += similarityOfNeighbor;
                            weightedSum += (itemPositionOfNeighbor - meanByUser[indexOfNeighbor]) * similarityOfNeighbor;
                            currentTopKCount++;
                            if (currentTopKCount >= K)
                            {
                                break;
                            }
                        }
                    }

                    // If any neighbor has seen this item
                    if (currentTopKCount != 0)
                    {
                        // TODO: Add user mean may improve the performance
                        R_predicted[indexOfUser, indexOfUnknownItem] = meanOfUser + weightedSum / weightSum;
                    }
                    else
                    {
                        R_predicted[indexOfUser, indexOfUnknownItem] = globalMean;
                    }
                }
            }//);
            return(R_predicted);
        }