Ejemplo n.º 1
        private static SimilarityData Similarity(string userAnswer, string correctAnswer, Card card, Rules rules, CorrectCertainty certainty)
            var similarityData = new List <SimilarityData>();

            void KeepBestSimilarityData()
                // Keep best similarity data
                similarityData = similarityData.OrderBy(x => x.Difference).ThenBy(x => (int)x.Certainty).ToList();
                similarityData = similarityData.Take(1).ToList();

            if (rules.HasFlag(Rules.IgnoreOpeningWhitespace))
                similarityData.Add(Similarity(userAnswer.TrimStart(' '), correctAnswer.TrimStart(' '), card, rules & ~Rules.IgnoreOpeningWhitespace,
                                              (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));
                // Math.Max to use worst certainty (if the certainty when calling this method was 'maybe correct', new certainty can't be 'probably correct' for instance)

            if (rules.HasFlag(Rules.IgnoreEndingWhitespace))
                similarityData.Add(Similarity(userAnswer.TrimEnd(' '), correctAnswer.TrimEnd(' '), card, rules & ~Rules.IgnoreEndingWhitespace,
                                              (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));

            if (rules.HasFlag(Rules.IgnoreFirstCapitalization))
                similarityData.Add(Similarity(CapitalizeFirstChar(userAnswer), CapitalizeFirstChar(correctAnswer), card, rules & ~Rules.IgnoreFirstCapitalization,
                                              (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));

            if (rules.HasFlag(Rules.IgnoreDotsInEnd))
                similarityData.Add(Similarity(userAnswer.TrimEnd('.'), correctAnswer.TrimEnd('.'), card, rules & ~Rules.IgnoreDotsInEnd,
                                              (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));

            if (rules.HasFlag(Rules.TreatWordsBetweenSlashAsSynonyms))
                if (correctAnswer.Contains("/"))
                    var  synonymSimilarities = new List <SimilarityData>();
                    bool any = false;
                    foreach (var userSynonym in userAnswer.Split('/').Where(x => !string.IsNullOrWhiteSpace(x)))
                        any = true;

                        var matches = new List <SimilarityData>();
                        foreach (var correctSynonym in correctAnswer.Split('/').Where(x => !string.IsNullOrWhiteSpace(x)))
                            matches.Add(Similarity(userSynonym, correctSynonym.TrimStart(' '), card, rules,
                                                   (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));

                        // Add best match
                        synonymSimilarities.Add(matches.OrderBy(x => x.Difference).First());

                    if (any)
                        // At least one synonym was entered!

                        if (synonymSimilarities.All(x => x.Difference == 0))
                            // Provided synonyms are correct
                                new SimilarityData(0, (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty), correctAnswer, card));
                                new SimilarityData(synonymSimilarities.Select(x => x.Difference).Max(),
                                                   (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty), correctAnswer, card));

            if (rules.HasFlag(Rules.TreatWordInParenthesisAsOptional))
                if (correctAnswer.Contains("(") && correctAnswer.Contains(")"))
                    if (!correctAnswer.TrimStart().StartsWith("("))
                        string w1 = correctAnswer.Split('(')[0].TrimEnd(' '); // tarp (tarpaulin) => tarp
                        similarityData.Add(Similarity(userAnswer, w1, card, rules, (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));

                    string w2 = correctAnswer.Split('(')[1].Split(')')[0].TrimStart(' ').TrimEnd(' '); // tarp (tarpaulin) => tarpaulin
                    similarityData.Add(Similarity(userAnswer, w2, card, rules, (CorrectCertainty)Math.Max((int)CorrectCertainty.MaybeCorrect, (int)certainty)));

                    //string w3 = correctAnswer.Replace("(", "").Replace(")", ""); // (eye)lash => eyelash

                    var rgp1 = new Regex(Regex.Escape("("));
                    var rgp2 = new Regex(Regex.Escape(")"));

                    string w3 = rgp1.Replace(correctAnswer, "", 1); // (eye)lash => eye)lash (replace first occurence of starting paranthesis)
                    w3 = rgp2.Replace(w3, "", 1);                   // eyelash => eyelash (replace first occurence of ending paranthesis)

                    similarityData.Add(Similarity(userAnswer, w3, card, rules, (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));

                    if (!correctAnswer.TrimEnd().EndsWith(")") || correctAnswer.Count(c => c == ')') > 1)
                        string w4 = correctAnswer.Split(new[] { ')' }, 2)[1].TrimStart(' '); // (eye)lash => lash
                        similarityData.Add(Similarity(userAnswer, w4, card, rules, (CorrectCertainty)Math.Max((int)CorrectCertainty.ProbablyCorrect, (int)certainty)));

            int difference = Fastenshtein.Levenshtein.Distance(userAnswer, correctAnswer);

            similarityData.Add(new SimilarityData(difference, certainty, correctAnswer, card));
//#warning the best similarity data that is being kept is not necessarily equal to the written answer in the quiz!!! this potentially shows a wrong answer in "ProbablyCorrectAnswer" dialog
            SimilarityData best = similarityData.First();

Ejemplo n.º 2
        /// <summary>
        /// The user-based KNN collaborative filtering described in paper:
        /// Resnick, P., et al., "GroupLens: an open architecture for collaborative filtering of netnews", 1994.
        /// Link: http://dx.doi.org/10.1145/192844.192905
        /// </summary>
        /// <param name="R_train"></param>
        /// <param name="R_unknown"></param>
        /// <param name="K"></param>
        /// <returns></returns>
        public static DataMatrix PredictRatings(DataMatrix R_train, DataMatrix R_unknown, SimilarityData neighborsByUser, int K)
            // Debug
            Debug.Assert(R_train.UserCount == R_unknown.UserCount);
            Debug.Assert(R_train.ItemCount == R_unknown.ItemCount);
            int cappedCount = 0, globalMeanCount = 0;

            // This matrix stores predictions
            DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount);

            // Basic statistics from train set
            double          globalMean = R_train.GetGlobalMean();
            Vector <double> meanByUser = R_train.GetUserMeans();
            Vector <double> meanByItem = R_train.GetItemMeans();

            // Predict ratings for each test user
            // Single thread appears to be very fast, parallel.foreach is unnecessary
            Object lockMe = new Object();

            Parallel.ForEach(R_unknown.Users, user =>
                int indexOfUser             = user.Item1;
                RatingVector userRatings    = new RatingVector(R_train.GetRow(indexOfUser));
                RatingVector unknownRatings = new RatingVector(user.Item2);

                Utils.PrintEpoch("Predicting user/total", indexOfUser, R_train.UserCount);

                // Note that there are more than K neighbors in the list (sorted by similarity)
                // we will use the top-K neighbors WHO HAVE RATED THE ITEM
                // For example we have 200 top neighbors, and we hope there are
                // K neighbors in the list have rated the item. We can't keep
                // everyone in the neighbor list because there are too many for large data sets
                var topNeighborsOfUser = neighborsByUser[indexOfUser];
                //Dictionary<int, double> topKNeighbors = KNNCore.GetTopKNeighborsByUser(userSimilarities, indexOfUser, K);

                double meanOfUser = meanByUser[indexOfUser];

                // Loop through each ratingto be predicted
                foreach (Tuple <int, double> unknownRating in unknownRatings.Ratings)
                    int itemIndex = unknownRating.Item1;
                    double prediction;

                    // TODO: we actually should use the Top-K neighbors
                    // that have rated this item, otherwise we may have
                    // only a few neighbors rated this item

                    // Compute the average rating on item iid given
                    // by the top K neighbors. Each rating is offsetted by
                    // the neighbor's average and weighted by the similarity
                    double weightedSum   = 0;
                    double weightSum     = 0;
                    int currentTopKCount = 0;
                    foreach (KeyValuePair <int, double> neighbor in topNeighborsOfUser)
                        int neighborIndex           = neighbor.Key;
                        double similarityOfNeighbor = neighbor.Value;
                        double itemRatingOfNeighbor = R_train[neighborIndex, itemIndex];

                        // We count only if the neighbor has seen this item before
                        if (itemRatingOfNeighbor != 0)
                            weightSum   += similarityOfNeighbor;
                            weightedSum += (itemRatingOfNeighbor - meanByUser[neighborIndex]) * similarityOfNeighbor;
                            if (currentTopKCount >= K)
                            }                                       // Stop when we have seen K neighbors
                    // A zero weightedSum means this is a cold item and global mean will be assigned by default
                    if (weightedSum != 0)
                        prediction = meanOfUser + weightedSum / weightSum;
                        prediction = globalMean;

                    // Cap the ratings
                    if (prediction > Config.Ratings.MaxRating)
                        prediction = Config.Ratings.MaxRating;
                    if (prediction < Config.Ratings.MinRating)
                        prediction = Config.Ratings.MinRating;

                    lock (lockMe)
                        R_predicted[indexOfUser, itemIndex] = prediction;
            Utils.PrintValue("# capped predictions", cappedCount.ToString("D"));
            Utils.PrintValue("# default predictions", globalMeanCount.ToString("D"));
Ejemplo n.º 3
        public string GetReadyForNumerical(bool saveLoadedData = true)
            if (ReadyForNumerical)
                return("Is ready.");

            StringBuilder log = new StringBuilder();


            log.AppendLine(Utils.PrintHeading("Create R_train/R_test sets from " + DataSetFile));
            Utils.LoadMovieLensSplitByCount(DataSetFile, out R_train,
                                            out R_test, MinCountOfRatings, MaxCountOfRatings, CountOfRatingsForTrain, ShuffleData, Seed);

            Console.WriteLine(R_train.DatasetBrief("Train set"));
            Console.WriteLine(R_test.DatasetBrief("Test set"));
            log.AppendLine(R_train.DatasetBrief("Train set"));
            log.AppendLine(R_test.DatasetBrief("Test set"));

            R_unknown = R_test.IndexesOfNonZeroElements();

            log.AppendLine(Utils.PrintValue("Relevant item criteria", RelevantItemCriteria.ToString("0.0")));
            RelevantItemsByUser = ItemRecommendationCore.GetRelevantItemsByUser(R_test, RelevantItemCriteria);
            log.AppendLine(Utils.PrintValue("Mean # of relevant items per user",
                                            RelevantItemsByUser.Average(k => k.Value.Count).ToString("0")));

            #region Prepare similarity data
            if (File.Exists(GetDataFileName("USR")) &&
                File.Exists(GetDataFileName("ISR")) &&
                Utils.PrintHeading("Load user-user similarities (rating based)");
                UserSimilaritiesOfRating = Utils.IO <SimilarityData> .LoadObject(GetDataFileName("USR"));


                Utils.PrintHeading("Load item-item similarities (rating based)");
                ItemSimilaritiesOfRating = Utils.IO <SimilarityData> .LoadObject(GetDataFileName("ISR"));


                Utils.PrintHeading("Load item-item strong similarity indicators (rating based)");
                StrongSimilarityIndicatorsByItemRating = Utils.IO <HashSet <Tuple <int, int> > > .LoadObject(GetDataFileName("SSIIR"));

                Utils.PrintHeading("Compute user-user similarities (rating based)");
                Metric.GetPearsonOfRows(R_train, MaxCountOfNeighbors, StrongSimilarityThreshold,
                                        out UserSimilaritiesOfRating);
                if (saveLoadedData)
                    Utils.IO <SimilarityData> .SaveObject(UserSimilaritiesOfRating, GetDataFileName("USR"));

                Utils.PrintHeading("Compute item-item similarities (rating based)");
                Metric.GetPearsonOfColumns(R_train, MaxCountOfNeighbors, StrongSimilarityThreshold,
                                           out ItemSimilaritiesOfRating, out StrongSimilarityIndicatorsByItemRating);
                if (saveLoadedData)
                    Utils.IO <SimilarityData> .SaveObject(ItemSimilaritiesOfRating, GetDataFileName("ISR"));

                    Utils.IO <HashSet <Tuple <int, int> > >
                    .SaveObject(StrongSimilarityIndicatorsByItemRating, GetDataFileName("SSIIR"));

            ReadyForNumerical = true;

Ejemplo n.º 4
        public string GetReadyForOrdinal(bool saveLoadedData = true)
            if (!ReadyForNumerical)
            if (ReadyForOrdinal)
                return("Is ready.");

            StringBuilder log = new StringBuilder();

            log.AppendLine(Utils.PrintHeading("Prepare preferecen relation data"));

            Console.WriteLine("Converting R_train into PR_train");
            log.AppendLine("Converting R_train into PR_train");
            PR_train = PrefRelations.CreateDiscrete(R_train);

            //Console.WriteLine("Converting R_test into PR_test");
            //log.AppendLine("Converting R_test into PR_test");
            //PR_test = PrefRelations.CreateDiscrete(R_test);


            #region Prepare similarity data
            if (File.Exists(GetDataFileName("USP")) &&
                File.Exists(GetDataFileName("ISP")) &&
                Utils.PrintHeading("Load user, item, indicators variables (Pref based)");
                UserSimilaritiesOfPref = Utils.IO <SimilarityData> .LoadObject(GetDataFileName("USP"));

                ItemSimilaritiesOfPref = Utils.IO <SimilarityData> .LoadObject(GetDataFileName("ISP"));

                StrongSimilarityIndicatorsByItemPref = Utils.IO <HashSet <Tuple <int, int> > > .LoadObject(GetDataFileName("SSIIP"));

                Utils.PrintHeading("Compute user-user similarities (Pref based)");
                Metric.GetCosineOfPrefRelations(PR_train, MaxCountOfNeighbors,
                                                StrongSimilarityThreshold, out UserSimilaritiesOfPref);

                // For the moment, we use user-wise preferences to compute
                // item-item similarities, it is not the same as user-user pref similarities
                Utils.PrintHeading("Compute item-item similarities (Pref based)");
                DataMatrix PR_userwise_preferences = new DataMatrix(PR_train.GetPositionMatrix());
                Metric.GetPearsonOfColumns(PR_userwise_preferences, MaxCountOfNeighbors, StrongSimilarityThreshold,
                                           out ItemSimilaritiesOfPref, out StrongSimilarityIndicatorsByItemPref);

                if (saveLoadedData)
                    Utils.IO <SimilarityData> .SaveObject(UserSimilaritiesOfPref, GetDataFileName("USP"));

                    Utils.IO <SimilarityData> .SaveObject(ItemSimilaritiesOfPref, GetDataFileName("ISP"));

                    Utils.IO <HashSet <Tuple <int, int> > >
                    .SaveObject(StrongSimilarityIndicatorsByItemPref, GetDataFileName("SSIIP"));

            ReadyForOrdinal = true;

Ejemplo n.º 5
        public static DataMatrix PredictRatings(PrefRelations PR_train,
                                                DataMatrix R_unknown, int K, SimilarityData neighborsByUser)
            Debug.Assert(PR_train.UserCount == R_unknown.UserCount);
            Debug.Assert(PR_train.ItemCount == R_unknown.ItemCount);

            // This matrix stores predictions
            DataMatrix R_predicted = new DataMatrix(R_unknown.UserCount, R_unknown.ItemCount);

            // This can be considered as the R_train in standard UserKNN
            SparseMatrix positionMatrix            = PR_train.GetPositionMatrix();
            DataMatrix   ratingMatrixFromPositions = new DataMatrix(positionMatrix);

            Vector <double> meanByUser = ratingMatrixFromPositions.GetUserMeans();
            Vector <double> meanByItem = ratingMatrixFromPositions.GetItemMeans();
            double          globalMean = ratingMatrixFromPositions.GetGlobalMean();

            // Predict positions for each test user
            // Appears to be very fast, parallel.foreach is unnecessary
            foreach (Tuple <int, Vector <double> > user in R_unknown.Users)
                int             indexOfUser             = user.Item1;
                Vector <double> indexesOfUnknownRatings = user.Item2;

                Utils.PrintEpoch("Predicting user/total", indexOfUser, PR_train.UserCount);

                // Note that there are more than K neighbors in the list (sorted by similarity)
                // we will use the top-K neighbors WHO HAVE RATED THE ITEM
                // For example we have 200 top neighbors, and we hope there are
                // K neighbors in the list have rated the item. We can't keep
                // everyone in the neighbor list because there are too many for large data sets
                var topNeighborsOfUser = neighborsByUser[indexOfUser];

                double meanOfUser = meanByUser[indexOfUser];

                // Loop through each position to be predicted
                foreach (Tuple <int, double> unknownRating in indexesOfUnknownRatings.EnumerateIndexed(Zeros.AllowSkip))
                    int indexOfUnknownItem = unknownRating.Item1;

                    // Compute the position of this item for the user
                    // by combining neighbors' positions on this item
                    double weightedSum      = 0;
                    double weightSum        = 0;
                    int    currentTopKCount = 0;
                    foreach (KeyValuePair <int, double> neighbor in topNeighborsOfUser)
                        int    indexOfNeighbor        = neighbor.Key;
                        double similarityOfNeighbor   = neighbor.Value;
                        double itemPositionOfNeighbor = ratingMatrixFromPositions[indexOfNeighbor, indexOfUnknownItem];

                        // We count only if the neighbor has seen this item before
                        if (itemPositionOfNeighbor != 0)
                            // Recall that we use a constant to hold position value 0
                            // we revert it back here
                            if (itemPositionOfNeighbor == Config.ZeroInSparseMatrix)
                                Debug.Assert(true, "By using the PositionShift constant, we should not be in here.");
                                itemPositionOfNeighbor = 0;
                            weightSum   += similarityOfNeighbor;
                            weightedSum += (itemPositionOfNeighbor - meanByUser[indexOfNeighbor]) * similarityOfNeighbor;
                            if (currentTopKCount >= K)

                    // If any neighbor has seen this item
                    if (currentTopKCount != 0)
                        // TODO: Add user mean may improve the performance
                        R_predicted[indexOfUser, indexOfUnknownItem] = meanOfUser + weightedSum / weightSum;
                        R_predicted[indexOfUser, indexOfUnknownItem] = globalMean;