コード例 #1
0
        private static void FindBestThresholds(ProgramArguments programArgs)
        {
            IEnumerable <DocumentCluster> originalClusters  = GetSimilarNewsTopicFiles();
            IEnumerable <Document>        documents         = Flatten(originalClusters);
            CorrelationMatrix             correlationMatrix = LoadCorrelationMatrix(programArgs);

            string fileName = Guid.NewGuid().ToString() + ".csv";

            using (StreamWriter sw = new StreamWriter(fileName))
            {
                double errorGoal = 0.01;
                SortedDictionary <double, SortedDictionary <double, List <double> > > errorValues = new SortedDictionary <double, SortedDictionary <double, List <double> > >();
                ErrorOptimizer.Optimize(0, 1, 0, 1, (permissibleValue, variationValue) =>
                {
                    SimilarityAlgorithm similarityAlgorithm             = new SimilarityAlgorithm(correlationMatrix, permissibleValue, variationValue);
                    DocumentCategorizer categorizer                     = new DocumentCategorizer(similarityAlgorithm);
                    IEnumerable <DocumentCluster> resultClusters        = categorizer.Cluster(documents);
                    IEnumerable <DocumentClusterErrorScore> errorScores = CalculateErrorScore(originalClusters, resultClusters);

                    double average = (from score in errorScores select score.Value).Average();
                    Console.WriteLine("Average Error: " + average);

                    sw.WriteLine("{0}, {1}, {2}", permissibleValue, variationValue, average);

                    return(Math.Abs(average) <= errorGoal);
                });
            }
        }
コード例 #2
0
        private static void OutputThresholdReport(List <Tuple <Statement, Statement> > pairs, ProgramArguments programArgs)
        {
            CorrelationMatrix   correlationMatrix = LoadCorrelationMatrix(programArgs);
            SimilarityAlgorithm sim = new SimilarityAlgorithm(correlationMatrix);
            StringBuilder       sb  = new StringBuilder();

            foreach (Tuple <Statement, Statement> pair in pairs)
            {
                Statement s1 = StemStatement(pair.Item1);
                Statement s2 = StemStatement(pair.Item2);

                double s12      = sim.StatementSimilarityToStatement(s1, s2);
                double s21      = sim.StatementSimilarityToStatement(s2, s1);
                bool   areEqual = sim.StatementEqualsToStatement(s1, s2);

                sb.AppendFormat(
                    "{0},{1},{2},{2}\r\n",
                    pair.Item1.ToString().Replace(',', '.'),
                    pair.Item2.ToString().Replace(',', '.'),
                    Math.Min(s12, s21),
                    Math.Abs(s12 - s21));
            }

            string reportName = "autoRSS_thresholdReport_" + Guid.NewGuid().ToString() + ".csv";

            using (StreamWriter sw = new StreamWriter(reportName))
            {
                sw.WriteLine(sb.ToString());
            }

            Console.WriteLine("Report: " + reportName);
        }
コード例 #3
0
        private static void ExperimentPandVThresholds(ProgramArguments programArgs)
        {
            IEnumerable <DocumentCluster> originalClusters  = GetSimilarNewsTopicFiles();
            IEnumerable <Document>        documents         = Flatten(originalClusters);
            CorrelationMatrix             correlationMatrix = LoadCorrelationMatrix(programArgs);

            double startP = 0,
                   endP   = 1,
                   startV = 0,
                   endV   = 1;
            double step   = 0.01;

            double[,] errorValues = new double[(int)((endP - startP) / step) + 1, (int)((endV - startV) / step) + 1];
            for (double i = 0, iP = startP; iP < endP; iP += step, i++)
            {
                for (double j = 0, iV = startV; iV < endV; iV += step, j++)
                {
                    SimilarityAlgorithm similarityAlgorithm
                        = new SimilarityAlgorithm(
                              correlationMatrix,
                              iP,
                              iV);
                    DocumentCategorizer                     categorizer    = new DocumentCategorizer(similarityAlgorithm);
                    IEnumerable <DocumentCluster>           resultClusters = categorizer.Cluster(documents);
                    IEnumerable <DocumentClusterErrorScore> errorScores    = CalculateErrorScore(resultClusters, originalClusters);

                    double average = (from score in errorScores select score.Value).Average();
                    Console.WriteLine("Average Error: " + average);

                    errorValues[(int)i, (int)j] = Math.Abs(average);
                }
            }

            string fileName = Guid.NewGuid().ToString() + ".csv";

            using (StreamWriter sw = new StreamWriter(fileName))
            {
                sw.Write("0, ");
                for (double j = 0, iV = startV; iV < endV; iV += step, j++)
                {
                    sw.Write("{0}, ", iV);
                }

                sw.WriteLine();

                for (double i = 0, iP = startP; iP < endP; iP += step, i++)
                {
                    sw.Write("{0}, ", iP);
                    for (double j = 0, iV = startV; iV < endV; iV += step, j++)
                    {
                        sw.Write("{0}, ", errorValues[(int)i, (int)j]);
                    }

                    sw.WriteLine();
                }
            }

            Logger.Log("Saved experiment to file: " + fileName);
        }
コード例 #4
0
        private static void CreateThresholdTrainingData(ProgramArguments programArgs)
        {
            CorrelationMatrix   correlationMatrix = LoadCorrelationMatrix(programArgs);
            SimilarityAlgorithm sim = new SimilarityAlgorithm(correlationMatrix);

            while (true)
            {
                ScanTrainData(sim);
                Console.WriteLine("Press Enter to rescan");
                Console.ReadLine();
            }
        }
コード例 #5
0
        private static void CreateSimilarityReport(ProgramArguments programArgs)
        {
            CorrelationMatrix correlationMatrix = LoadCorrelationMatrix(programArgs);

            SimilarityAlgorithm sim = new SimilarityAlgorithm(correlationMatrix);
            string wikipediaPath    = @"C:\Users\haabu\Downloads\enwiki-latest-pages-articles.xml\enwiki-latest-pages-articles.xml";

            using (XmlReader sr = XmlReader.Create(new FileStream(wikipediaPath, FileMode.Open)))
            {
                // Skip first 100
                for (int i = 0; i < 100; i++)
                {
                    bool elementFound = sr.ReadToFollowing("text");
                    if (!elementFound)
                    {
                        break;
                    }
                }

                string filename = "autorss_test_" + Guid.NewGuid().ToString() + ".csv";
                using (FileStream fs = new FileStream(filename, FileMode.CreateNew))
                {
                    StreamWriter sw           = new StreamWriter(fs);
                    Document     prevDocument = null;
                    for (int i = 0; i < 100; i++)
                    {
                        bool elementFound = sr.ReadToFollowing("text");
                        if (elementFound)
                        {
                            string pageContents;
                            //using (MonitoredScope scope = new MonitoredScope("Xml Read Element", TraceLevel.Medium))
                            {
                                sr.ReadStartElement();
                                pageContents = sr.ReadContentAsString();
                            }

                            Document document = ConstructDocument(pageContents);
                            //Console.WriteLine("Ratio: " + sim.CalculateOddsRatio(document, document) + "\r\nDocument Contents: " + pageContents);
                            if (null == prevDocument)
                            {
                                prevDocument = document;
                            }

                            sw.WriteLine(sim.CalculateOddsRatio(document, prevDocument));
                            prevDocument = document;
                        }
                    }

                    sw.Flush();
                }
            }
        }
コード例 #6
0
        public ISimilarityAlgorithm Create(SimilarityAlgorithm userSimilarityAlgorithm)
        {
            switch (userSimilarityAlgorithm)
            {
            case SimilarityAlgorithm.Euclidean:
                return(new EuclideanDistanceSimilarityAlgorithm());

            case SimilarityAlgorithm.Pearson:
                return(new PearsonCorrelationSimilarityAlgorithm());

            default:
                throw new ArgumentException(
                          $"Unexpected value: '{userSimilarityAlgorithm}'",
                          nameof(userSimilarityAlgorithm));
            }
        }
コード例 #7
0
        private static void ScanTrainData(SimilarityAlgorithm sim)
        {
            StringBuilder sb = new StringBuilder();

            foreach (string path in Directory.GetFiles("TrainData"))
            {
                string fileContents = "";
                using (StreamReader sr = new StreamReader(path))
                {
                    fileContents = sr.ReadToEnd();
                }

                Document document = ConstructDocument(fileContents);
                for (int i = 0; i < document.Statements.Length - 1; i += 2)
                {
                    Statement s1  = document.Statements[i];
                    Statement s2  = document.Statements[i + 1];
                    double    s12 = sim.StatementSimilarityToStatement(s1, s2);
                    double    s21 = sim.StatementSimilarityToStatement(s2, s1);
                    sb.AppendFormat("{0},{1},{2},{3}\r\n",
                                    s1.ToString(),
                                    s2.ToString(),
                                    Math.Min(s12, s21),
                                    Math.Abs(s12 - s21));
                }
            }

            string filename = "autorss_Threshold_" + Guid.NewGuid().ToString() + ".csv";

            using (FileStream fs = new FileStream(filename, FileMode.CreateNew))
            {
                StreamWriter sw = new StreamWriter(fs);
                sw.WriteLine(sb.ToString());
                sw.Flush();
            }

            Console.WriteLine("Threshold Report: " + filename);
        }
コード例 #8
0
 public DocumentCategorizer(SimilarityAlgorithm similarityAlgorithm)
 {
     _similarity = similarityAlgorithm;
 }
コード例 #9
0
 public DocumentCategorizer(CorrelationMatrix matrix)
 {
     _similarity = new SimilarityAlgorithm(matrix);
 }
コード例 #10
0
 private IEnumerable <Recommendation> GetRecommendations(int userId, SimilarityAlgorithm algorithm)
 {
     return(client.Get <IEnumerable <Recommendation> >($"api/recommendations/{userId}/{algorithm}").Item1);
 }
コード例 #11
0
        public IEnumerable <Recommendation> FindRecommendations(
            User userToFindRecommendationsFor,
            IEnumerable <User> allUsers,
            IEnumerable <Movie> allMovies,
            IEnumerable <Rating> allRatings,
            SimilarityAlgorithm userSimilarityAlgorithm)
        {
            //allUsers = allUsers.Where(u => u.Id != 3).ToList();
            //allRatings = allRatings.Where(r => r.UserId != 3).ToList();

            var userRatingsSimilarityAlgorithm =
                this.userRatingsSimilarityAlgorithmFactory.Create(userSimilarityAlgorithm);

            var userToFindRecommendationsForRatings =
                allRatings.Where(r => r.UserId == userToFindRecommendationsFor.Id);
            var allOtherUsers =
                allUsers.Where(u => u.Id != userToFindRecommendationsFor.Id);
            var allOtherUsersRatings =
                allRatings.Where(r => r.UserId != userToFindRecommendationsFor.Id);
            var unseenMovies =
                allMovies.Where(m => !userToFindRecommendationsForRatings.Any(r => r.MovieId == m.Id));

            var userToFindRecommendationsForValues =
                userToFindRecommendationsForRatings.ToDictionary(r => r.MovieId, r => (double)r.UserRating);

            var userSimilarities = allOtherUsers
                                   .Select(u => new
            {
                UserId     = u.Id,
                Similarity = userRatingsSimilarityAlgorithm.CalculateSimilarity(
                    userToFindRecommendationsForValues,
                    allOtherUsersRatings.Where(r => r.UserId == u.Id)
                    .ToDictionary(r => r.MovieId, r => (double)r.UserRating))
            });


            var movieSimilaritiesAndWeightedRatings = unseenMovies
                                                      .Join(allOtherUsersRatings, m => m.Id, r => r.MovieId, (m, r) => new { m, r })
                                                      .Join(userSimilarities, mr => mr.r.UserId, us => us.UserId, (mr, us) => new { mr, us })
                                                      .Select(mrus => new
            {
                mrus.mr.r.MovieId,
                //mrus.mr.r.UserId,
                //mrus.mr.r.UserRating,
                mrus.us.Similarity,
                WeightedRating = mrus.mr.r.UserRating * mrus.us.Similarity,
            });

            var movieScores = movieSimilaritiesAndWeightedRatings
                              .GroupBy(msawr => msawr.MovieId)
                              .Select(g => new
            {
                MovieId           = g.Key,
                WeightedRatingSum = g.Sum(msawr => msawr.WeightedRating),
                SimilaritySum     = g.Sum(msawr => msawr.Similarity)
            });

            return(movieScores
                   .Join(allMovies, ms => ms.MovieId, m => m.Id, (ms, m) => new Recommendation
            {
                MovieId = m.Id,
                MovieName = m.Name,
                Score = ms.WeightedRatingSum / ms.SimilaritySum
            }));
        }