private static void FindBestThresholds(ProgramArguments programArgs) { IEnumerable <DocumentCluster> originalClusters = GetSimilarNewsTopicFiles(); IEnumerable <Document> documents = Flatten(originalClusters); CorrelationMatrix correlationMatrix = LoadCorrelationMatrix(programArgs); string fileName = Guid.NewGuid().ToString() + ".csv"; using (StreamWriter sw = new StreamWriter(fileName)) { double errorGoal = 0.01; SortedDictionary <double, SortedDictionary <double, List <double> > > errorValues = new SortedDictionary <double, SortedDictionary <double, List <double> > >(); ErrorOptimizer.Optimize(0, 1, 0, 1, (permissibleValue, variationValue) => { SimilarityAlgorithm similarityAlgorithm = new SimilarityAlgorithm(correlationMatrix, permissibleValue, variationValue); DocumentCategorizer categorizer = new DocumentCategorizer(similarityAlgorithm); IEnumerable <DocumentCluster> resultClusters = categorizer.Cluster(documents); IEnumerable <DocumentClusterErrorScore> errorScores = CalculateErrorScore(originalClusters, resultClusters); double average = (from score in errorScores select score.Value).Average(); Console.WriteLine("Average Error: " + average); sw.WriteLine("{0}, {1}, {2}", permissibleValue, variationValue, average); return(Math.Abs(average) <= errorGoal); }); } }
private static void OutputThresholdReport(List <Tuple <Statement, Statement> > pairs, ProgramArguments programArgs) { CorrelationMatrix correlationMatrix = LoadCorrelationMatrix(programArgs); SimilarityAlgorithm sim = new SimilarityAlgorithm(correlationMatrix); StringBuilder sb = new StringBuilder(); foreach (Tuple <Statement, Statement> pair in pairs) { Statement s1 = StemStatement(pair.Item1); Statement s2 = StemStatement(pair.Item2); double s12 = sim.StatementSimilarityToStatement(s1, s2); double s21 = sim.StatementSimilarityToStatement(s2, s1); bool areEqual = sim.StatementEqualsToStatement(s1, s2); sb.AppendFormat( "{0},{1},{2},{2}\r\n", pair.Item1.ToString().Replace(',', '.'), pair.Item2.ToString().Replace(',', '.'), Math.Min(s12, s21), Math.Abs(s12 - s21)); } string reportName = "autoRSS_thresholdReport_" + Guid.NewGuid().ToString() + ".csv"; using (StreamWriter sw = new StreamWriter(reportName)) { sw.WriteLine(sb.ToString()); } Console.WriteLine("Report: " + reportName); }
private static void ExperimentPandVThresholds(ProgramArguments programArgs) { IEnumerable <DocumentCluster> originalClusters = GetSimilarNewsTopicFiles(); IEnumerable <Document> documents = Flatten(originalClusters); CorrelationMatrix correlationMatrix = LoadCorrelationMatrix(programArgs); double startP = 0, endP = 1, startV = 0, endV = 1; double step = 0.01; double[,] errorValues = new double[(int)((endP - startP) / step) + 1, (int)((endV - startV) / step) + 1]; for (double i = 0, iP = startP; iP < endP; iP += step, i++) { for (double j = 0, iV = startV; iV < endV; iV += step, j++) { SimilarityAlgorithm similarityAlgorithm = new SimilarityAlgorithm( correlationMatrix, iP, iV); DocumentCategorizer categorizer = new DocumentCategorizer(similarityAlgorithm); IEnumerable <DocumentCluster> resultClusters = categorizer.Cluster(documents); IEnumerable <DocumentClusterErrorScore> errorScores = CalculateErrorScore(resultClusters, originalClusters); double average = (from score in errorScores select score.Value).Average(); Console.WriteLine("Average Error: " + average); errorValues[(int)i, (int)j] = Math.Abs(average); } } string fileName = Guid.NewGuid().ToString() + ".csv"; using (StreamWriter sw = new StreamWriter(fileName)) { sw.Write("0, "); for (double j = 0, iV = startV; iV < endV; iV += step, j++) { sw.Write("{0}, ", iV); } sw.WriteLine(); for (double i = 0, iP = startP; iP < endP; iP += step, i++) { sw.Write("{0}, ", iP); for (double j = 0, iV = startV; iV < endV; iV += step, j++) { sw.Write("{0}, ", errorValues[(int)i, (int)j]); } sw.WriteLine(); } } Logger.Log("Saved experiment to file: " + fileName); }
private static void CreateThresholdTrainingData(ProgramArguments programArgs) { CorrelationMatrix correlationMatrix = LoadCorrelationMatrix(programArgs); SimilarityAlgorithm sim = new SimilarityAlgorithm(correlationMatrix); while (true) { ScanTrainData(sim); Console.WriteLine("Press Enter to rescan"); Console.ReadLine(); } }
private static void CreateSimilarityReport(ProgramArguments programArgs) { CorrelationMatrix correlationMatrix = LoadCorrelationMatrix(programArgs); SimilarityAlgorithm sim = new SimilarityAlgorithm(correlationMatrix); string wikipediaPath = @"C:\Users\haabu\Downloads\enwiki-latest-pages-articles.xml\enwiki-latest-pages-articles.xml"; using (XmlReader sr = XmlReader.Create(new FileStream(wikipediaPath, FileMode.Open))) { // Skip first 100 for (int i = 0; i < 100; i++) { bool elementFound = sr.ReadToFollowing("text"); if (!elementFound) { break; } } string filename = "autorss_test_" + Guid.NewGuid().ToString() + ".csv"; using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { StreamWriter sw = new StreamWriter(fs); Document prevDocument = null; for (int i = 0; i < 100; i++) { bool elementFound = sr.ReadToFollowing("text"); if (elementFound) { string pageContents; //using (MonitoredScope scope = new MonitoredScope("Xml Read Element", TraceLevel.Medium)) { sr.ReadStartElement(); pageContents = sr.ReadContentAsString(); } Document document = ConstructDocument(pageContents); //Console.WriteLine("Ratio: " + sim.CalculateOddsRatio(document, document) + "\r\nDocument Contents: " + pageContents); if (null == prevDocument) { prevDocument = document; } sw.WriteLine(sim.CalculateOddsRatio(document, prevDocument)); prevDocument = document; } } sw.Flush(); } } }
public ISimilarityAlgorithm Create(SimilarityAlgorithm userSimilarityAlgorithm) { switch (userSimilarityAlgorithm) { case SimilarityAlgorithm.Euclidean: return(new EuclideanDistanceSimilarityAlgorithm()); case SimilarityAlgorithm.Pearson: return(new PearsonCorrelationSimilarityAlgorithm()); default: throw new ArgumentException( $"Unexpected value: '{userSimilarityAlgorithm}'", nameof(userSimilarityAlgorithm)); } }
private static void ScanTrainData(SimilarityAlgorithm sim) { StringBuilder sb = new StringBuilder(); foreach (string path in Directory.GetFiles("TrainData")) { string fileContents = ""; using (StreamReader sr = new StreamReader(path)) { fileContents = sr.ReadToEnd(); } Document document = ConstructDocument(fileContents); for (int i = 0; i < document.Statements.Length - 1; i += 2) { Statement s1 = document.Statements[i]; Statement s2 = document.Statements[i + 1]; double s12 = sim.StatementSimilarityToStatement(s1, s2); double s21 = sim.StatementSimilarityToStatement(s2, s1); sb.AppendFormat("{0},{1},{2},{3}\r\n", s1.ToString(), s2.ToString(), Math.Min(s12, s21), Math.Abs(s12 - s21)); } } string filename = "autorss_Threshold_" + Guid.NewGuid().ToString() + ".csv"; using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { StreamWriter sw = new StreamWriter(fs); sw.WriteLine(sb.ToString()); sw.Flush(); } Console.WriteLine("Threshold Report: " + filename); }
public DocumentCategorizer(SimilarityAlgorithm similarityAlgorithm) { _similarity = similarityAlgorithm; }
public DocumentCategorizer(CorrelationMatrix matrix) { _similarity = new SimilarityAlgorithm(matrix); }
private IEnumerable <Recommendation> GetRecommendations(int userId, SimilarityAlgorithm algorithm) { return(client.Get <IEnumerable <Recommendation> >($"api/recommendations/{userId}/{algorithm}").Item1); }
public IEnumerable <Recommendation> FindRecommendations( User userToFindRecommendationsFor, IEnumerable <User> allUsers, IEnumerable <Movie> allMovies, IEnumerable <Rating> allRatings, SimilarityAlgorithm userSimilarityAlgorithm) { //allUsers = allUsers.Where(u => u.Id != 3).ToList(); //allRatings = allRatings.Where(r => r.UserId != 3).ToList(); var userRatingsSimilarityAlgorithm = this.userRatingsSimilarityAlgorithmFactory.Create(userSimilarityAlgorithm); var userToFindRecommendationsForRatings = allRatings.Where(r => r.UserId == userToFindRecommendationsFor.Id); var allOtherUsers = allUsers.Where(u => u.Id != userToFindRecommendationsFor.Id); var allOtherUsersRatings = allRatings.Where(r => r.UserId != userToFindRecommendationsFor.Id); var unseenMovies = allMovies.Where(m => !userToFindRecommendationsForRatings.Any(r => r.MovieId == m.Id)); var userToFindRecommendationsForValues = userToFindRecommendationsForRatings.ToDictionary(r => r.MovieId, r => (double)r.UserRating); var userSimilarities = allOtherUsers .Select(u => new { UserId = u.Id, Similarity = userRatingsSimilarityAlgorithm.CalculateSimilarity( userToFindRecommendationsForValues, allOtherUsersRatings.Where(r => r.UserId == u.Id) .ToDictionary(r => r.MovieId, r => (double)r.UserRating)) }); var movieSimilaritiesAndWeightedRatings = unseenMovies .Join(allOtherUsersRatings, m => m.Id, r => r.MovieId, (m, r) => new { m, r }) .Join(userSimilarities, mr => mr.r.UserId, us => us.UserId, (mr, us) => new { mr, us }) .Select(mrus => new { mrus.mr.r.MovieId, //mrus.mr.r.UserId, //mrus.mr.r.UserRating, mrus.us.Similarity, WeightedRating = mrus.mr.r.UserRating * mrus.us.Similarity, }); var movieScores = movieSimilaritiesAndWeightedRatings .GroupBy(msawr => msawr.MovieId) .Select(g => new { MovieId = g.Key, WeightedRatingSum = g.Sum(msawr => msawr.WeightedRating), SimilaritySum = g.Sum(msawr => msawr.Similarity) }); return(movieScores .Join(allMovies, ms => ms.MovieId, m => m.Id, (ms, m) => new Recommendation { MovieId = m.Id, MovieName = m.Name, Score = ms.WeightedRatingSum / ms.SimilaritySum })); }