private static void FindBestThresholds(ProgramArguments programArgs) { IEnumerable <DocumentCluster> originalClusters = GetSimilarNewsTopicFiles(); IEnumerable <Document> documents = Flatten(originalClusters); CorrelationMatrix correlationMatrix = LoadCorrelationMatrix(programArgs); string fileName = Guid.NewGuid().ToString() + ".csv"; using (StreamWriter sw = new StreamWriter(fileName)) { double errorGoal = 0.01; SortedDictionary <double, SortedDictionary <double, List <double> > > errorValues = new SortedDictionary <double, SortedDictionary <double, List <double> > >(); ErrorOptimizer.Optimize(0, 1, 0, 1, (permissibleValue, variationValue) => { SimilarityAlgorithm similarityAlgorithm = new SimilarityAlgorithm(correlationMatrix, permissibleValue, variationValue); DocumentCategorizer categorizer = new DocumentCategorizer(similarityAlgorithm); IEnumerable <DocumentCluster> resultClusters = categorizer.Cluster(documents); IEnumerable <DocumentClusterErrorScore> errorScores = CalculateErrorScore(originalClusters, resultClusters); double average = (from score in errorScores select score.Value).Average(); Console.WriteLine("Average Error: " + average); sw.WriteLine("{0}, {1}, {2}", permissibleValue, variationValue, average); return(Math.Abs(average) <= errorGoal); }); } }
private static void ExperimentPandVThresholds(ProgramArguments programArgs) { IEnumerable <DocumentCluster> originalClusters = GetSimilarNewsTopicFiles(); IEnumerable <Document> documents = Flatten(originalClusters); CorrelationMatrix correlationMatrix = LoadCorrelationMatrix(programArgs); double startP = 0, endP = 1, startV = 0, endV = 1; double step = 0.01; double[,] errorValues = new double[(int)((endP - startP) / step) + 1, (int)((endV - startV) / step) + 1]; for (double i = 0, iP = startP; iP < endP; iP += step, i++) { for (double j = 0, iV = startV; iV < endV; iV += step, j++) { SimilarityAlgorithm similarityAlgorithm = new SimilarityAlgorithm( correlationMatrix, iP, iV); DocumentCategorizer categorizer = new DocumentCategorizer(similarityAlgorithm); IEnumerable <DocumentCluster> resultClusters = categorizer.Cluster(documents); IEnumerable <DocumentClusterErrorScore> errorScores = CalculateErrorScore(resultClusters, originalClusters); double average = (from score in errorScores select score.Value).Average(); Console.WriteLine("Average Error: " + average); errorValues[(int)i, (int)j] = Math.Abs(average); } } string fileName = Guid.NewGuid().ToString() + ".csv"; using (StreamWriter sw = new StreamWriter(fileName)) { sw.Write("0, "); for (double j = 0, iV = startV; iV < endV; iV += step, j++) { sw.Write("{0}, ", iV); } sw.WriteLine(); for (double i = 0, iP = startP; iP < endP; iP += step, i++) { sw.Write("{0}, ", iP); for (double j = 0, iV = startV; iV < endV; iV += step, j++) { sw.Write("{0}, ", errorValues[(int)i, (int)j]); } sw.WriteLine(); } } Logger.Log("Saved experiment to file: " + fileName); }
private static IEnumerable <DocumentClusterErrorScore> CategorizeLabeledNewsArticles(ProgramArguments programArgs) { IEnumerable <DocumentCluster> originalClusters = GetSimilarNewsTopicFiles(); IEnumerable <Document> documents = Flatten(originalClusters); CorrelationMatrix correlationMatrix = LoadCorrelationMatrix(programArgs); DocumentCategorizer categorizer = new DocumentCategorizer(correlationMatrix); IEnumerable <DocumentCluster> resultClusters = categorizer.Cluster(documents); OutputClusters(resultClusters); IEnumerable <DocumentClusterErrorScore> errorScores = CalculateErrorScore(originalClusters, resultClusters); return(errorScores); }