private static void ReadData(ExecutionOption options, List <Article> articles) { if (!File.Exists(options.DataFile)) { throw new ArgumentException(string.Format("File not found {0}", options.DataFile)); } else { using (var sr = new StreamReader(options.DataFile)) { var reader = new CsvReader(sr); reader.Configuration.HasHeaderRecord = false; while (reader.Read()) { var articleId = reader.GetField <int>(0); var wordId = reader.GetField <int>(1); var wordCount = reader.GetField <int>(2); articles[articleId - 1].WordCounts[wordId] = wordCount; if (k < wordId) { k = wordId; } } } } }
private static List <string> ReadGroups(ExecutionOption options) { List <string> groups = new List <string>(); if (!File.Exists(options.GroupsFile)) { throw new ArgumentException(string.Format("File not found {0}", options.LabelFile)); } else { using (var sr = new StreamReader(options.GroupsFile)) { var reader = new CsvReader(sr); reader.Configuration.HasHeaderRecord = false; while (reader.Read()) { var group = reader.GetField <string>(0); groups.Add(group); } } } return(groups); }
private static List <Article> ReadLabels(ExecutionOption options, List <string> groups) { List <Article> articles = new List <Article>(); if (!File.Exists(options.LabelFile)) { throw new ArgumentException(string.Format("File not found {0}", options.LabelFile)); } else { using (var sr = new StreamReader(options.LabelFile)) { var reader = new CsvReader(sr); reader.Configuration.HasHeaderRecord = false; while (reader.Read()) { int label = reader.GetField <int>(0); Article a = new Article { GroupId = label - 1 }; articles.Add(a); } } } return(articles); }
static void Main(string[] args) { ExecutionOption options = new ExecutionOption(); if (CommandLine.Parser.Default.ParseArguments(args, options)) { List <string> groups = ReadGroups(options); List <Article> articles = ReadLabels(options, groups); ReadData(options, articles); /* * foreach (Article article in articles.Take(5)) * { * Console.WriteLine(article); * } */ if (options.ExecutionMode == ExecutionMode.AllWayAverage) { double[,] sum = new double[groups.Count, groups.Count]; int[,] count = new int[groups.Count, groups.Count]; for (int i = 0; i < articles.Count; i++) { Article ai = articles[i]; for (int j = i + 1; j < articles.Count; j++) { Article aj = articles[j]; sum[ai.GroupId, aj.GroupId] += GetDistance(ai.WordCounts, aj.WordCounts, options.SimilarityFunction); count[ai.GroupId, aj.GroupId] += 1; } } using (var sr = new StreamWriter("output.csv")) { var writer = new CsvWriter(sr); for (int i = 0; i < groups.Count; i++) { for (int j = 0; j < groups.Count; j++) { double average = j > i ? sum[i, j] / count[i, j] : sum[j, i] / count[j, i]; int fieldCount = j > i ? count[i, j] : count[j, i]; if (fieldCount > 0) { writer.WriteField(average); } else { writer.WriteField(0.0); } } writer.NextRecord(); } } } else if (options.ExecutionMode == ExecutionMode.MostNearestNeighbor) { int[,] count = new int[groups.Count, groups.Count]; for (int i = 0; i < articles.Count; i++) { double maxSum = 0.0; int maxGroupId = -1; Article ai = articles[i]; for (int j = 0; j < articles.Count; j++) { Article aj = articles[j]; if (ai.GroupId == aj.GroupId) { continue; } double distance = GetDistance(ai.WordCounts, aj.WordCounts, options.SimilarityFunction); if (distance > maxSum) { maxSum = distance; maxGroupId = aj.GroupId; } } if (maxGroupId >= 0) { count[ai.GroupId, maxGroupId] += 1; } } using (var sr = new StreamWriter("output.csv")) { var writer = new CsvWriter(sr); for (int i = 0; i < groups.Count; i++) { for (int j = 0; j < groups.Count; j++) { writer.WriteField(count[i, j]); } writer.NextRecord(); } } } else if (options.ExecutionMode == ExecutionMode.MostNearestNeighborWithRandomProjection) { var M = Matrix <double> .Build; var matrix = M.Random(options.RandomProjectionDimCount, k + 1, new Normal(0.0, 1.0)); DimensionReduceArticles(articles, matrix); int[,] count = new int[groups.Count, groups.Count]; for (int i = 0; i < articles.Count; i++) { double maxSum = 0.0; int maxGroupId = -1; Article ai = articles[i]; for (int j = 0; j < articles.Count; j++) { Article aj = articles[j]; if (ai.GroupId == aj.GroupId) { continue; } double distance = GetDistance(ai.WordCountVector, aj.WordCountVector, options.SimilarityFunction); if (distance > maxSum) { maxSum = distance; maxGroupId = aj.GroupId; } } if (maxGroupId >= 0) { count[ai.GroupId, maxGroupId] += 1; } } using (var sr = new StreamWriter("output.csv")) { var writer = new CsvWriter(sr); for (int i = 0; i < groups.Count; i++) { for (int j = 0; j < groups.Count; j++) { writer.WriteField(count[i, j]); } writer.NextRecord(); } } } else if (options.ExecutionMode == ExecutionMode.RandomProjectionCosineScatter) { throw new NotSupportedException("Unknown execution mode"); } else { throw new NotSupportedException("Unknown execution mode"); } Console.WriteLine("----------------------------------------------------------------------"); } }