static void Main (string[] args) { string trainingPath = args[0]; string testPath = args[1]; int k_val = Convert.ToInt32(args[2]); int simFunc = Convert.ToInt32(args[3]); List<Document> TrainDocs = new List<Document>(); string sysOutput = (args[4]); int docid = 0; Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); List<String> Classes = ReadFile(trainingPath, ref docid, ref TrainDocs); string line; List<TestDoc> TestingDoc = new List<TestDoc>(); Dictionary<String, int> ConfusionDict = new Dictionary<string, int>(); string key; int value, index; using (StreamReader Sr = new StreamReader(testPath)) { while ((line = Sr.ReadLine()) != null) { if (String.IsNullOrEmpty(line)) continue; string[] words = line.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); TestDoc test = new TestDoc(words[0]); for (int i = 1; i < words.Length; i++) { index = words[i].IndexOf(":"); key = words[i].Substring(0, index); value = Convert.ToInt32(words[i].Substring(index + 1)); test.SumSquared += System.Math.Pow(value, 2); if (test.wordCounts.ContainsKey(key)) test.wordCounts[key] += value; else test.wordCounts.Add(key, value); } if (simFunc == 1) ProcessTrainEuclidean(TrainDocs, k_val, ref test, ref ConfusionDict); else if (simFunc == 2) ProcessTrainCosine(TrainDocs, k_val, ref test, ref ConfusionDict); else throw new Exception("invalid simfunc code"); TestingDoc.Add(test); } } WriteSysOutput (sysOutput,TestingDoc, Classes,k_val); WriteConfusionMatrix(Classes, ConfusionDict, "test", TestingDoc.Count); stopwatch.Stop(); Console.WriteLine("Time elapsed: {0:hh\\:mm\\:ss}", stopwatch.Elapsed); //Console.ReadLine(); }
public static void ProcessTrainEuclidean (List<Document> TrainDocs, int k_val, ref TestDoc test, ref Dictionary<String, int> ConfusionDict) { List<Score> ScoreList = new List<Score>(); double score; foreach (var item in TrainDocs) { score = 0; var keyList = item.Keys.Intersect(test.wordCounts.Keys).Distinct(); foreach (var word in keyList) score += (item.wordCounts[word]*test.wordCounts[word]); //(a-b)^2 = a^2 +b^2 -2 *a * b score = test.SumSquared+item.SumSquared - (2 * score); ScoreList.Add(new Score(item.classLabel, score)); } ScoreList = ScoreList.OrderBy(s => s.ScoreValue).ToList(); string key; for (int i = 0; i < k_val; i++) { key = ScoreList[i].classLabel; if (test.ClassCount.ContainsKey(key)) test.ClassCount[key]++; else test.ClassCount.Add(key, 1); } test.ClassCount = test.ClassCount.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value); test.PredClass = test.ClassCount.Keys.First(); key = test.ActualClass + "_" + test.PredClass; if (ConfusionDict.ContainsKey(key)) ConfusionDict[key]++; else ConfusionDict.Add(key, 1); }
public static void ProcessTrainCosine (List<Document> TrainDocs, int k_val, ref TestDoc test, ref Dictionary<String, int> ConfusionDict) { List<Score> ScoreList = new List<Score>(); double score; double V1=0, V2=0; V2 = test.SumSquared; V2 = System.Math.Sqrt(V2); foreach (var item in TrainDocs) { V1 = item.SumSquared; V1 = System.Math.Sqrt(V1); score = 0; var keyList = item.Keys.Intersect(test.wordCounts.Keys).Distinct(); foreach (var word in keyList) { score += (item.wordCounts[word] * test.wordCounts[word]); } score = (score) / (V1 * V2); ScoreList.Add(new Score(item.classLabel, score)); } ScoreList = ScoreList.OrderByDescending(s => s.ScoreValue).ToList(); string key; for (int i = 0; i < k_val; i++) { key = ScoreList[i].classLabel; if (test.ClassCount.ContainsKey(key)) test.ClassCount[key]++; else test.ClassCount.Add(key, 1); } test.ClassCount = test.ClassCount.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value); test.PredClass = test.ClassCount.Keys.First(); key = test.ActualClass + "_" + test.PredClass; if (ConfusionDict.ContainsKey(key)) ConfusionDict[key]++; else ConfusionDict.Add(key, 1); }