Exemple #1
0
        static void Main (string[] args)
        {
            string trainingPath = args[0];
            string testPath = args[1];
            int k_val = Convert.ToInt32(args[2]);
            int simFunc = Convert.ToInt32(args[3]);
            List<Document> TrainDocs = new List<Document>();
            string sysOutput = (args[4]);
            int docid = 0;
            Stopwatch stopwatch = new Stopwatch();
            stopwatch.Start();
            List<String> Classes = ReadFile(trainingPath, ref docid, ref TrainDocs);
            
            string line;
            List<TestDoc> TestingDoc = new List<TestDoc>();
            Dictionary<String, int> ConfusionDict = new Dictionary<string, int>();
            string key;
            int value, index;
            using (StreamReader Sr = new StreamReader(testPath))
            {
                while ((line = Sr.ReadLine()) != null)
                {
                    if (String.IsNullOrEmpty(line))
                        continue;
                    string[] words = line.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
                    TestDoc test = new TestDoc(words[0]);
                    for (int i = 1; i < words.Length; i++)
                    {
                        index = words[i].IndexOf(":");
                        key = words[i].Substring(0, index);
                        value = Convert.ToInt32(words[i].Substring(index + 1));
                        test.SumSquared += System.Math.Pow(value, 2);
                        if (test.wordCounts.ContainsKey(key))
                            test.wordCounts[key] += value;
                        else
                            test.wordCounts.Add(key, value);
                    }
                    if (simFunc == 1)
                        ProcessTrainEuclidean(TrainDocs, k_val, ref test, ref  ConfusionDict);
                    else if (simFunc == 2)
                        ProcessTrainCosine(TrainDocs, k_val, ref test, ref  ConfusionDict);
                    else
                        throw new Exception("invalid simfunc code");
                    TestingDoc.Add(test);
                }
            }

            
            WriteSysOutput (sysOutput,TestingDoc, Classes,k_val);
            WriteConfusionMatrix(Classes, ConfusionDict, "test", TestingDoc.Count);
            stopwatch.Stop();
            Console.WriteLine("Time elapsed: {0:hh\\:mm\\:ss}", stopwatch.Elapsed);
            //Console.ReadLine();
        }
Exemple #2
0
 public static void ProcessTrainEuclidean (List<Document> TrainDocs, int k_val, ref TestDoc test, ref Dictionary<String, int> ConfusionDict)
 {
     List<Score> ScoreList = new List<Score>();
     double score;
     foreach (var item in TrainDocs)
     {
         score = 0;
         var keyList = item.Keys.Intersect(test.wordCounts.Keys).Distinct();
         foreach (var word in keyList)
             score += (item.wordCounts[word]*test.wordCounts[word]);
         //(a-b)^2 = a^2 +b^2 -2 *a * b
         score = test.SumSquared+item.SumSquared - (2 * score);
         ScoreList.Add(new Score(item.classLabel, score));
     }
     ScoreList = ScoreList.OrderBy(s => s.ScoreValue).ToList();
     string key;
     for (int i = 0; i < k_val; i++)
     {
         key = ScoreList[i].classLabel;
         if (test.ClassCount.ContainsKey(key))
             test.ClassCount[key]++;
         else
             test.ClassCount.Add(key, 1);
     }
     test.ClassCount = test.ClassCount.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value);
     test.PredClass = test.ClassCount.Keys.First();
     key = test.ActualClass + "_" + test.PredClass;
     if (ConfusionDict.ContainsKey(key))
         ConfusionDict[key]++;
     else
         ConfusionDict.Add(key, 1);
 }
Exemple #3
0
        public static void ProcessTrainCosine (List<Document> TrainDocs, int k_val, ref TestDoc test, ref Dictionary<String, int> ConfusionDict)
        {
            List<Score> ScoreList = new List<Score>();
            double score;
            double V1=0, V2=0;
            V2 = test.SumSquared;
            V2 = System.Math.Sqrt(V2);

            foreach (var item in TrainDocs)
            {
                V1 = item.SumSquared;
                V1 = System.Math.Sqrt(V1);
                score = 0;

                var keyList = item.Keys.Intersect(test.wordCounts.Keys).Distinct();
                foreach (var word in keyList)
                {
                    score += (item.wordCounts[word] * test.wordCounts[word]);
                }
                score = (score) / (V1 * V2);
                ScoreList.Add(new Score(item.classLabel, score));
            }
            ScoreList = ScoreList.OrderByDescending(s => s.ScoreValue).ToList();
            string key;
            for (int i = 0; i < k_val; i++)
            {
                key = ScoreList[i].classLabel;
                if (test.ClassCount.ContainsKey(key))
                    test.ClassCount[key]++;
                else
                    test.ClassCount.Add(key, 1);
            }
            test.ClassCount = test.ClassCount.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value);
            test.PredClass = test.ClassCount.Keys.First();
            key = test.ActualClass + "_" + test.PredClass;
            if (ConfusionDict.ContainsKey(key))
                ConfusionDict[key]++;
            else
                ConfusionDict.Add(key, 1);
        }