示例#1
0
        static void Main(string[] args)
        {
            List <Dictionary <string, double> > docWordDicList = new List <Dictionary <string, double> >();
            Dictionary <string, int>            dictionary     = new Dictionary <string, int>();
            List <int> trainingAnswer = new List <int>();
            Dictionary <string, double> wordIDFDictionary = new Dictionary <string, double>();
            Hashtable     stopWordTable    = genStopwordTable(STOP_WORD_PATH);
            List <string> testFileNameList = new List <string>();
            int           dicSize          = 100;

            Console.WriteLine("==> Starting prepare data...");
            NLPAdapter nlpAdapter = new NLPAdapter(NLP_MODEL_PATH);

            trainModel(TRAINING_DATA_DIR,
                       LOG_DIR,
                       ref docWordDicList,
                       ref dictionary,
                       dicSize,
                       ref trainingAnswer,
                       ref wordIDFDictionary,
                       stopWordTable,
                       nlpAdapter
                       );
#if KNN_MODE
            KNN knn = new KNN();
            knn.set(dicSize, docWordDicList.Count());
            knn.initial(docWordDicList, dictionary, trainingAnswer);
            knn.train(3, 20);
            knn.getAveDistance();

            //knn.genLog(@"D:\work\KPMG\learning\classification\project1_0422\log");
            List <KeyValuePair <int, int> > testAnswer = runKnnTest(knn, TEST_DATA_DIR, TEST_LOG_DIR, dictionary, wordIDFDictionary, stopWordTable, ref testFileNameList, nlpAdapter);
#else
            Console.WriteLine("==> Starting get model...");
            SVMAdapter svmAdapter = new SVMAdapter();
            svm_model  model      = svmAdapter.getSVMModel(docWordDicList, dictionary, trainingAnswer, SVMAdapter.SVM_C_DEFAULT, SVMAdapter.SVM_GAMMA_DEFAULT);

            Console.WriteLine("==> Starting SVM test...");
            List <KeyValuePair <int, int> > testAnswer = runSVMTest(svmAdapter, TEST_DATA_DIR, TEST_LOG_DIR, dictionary, wordIDFDictionary, stopWordTable, ref testFileNameList, model, nlpAdapter);
            Console.WriteLine("==> Starting SVM test done!!");
#endif
            Console.WriteLine("==> Starting saving result...");
            genStatistic(testAnswer, testFileNameList, LOG_DIR);
        }
示例#2
0
        private static Dictionary <string, double> readDoc(string path, Hashtable stopwordTable, NLPAdapter nlpAdapter)
        {
            Dictionary <string, double> docWordCount = new Dictionary <string, double>();
            StreamReader docFile = new StreamReader(path);
            string       line;

            Dictionary <String, String> posTags = new Dictionary <String, String>();

            posTags.Add(NLPAdapter.POS_TAG_NN, NLPAdapter.POS_TAG_NN);
            posTags.Add(NLPAdapter.POS_TAG_NNS, NLPAdapter.POS_TAG_NNS);
            posTags.Add(NLPAdapter.POS_TAG_NNP, NLPAdapter.POS_TAG_NNP);
            posTags.Add(NLPAdapter.POS_TAG_NNPS, NLPAdapter.POS_TAG_NNPS);

            while ((line = docFile.ReadLine()) != null)
            {
                if (isColumn(line))
                {
                    string column  = getColumnName(line);
                    string content = getColumnContent(line);
                    if (column == "subject")
                    {
                        foreach (string iter_word in splitLine(content))
                        {
                            string word = getWord(iter_word, stopwordTable);
                            //word cleansing done
                            if (word != null)
                            {
                                if (docWordCount.ContainsKey(word))
                                {
                                    docWordCount[word] += weight["subject"];
                                }
                                else
                                {
                                    docWordCount.Add(word, weight["subject"]);
                                }
                            }
                        }
                    }
                    else if (column == "path")
                    {
                        foreach (string iter_word in splitPath(content))
                        {
                            string word = getWord(iter_word, stopwordTable);
                            //word cleansing done
                            if (word != null)
                            {
                                if (docWordCount.ContainsKey(word))
                                {
                                    docWordCount[word] += weight["path"];
                                }
                                else
                                {
                                    docWordCount.Add(word, weight["path"]);
                                }
                            }
                        }
                    }
                    else if (column == "newsgroups")
                    {
                        foreach (string iter_word in splitNewsgroup(content))
                        {
                            string word = getWord(iter_word, stopwordTable);
                            //word cleansing done
                            if (word != null)
                            {
                                if (docWordCount.ContainsKey(word))
                                {
                                    docWordCount[word] += weight["newsgroups"];
                                }
                                else
                                {
                                    docWordCount.Add(word, weight["newsgroups"]);
                                }
                            }
                        }
                    }
                    else if (column == "from")
                    {
                        foreach (string iter_word in getEmail(content))
                        {
                            string word = getWord(iter_word, stopwordTable);
                            //word cleansing done
                            if (word != null)
                            {
                                if (docWordCount.ContainsKey(word))
                                {
                                    docWordCount[word] += weight["from"];
                                }
                                else
                                {
                                    docWordCount.Add(word, weight["from"]);
                                }
                            }
                        }
                    }
                    else
                    {
                        content = processSpecialField(content, ref docWordCount);
                        foreach (string iter_word in splitLine(content))
                        {
                            string word = getWord(iter_word, stopwordTable);
                            //word cleansing done
                            if (word != null)
                            {
                                if (docWordCount.ContainsKey(word))
                                {
                                    docWordCount[word] += weight["word"];
                                }
                                else
                                {
                                    docWordCount.Add(word, weight["word"]);
                                }
                            }
                        }
                    }
                }
                else
                {
                    if (line.Length != 0)
                    {
                        //return docWordCount;
                        break;
                    }
                }
            }

            while (line != null)
            {
                line = processSpecialField(line, ref docWordCount);

#if USE_POSTAG
                // Sango: just left noun.
                line = nlpAdapter.getFilterResult(line, posTags);
#endif

                foreach (string iter_word in splitLine(line))
                {
                    string word = getWord(iter_word, stopwordTable);
                    //word cleansing done
                    if (word != null)
                    {
                        if (docWordCount.ContainsKey(word))
                        {
                            docWordCount[word] += weight["word"];
                        }
                        else
                        {
                            docWordCount.Add(word, weight["word"]);
                        }
                    }
                }
                line = docFile.ReadLine();
            }
            return(docWordCount);
        }
示例#3
0
        private static Dictionary <string, double> readCategory(string path, int categoryIndex, ref List <Dictionary <string, double> > docWordDicList, ref List <int> trainingAnswer, Hashtable stopwordTable, NLPAdapter nlpAdapter)
        {
            Dictionary <string, double> categoryWordCount = new Dictionary <string, double>();
            Dictionary <string, double> docWordCount      = new Dictionary <string, double>();

            string[] docs = Directory.GetFiles(path);
            for (int i = 0; i < docs.Length; i++)
            {
                trainingAnswer.Add(categoryIndex);
                docWordCount = readDoc(docs[i], stopwordTable, nlpAdapter);
                docWordDicList.Add(docWordCount);
                foreach (string word in docWordCount.Keys)
                {
                    if (categoryWordCount.ContainsKey(word))
                    {
                        if (docWordCount[word] != 0)
                        {
                            //categoryWordCount[word] += docWordCount[word];
                            categoryWordCount[word] += 1;
                        }
                    }
                    else
                    {
                        if (docWordCount[word] != 0)
                        {
                            //categoryWordCount.Add(word, docWordCount[word]);
                            categoryWordCount.Add(word, 1);
                        }
                    }
                }
            }
            return(categoryWordCount);
        }
示例#4
0
        private static void trainModel(string trainPath, string logPath, ref List <Dictionary <string, double> > docWordDicList, ref Dictionary <string, int> dictionary, int dicSize, ref List <int> trainingAnswer, ref Dictionary <string, double> wordIDFDictionary, Hashtable stopwordTable, NLPAdapter nlpAdapter)
        {
            List <Dictionary <string, double> > categoryWordCountList = new List <Dictionary <string, double> >();
            Dictionary <string, int>            tempDictionary        = new Dictionary <string, int>();

            string[] categories = Directory.GetDirectories(trainPath);
            for (int i = 0; i < categories.Length; i++) //traverse Categories, generate traingAnswer
            {
                categoryWordCountList.Add(readCategory(categories[i], i, ref docWordDicList, ref trainingAnswer, stopwordTable, nlpAdapter));
            }

            // generate wordIDFDictionary
            for (int i = 0; i < categoryWordCountList.Count(); i++)
            {
                foreach (string word in categoryWordCountList[i].Keys)
                {
                    if (wordIDFDictionary.ContainsKey(word))
                    {
                        wordIDFDictionary[word] += 1;
                    }
                    else
                    {
                        wordIDFDictionary.Add(word, 1);
                    }
                }
            }
            string[] keys = wordIDFDictionary.Keys.ToArray();

            for (int i = 0; i < keys.Length; i++)
            {
                wordIDFDictionary[keys[i]] = Math.Log(categoryWordCountList.Count() / wordIDFDictionary[keys[i]]);
            }

            // generate dictionary
            List <List <KeyValuePair <string, double> > > sortedCategoryTFIDFList = new List <List <KeyValuePair <string, double> > >();
            StreamWriter dicFile  = new StreamWriter(logPath + "\\" + "dictionary.csv");
            int          dicCount = 0;

            for (int i = 0; i < categoryWordCountList.Count(); i++)
            {
                string[] words = categoryWordCountList[i].Keys.ToArray();
                double   categoryWordCountSum = 0;
                List <KeyValuePair <string, double> > sortedCategoryTFIDF = new List <KeyValuePair <string, double> >();
                for (int j = 0; j < words.Length; j++)
                {
                    categoryWordCountSum += categoryWordCountList[i][words[j]];
                }
                for (int j = 0; j < words.Length; j++)
                {
                    sortedCategoryTFIDF.Add(new KeyValuePair <string, double>(words[j], (categoryWordCountList[i][words[j]] / categoryWordCountSum) * wordIDFDictionary[words[j]]));//category TFIDF
                }
                //sortedCategoryTFIDF = categoryWordCountList[i].ToList();
                sortedCategoryTFIDF.Sort((a, b) => b.Value.CompareTo(a.Value));
                sortedCategoryTFIDFList.Add(sortedCategoryTFIDF);
            }
            for (int i = 0; i < dicSize * 2; i++)
            {
                for (int j = 0; j < sortedCategoryTFIDFList.Count(); j++)
                {
                    if (dicCount >= dicSize * 2)
                    {
                        break;
                    }
                    if (!tempDictionary.ContainsKey(sortedCategoryTFIDFList[j][i].Key))
                    {
                        dicFile.WriteLine(sortedCategoryTFIDFList[j][i].Key + "," + sortedCategoryTFIDFList[j][i].Value);
                        tempDictionary.Add(sortedCategoryTFIDFList[j][i].Key, dicCount);
                        dicCount++;
                    }
                }
                if (dicCount >= dicSize * 2)
                {
                    dicFile.Close();
                    break;
                }
            }

            dictionary = trainCooccurrence(logPath, ref categoryWordCountList, ref docWordDicList, ref wordIDFDictionary, trainingAnswer, tempDictionary, dicSize);
            //generate docWordDicList
            for (int i = 0; i < docWordDicList.Count(); i++)
            {
                string[] words           = docWordDicList[i].Keys.ToArray();
                double   docWordCountSum = 0;
                for (int j = 0; j < words.Length; j++)
                {
                    docWordCountSum += docWordDicList[i][words[j]];
                }
                for (int j = 0; j < words.Length; j++)
                {
                    if (docWordDicList[i][words[j]] != 0)
                    {
                        docWordDicList[i][words[j]] = (docWordDicList[i][words[j]] / docWordCountSum) * wordIDFDictionary[words[j]];//docWordDic TFIDF
                    }
                }
            }
        }
示例#5
0
        private static List <KeyValuePair <int, int> > runSVMTest(SVMAdapter svmAdapter, string testPath, string logPath, Dictionary <string, int> dictionary, Dictionary <string, double> wordIDFDictionary, Hashtable stopWordTable, ref List <string> testFileNameList, svm_model model, NLPAdapter nlpAdapter)
        {
            string[] categories = Directory.GetDirectories(testPath);
            List <KeyValuePair <int, int> > testAnswer = new List <KeyValuePair <int, int> >();

            for (int i = 0; i < categories.Length; i++) //traverse Categories
            {
                Console.WriteLine(categories[i]);
                string[] files = Directory.GetFiles(categories[i]);
                for (int j = 0; j < files.Length; j++)
                {
                    int testResult = -1;
                    testFileNameList.Add(Path.GetFileName(files[j]));
                    testResult = svmAdapter.runSVMTest(readDoc(files[j], stopWordTable, nlpAdapter), dictionary, wordIDFDictionary, model);
                    testAnswer.Add(new KeyValuePair <int, int>(testResult, i));
                    Console.WriteLine(testResult + "," + i);
                }
            }

            return(testAnswer);
        }
示例#6
0
        private static List <KeyValuePair <int, int> > runKnnTest(KNN knn, string testPath, string logPath, Dictionary <string, int> dictionary, Dictionary <string, double> wordIDFDictionary, Hashtable stopWordTable, ref List <string> testFileNameList, NLPAdapter nlpAdapter)
        {
            string[] categories = Directory.GetDirectories(testPath);
            List <KeyValuePair <int, int> > testAnswer = new List <KeyValuePair <int, int> >();

            for (int i = 0; i < categories.Length; i++) //traverse Categories
            {
                Console.WriteLine(categories[i]);
                string[] files = Directory.GetFiles(categories[i]);
                for (int j = 0; j < files.Length; j++)
                {
                    int testResult = -1;
                    testFileNameList.Add(Path.GetFileName(files[j]));
                    Dictionary <string, double> docWordDic = readDoc(files[j], stopWordTable, nlpAdapter);
                    docWordDic = docCooccurrence(docWordDic, dictionary);
                    testResult = knn.test(docWordDic, dictionary, wordIDFDictionary);
                    testAnswer.Add(new KeyValuePair <int, int>(testResult, i));
                    Console.WriteLine(testResult + "," + i);
                }
            }
            return(testAnswer);
        }