static void Main(string[] args) { List <Dictionary <string, double> > docWordDicList = new List <Dictionary <string, double> >(); Dictionary <string, int> dictionary = new Dictionary <string, int>(); List <int> trainingAnswer = new List <int>(); Dictionary <string, double> wordIDFDictionary = new Dictionary <string, double>(); Hashtable stopWordTable = genStopwordTable(STOP_WORD_PATH); List <string> testFileNameList = new List <string>(); int dicSize = 100; Console.WriteLine("==> Starting prepare data..."); NLPAdapter nlpAdapter = new NLPAdapter(NLP_MODEL_PATH); trainModel(TRAINING_DATA_DIR, LOG_DIR, ref docWordDicList, ref dictionary, dicSize, ref trainingAnswer, ref wordIDFDictionary, stopWordTable, nlpAdapter ); #if KNN_MODE KNN knn = new KNN(); knn.set(dicSize, docWordDicList.Count()); knn.initial(docWordDicList, dictionary, trainingAnswer); knn.train(3, 20); knn.getAveDistance(); //knn.genLog(@"D:\work\KPMG\learning\classification\project1_0422\log"); List <KeyValuePair <int, int> > testAnswer = runKnnTest(knn, TEST_DATA_DIR, TEST_LOG_DIR, dictionary, wordIDFDictionary, stopWordTable, ref testFileNameList, nlpAdapter); #else Console.WriteLine("==> Starting get model..."); SVMAdapter svmAdapter = new SVMAdapter(); svm_model model = svmAdapter.getSVMModel(docWordDicList, dictionary, trainingAnswer, SVMAdapter.SVM_C_DEFAULT, SVMAdapter.SVM_GAMMA_DEFAULT); Console.WriteLine("==> Starting SVM test..."); List <KeyValuePair <int, int> > testAnswer = runSVMTest(svmAdapter, TEST_DATA_DIR, TEST_LOG_DIR, dictionary, wordIDFDictionary, stopWordTable, ref testFileNameList, model, nlpAdapter); Console.WriteLine("==> Starting SVM test done!!"); #endif Console.WriteLine("==> Starting saving result..."); genStatistic(testAnswer, testFileNameList, LOG_DIR); }
private static Dictionary <string, double> readDoc(string path, Hashtable stopwordTable, NLPAdapter nlpAdapter) { Dictionary <string, double> docWordCount = new Dictionary <string, double>(); StreamReader docFile = new StreamReader(path); string line; Dictionary <String, String> posTags = new Dictionary <String, String>(); posTags.Add(NLPAdapter.POS_TAG_NN, NLPAdapter.POS_TAG_NN); posTags.Add(NLPAdapter.POS_TAG_NNS, NLPAdapter.POS_TAG_NNS); posTags.Add(NLPAdapter.POS_TAG_NNP, NLPAdapter.POS_TAG_NNP); posTags.Add(NLPAdapter.POS_TAG_NNPS, NLPAdapter.POS_TAG_NNPS); while ((line = docFile.ReadLine()) != null) { if (isColumn(line)) { string column = getColumnName(line); string content = getColumnContent(line); if (column == "subject") { foreach (string iter_word in splitLine(content)) { string word = getWord(iter_word, stopwordTable); //word cleansing done if (word != null) { if (docWordCount.ContainsKey(word)) { docWordCount[word] += weight["subject"]; } else { docWordCount.Add(word, weight["subject"]); } } } } else if (column == "path") { foreach (string iter_word in splitPath(content)) { string word = getWord(iter_word, stopwordTable); //word cleansing done if (word != null) { if (docWordCount.ContainsKey(word)) { docWordCount[word] += weight["path"]; } else { docWordCount.Add(word, weight["path"]); } } } } else if (column == "newsgroups") { foreach (string iter_word in splitNewsgroup(content)) { string word = getWord(iter_word, stopwordTable); //word cleansing done if (word != null) { if (docWordCount.ContainsKey(word)) { docWordCount[word] += weight["newsgroups"]; } else { docWordCount.Add(word, weight["newsgroups"]); } } } } else if (column == "from") { foreach (string iter_word in getEmail(content)) { string word = getWord(iter_word, stopwordTable); //word cleansing done if (word != null) { if (docWordCount.ContainsKey(word)) { docWordCount[word] += weight["from"]; } else { docWordCount.Add(word, weight["from"]); } } } } else { content = processSpecialField(content, ref docWordCount); foreach (string iter_word in splitLine(content)) { string word = getWord(iter_word, stopwordTable); //word cleansing done if (word != null) { if (docWordCount.ContainsKey(word)) { docWordCount[word] += weight["word"]; } else { docWordCount.Add(word, weight["word"]); } } } } } else { if (line.Length != 0) { //return docWordCount; break; } } } while (line != null) { line = processSpecialField(line, ref docWordCount); #if USE_POSTAG // Sango: just left noun. line = nlpAdapter.getFilterResult(line, posTags); #endif foreach (string iter_word in splitLine(line)) { string word = getWord(iter_word, stopwordTable); //word cleansing done if (word != null) { if (docWordCount.ContainsKey(word)) { docWordCount[word] += weight["word"]; } else { docWordCount.Add(word, weight["word"]); } } } line = docFile.ReadLine(); } return(docWordCount); }
private static Dictionary <string, double> readCategory(string path, int categoryIndex, ref List <Dictionary <string, double> > docWordDicList, ref List <int> trainingAnswer, Hashtable stopwordTable, NLPAdapter nlpAdapter) { Dictionary <string, double> categoryWordCount = new Dictionary <string, double>(); Dictionary <string, double> docWordCount = new Dictionary <string, double>(); string[] docs = Directory.GetFiles(path); for (int i = 0; i < docs.Length; i++) { trainingAnswer.Add(categoryIndex); docWordCount = readDoc(docs[i], stopwordTable, nlpAdapter); docWordDicList.Add(docWordCount); foreach (string word in docWordCount.Keys) { if (categoryWordCount.ContainsKey(word)) { if (docWordCount[word] != 0) { //categoryWordCount[word] += docWordCount[word]; categoryWordCount[word] += 1; } } else { if (docWordCount[word] != 0) { //categoryWordCount.Add(word, docWordCount[word]); categoryWordCount.Add(word, 1); } } } } return(categoryWordCount); }
private static void trainModel(string trainPath, string logPath, ref List <Dictionary <string, double> > docWordDicList, ref Dictionary <string, int> dictionary, int dicSize, ref List <int> trainingAnswer, ref Dictionary <string, double> wordIDFDictionary, Hashtable stopwordTable, NLPAdapter nlpAdapter) { List <Dictionary <string, double> > categoryWordCountList = new List <Dictionary <string, double> >(); Dictionary <string, int> tempDictionary = new Dictionary <string, int>(); string[] categories = Directory.GetDirectories(trainPath); for (int i = 0; i < categories.Length; i++) //traverse Categories, generate traingAnswer { categoryWordCountList.Add(readCategory(categories[i], i, ref docWordDicList, ref trainingAnswer, stopwordTable, nlpAdapter)); } // generate wordIDFDictionary for (int i = 0; i < categoryWordCountList.Count(); i++) { foreach (string word in categoryWordCountList[i].Keys) { if (wordIDFDictionary.ContainsKey(word)) { wordIDFDictionary[word] += 1; } else { wordIDFDictionary.Add(word, 1); } } } string[] keys = wordIDFDictionary.Keys.ToArray(); for (int i = 0; i < keys.Length; i++) { wordIDFDictionary[keys[i]] = Math.Log(categoryWordCountList.Count() / wordIDFDictionary[keys[i]]); } // generate dictionary List <List <KeyValuePair <string, double> > > sortedCategoryTFIDFList = new List <List <KeyValuePair <string, double> > >(); StreamWriter dicFile = new StreamWriter(logPath + "\\" + "dictionary.csv"); int dicCount = 0; for (int i = 0; i < categoryWordCountList.Count(); i++) { string[] words = categoryWordCountList[i].Keys.ToArray(); double categoryWordCountSum = 0; List <KeyValuePair <string, double> > sortedCategoryTFIDF = new List <KeyValuePair <string, double> >(); for (int j = 0; j < words.Length; j++) { categoryWordCountSum += categoryWordCountList[i][words[j]]; } for (int j = 0; j < words.Length; j++) { sortedCategoryTFIDF.Add(new KeyValuePair <string, double>(words[j], (categoryWordCountList[i][words[j]] / categoryWordCountSum) * wordIDFDictionary[words[j]]));//category TFIDF } //sortedCategoryTFIDF = categoryWordCountList[i].ToList(); sortedCategoryTFIDF.Sort((a, b) => b.Value.CompareTo(a.Value)); sortedCategoryTFIDFList.Add(sortedCategoryTFIDF); } for (int i = 0; i < dicSize * 2; i++) { for (int j = 0; j < sortedCategoryTFIDFList.Count(); j++) { if (dicCount >= dicSize * 2) { break; } if (!tempDictionary.ContainsKey(sortedCategoryTFIDFList[j][i].Key)) { dicFile.WriteLine(sortedCategoryTFIDFList[j][i].Key + "," + sortedCategoryTFIDFList[j][i].Value); tempDictionary.Add(sortedCategoryTFIDFList[j][i].Key, dicCount); dicCount++; } } if (dicCount >= dicSize * 2) { dicFile.Close(); break; } } dictionary = trainCooccurrence(logPath, ref categoryWordCountList, ref docWordDicList, ref wordIDFDictionary, trainingAnswer, tempDictionary, dicSize); //generate docWordDicList for (int i = 0; i < docWordDicList.Count(); i++) { string[] words = docWordDicList[i].Keys.ToArray(); double docWordCountSum = 0; for (int j = 0; j < words.Length; j++) { docWordCountSum += docWordDicList[i][words[j]]; } for (int j = 0; j < words.Length; j++) { if (docWordDicList[i][words[j]] != 0) { docWordDicList[i][words[j]] = (docWordDicList[i][words[j]] / docWordCountSum) * wordIDFDictionary[words[j]];//docWordDic TFIDF } } } }
private static List <KeyValuePair <int, int> > runSVMTest(SVMAdapter svmAdapter, string testPath, string logPath, Dictionary <string, int> dictionary, Dictionary <string, double> wordIDFDictionary, Hashtable stopWordTable, ref List <string> testFileNameList, svm_model model, NLPAdapter nlpAdapter) { string[] categories = Directory.GetDirectories(testPath); List <KeyValuePair <int, int> > testAnswer = new List <KeyValuePair <int, int> >(); for (int i = 0; i < categories.Length; i++) //traverse Categories { Console.WriteLine(categories[i]); string[] files = Directory.GetFiles(categories[i]); for (int j = 0; j < files.Length; j++) { int testResult = -1; testFileNameList.Add(Path.GetFileName(files[j])); testResult = svmAdapter.runSVMTest(readDoc(files[j], stopWordTable, nlpAdapter), dictionary, wordIDFDictionary, model); testAnswer.Add(new KeyValuePair <int, int>(testResult, i)); Console.WriteLine(testResult + "," + i); } } return(testAnswer); }
private static List <KeyValuePair <int, int> > runKnnTest(KNN knn, string testPath, string logPath, Dictionary <string, int> dictionary, Dictionary <string, double> wordIDFDictionary, Hashtable stopWordTable, ref List <string> testFileNameList, NLPAdapter nlpAdapter) { string[] categories = Directory.GetDirectories(testPath); List <KeyValuePair <int, int> > testAnswer = new List <KeyValuePair <int, int> >(); for (int i = 0; i < categories.Length; i++) //traverse Categories { Console.WriteLine(categories[i]); string[] files = Directory.GetFiles(categories[i]); for (int j = 0; j < files.Length; j++) { int testResult = -1; testFileNameList.Add(Path.GetFileName(files[j])); Dictionary <string, double> docWordDic = readDoc(files[j], stopWordTable, nlpAdapter); docWordDic = docCooccurrence(docWordDic, dictionary); testResult = knn.test(docWordDic, dictionary, wordIDFDictionary); testAnswer.Add(new KeyValuePair <int, int>(testResult, i)); Console.WriteLine(testResult + "," + i); } } return(testAnswer); }