public static List<testResult> RunTest(string path,Hashtable dictionary ,int dicSize, Hashtable idfTable, KNearestNeighbors knn) { List<testResult> result = new List<testResult>(); //int[] trainingAnswer = new int[17998]; int count = 0; string[] categories = Directory.GetDirectories(path); for (int i = 0; i < categories.Count(); i++) //traverse Categories { Console.WriteLine(Path.GetFileName(categories[i])); string[] file_names = Directory.GetFiles(categories[i]); for (int j = 0; j < file_names.Count(); j++) //file in Cagetory { Console.WriteLine(Path.GetFileName(file_names[j])); System.IO.StreamReader file = new System.IO.StreamReader(file_names[j]); double[] featureV = new double[dicSize]; for(int k = 0;k<dicSize;k++) //initial featureV[k] = 0; String line; int counter = 0; Hashtable docWord = new Hashtable(); Stemmer stemmer = new Stemmer(); int sumWordCount = 0; stemmer.stem(); //Console.WriteLine(stemmer.stem("running")); //String word; /******Structured Column*****/ while ((line = file.ReadLine()) != null) { //Console.WriteLine(line); if (line.Contains(": ")) { string[] splitPart = line.Split(new string[] { ": " }, StringSplitOptions.None); string columnName = splitPart[0].Trim(); string content = splitPart[splitPart.Length - 1]; if (columnName.ToLower() == "subject") { foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]")) { String word = iter_word.ToLower().Trim(new Char[] { '_', '-' }); double Num; bool isNum = double.TryParse(word, out Num); if (isNum) { continue; } stemmer.add(word.ToCharArray(), word.Length); stemmer.stem(); word = stemmer.ToString(); if (word.Length == 0) { continue; } if (stopWordTable.ContainsKey(word)) { continue; } sumWordCount += 1 * subjectWeight; // word preprocess done if (docWord.ContainsKey(word)) { int temp = (int)docWord[word]; temp += 1 * subjectWeight; docWord[word] = temp; } else { docWord[word] = 1 * subjectWeight; } } } /*else if (columnName.ToLower() == "keywords") { foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]")) { String word = iter_word.ToLower().Trim(new Char[] { '_', '-' }); double Num; bool isNum = double.TryParse(word, out Num); if (isNum) { continue; } stemmer.add(word.ToCharArray(), word.Length); stemmer.stem(); word = stemmer.ToString(); if (word.Length == 0) { continue; } if (stopWordTable.ContainsKey(word)) { continue; } sumWordCount += 1 * keywordsWeight; // word preprocess done if (docWord.ContainsKey(word)) { int temp = (int)docWord[word]; temp += 1 * keywordsWeight; docWord[word] = temp; } else { docWord[word] = 1 * keywordsWeight; } } } if (columnName.ToLower() == "newsgroups") { foreach (string iter_word in content.Split(new char[] { ',' })) { String word = iter_word.ToLower().Trim(); sumWordCount += 1 * newsgroupsWeight; // word preprocess done if (docWord.ContainsKey(word)) { int temp = (int)docWord[word]; temp += 1 * newsgroupsWeight; docWord[word] = temp; } else { docWord[word] = 1 * newsgroupsWeight; } } }*/ /*else if (columnName.ToLower() == "from") { Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase); //find items that matches with our pattern MatchCollection emailMatches = emailRegex.Matches(content); foreach (Match emailMatch in emailMatches) { String word = emailMatch.Value; // word preprocess done if (docWord.ContainsKey(word)) { int temp = (int)docWord[word]; temp += 1 * fromWeight; docWord[word] = temp; } else { docWord[word] = 1 * fromWeight; } } }*/ } else { break; } } /******Text******/ while ((line = file.ReadLine()) != null) { if (line.StartsWith(">") || line.StartsWith("|>")) { continue; } //foreach(string iter_word in line.Split(new Char [] {' ', ',', '.', ':', '\t', '\n' })) foreach (string iter_word in Regex.Split(line, @"[^A-Za-z0-9_-]")) { String word = iter_word.ToLower().Trim(new Char[] { '_', '-' }); double Num; bool isNum = double.TryParse(word, out Num); if (isNum) { continue; } stemmer.add(word.ToCharArray(), word.Length); stemmer.stem(); word = stemmer.ToString(); if (word.Length == 0) { continue; } if (stopWordTable.ContainsKey(word)) { continue; } sumWordCount++; // word preprocess done if (docWord.ContainsKey(word)) { int temp = (int)docWord[word]; temp++; docWord[word] = temp; } else { docWord[word] = 1; } } }// line end foreach (string word in docWord.Keys) { if (dictionary.ContainsKey(word)) { int indexOfDic = (int)dictionary[word]; double TF = System.Convert.ToDouble((int)docWord[word])/System.Convert.ToDouble(sumWordCount); double IDF = (double)idfTable[word]; featureV[indexOfDic] = TF * IDF; } } testResult resultTemp = new testResult(); resultTemp.docName = Path.GetFileName(file_names[j]); resultTemp.oriClass = i; resultTemp.resultClass = knn.Compute(featureV); result.Add(resultTemp); Console.WriteLine(resultTemp.resultClass); }//file end //Console.ReadLine(); }//category end return result; }
/** Test program for demonstrating the Stemmer. It reads text from a * a list of files, stems each word, and writes the result to standard * output. Note that the word stemmed is expected to be in lower case: * forcing lower case must be done outside the Stemmer class. * Usage: Stemmer file-name file-name ... */ public static void Main(String[] args) { if (args.Length == 0) { Console.WriteLine("Usage: Stemmer <input file>"); return; } char[] w = new char[501]; Stemmer s = new Stemmer(); for (int i = 0; i < args.Length; i++) { try { FileStream _in = new FileStream(args[i], FileMode.Open, FileAccess.Read); try { while (true) { int ch = _in.ReadByte(); if (Char.IsLetter((char)ch)) { int j = 0; while (true) { ch = Char.ToLower((char)ch); w[j] = (char)ch; if (j < 500) { j++; } ch = _in.ReadByte(); if (!Char.IsLetter((char)ch)) { /* to test add(char ch) */ for (int c = 0; c < j; c++) { s.add(w[c]); } /* or, to test add(char[] w, int j) */ /* s.add(w, j); */ s.stem(); String u; /* and now, to test toString() : */ u = s.ToString(); /* to test getResultBuffer(), getResultLength() : */ /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */ Console.Write(u); break; } } } if (ch < 0) { break; } Console.Write((char)ch); } } catch (IOException) { Console.WriteLine("error reading " + args[i]); break; } } catch (FileNotFoundException) { Console.WriteLine("file " + args[i] + " not found"); break; } } }
// Insert logic for processing found files here. public static int ProcessTraining(string filePath, Hashtable wordInCategory) { //Console.WriteLine("Processed file '{0}'.", filePath); //Console.WriteLine(filePath.Replace("20_newsgroups","parsed")); // Read the file and display it line by line. fileList.Add(filePath); System.IO.StreamReader file = new System.IO.StreamReader(filePath); String line; int counter = 0; Hashtable docWord = new Hashtable(); Stemmer stemmer = new Stemmer(); stemmer.stem(); //Console.WriteLine(stemmer.stem("running")); //String word; counter = 0; /******Structured Column*****/ while ((line = file.ReadLine()) != null) { //Console.WriteLine(line); if (line.Contains(": ")) { string[] splitPart = line.Split(new string[] { ": " }, StringSplitOptions.None); string columnName = splitPart[0].Trim(); columnCountTable[columnName] = filePath; string content = splitPart[splitPart.Length-1]; if (columnName.ToLower() == "subject") { foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]")) { wordCount temp; int wordCountTemp = 0; String word = iter_word.ToLower().Trim(new Char[] { '_', '-' }); double Num; bool isNum = double.TryParse(word, out Num); if (isNum) { continue; } stemmer.add(word.ToCharArray(), word.Length); stemmer.stem(); word = stemmer.ToString(); if (word.Length == 0) { continue; } if (stopWordTable.ContainsKey(word)) { continue; } // word preprocess done counter += 1 * subjectWeight; if (wordInCategory.ContainsKey(word)) { int count = (int)wordInCategory[word]; count += 1 * subjectWeight; wordInCategory[word] = count; } else { wordInCategory[word] = 1 * subjectWeight; } if (wordCountTable.ContainsKey(word)) //word already apper { temp = (wordCount)wordCountTable[word]; temp.count += 1 * subjectWeight; if (!docWord.ContainsKey(word))//add DF { temp.DF += 1; } } else { temp.count = 1 * subjectWeight; temp.DF = 1; } if (docWord.ContainsKey(word))/****real count word*****/ { wordCountTemp = (int)docWord[word]; } wordCountTemp += 1 * subjectWeight; docWord[word] = wordCountTemp; wordCountTable[word] = temp; } } /*else if (columnName.ToLower() == "newsgroups") { foreach (string iter_word in content.Split(new char[]{','})) { wordCount temp; int wordCountTemp = 0; String word = iter_word.ToLower().Trim(); // word preprocess done counter += 1 * newsgroupsWeight; if (wordInCategory.ContainsKey(word)) { int count = (int)wordInCategory[word]; count += 1 * newsgroupsWeight; wordInCategory[word] = count; } else { wordInCategory[word] = 1 * newsgroupsWeight; } if (wordCountTable.ContainsKey(word)) //word already apper { temp = (wordCount)wordCountTable[word]; temp.count += 1 * newsgroupsWeight; if (!docWord.ContainsKey(word))//add DF { temp.DF += 1; } } else { temp.count = 1 * newsgroupsWeight; temp.DF = 1; } if (docWord.ContainsKey(word))*//****real count word*****/ /* { wordCountTemp = (int)docWord[word]; } wordCountTemp += 1 * newsgroupsWeight; docWord[word] = wordCountTemp; wordCountTable[word] = temp; } } else if (columnName.ToLower() == "keywords") { foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]")) { wordCount temp; int wordCountTemp = 0; String word = iter_word.ToLower().Trim(new Char[] { '_', '-' }); double Num; bool isNum = double.TryParse(word, out Num); if (isNum) { continue; } stemmer.add(word.ToCharArray(), word.Length); stemmer.stem(); word = stemmer.ToString(); if (word.Length == 0) { continue; } if (stopWordTable.ContainsKey(word)) { continue; } // word preprocess done counter += 1 * keywordsWeight; if (wordInCategory.ContainsKey(word)) { int count = (int)wordInCategory[word]; count += 1 * keywordsWeight; wordInCategory[word] = count; } else { wordInCategory[word] = 1 * keywordsWeight; } if (wordCountTable.ContainsKey(word)) //word already apper { temp = (wordCount)wordCountTable[word]; temp.count += 1 * keywordsWeight; if (!docWord.ContainsKey(word))//add DF { temp.DF += 1; } } else { temp.count = 1 * keywordsWeight; temp.DF = 1; } if (docWord.ContainsKey(word))*//****real count word*****/ /*{ wordCountTemp = (int)docWord[word]; } wordCountTemp += 1 * keywordsWeight; docWord[word] = wordCountTemp; wordCountTable[word] = temp; } }*/ /*else if (columnName.ToLower() == "from") { Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*",RegexOptions.IgnoreCase); //find items that matches with our pattern MatchCollection emailMatches = emailRegex.Matches(content); foreach (Match emailMatch in emailMatches) { wordCount temp; int wordCountTemp = 0; String word = emailMatch.Value; // word preprocess done counter += 1 * fromWeight; if (wordInCategory.ContainsKey(word)) { int count = (int)wordInCategory[word]; count += 1 * fromWeight; wordInCategory[word] = count; } else { wordInCategory[word] = 1 * fromWeight; } if (wordCountTable.ContainsKey(word)) //word already apper { temp = (wordCount)wordCountTable[word]; temp.count += 1 * fromWeight; if (!docWord.ContainsKey(word))//add DF { temp.DF += 1; } } else { temp.count = 1 * fromWeight; temp.DF = 1; } if (docWord.ContainsKey(word))*//****real count word*****/ /*{ wordCountTemp = (int)docWord[word]; } wordCountTemp += 1 * fromWeight; docWord[word] = wordCountTemp; wordCountTable[word] = temp; } }*/ } else { break; } } /******Text******/ while ((line = file.ReadLine()) != null) { if (line.StartsWith(">") || line.StartsWith("|>")) { continue; } //foreach(string iter_word in line.Split(new Char [] {' ', ',', '.', ':', '\t', '\n' })) foreach (string iter_word in Regex.Split(line, @"[^A-Za-z0-9_-]")) { wordCount temp; int wordCountTemp = 0; String word = iter_word.ToLower().Trim(new Char[] { '_', '-' }); double Num; bool isNum = double.TryParse(word, out Num); if(isNum) { continue; } stemmer.add(word.ToCharArray(), word.Length); stemmer.stem(); word = stemmer.ToString(); if (word.Length == 0) { continue; } if (stopWordTable.ContainsKey(word)) { continue; } // word preprocess done counter++; if (wordInCategory.ContainsKey(word)) { int count = (int)wordInCategory[word]; count += 1; wordInCategory[word] = count; } else { wordInCategory[word] = 1; } if (wordCountTable.ContainsKey(word)) //word already apper { temp = (wordCount)wordCountTable[word]; temp.count += 1; if (!docWord.ContainsKey(word))//add DF { temp.DF += 1; } } else { temp.count = 1; temp.DF = 1; } if (docWord.ContainsKey(word)) { wordCountTemp = (int)docWord[word]; } wordCountTemp += 1; docWord[word] = wordCountTemp; wordCountTable[word] = temp; } } docFeatureTable.Add(docWord); file.Close(); //Console.ReadLine(); return counter; }