Beispiel #1
0
        public static List<testResult> RunTest(string path,Hashtable dictionary ,int dicSize, Hashtable idfTable, KNearestNeighbors knn)
        {
            List<testResult> result = new List<testResult>();
            //int[] trainingAnswer = new int[17998];
            int count = 0;
            string[] categories = Directory.GetDirectories(path);
            for (int i = 0; i < categories.Count(); i++) //traverse Categories
            {
                Console.WriteLine(Path.GetFileName(categories[i]));
                string[] file_names = Directory.GetFiles(categories[i]);
                for (int j = 0; j < file_names.Count(); j++) //file in Cagetory
                {
                    Console.WriteLine(Path.GetFileName(file_names[j]));
                    System.IO.StreamReader file = new System.IO.StreamReader(file_names[j]);
                    double[] featureV = new double[dicSize];
                    for(int k = 0;k<dicSize;k++) //initial
                        featureV[k] = 0;
                    String line;
                    int counter = 0;
                    Hashtable docWord = new Hashtable();
                    Stemmer stemmer = new Stemmer();
                    int sumWordCount = 0;
                    stemmer.stem();
                    //Console.WriteLine(stemmer.stem("running"));
                    //String word;

                    /******Structured Column*****/
                    while ((line = file.ReadLine()) != null)
                    {
                        //Console.WriteLine(line);
                        if (line.Contains(": "))
                        {
                            string[] splitPart = line.Split(new string[] { ": " }, StringSplitOptions.None);
                            string columnName = splitPart[0].Trim();
                            string content = splitPart[splitPart.Length - 1];
                            if (columnName.ToLower() == "subject")
                            {
                                foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]"))
                                {
                                    String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                                    double Num;
                                    bool isNum = double.TryParse(word, out Num);
                                    if (isNum)
                                    {
                                        continue;
                                    }
                                    stemmer.add(word.ToCharArray(), word.Length);
                                    stemmer.stem();
                                    word = stemmer.ToString();
                                    if (word.Length == 0)
                                    {
                                        continue;
                                    }
                                    if (stopWordTable.ContainsKey(word))
                                    {
                                        continue;
                                    }
                                    sumWordCount += 1 * subjectWeight;
                                    // word preprocess done
                                    if (docWord.ContainsKey(word))
                                    {
                                        int temp = (int)docWord[word];
                                        temp += 1 * subjectWeight;
                                        docWord[word] = temp;
                                    }
                                    else
                                    {
                                        docWord[word] = 1 * subjectWeight;
                                    }
                                }
                            }
                            /*else if (columnName.ToLower() == "keywords")
                            {
                                foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]"))
                                {
                                    String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                                    double Num;
                                    bool isNum = double.TryParse(word, out Num);
                                    if (isNum)
                                    {
                                        continue;
                                    }
                                    stemmer.add(word.ToCharArray(), word.Length);
                                    stemmer.stem();
                                    word = stemmer.ToString();
                                    if (word.Length == 0)
                                    {
                                        continue;
                                    }
                                    if (stopWordTable.ContainsKey(word))
                                    {
                                        continue;
                                    }
                                    sumWordCount += 1 * keywordsWeight;
                                    // word preprocess done
                                    if (docWord.ContainsKey(word))
                                    {
                                        int temp = (int)docWord[word];
                                        temp += 1 * keywordsWeight;
                                        docWord[word] = temp;
                                    }
                                    else
                                    {
                                        docWord[word] = 1 * keywordsWeight;
                                    }
                                }
                            }
                            if (columnName.ToLower() == "newsgroups")
                            {
                                foreach (string iter_word in content.Split(new char[] { ',' }))
                                {
                                    String word = iter_word.ToLower().Trim();
                                    sumWordCount += 1 * newsgroupsWeight;
                                    // word preprocess done
                                    if (docWord.ContainsKey(word))
                                    {
                                        int temp = (int)docWord[word];
                                        temp += 1 * newsgroupsWeight;
                                        docWord[word] = temp;
                                    }
                                    else
                                    {
                                        docWord[word] = 1 * newsgroupsWeight;
                                    }
                                }
                            }*/
                            /*else if (columnName.ToLower() == "from")
                            {
                                Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
                                //find items that matches with our pattern
                                MatchCollection emailMatches = emailRegex.Matches(content);
                                foreach (Match emailMatch in emailMatches)
                                {
                                    String word = emailMatch.Value;
                                    // word preprocess done
                                    if (docWord.ContainsKey(word))
                                    {
                                        int temp = (int)docWord[word];
                                        temp += 1 * fromWeight;
                                        docWord[word] = temp;
                                    }
                                    else
                                    {
                                        docWord[word] = 1 * fromWeight;
                                    }
                                }
                            }*/
                        }
                        else
                        {
                            break;
                        }
                    }

                    /******Text******/
                    while ((line = file.ReadLine()) != null)
                    {
                        if (line.StartsWith(">") || line.StartsWith("|>"))
                        {
                            continue;
                        }
                        //foreach(string iter_word in line.Split(new Char [] {' ', ',', '.', ':', '\t', '\n' }))
                        foreach (string iter_word in Regex.Split(line, @"[^A-Za-z0-9_-]"))
                        {
                            String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                            double Num;
                            bool isNum = double.TryParse(word, out Num);
                            if (isNum)
                            {
                                continue;
                            }
                            stemmer.add(word.ToCharArray(), word.Length);
                            stemmer.stem();
                            word = stemmer.ToString();
                            if (word.Length == 0)
                            {
                                continue;
                            }
                            if (stopWordTable.ContainsKey(word))
                            {
                                continue;
                            }
                            sumWordCount++;
                            // word preprocess done
                            if (docWord.ContainsKey(word))
                            {
                                int temp = (int)docWord[word];
                                temp++;
                                docWord[word] = temp;
                            }
                            else
                            {
                                docWord[word] = 1;
                            }
                        }
                    }// line end
                    foreach (string word in docWord.Keys)
                    {
                        if (dictionary.ContainsKey(word))
                        {
                            int indexOfDic = (int)dictionary[word];
                            double TF = System.Convert.ToDouble((int)docWord[word])/System.Convert.ToDouble(sumWordCount);
                            double IDF = (double)idfTable[word];
                            featureV[indexOfDic] = TF * IDF;
                        }
                    }
                    testResult resultTemp = new testResult();
                    resultTemp.docName = Path.GetFileName(file_names[j]);
                    resultTemp.oriClass = i;
                    resultTemp.resultClass = knn.Compute(featureV);
                    result.Add(resultTemp);
                    Console.WriteLine(resultTemp.resultClass);
                }//file end
                //Console.ReadLine();
            }//category end
            return result;
        }
Beispiel #2
0
        /** Test program for demonstrating the Stemmer.  It reads text from a
         * a list of files, stems each word, and writes the result to standard
         * output. Note that the word stemmed is expected to be in lower case:
         * forcing lower case must be done outside the Stemmer class.
         * Usage: Stemmer file-name file-name ...
         */
        public static void Main(String[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Usage:  Stemmer <input file>");
                return;
            }
            char[]  w = new char[501];
            Stemmer s = new Stemmer();

            for (int i = 0; i < args.Length; i++)
            {
                try
                {
                    FileStream _in = new FileStream(args[i], FileMode.Open, FileAccess.Read);
                    try
                    {
                        while (true)
                        {
                            int ch = _in.ReadByte();
                            if (Char.IsLetter((char)ch))
                            {
                                int j = 0;
                                while (true)
                                {
                                    ch   = Char.ToLower((char)ch);
                                    w[j] = (char)ch;
                                    if (j < 500)
                                    {
                                        j++;
                                    }
                                    ch = _in.ReadByte();
                                    if (!Char.IsLetter((char)ch))
                                    {
                                        /* to test add(char ch) */
                                        for (int c = 0; c < j; c++)
                                        {
                                            s.add(w[c]);
                                        }
                                        /* or, to test add(char[] w, int j) */
                                        /* s.add(w, j); */
                                        s.stem();

                                        String u;

                                        /* and now, to test toString() : */
                                        u = s.ToString();

                                        /* to test getResultBuffer(), getResultLength() : */
                                        /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */

                                        Console.Write(u);
                                        break;
                                    }
                                }
                            }
                            if (ch < 0)
                            {
                                break;
                            }
                            Console.Write((char)ch);
                        }
                    }
                    catch (IOException)
                    {
                        Console.WriteLine("error reading " + args[i]);
                        break;
                    }
                }
                catch (FileNotFoundException)
                {
                    Console.WriteLine("file " + args[i] + " not found");
                    break;
                }
            }
        }
Beispiel #3
0
        // Insert logic for processing found files here.
        public static int ProcessTraining(string filePath, Hashtable wordInCategory)
        {
            //Console.WriteLine("Processed file '{0}'.", filePath);
            //Console.WriteLine(filePath.Replace("20_newsgroups","parsed"));
            // Read the file and display it line by line.
            fileList.Add(filePath);
            System.IO.StreamReader file = new System.IO.StreamReader(filePath);
            String line;
            int counter = 0;
            Hashtable docWord = new Hashtable();
            Stemmer stemmer = new Stemmer();

            stemmer.stem();
            //Console.WriteLine(stemmer.stem("running"));
            //String word;
            counter = 0;
            /******Structured Column*****/

            while ((line = file.ReadLine()) != null)
            {
                //Console.WriteLine(line);
                if (line.Contains(": "))
                {
                    string[] splitPart = line.Split(new string[] { ": " }, StringSplitOptions.None);
                    string columnName = splitPart[0].Trim();
                    columnCountTable[columnName] = filePath;
                    string content = splitPart[splitPart.Length-1];
                    if (columnName.ToLower() == "subject")
                    {
                        foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]"))
                        {
                            wordCount temp;
                            int wordCountTemp = 0;
                            String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                            double Num;
                            bool isNum = double.TryParse(word, out Num);
                            if (isNum)
                            {
                                continue;
                            }
                            stemmer.add(word.ToCharArray(), word.Length);
                            stemmer.stem();
                            word = stemmer.ToString();
                            if (word.Length == 0)
                            {
                                continue;
                            }
                            if (stopWordTable.ContainsKey(word))
                            {
                                continue;
                            }
                            // word preprocess done
                            counter += 1 * subjectWeight;
                            if (wordInCategory.ContainsKey(word))
                            {
                                int count = (int)wordInCategory[word];
                                count += 1 * subjectWeight;
                                wordInCategory[word] = count;
                            }
                            else
                            {
                                wordInCategory[word] = 1 * subjectWeight;
                            }
                            if (wordCountTable.ContainsKey(word)) //word already apper
                            {
                                temp = (wordCount)wordCountTable[word];
                                temp.count += 1 * subjectWeight;
                                if (!docWord.ContainsKey(word))//add DF
                                {
                                    temp.DF += 1;
                                }
                            }
                            else
                            {
                                temp.count = 1 * subjectWeight;
                                temp.DF = 1;
                            }
                            if (docWord.ContainsKey(word))/****real count word*****/
                            {
                                wordCountTemp = (int)docWord[word];
                            }
                            wordCountTemp += 1 * subjectWeight;
                            docWord[word] = wordCountTemp;
                            wordCountTable[word] = temp;
                        }
                    }
                    /*else if (columnName.ToLower() == "newsgroups")
                    {
                        foreach (string iter_word in content.Split(new char[]{','}))
                        {
                            wordCount temp;
                            int wordCountTemp = 0;
                            String word = iter_word.ToLower().Trim();
                            // word preprocess done
                            counter += 1 * newsgroupsWeight;
                            if (wordInCategory.ContainsKey(word))
                            {
                                int count = (int)wordInCategory[word];
                                count += 1 * newsgroupsWeight;
                                wordInCategory[word] = count;
                            }
                            else
                            {
                                wordInCategory[word] = 1 * newsgroupsWeight;
                            }
                            if (wordCountTable.ContainsKey(word)) //word already apper
                            {
                                temp = (wordCount)wordCountTable[word];
                                temp.count += 1 * newsgroupsWeight;
                                if (!docWord.ContainsKey(word))//add DF
                                {
                                    temp.DF += 1;
                                }
                            }
                            else
                            {
                                temp.count = 1 * newsgroupsWeight;
                                temp.DF = 1;
                            }
                            if (docWord.ContainsKey(word))*//****real count word*****/
                           /* {
                                wordCountTemp = (int)docWord[word];
                            }
                            wordCountTemp += 1 * newsgroupsWeight;
                            docWord[word] = wordCountTemp;
                            wordCountTable[word] = temp;
                        }
                    }
                    else if (columnName.ToLower() == "keywords")
                    {
                        foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]"))
                        {
                            wordCount temp;
                            int wordCountTemp = 0;
                            String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                            double Num;
                            bool isNum = double.TryParse(word, out Num);
                            if (isNum)
                            {
                                continue;
                            }
                            stemmer.add(word.ToCharArray(), word.Length);
                            stemmer.stem();
                            word = stemmer.ToString();
                            if (word.Length == 0)
                            {
                                continue;
                            }
                            if (stopWordTable.ContainsKey(word))
                            {
                                continue;
                            }
                            // word preprocess done
                            counter += 1 * keywordsWeight;
                            if (wordInCategory.ContainsKey(word))
                            {
                                int count = (int)wordInCategory[word];
                                count += 1 * keywordsWeight;
                                wordInCategory[word] = count;
                            }
                            else
                            {
                                wordInCategory[word] = 1 * keywordsWeight;
                            }
                            if (wordCountTable.ContainsKey(word)) //word already apper
                            {
                                temp = (wordCount)wordCountTable[word];
                                temp.count += 1 * keywordsWeight;
                                if (!docWord.ContainsKey(word))//add DF
                                {
                                    temp.DF += 1;
                                }
                            }
                            else
                            {
                                temp.count = 1 * keywordsWeight;
                                temp.DF = 1;
                            }
                            if (docWord.ContainsKey(word))*//****real count word*****/
                            /*{
                                wordCountTemp = (int)docWord[word];
                            }
                            wordCountTemp += 1 * keywordsWeight;
                            docWord[word] = wordCountTemp;
                            wordCountTable[word] = temp;
                        }
                    }*/
                    /*else if (columnName.ToLower() == "from")
                    {
                        Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*",RegexOptions.IgnoreCase);
                        //find items that matches with our pattern
                        MatchCollection emailMatches = emailRegex.Matches(content);
                        foreach (Match emailMatch in emailMatches)
                        {
                            wordCount temp;
                            int wordCountTemp = 0;
                            String word = emailMatch.Value;
                            // word preprocess done
                            counter += 1 * fromWeight;
                            if (wordInCategory.ContainsKey(word))
                            {
                                int count = (int)wordInCategory[word];
                                count += 1 * fromWeight;
                                wordInCategory[word] = count;
                            }
                            else
                            {
                                wordInCategory[word] = 1 * fromWeight;
                            }
                            if (wordCountTable.ContainsKey(word)) //word already apper
                            {
                                temp = (wordCount)wordCountTable[word];
                                temp.count += 1 * fromWeight;
                                if (!docWord.ContainsKey(word))//add DF
                                {
                                    temp.DF += 1;
                                }
                            }
                            else
                            {
                                temp.count = 1 * fromWeight;
                                temp.DF = 1;
                            }
                            if (docWord.ContainsKey(word))*//****real count word*****/
                            /*{
                                wordCountTemp = (int)docWord[word];
                            }
                            wordCountTemp += 1 * fromWeight;
                            docWord[word] = wordCountTemp;
                            wordCountTable[word] = temp;
                        }
                    }*/
                }
                else
                {
                    break;
                }
            }

            /******Text******/
            while ((line = file.ReadLine()) != null)
            {
                if (line.StartsWith(">") || line.StartsWith("|>"))
                {
                    continue;
                }
                //foreach(string iter_word in line.Split(new Char [] {' ', ',', '.', ':', '\t', '\n' }))
                foreach (string iter_word in Regex.Split(line, @"[^A-Za-z0-9_-]"))
                {
                    wordCount temp;
                    int wordCountTemp = 0;
                    String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                    double Num;
                    bool isNum = double.TryParse(word, out Num);
                    if(isNum)
                    {
                        continue;
                    }
                    stemmer.add(word.ToCharArray(), word.Length);
                    stemmer.stem();
                    word = stemmer.ToString();
                    if (word.Length == 0)
                    {
                        continue;
                    }
                    if (stopWordTable.ContainsKey(word))
                    {
                        continue;
                    }
                    // word preprocess done
                    counter++;
                    if (wordInCategory.ContainsKey(word))
                    {
                        int count = (int)wordInCategory[word];
                        count += 1;
                        wordInCategory[word] = count;
                    }
                    else
                    {
                        wordInCategory[word] = 1;
                    }
                    if (wordCountTable.ContainsKey(word)) //word already apper
                    {
                        temp = (wordCount)wordCountTable[word];
                        temp.count += 1;
                        if (!docWord.ContainsKey(word))//add DF
                        {
                            temp.DF += 1;
                        }
                    }
                    else
                    {
                        temp.count = 1;
                        temp.DF = 1;
                    }
                    if (docWord.ContainsKey(word))
                    {
                        wordCountTemp = (int)docWord[word];
                    }
                    wordCountTemp += 1;
                    docWord[word] = wordCountTemp;
                    wordCountTable[word] = temp;
                }
            }
            docFeatureTable.Add(docWord);
            file.Close();
            //Console.ReadLine();
            return counter;
        }