C# (CSharp) porter Stemmer Examples

Programming Language: C# (CSharp)

Namespace/Package Name: porter

Class/Type: Stemmer

Examples at hotexamples.com: 3

C# (CSharp) porter Stemmer - 3 examples found. These are the top rated real world C# (CSharp) examples of porter.Stemmer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ToString(1)

add(1)

stem(1)

Stemmer Class Documentation

Example #1

Show file

File: Program.cs Project: sweslo17/classification

        public static List<testResult> RunTest(string path,Hashtable dictionary ,int dicSize, Hashtable idfTable, KNearestNeighbors knn)
        {
            List<testResult> result = new List<testResult>();
            //int[] trainingAnswer = new int[17998];
            int count = 0;
            string[] categories = Directory.GetDirectories(path);
            for (int i = 0; i < categories.Count(); i++) //traverse Categories
            {
                Console.WriteLine(Path.GetFileName(categories[i]));
                string[] file_names = Directory.GetFiles(categories[i]);
                for (int j = 0; j < file_names.Count(); j++) //file in Cagetory
                {
                    Console.WriteLine(Path.GetFileName(file_names[j]));
                    System.IO.StreamReader file = new System.IO.StreamReader(file_names[j]);
                    double[] featureV = new double[dicSize];
                    for(int k = 0;k<dicSize;k++) //initial
                        featureV[k] = 0;
                    String line;
                    int counter = 0;
                    Hashtable docWord = new Hashtable();
                    Stemmer stemmer = new Stemmer();
                    int sumWordCount = 0;
                    stemmer.stem();
                    //Console.WriteLine(stemmer.stem("running"));
                    //String word;

                    /******Structured Column*****/
                    while ((line = file.ReadLine()) != null)
                    {
                        //Console.WriteLine(line);
                        if (line.Contains(": "))
                        {
                            string[] splitPart = line.Split(new string[] { ": " }, StringSplitOptions.None);
                            string columnName = splitPart[0].Trim();
                            string content = splitPart[splitPart.Length - 1];
                            if (columnName.ToLower() == "subject")
                            {
                                foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]"))
                                {
                                    String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                                    double Num;
                                    bool isNum = double.TryParse(word, out Num);
                                    if (isNum)
                                    {
                                        continue;
                                    }
                                    stemmer.add(word.ToCharArray(), word.Length);
                                    stemmer.stem();
                                    word = stemmer.ToString();
                                    if (word.Length == 0)
                                    {
                                        continue;
                                    }
                                    if (stopWordTable.ContainsKey(word))
                                    {
                                        continue;
                                    }
                                    sumWordCount += 1 * subjectWeight;
                                    // word preprocess done
                                    if (docWord.ContainsKey(word))
                                    {
                                        int temp = (int)docWord[word];
                                        temp += 1 * subjectWeight;
                                        docWord[word] = temp;
                                    }
                                    else
                                    {
                                        docWord[word] = 1 * subjectWeight;
                                    }
                                }
                            }
                            /*else if (columnName.ToLower() == "keywords")
                            {
                                foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]"))
                                {
                                    String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                                    double Num;
                                    bool isNum = double.TryParse(word, out Num);
                                    if (isNum)
                                    {
                                        continue;
                                    }
                                    stemmer.add(word.ToCharArray(), word.Length);
                                    stemmer.stem();
                                    word = stemmer.ToString();
                                    if (word.Length == 0)
                                    {
                                        continue;
                                    }
                                    if (stopWordTable.ContainsKey(word))
                                    {
                                        continue;
                                    }
                                    sumWordCount += 1 * keywordsWeight;
                                    // word preprocess done
                                    if (docWord.ContainsKey(word))
                                    {
                                        int temp = (int)docWord[word];
                                        temp += 1 * keywordsWeight;
                                        docWord[word] = temp;
                                    }
                                    else
                                    {
                                        docWord[word] = 1 * keywordsWeight;
                                    }
                                }
                            }
                            if (columnName.ToLower() == "newsgroups")
                            {
                                foreach (string iter_word in content.Split(new char[] { ',' }))
                                {
                                    String word = iter_word.ToLower().Trim();
                                    sumWordCount += 1 * newsgroupsWeight;
                                    // word preprocess done
                                    if (docWord.ContainsKey(word))
                                    {
                                        int temp = (int)docWord[word];
                                        temp += 1 * newsgroupsWeight;
                                        docWord[word] = temp;
                                    }
                                    else
                                    {
                                        docWord[word] = 1 * newsgroupsWeight;
                                    }
                                }
                            }*/
                            /*else if (columnName.ToLower() == "from")
                            {
                                Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
                                //find items that matches with our pattern
                                MatchCollection emailMatches = emailRegex.Matches(content);
                                foreach (Match emailMatch in emailMatches)
                                {
                                    String word = emailMatch.Value;
                                    // word preprocess done
                                    if (docWord.ContainsKey(word))
                                    {
                                        int temp = (int)docWord[word];
                                        temp += 1 * fromWeight;
                                        docWord[word] = temp;
                                    }
                                    else
                                    {
                                        docWord[word] = 1 * fromWeight;
                                    }
                                }
                            }*/
                        }
                        else
                        {
                            break;
                        }
                    }

                    /******Text******/
                    while ((line = file.ReadLine()) != null)
                    {
                        if (line.StartsWith(">") || line.StartsWith("|>"))
                        {
                            continue;
                        }
                        //foreach(string iter_word in line.Split(new Char [] {' ', ',', '.', ':', '\t', '\n' }))
                        foreach (string iter_word in Regex.Split(line, @"[^A-Za-z0-9_-]"))
                        {
                            String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                            double Num;
                            bool isNum = double.TryParse(word, out Num);
                            if (isNum)
                            {
                                continue;
                            }
                            stemmer.add(word.ToCharArray(), word.Length);
                            stemmer.stem();
                            word = stemmer.ToString();
                            if (word.Length == 0)
                            {
                                continue;
                            }
                            if (stopWordTable.ContainsKey(word))
                            {
                                continue;
                            }
                            sumWordCount++;
                            // word preprocess done
                            if (docWord.ContainsKey(word))
                            {
                                int temp = (int)docWord[word];
                                temp++;
                                docWord[word] = temp;
                            }
                            else
                            {
                                docWord[word] = 1;
                            }
                        }
                    }// line end
                    foreach (string word in docWord.Keys)
                    {
                        if (dictionary.ContainsKey(word))
                        {
                            int indexOfDic = (int)dictionary[word];
                            double TF = System.Convert.ToDouble((int)docWord[word])/System.Convert.ToDouble(sumWordCount);
                            double IDF = (double)idfTable[word];
                            featureV[indexOfDic] = TF * IDF;
                        }
                    }
                    testResult resultTemp = new testResult();
                    resultTemp.docName = Path.GetFileName(file_names[j]);
                    resultTemp.oriClass = i;
                    resultTemp.resultClass = knn.Compute(featureV);
                    result.Add(resultTemp);
                    Console.WriteLine(resultTemp.resultClass);
                }//file end
                //Console.ReadLine();
            }//category end
            return result;
        }

Example #2

Show file

File: Stemmer.cs Project: TripleEmcoder/Archive

        /** Test program for demonstrating the Stemmer.  It reads text from a
         * a list of files, stems each word, and writes the result to standard
         * output. Note that the word stemmed is expected to be in lower case:
         * forcing lower case must be done outside the Stemmer class.
         * Usage: Stemmer file-name file-name ...
         */
        public static void Main(String[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Usage:  Stemmer <input file>");
                return;
            }
            char[]  w = new char[501];
            Stemmer s = new Stemmer();

            for (int i = 0; i < args.Length; i++)
            {
                try
                {
                    FileStream _in = new FileStream(args[i], FileMode.Open, FileAccess.Read);
                    try
                    {
                        while (true)
                        {
                            int ch = _in.ReadByte();
                            if (Char.IsLetter((char)ch))
                            {
                                int j = 0;
                                while (true)
                                {
                                    ch   = Char.ToLower((char)ch);
                                    w[j] = (char)ch;
                                    if (j < 500)
                                    {
                                        j++;
                                    }
                                    ch = _in.ReadByte();
                                    if (!Char.IsLetter((char)ch))
                                    {
                                        /* to test add(char ch) */
                                        for (int c = 0; c < j; c++)
                                        {
                                            s.add(w[c]);
                                        }
                                        /* or, to test add(char[] w, int j) */
                                        /* s.add(w, j); */
                                        s.stem();

                                        String u;

                                        /* and now, to test toString() : */
                                        u = s.ToString();

                                        /* to test getResultBuffer(), getResultLength() : */
                                        /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */

                                        Console.Write(u);
                                        break;
                                    }
                                }
                            }
                            if (ch < 0)
                            {
                                break;
                            }
                            Console.Write((char)ch);
                        }
                    }
                    catch (IOException)
                    {
                        Console.WriteLine("error reading " + args[i]);
                        break;
                    }
                }
                catch (FileNotFoundException)
                {
                    Console.WriteLine("file " + args[i] + " not found");
                    break;
                }
            }
        }

Example #3

Show file

File: Program.cs Project: sweslo17/classification

        // Insert logic for processing found files here.
        public static int ProcessTraining(string filePath, Hashtable wordInCategory)
        {
            //Console.WriteLine("Processed file '{0}'.", filePath);
            //Console.WriteLine(filePath.Replace("20_newsgroups","parsed"));
            // Read the file and display it line by line.
            fileList.Add(filePath);
            System.IO.StreamReader file = new System.IO.StreamReader(filePath);
            String line;
            int counter = 0;
            Hashtable docWord = new Hashtable();
            Stemmer stemmer = new Stemmer();

            stemmer.stem();
            //Console.WriteLine(stemmer.stem("running"));
            //String word;
            counter = 0;
            /******Structured Column*****/

            while ((line = file.ReadLine()) != null)
            {
                //Console.WriteLine(line);
                if (line.Contains(": "))
                {
                    string[] splitPart = line.Split(new string[] { ": " }, StringSplitOptions.None);
                    string columnName = splitPart[0].Trim();
                    columnCountTable[columnName] = filePath;
                    string content = splitPart[splitPart.Length-1];
                    if (columnName.ToLower() == "subject")
                    {
                        foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]"))
                        {
                            wordCount temp;
                            int wordCountTemp = 0;
                            String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                            double Num;
                            bool isNum = double.TryParse(word, out Num);
                            if (isNum)
                            {
                                continue;
                            }
                            stemmer.add(word.ToCharArray(), word.Length);
                            stemmer.stem();
                            word = stemmer.ToString();
                            if (word.Length == 0)
                            {
                                continue;
                            }
                            if (stopWordTable.ContainsKey(word))
                            {
                                continue;
                            }
                            // word preprocess done
                            counter += 1 * subjectWeight;
                            if (wordInCategory.ContainsKey(word))
                            {
                                int count = (int)wordInCategory[word];
                                count += 1 * subjectWeight;
                                wordInCategory[word] = count;
                            }
                            else
                            {
                                wordInCategory[word] = 1 * subjectWeight;
                            }
                            if (wordCountTable.ContainsKey(word)) //word already apper
                            {
                                temp = (wordCount)wordCountTable[word];
                                temp.count += 1 * subjectWeight;
                                if (!docWord.ContainsKey(word))//add DF
                                {
                                    temp.DF += 1;
                                }
                            }
                            else
                            {
                                temp.count = 1 * subjectWeight;
                                temp.DF = 1;
                            }
                            if (docWord.ContainsKey(word))/****real count word*****/
                            {
                                wordCountTemp = (int)docWord[word];
                            }
                            wordCountTemp += 1 * subjectWeight;
                            docWord[word] = wordCountTemp;
                            wordCountTable[word] = temp;
                        }
                    }
                    /*else if (columnName.ToLower() == "newsgroups")
                    {
                        foreach (string iter_word in content.Split(new char[]{','}))
                        {
                            wordCount temp;
                            int wordCountTemp = 0;
                            String word = iter_word.ToLower().Trim();
                            // word preprocess done
                            counter += 1 * newsgroupsWeight;
                            if (wordInCategory.ContainsKey(word))
                            {
                                int count = (int)wordInCategory[word];
                                count += 1 * newsgroupsWeight;
                                wordInCategory[word] = count;
                            }
                            else
                            {
                                wordInCategory[word] = 1 * newsgroupsWeight;
                            }
                            if (wordCountTable.ContainsKey(word)) //word already apper
                            {
                                temp = (wordCount)wordCountTable[word];
                                temp.count += 1 * newsgroupsWeight;
                                if (!docWord.ContainsKey(word))//add DF
                                {
                                    temp.DF += 1;
                                }
                            }
                            else
                            {
                                temp.count = 1 * newsgroupsWeight;
                                temp.DF = 1;
                            }
                            if (docWord.ContainsKey(word))*//****real count word*****/
                           /* {
                                wordCountTemp = (int)docWord[word];
                            }
                            wordCountTemp += 1 * newsgroupsWeight;
                            docWord[word] = wordCountTemp;
                            wordCountTable[word] = temp;
                        }
                    }
                    else if (columnName.ToLower() == "keywords")
                    {
                        foreach (string iter_word in Regex.Split(content, @"[^A-Za-z0-9_-]"))
                        {
                            wordCount temp;
                            int wordCountTemp = 0;
                            String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                            double Num;
                            bool isNum = double.TryParse(word, out Num);
                            if (isNum)
                            {
                                continue;
                            }
                            stemmer.add(word.ToCharArray(), word.Length);
                            stemmer.stem();
                            word = stemmer.ToString();
                            if (word.Length == 0)
                            {
                                continue;
                            }
                            if (stopWordTable.ContainsKey(word))
                            {
                                continue;
                            }
                            // word preprocess done
                            counter += 1 * keywordsWeight;
                            if (wordInCategory.ContainsKey(word))
                            {
                                int count = (int)wordInCategory[word];
                                count += 1 * keywordsWeight;
                                wordInCategory[word] = count;
                            }
                            else
                            {
                                wordInCategory[word] = 1 * keywordsWeight;
                            }
                            if (wordCountTable.ContainsKey(word)) //word already apper
                            {
                                temp = (wordCount)wordCountTable[word];
                                temp.count += 1 * keywordsWeight;
                                if (!docWord.ContainsKey(word))//add DF
                                {
                                    temp.DF += 1;
                                }
                            }
                            else
                            {
                                temp.count = 1 * keywordsWeight;
                                temp.DF = 1;
                            }
                            if (docWord.ContainsKey(word))*//****real count word*****/
                            /*{
                                wordCountTemp = (int)docWord[word];
                            }
                            wordCountTemp += 1 * keywordsWeight;
                            docWord[word] = wordCountTemp;
                            wordCountTable[word] = temp;
                        }
                    }*/
                    /*else if (columnName.ToLower() == "from")
                    {
                        Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*",RegexOptions.IgnoreCase);
                        //find items that matches with our pattern
                        MatchCollection emailMatches = emailRegex.Matches(content);
                        foreach (Match emailMatch in emailMatches)
                        {
                            wordCount temp;
                            int wordCountTemp = 0;
                            String word = emailMatch.Value;
                            // word preprocess done
                            counter += 1 * fromWeight;
                            if (wordInCategory.ContainsKey(word))
                            {
                                int count = (int)wordInCategory[word];
                                count += 1 * fromWeight;
                                wordInCategory[word] = count;
                            }
                            else
                            {
                                wordInCategory[word] = 1 * fromWeight;
                            }
                            if (wordCountTable.ContainsKey(word)) //word already apper
                            {
                                temp = (wordCount)wordCountTable[word];
                                temp.count += 1 * fromWeight;
                                if (!docWord.ContainsKey(word))//add DF
                                {
                                    temp.DF += 1;
                                }
                            }
                            else
                            {
                                temp.count = 1 * fromWeight;
                                temp.DF = 1;
                            }
                            if (docWord.ContainsKey(word))*//****real count word*****/
                            /*{
                                wordCountTemp = (int)docWord[word];
                            }
                            wordCountTemp += 1 * fromWeight;
                            docWord[word] = wordCountTemp;
                            wordCountTable[word] = temp;
                        }
                    }*/
                }
                else
                {
                    break;
                }
            }

            /******Text******/
            while ((line = file.ReadLine()) != null)
            {
                if (line.StartsWith(">") || line.StartsWith("|>"))
                {
                    continue;
                }
                //foreach(string iter_word in line.Split(new Char [] {' ', ',', '.', ':', '\t', '\n' }))
                foreach (string iter_word in Regex.Split(line, @"[^A-Za-z0-9_-]"))
                {
                    wordCount temp;
                    int wordCountTemp = 0;
                    String word = iter_word.ToLower().Trim(new Char[] { '_', '-' });
                    double Num;
                    bool isNum = double.TryParse(word, out Num);
                    if(isNum)
                    {
                        continue;
                    }
                    stemmer.add(word.ToCharArray(), word.Length);
                    stemmer.stem();
                    word = stemmer.ToString();
                    if (word.Length == 0)
                    {
                        continue;
                    }
                    if (stopWordTable.ContainsKey(word))
                    {
                        continue;
                    }
                    // word preprocess done
                    counter++;
                    if (wordInCategory.ContainsKey(word))
                    {
                        int count = (int)wordInCategory[word];
                        count += 1;
                        wordInCategory[word] = count;
                    }
                    else
                    {
                        wordInCategory[word] = 1;
                    }
                    if (wordCountTable.ContainsKey(word)) //word already apper
                    {
                        temp = (wordCount)wordCountTable[word];
                        temp.count += 1;
                        if (!docWord.ContainsKey(word))//add DF
                        {
                            temp.DF += 1;
                        }
                    }
                    else
                    {
                        temp.count = 1;
                        temp.DF = 1;
                    }
                    if (docWord.ContainsKey(word))
                    {
                        wordCountTemp = (int)docWord[word];
                    }
                    wordCountTemp += 1;
                    docWord[word] = wordCountTemp;
                    wordCountTable[word] = temp;
                }
            }
            docFeatureTable.Add(docWord);
            file.Close();
            //Console.ReadLine();
            return counter;
        }