Example #1
0
        private void processDoc(string text)
        {
            if (docId % 100 == 0)
            {
                Console.WriteLine("processing doc #" + docId);
            }

            temp = text.Split(delims);
            string term;

            for (int i = 0; i < temp.Length; i++)
            {
                term = stemmer.stemTerm(temp[i].ToLower().Trim());
                if (term.Length > 0 && !sList.Contains(term) && isAsciiLetters(term))
                {
                    termInstanceCounter++;
                    if (!terms.Contains(term))
                    {
                        Hashtable termDocs = new Hashtable();
                        termDocs.Add(docId, true);
                        terms.Add(term, termDocs);
                    }
                    else
                    {
                        Hashtable termDocs = (Hashtable)terms[term];
                        if (!termDocs.Contains(docId))
                        {
                            termDocs.Add(docId, true);
                        }
                    }
                }
            }
            return;
        }
Example #2
0
        /// <summary>
        /// A static method that gets a unique word count for each of the words in a string
        /// </summary>
        /// <param name="str">The String that will be broken into a distinct word count</param>
        /// <returns>A distinct word count in the form of a dictionary(word, count)</returns>
        public static Dictionary <string, double> GetWordCount(this string str)
        {
            //Check to see that the user pased an actual string
            //If they didn't return them an empty dictionary
            if (String.IsNullOrEmpty(str))
            {
                return(new Dictionary <string, double>());
            }
            //Create the stemmer used to impliment Porters Algorithm for stemming strings
            //The purpose of this is to take words like lovely and convert them to love,
            //This helps attain more accurate results
            var stemmer = new PorterStemmerAlgorithm.PorterStemmer();
            //A dummy double used as the output for the Double.TryParse
            //This eliminates numbers from the
            Double num;
            Regex  rgx = new Regex("[^a-zA-Z0-9]");

            str = rgx.Replace(str, " ");
            //Split the words first removing _ characters
            return((new Regex(@"\w(?<!\d)[\w'-]*")).Matches(str)
                   //Cast them to an enumerable of the matches.
                   .Cast <Match>()
                   //Convert the strings to lower, Stem them for consistency and select them.
                   .Select(m => stemmer.stemTerm(m.Value.ToLower()))
                   //Group Them by their text
                   .GroupBy(p => p)
                   //Select a new object where the Word is the text and the Count is the number of occurences of that word
                   .Select(g => new { Word = g.Key, Count = g.Count() })
                   //Order them by word (not necessary but I like order)
                   .OrderBy(p => p.Word)
                   //Remove all items that are found in the stop words dictionary, or are simply numbers
                   .Where(p => !StopWords.ContainsKey(p.Word) && !Double.TryParse(p.Word, out num))
                   //Convert this list to a dictionary where the word is the key and the number of its occurences is the value
                   .ToDictionary(p => p.Word, p => Convert.ToDouble(p.Count)));
        }
Example #3
0
        /// <summary>
        /// Stems an array of tokens
        /// </summary>
        /// <param name="tokens">An array of lowercase tokens</param>
        /// <returns>An array of stems</returns>
        public string[] StemTokens(string[] tokens)
        {
            int numTokens = tokens.Count();

            string[] stems = new string[numTokens];
            for (int i = 0; i < numTokens; i++)
            {
                stems[i] = myStemmer.stemTerm(tokens[i]);
            }
            return(stems);
        }
Example #4
0
 public string[] StemTokens(string[] tokens)
 {
     string[] stemmedToken = new string[tokens.Length];
     for (int i = 0; i < tokens.Length; i++)
     {
         Console.WriteLine("Originally: " + tokens[i]);
         stemmedToken[i] = myStemmer.stemTerm(tokens[i]);
         Console.WriteLine("After stemmed: " + stemmedToken[i] + "\n");
     }
     return(stemmedToken);
 }
Example #5
0
        /// <summary>
        /// Stems an array of tokens
        /// </summary>
        /// <param name="tokens">An array of lowercase tokens</param>
        /// <returns>An array of stems</returns>
        private string[] StemTokens(string[] tokens)
        {
            PorterStemmerAlgorithm.PorterStemmer myStemmer = new PorterStemmerAlgorithm.PorterStemmer();
            int numTokens = tokens.Count();

            string[] stems = new string[numTokens];
            for (int i = 0; i < numTokens; i++)
            {
                stems[i] = myStemmer.stemTerm(tokens[i]);
            }
            return(stems);
        }
Example #6
0
        //made public ONLY for testing purposes!
        //removes irrelevant temrs from query
        public ArrayList MakeQuery(string query)
        {
            query = query.ToLower();
            char[]    delims     = { ' ' };
            string[]  temp       = query.Split(delims);
            ArrayList queryTerms = new ArrayList();
            string    term;

            for (int i = 0; i < temp.Length; i++)
            {
                term = ps.stemTerm(temp[i]);
                if (index.HasTerm(term))
                {
                    queryTerms.Add(term);
                }
            }

            return(queryTerms);
        }
Example #7
0
        private void processDoc(string text)
        {
            if (docId % 100 == 0)
            {
                Console.WriteLine("processing doc #" + docId);
            }

            temp = text.Split(delims);
            string term;

            for (int i = 0; i < temp.Length; i++)
            {
                term = stemmer.stemTerm(temp[i].ToLower().Trim());
                if (termLoader.HasTerm(term))
                {
                    processTerm(termLoader.GetTermId(term), docId);
                }
            }
            return;
        }
Example #8
0
        //if user choose preprocessing, this method will do following steps: tokenization, remove stop word and
        public string preprocessing(string text)
        {
            string[] separators  = { ",", ".", "!", "?", ";", ":", "-", " ", "\n", "\"", "'" };
            string[] query_token = text.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries);
            myStemmer = new PorterStemmerAlgorithm.PorterStemmer();
            List <string> filteredTokens = new List <string>();

            for (int i = 0; i < query_token.Length; i++)
            {
                string token = query_token[i];
                if (!stopWords.Contains(token) && (token.Length > 2))
                {
                    filteredTokens.Add(token);
                }
            }
            filteredTokens.ToArray <string>();
            string processed = "";

            foreach (var word in filteredTokens)
            {
                processed += myStemmer.stemTerm(word) + " ";
            }
            return(processed);
        }
Example #9
0
        static void Main(string[] args)
        {
            AggregateTester at = new AggregateTester(0, 10, 1, 10, 100, 10, 5, 10, 1);

            at.Run();

            Hashtable result = new Hashtable();

            result.Add(1, true);
            result.Add(2, true);
            result.Add(3, true);

            Hashtable relevant = new Hashtable();

            relevant.Add(1, true);
            relevant.Add(3, true);
            relevant.Add(5, true);
            relevant.Add(7, true);
            relevant.Add(8, true);

            d.PerformanceCalculator pc = new d.PerformanceCalculator(result, relevant);
            Console.WriteLine("Precision = " + pc.Precision);
            Console.WriteLine("Recall = " + pc.Recall);
            Console.WriteLine("FMeasure = " + pc.FMeasure);

            d.DocsLoader    dl    = new d.DocsLoader();
            d.CatsLoader    cl    = new d.CatsLoader();
            d.DocCatsLoader dc    = new d.DocCatsLoader(cl);
            int             docId = 1;
            ArrayList       al    = dc.GetDocCategories(docId);

            Console.WriteLine(dl.GetDocTitle(docId) + " has " + al.Count + " categories: ");
            foreach (int catId in al)
            {
                Console.WriteLine("  " + cl.GetCategory(catId));
            }


            d.Index         index  = new d.Index(Helper.INDEX_PATH);
            d.DocTermItem[] dterms = index.DocTerms(0);

            SearchVS s = new SearchVS(Helper.INDEX_PATH);

            s.run();

            i.DataLoader   dal = new i.DataLoader(Helper.SOURCE_PATH);
            i.IndexBuilder ib  = new i.IndexBuilder(dal, Helper.INDEX_PATH);
            ib.BuildIndex();


            PorterStemmerAlgorithm.PorterStemmer ps = new PorterStemmerAlgorithm.PorterStemmer();
            Console.WriteLine(ps.stemTerm("beautify"));

            TermFilter f = new TermFilter();

            f.CreateNewTermsFile();

            TermProcessor p = new TermProcessor();

            p.CreateTermsFile();

            TermDocsProcessor tdp = new TermDocsProcessor();

            tdp.CreateTermDocsFile();
            tdp.CreateTermDocsFile();
        }
Example #10
0
 public string[] stemTokens (string str){
     string result = myStemmer.stemTerm(str);
     
     String[] resultToken = TokeniseString(result);
     return resultToken;
 }
Example #11
0
        public static string GetStem(string Word)
        {
            var stemmer = new PorterStemmerAlgorithm.PorterStemmer();

            return stemmer.stemTerm(Word);
        }