Ejemplo n.º 1
0
        /// <summary>
        /// A static method that gets a unique word count for each of the words in a string
        /// </summary>
        /// <param name="str">The String that will be broken into a distinct word count</param>
        /// <returns>A distinct word count in the form of a dictionary(word, count)</returns>
        public static Dictionary <string, double> GetWordCount(this string str)
        {
            //Check to see that the user pased an actual string
            //If they didn't return them an empty dictionary
            if (String.IsNullOrEmpty(str))
            {
                return(new Dictionary <string, double>());
            }
            //Create the stemmer used to impliment Porters Algorithm for stemming strings
            //The purpose of this is to take words like lovely and convert them to love,
            //This helps attain more accurate results
            var stemmer = new PorterStemmerAlgorithm.PorterStemmer();
            //A dummy double used as the output for the Double.TryParse
            //This eliminates numbers from the
            Double num;
            Regex  rgx = new Regex("[^a-zA-Z0-9]");

            str = rgx.Replace(str, " ");
            //Split the words first removing _ characters
            return((new Regex(@"\w(?<!\d)[\w'-]*")).Matches(str)
                   //Cast them to an enumerable of the matches.
                   .Cast <Match>()
                   //Convert the strings to lower, Stem them for consistency and select them.
                   .Select(m => stemmer.stemTerm(m.Value.ToLower()))
                   //Group Them by their text
                   .GroupBy(p => p)
                   //Select a new object where the Word is the text and the Count is the number of occurences of that word
                   .Select(g => new { Word = g.Key, Count = g.Count() })
                   //Order them by word (not necessary but I like order)
                   .OrderBy(p => p.Word)
                   //Remove all items that are found in the stop words dictionary, or are simply numbers
                   .Where(p => !StopWords.ContainsKey(p.Word) && !Double.TryParse(p.Word, out num))
                   //Convert this list to a dictionary where the word is the key and the number of its occurences is the value
                   .ToDictionary(p => p.Word, p => Convert.ToDouble(p.Count)));
        }
Ejemplo n.º 2
0
 public TermProcessor()
 {
     delims = getDelims();
     sList  = new StopList();
     sb     = new StringBuilder();
     docId  = 0;
     termInstanceCounter = 0;
     stemmer             = new PorterStemmerAlgorithm.PorterStemmer();
     terms = new Hashtable();
 }
Ejemplo n.º 3
0
 public TermDocsProcessor()
 {
     termLoader = new TermLoader();
     termDocs   = new Hashtable[termLoader.TermCount];
     delims     = getDelims();
     sList      = new StopList();
     sb         = new StringBuilder();
     docId      = 0;
     stemmer    = new PorterStemmerAlgorithm.PorterStemmer();
 }
Ejemplo n.º 4
0
        /// <summary>
        /// Stems an array of tokens
        /// </summary>
        /// <param name="tokens">An array of lowercase tokens</param>
        /// <returns>An array of stems</returns>
        private string[] StemTokens(string[] tokens)
        {
            PorterStemmerAlgorithm.PorterStemmer myStemmer = new PorterStemmerAlgorithm.PorterStemmer();
            int numTokens = tokens.Count();

            string[] stems = new string[numTokens];
            for (int i = 0; i < numTokens; i++)
            {
                stems[i] = myStemmer.stemTerm(tokens[i]);
            }
            return(stems);
        }
Ejemplo n.º 5
0
        private void InitLucene()
        {
            //analyzer = new Lucene.Net.Analysis.WhitespaceAnalyzer();
            //analyzer = new Lucene.Net.Analysis.SimpleAnalyzer(); // Activity 5
            //analyzer = new Lucene.Net.Analysis.StopAnalyzer(); // Activity 5
            //analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); // Activity 5
            //analyzer = new Lucene.Net.Analysis.Snowball.SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "English"); // Activity 7

            ISet <string> StopWords = new HashSet <string>();

            try
            {
                // Create an instance of StreamReader to read from a file.
                // The using statement also closes the StreamReader.
                using (StreamReader sr = new StreamReader(@"../../StopWords.txt"))
                {
                    string line;
                    // Read and display lines from the file until the end of
                    // the file is reached.
                    while ((line = sr.ReadLine()) != null)
                    {
                        StopWords.Add(line);
                    }
                }
            }
            catch (Exception e)
            {
                // Let the user know what went wrong.
                Console.WriteLine("The file could not be read:");
                Console.WriteLine(e.Message);
            }

            analyzer = new HjsStandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30, StopWords);

            //parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, TEXT_FN, analyzer);
            //parser = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, FIELDS, analyzer, BOOSTING);
            parser = CreateQueryParser();

            similarity = new HjsSimilarity();

            porterStemmer = new PorterStemmerAlgorithm.PorterStemmer();
            // WordNet Load
            LoadWordNet();
        }
Ejemplo n.º 6
0
        //if user choose preprocessing, this method will do following steps: tokenization, remove stop word and
        public string preprocessing(string text)
        {
            string[] separators  = { ",", ".", "!", "?", ";", ":", "-", " ", "\n", "\"", "'" };
            string[] query_token = text.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries);
            myStemmer = new PorterStemmerAlgorithm.PorterStemmer();
            List <string> filteredTokens = new List <string>();

            for (int i = 0; i < query_token.Length; i++)
            {
                string token = query_token[i];
                if (!stopWords.Contains(token) && (token.Length > 2))
                {
                    filteredTokens.Add(token);
                }
            }
            filteredTokens.ToArray <string>();
            string processed = "";

            foreach (var word in filteredTokens)
            {
                processed += myStemmer.stemTerm(word) + " ";
            }
            return(processed);
        }
Ejemplo n.º 7
0
                                      "with",      "would",  "yet",    "you",      "your" }; //list of stopwords

        public void TextAnalyser()
        {
            myStemmer  = new PorterStemmerAlgorithm.PorterStemmer();
            tokenCount = new Dictionary <string, int>();
        }
Ejemplo n.º 8
0
        static void Main(string[] args)
        {
            AggregateTester at = new AggregateTester(0, 10, 1, 10, 100, 10, 5, 10, 1);

            at.Run();

            Hashtable result = new Hashtable();

            result.Add(1, true);
            result.Add(2, true);
            result.Add(3, true);

            Hashtable relevant = new Hashtable();

            relevant.Add(1, true);
            relevant.Add(3, true);
            relevant.Add(5, true);
            relevant.Add(7, true);
            relevant.Add(8, true);

            d.PerformanceCalculator pc = new d.PerformanceCalculator(result, relevant);
            Console.WriteLine("Precision = " + pc.Precision);
            Console.WriteLine("Recall = " + pc.Recall);
            Console.WriteLine("FMeasure = " + pc.FMeasure);

            d.DocsLoader    dl    = new d.DocsLoader();
            d.CatsLoader    cl    = new d.CatsLoader();
            d.DocCatsLoader dc    = new d.DocCatsLoader(cl);
            int             docId = 1;
            ArrayList       al    = dc.GetDocCategories(docId);

            Console.WriteLine(dl.GetDocTitle(docId) + " has " + al.Count + " categories: ");
            foreach (int catId in al)
            {
                Console.WriteLine("  " + cl.GetCategory(catId));
            }


            d.Index         index  = new d.Index(Helper.INDEX_PATH);
            d.DocTermItem[] dterms = index.DocTerms(0);

            SearchVS s = new SearchVS(Helper.INDEX_PATH);

            s.run();

            i.DataLoader   dal = new i.DataLoader(Helper.SOURCE_PATH);
            i.IndexBuilder ib  = new i.IndexBuilder(dal, Helper.INDEX_PATH);
            ib.BuildIndex();


            PorterStemmerAlgorithm.PorterStemmer ps = new PorterStemmerAlgorithm.PorterStemmer();
            Console.WriteLine(ps.stemTerm("beautify"));

            TermFilter f = new TermFilter();

            f.CreateNewTermsFile();

            TermProcessor p = new TermProcessor();

            p.CreateTermsFile();

            TermDocsProcessor tdp = new TermDocsProcessor();

            tdp.CreateTermDocsFile();
            tdp.CreateTermDocsFile();
        }
Ejemplo n.º 9
0
        public static string GetStem(string Word)
        {
            var stemmer = new PorterStemmerAlgorithm.PorterStemmer();

            return stemmer.stemTerm(Word);
        }
Ejemplo n.º 10
0
 public VSSearcher(Index index)
 {
     ps         = new PorterStemmerAlgorithm.PorterStemmer();
     this.index = index;
     this.w     = 1.0f;
 }