/// <summary> /// A static method that gets a unique word count for each of the words in a string /// </summary> /// <param name="str">The String that will be broken into a distinct word count</param> /// <returns>A distinct word count in the form of a dictionary(word, count)</returns> public static Dictionary <string, double> GetWordCount(this string str) { //Check to see that the user pased an actual string //If they didn't return them an empty dictionary if (String.IsNullOrEmpty(str)) { return(new Dictionary <string, double>()); } //Create the stemmer used to impliment Porters Algorithm for stemming strings //The purpose of this is to take words like lovely and convert them to love, //This helps attain more accurate results var stemmer = new PorterStemmerAlgorithm.PorterStemmer(); //A dummy double used as the output for the Double.TryParse //This eliminates numbers from the Double num; Regex rgx = new Regex("[^a-zA-Z0-9]"); str = rgx.Replace(str, " "); //Split the words first removing _ characters return((new Regex(@"\w(?<!\d)[\w'-]*")).Matches(str) //Cast them to an enumerable of the matches. .Cast <Match>() //Convert the strings to lower, Stem them for consistency and select them. .Select(m => stemmer.stemTerm(m.Value.ToLower())) //Group Them by their text .GroupBy(p => p) //Select a new object where the Word is the text and the Count is the number of occurences of that word .Select(g => new { Word = g.Key, Count = g.Count() }) //Order them by word (not necessary but I like order) .OrderBy(p => p.Word) //Remove all items that are found in the stop words dictionary, or are simply numbers .Where(p => !StopWords.ContainsKey(p.Word) && !Double.TryParse(p.Word, out num)) //Convert this list to a dictionary where the word is the key and the number of its occurences is the value .ToDictionary(p => p.Word, p => Convert.ToDouble(p.Count))); }
public TermProcessor() { delims = getDelims(); sList = new StopList(); sb = new StringBuilder(); docId = 0; termInstanceCounter = 0; stemmer = new PorterStemmerAlgorithm.PorterStemmer(); terms = new Hashtable(); }
public TermDocsProcessor() { termLoader = new TermLoader(); termDocs = new Hashtable[termLoader.TermCount]; delims = getDelims(); sList = new StopList(); sb = new StringBuilder(); docId = 0; stemmer = new PorterStemmerAlgorithm.PorterStemmer(); }
/// <summary> /// Stems an array of tokens /// </summary> /// <param name="tokens">An array of lowercase tokens</param> /// <returns>An array of stems</returns> private string[] StemTokens(string[] tokens) { PorterStemmerAlgorithm.PorterStemmer myStemmer = new PorterStemmerAlgorithm.PorterStemmer(); int numTokens = tokens.Count(); string[] stems = new string[numTokens]; for (int i = 0; i < numTokens; i++) { stems[i] = myStemmer.stemTerm(tokens[i]); } return(stems); }
private void InitLucene() { //analyzer = new Lucene.Net.Analysis.WhitespaceAnalyzer(); //analyzer = new Lucene.Net.Analysis.SimpleAnalyzer(); // Activity 5 //analyzer = new Lucene.Net.Analysis.StopAnalyzer(); // Activity 5 //analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); // Activity 5 //analyzer = new Lucene.Net.Analysis.Snowball.SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "English"); // Activity 7 ISet <string> StopWords = new HashSet <string>(); try { // Create an instance of StreamReader to read from a file. // The using statement also closes the StreamReader. using (StreamReader sr = new StreamReader(@"../../StopWords.txt")) { string line; // Read and display lines from the file until the end of // the file is reached. while ((line = sr.ReadLine()) != null) { StopWords.Add(line); } } } catch (Exception e) { // Let the user know what went wrong. Console.WriteLine("The file could not be read:"); Console.WriteLine(e.Message); } analyzer = new HjsStandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30, StopWords); //parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, TEXT_FN, analyzer); //parser = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, FIELDS, analyzer, BOOSTING); parser = CreateQueryParser(); similarity = new HjsSimilarity(); porterStemmer = new PorterStemmerAlgorithm.PorterStemmer(); // WordNet Load LoadWordNet(); }
//if user choose preprocessing, this method will do following steps: tokenization, remove stop word and public string preprocessing(string text) { string[] separators = { ",", ".", "!", "?", ";", ":", "-", " ", "\n", "\"", "'" }; string[] query_token = text.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries); myStemmer = new PorterStemmerAlgorithm.PorterStemmer(); List <string> filteredTokens = new List <string>(); for (int i = 0; i < query_token.Length; i++) { string token = query_token[i]; if (!stopWords.Contains(token) && (token.Length > 2)) { filteredTokens.Add(token); } } filteredTokens.ToArray <string>(); string processed = ""; foreach (var word in filteredTokens) { processed += myStemmer.stemTerm(word) + " "; } return(processed); }
"with", "would", "yet", "you", "your" }; //list of stopwords public void TextAnalyser() { myStemmer = new PorterStemmerAlgorithm.PorterStemmer(); tokenCount = new Dictionary <string, int>(); }
static void Main(string[] args) { AggregateTester at = new AggregateTester(0, 10, 1, 10, 100, 10, 5, 10, 1); at.Run(); Hashtable result = new Hashtable(); result.Add(1, true); result.Add(2, true); result.Add(3, true); Hashtable relevant = new Hashtable(); relevant.Add(1, true); relevant.Add(3, true); relevant.Add(5, true); relevant.Add(7, true); relevant.Add(8, true); d.PerformanceCalculator pc = new d.PerformanceCalculator(result, relevant); Console.WriteLine("Precision = " + pc.Precision); Console.WriteLine("Recall = " + pc.Recall); Console.WriteLine("FMeasure = " + pc.FMeasure); d.DocsLoader dl = new d.DocsLoader(); d.CatsLoader cl = new d.CatsLoader(); d.DocCatsLoader dc = new d.DocCatsLoader(cl); int docId = 1; ArrayList al = dc.GetDocCategories(docId); Console.WriteLine(dl.GetDocTitle(docId) + " has " + al.Count + " categories: "); foreach (int catId in al) { Console.WriteLine(" " + cl.GetCategory(catId)); } d.Index index = new d.Index(Helper.INDEX_PATH); d.DocTermItem[] dterms = index.DocTerms(0); SearchVS s = new SearchVS(Helper.INDEX_PATH); s.run(); i.DataLoader dal = new i.DataLoader(Helper.SOURCE_PATH); i.IndexBuilder ib = new i.IndexBuilder(dal, Helper.INDEX_PATH); ib.BuildIndex(); PorterStemmerAlgorithm.PorterStemmer ps = new PorterStemmerAlgorithm.PorterStemmer(); Console.WriteLine(ps.stemTerm("beautify")); TermFilter f = new TermFilter(); f.CreateNewTermsFile(); TermProcessor p = new TermProcessor(); p.CreateTermsFile(); TermDocsProcessor tdp = new TermDocsProcessor(); tdp.CreateTermDocsFile(); tdp.CreateTermDocsFile(); }
public static string GetStem(string Word) { var stemmer = new PorterStemmerAlgorithm.PorterStemmer(); return stemmer.stemTerm(Word); }
public VSSearcher(Index index) { ps = new PorterStemmerAlgorithm.PorterStemmer(); this.index = index; this.w = 1.0f; }