private void processDoc(string text) { if (docId % 100 == 0) { Console.WriteLine("processing doc #" + docId); } temp = text.Split(delims); string term; for (int i = 0; i < temp.Length; i++) { term = stemmer.stemTerm(temp[i].ToLower().Trim()); if (term.Length > 0 && !sList.Contains(term) && isAsciiLetters(term)) { termInstanceCounter++; if (!terms.Contains(term)) { Hashtable termDocs = new Hashtable(); termDocs.Add(docId, true); terms.Add(term, termDocs); } else { Hashtable termDocs = (Hashtable)terms[term]; if (!termDocs.Contains(docId)) { termDocs.Add(docId, true); } } } } return; }
/// <summary> /// A static method that gets a unique word count for each of the words in a string /// </summary> /// <param name="str">The String that will be broken into a distinct word count</param> /// <returns>A distinct word count in the form of a dictionary(word, count)</returns> public static Dictionary <string, double> GetWordCount(this string str) { //Check to see that the user pased an actual string //If they didn't return them an empty dictionary if (String.IsNullOrEmpty(str)) { return(new Dictionary <string, double>()); } //Create the stemmer used to impliment Porters Algorithm for stemming strings //The purpose of this is to take words like lovely and convert them to love, //This helps attain more accurate results var stemmer = new PorterStemmerAlgorithm.PorterStemmer(); //A dummy double used as the output for the Double.TryParse //This eliminates numbers from the Double num; Regex rgx = new Regex("[^a-zA-Z0-9]"); str = rgx.Replace(str, " "); //Split the words first removing _ characters return((new Regex(@"\w(?<!\d)[\w'-]*")).Matches(str) //Cast them to an enumerable of the matches. .Cast <Match>() //Convert the strings to lower, Stem them for consistency and select them. .Select(m => stemmer.stemTerm(m.Value.ToLower())) //Group Them by their text .GroupBy(p => p) //Select a new object where the Word is the text and the Count is the number of occurences of that word .Select(g => new { Word = g.Key, Count = g.Count() }) //Order them by word (not necessary but I like order) .OrderBy(p => p.Word) //Remove all items that are found in the stop words dictionary, or are simply numbers .Where(p => !StopWords.ContainsKey(p.Word) && !Double.TryParse(p.Word, out num)) //Convert this list to a dictionary where the word is the key and the number of its occurences is the value .ToDictionary(p => p.Word, p => Convert.ToDouble(p.Count))); }
/// <summary> /// Stems an array of tokens /// </summary> /// <param name="tokens">An array of lowercase tokens</param> /// <returns>An array of stems</returns> public string[] StemTokens(string[] tokens) { int numTokens = tokens.Count(); string[] stems = new string[numTokens]; for (int i = 0; i < numTokens; i++) { stems[i] = myStemmer.stemTerm(tokens[i]); } return(stems); }
public string[] StemTokens(string[] tokens) { string[] stemmedToken = new string[tokens.Length]; for (int i = 0; i < tokens.Length; i++) { Console.WriteLine("Originally: " + tokens[i]); stemmedToken[i] = myStemmer.stemTerm(tokens[i]); Console.WriteLine("After stemmed: " + stemmedToken[i] + "\n"); } return(stemmedToken); }
/// <summary> /// Stems an array of tokens /// </summary> /// <param name="tokens">An array of lowercase tokens</param> /// <returns>An array of stems</returns> private string[] StemTokens(string[] tokens) { PorterStemmerAlgorithm.PorterStemmer myStemmer = new PorterStemmerAlgorithm.PorterStemmer(); int numTokens = tokens.Count(); string[] stems = new string[numTokens]; for (int i = 0; i < numTokens; i++) { stems[i] = myStemmer.stemTerm(tokens[i]); } return(stems); }
//made public ONLY for testing purposes! //removes irrelevant temrs from query public ArrayList MakeQuery(string query) { query = query.ToLower(); char[] delims = { ' ' }; string[] temp = query.Split(delims); ArrayList queryTerms = new ArrayList(); string term; for (int i = 0; i < temp.Length; i++) { term = ps.stemTerm(temp[i]); if (index.HasTerm(term)) { queryTerms.Add(term); } } return(queryTerms); }
private void processDoc(string text) { if (docId % 100 == 0) { Console.WriteLine("processing doc #" + docId); } temp = text.Split(delims); string term; for (int i = 0; i < temp.Length; i++) { term = stemmer.stemTerm(temp[i].ToLower().Trim()); if (termLoader.HasTerm(term)) { processTerm(termLoader.GetTermId(term), docId); } } return; }
//if user choose preprocessing, this method will do following steps: tokenization, remove stop word and public string preprocessing(string text) { string[] separators = { ",", ".", "!", "?", ";", ":", "-", " ", "\n", "\"", "'" }; string[] query_token = text.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries); myStemmer = new PorterStemmerAlgorithm.PorterStemmer(); List <string> filteredTokens = new List <string>(); for (int i = 0; i < query_token.Length; i++) { string token = query_token[i]; if (!stopWords.Contains(token) && (token.Length > 2)) { filteredTokens.Add(token); } } filteredTokens.ToArray <string>(); string processed = ""; foreach (var word in filteredTokens) { processed += myStemmer.stemTerm(word) + " "; } return(processed); }
static void Main(string[] args) { AggregateTester at = new AggregateTester(0, 10, 1, 10, 100, 10, 5, 10, 1); at.Run(); Hashtable result = new Hashtable(); result.Add(1, true); result.Add(2, true); result.Add(3, true); Hashtable relevant = new Hashtable(); relevant.Add(1, true); relevant.Add(3, true); relevant.Add(5, true); relevant.Add(7, true); relevant.Add(8, true); d.PerformanceCalculator pc = new d.PerformanceCalculator(result, relevant); Console.WriteLine("Precision = " + pc.Precision); Console.WriteLine("Recall = " + pc.Recall); Console.WriteLine("FMeasure = " + pc.FMeasure); d.DocsLoader dl = new d.DocsLoader(); d.CatsLoader cl = new d.CatsLoader(); d.DocCatsLoader dc = new d.DocCatsLoader(cl); int docId = 1; ArrayList al = dc.GetDocCategories(docId); Console.WriteLine(dl.GetDocTitle(docId) + " has " + al.Count + " categories: "); foreach (int catId in al) { Console.WriteLine(" " + cl.GetCategory(catId)); } d.Index index = new d.Index(Helper.INDEX_PATH); d.DocTermItem[] dterms = index.DocTerms(0); SearchVS s = new SearchVS(Helper.INDEX_PATH); s.run(); i.DataLoader dal = new i.DataLoader(Helper.SOURCE_PATH); i.IndexBuilder ib = new i.IndexBuilder(dal, Helper.INDEX_PATH); ib.BuildIndex(); PorterStemmerAlgorithm.PorterStemmer ps = new PorterStemmerAlgorithm.PorterStemmer(); Console.WriteLine(ps.stemTerm("beautify")); TermFilter f = new TermFilter(); f.CreateNewTermsFile(); TermProcessor p = new TermProcessor(); p.CreateTermsFile(); TermDocsProcessor tdp = new TermDocsProcessor(); tdp.CreateTermDocsFile(); tdp.CreateTermDocsFile(); }
public string[] stemTokens (string str){ string result = myStemmer.stemTerm(str); String[] resultToken = TokeniseString(result); return resultToken; }
public static string GetStem(string Word) { var stemmer = new PorterStemmerAlgorithm.PorterStemmer(); return stemmer.stemTerm(Word); }