// Index the text of the passed document private void IndexDocumentText(string sDocText, int nDocID) { string[] words = LSICommon.Instance.GetWords(sDocText); for (int i = 0; i < words.Length; i++) { words[i] = goStemmmer.stemTerm(words[i]); if (!goWordList.ContainsKey(words[i])) { goWordList.Add(words[i], goWordList.Count); } DocWordRelation dwr; dwr.DocID = nDocID; dwr.WordID = (int)goWordList[words[i]]; if (!goDocWord.ContainsKey(dwr)) { goDocWord.Add(dwr, 1); } else { goDocWord[dwr] = (int)goDocWord[dwr] + 1; } } }
/// <summary> /// Use the Word Stemmer interface to generate a stem from a single word. /// This uses the Porter word stemmer algorithm. /// </summary> /// <param name="Word">The word to stem</param> /// <returns>The stem of the word</returns> public string StemWord(string Word) { return(Stemmer.stemTerm(Word)); }
/// <summary> /// parse the given text according to parsing rule /// </summary> /// <param name="text">the text needed to parse</param> /// <param name="maxTerm">update the max term in the text</param> /// <returns>the dictionary of the terms and it's frequncy in the text</returns> public Dictionary <string, int> ParseText(string text, out KeyValuePair <string, int> maxTerm) { // update the max term int maxTermFreq = 0; string maxTermString = ""; Dictionary <string, int> parsingDictionary = new Dictionary <string, int>(); text = RemoveSymbolsFromDocument(text); string[] terms = text.Split(delimetersToSplitWords, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < terms.Length; i++) { string new_word = terms[i]; new_word = RemoveSpecialCahracter(new_word); if (IsStopWord(new_word)) { continue; } // check for date if (i < terms.Length - 3 && ((new_word.All(char.IsDigit) || terms[i + 1].All(char.IsDigit)) && (Enum.IsDefined(typeof(Months), new_word.ToLower()) || Enum.IsDefined(typeof(Months), terms[i + 1].ToLower())))) { int retNum1; if (Int32.TryParse(terms[i + 2], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out retNum1)) { new_word = ParseDate(new_word + ' ' + terms[i + 1] + ' ' + terms[i + 2]); i = i + 2; } else { new_word = ParseDate(new_word + ' ' + terms[i + 1]); i++; } } // parse expression with numbers else if (new_word.Any(char.IsDigit)) { string next_word = ""; if (i < terms.Length - 1) { next_word = terms[i + 1]; } int increaseI = 0; new_word = DealWithNumber(terms[i], next_word, out increaseI); if (increaseI == -1) { continue; } i = i + increaseI; } else { new_word = RemoveSpecialCahracterAfterDealingWithNumbers(new_word); if (IsStopWord(new_word) || new_word.ToLower() == "may") { continue; } // check for capital letters if (new_word.Any(c => char.IsUpper(c))) { new_word = (new_word).ToLower(); string next_word = ""; if (i < terms.Length - 2) { next_word = RemoveSpecialCahracter(terms[i + 1]).ToLower(); next_word = RemoveSpecialCahracterAfterDealingWithNumbers(next_word); } if (i < terms.Length - 2 && terms[i + 1].Any(c => char.IsUpper(c)) && !IsStopWord(next_word)) { string new_expression = new_word + " " + next_word; if (!parsingDictionary.ContainsKey(next_word)) { parsingDictionary[next_word] = 1; } else { parsingDictionary[next_word]++; if (parsingDictionary[next_word] > maxTermFreq) { maxTermString = next_word; maxTermFreq = parsingDictionary[next_word]; } } if (!parsingDictionary.ContainsKey(new_expression)) { parsingDictionary[new_expression] = 1; } else { parsingDictionary[new_expression]++; if (parsingDictionary[new_expression] > maxTermFreq) { maxTermString = new_expression; maxTermFreq = parsingDictionary[new_expression]; } } i++; } } } if (ToStem) { new_word = stemmer.stemTerm(new_word); } if (!parsingDictionary.ContainsKey(new_word)) { parsingDictionary[new_word] = 1; } else { parsingDictionary[new_word]++; if (parsingDictionary[new_word] > maxTermFreq) { maxTermString = new_word; maxTermFreq = parsingDictionary[new_word]; } } } if (maxTermFreq == 0 && parsingDictionary.Count > 0) { maxTermString = parsingDictionary.First().Key; maxTermFreq = parsingDictionary[maxTermString]; } maxTerm = new KeyValuePair <string, int>(maxTermString, maxTermFreq); return(parsingDictionary); }
/// <summary> /// find and saves terms by regex /// </summary> /// <param name="text">text of doc</param> /// <param name="regex">regex to use</param> /// <param name="type">type of terms it finds</param> /// <param name="docName">the doc to search in</param> /// <returns></returns> static private int getTerms(ref string text, Regex regex, string type, string docName) { MatchCollection terms = regex.Matches(text); int numOfTerms = 0; string termString; foreach (Match term in terms) { termString = term.ToString().ToLower().Replace('\n', ' ').Trim(charsToTrim); // Stop words if (StopWords.ContainsKey(termString) || termString.Length <= 0) { continue; } if (!(type[0] == 'W') && !(type[0] == 'N') && !(type[0] == 'Q') && !(type[0] == 'C') && !(type[0] == 'H')) { string clearTerm = new String('#', term.Length - 2); text = text.Substring(0, term.Index) + " " + clearTerm + " " + text.Substring(term.Index + term.Length); } DateTime convertedDate; switch (type) { case "Range": MatchCollection numbers = justANumberReg.Matches(termString); int a, b; int.TryParse(numbers[1].ToString(), out b); int.TryParse(numbers[0].ToString(), out a); if (Math.Abs(a - b) < 20) { for (; a <= b; a++) { addTermToDic(d_abNumTerms, a.ToString(), docName, term.Index, ref numOfTerms, "Number"); } } else { addTermToDic(d_abNumTerms, a.ToString(), docName, term.Index, ref numOfTerms, "Number"); addTermToDic(d_abNumTerms, b.ToString(), docName, term.Index, ref numOfTerms, "Number"); } break; case "Percent": string[] percentSplit = termString.Split(' ', '%'); float percent; float.TryParse(percentSplit[0], out percent); addTermToDic(d_abNumTerms, (percent * 0.01).ToString("P"), docName, term.Index, ref numOfTerms, "Percent"); break; case "Price": MatchCollection number = numReg.Matches(termString); float price; float.TryParse(number[0].ToString(), out price); if (termString.Contains('m')) { price = price * 1000000; } else if (termString.Contains('n')) { price = price * 1000000000; } addTermToDic(d_abNumTerms, price.ToString("C", new CultureInfo("en-US")), docName, term.Index, ref numOfTerms, "Price"); break; case "Number": string[] termSplit = termString.Split(' '); Double numformated; Double.TryParse(termSplit[0].ToString(), out numformated); if (termSplit.Length > 1) { if (termSplit[1][0] == 'm') { numformated = numformated * 1000000; } else if (termSplit[1][0] == 'b') { numformated = numformated * 1000000000; } else if (termSplit[1][0] == 't') { numformated = numformated * 1000000000000; } else if (termSplit[1][0] == 'h') { numformated = numformated * 100; } } addTermToDic(d_abNumTerms, numformated.ToString(), docName, term.Index, ref numOfTerms, "Number"); break; case "Date": try { //with th int thIndex = termString.IndexOf("th"); if (thIndex >= 0) { termString = termString.Remove(thIndex, 2); } convertedDate = Convert.ToDateTime(termString); termString = convertedDate.ToShortDateString(); addTermToDic(d_abNumTerms, termString, docName, term.Index, ref numOfTerms, "Date"); } catch (Exception e) { //manually convert string dd, mm, yyyy; string[] termStringSplited = termString.Split(' '); if (months.ContainsKey(termStringSplited[1])) { dd = termStringSplited[0]; mm = months[termStringSplited[1]]; if (termStringSplited.Length == 3) { yyyy = termStringSplited[2]; } else { yyyy = "2015"; } } else { mm = months[termStringSplited[0]]; if (termStringSplited.Length == 3) { dd = termStringSplited[1].Trim(','); yyyy = termStringSplited[2]; } else { if (termStringSplited[1].Length <= 2) { dd = termStringSplited[1]; yyyy = "2015"; } else { yyyy = termStringSplited[1]; dd = "01"; } } } addTermToDic(d_abNumTerms, dd + "/" + mm + "/" + yyyy, docName, term.Index, ref numOfTerms, "Date"); } break; case "Year": convertedDate = new DateTime(int.Parse(termString), 1, 1); termString = convertedDate.ToShortDateString(); addTermToDic(d_abNumTerms, termString, docName, term.Index, ref numOfTerms, "Date"); break; default: //stemmer if (use_stem) { lock (stemmer) { termString = stemmer.stemTerm(termString); } } if (type == "Header") { int j = 1; } //insert to correct dictionary if (termString[0] >= 's') { addTermToDic(d_szTerms, termString, docName, term.Index, ref numOfTerms, type); } else if (termString[0] >= 'n') { addTermToDic(d_nrTerms, termString, docName, term.Index, ref numOfTerms, type); } else if (termString[0] >= 'g') { addTermToDic(d_gmTerms, termString, docName, term.Index, ref numOfTerms, type); } else if (termString[0] >= 'c') { addTermToDic(d_cfTerms, termString, docName, term.Index, ref numOfTerms, type); } else { addTermToDic(d_abNumTerms, termString, docName, term.Index, ref numOfTerms, type); } break; } } return(numOfTerms); }