private static void ExtractWordsAround( string text, Dictionary <string, bool> allWordForms, string langCode, WordsAndCounts wordsAndCounts, List <string> phrasesToKeep) { string textLowerCase = text.ToLowerInvariant(); string textHtmlDecoded = HttpUtility.HtmlDecode(textLowerCase); string textWithoutPhrases = EncodePhrases(textHtmlDecoded, phrasesToKeep); string[] allWords; if ((langCode == LANG_CODE_BG) || (langCode == LANG_CODE_RU)) { string nonCyrillicChars = "[^�-‗א-�" + BULLET_OPERATOR + "]+"; allWords = Regex.Split(textWithoutPhrases, nonCyrillicChars); } else { string nonLatinChars = "[^A-Za-z" + BULLET_OPERATOR + "]+"; allWords = Regex.Split(textWithoutPhrases, nonLatinChars); } DecodePhrases(allWords); List <string> validWords; if (removeShortAndStopWords) { // TODO: this does not work correctly for short words like "םמ" validWords = FilterWords(allWords, langCode, allWordForms); } else { validWords = new List <string>(allWords); } bool[] wordsToInclude = new bool[validWords.Count]; for (int i = 0; i < validWords.Count; i++) { string currentWord = validWords[i]; if (allWordForms.ContainsKey(currentWord)) { int start = Math.Max(0, i - contextSize); int end = Math.Min(i + contextSize, validWords.Count - 1); for (int contextIndex = start; contextIndex <= end; contextIndex++) { wordsToInclude[contextIndex] = true; } } } for (int i = 0; i < validWords.Count; i++) { if (wordsToInclude[i]) { string wordToInclude = validWords[i]; wordsAndCounts.Add(wordToInclude, 1); } } }
public static WordsAndCounts GetBasicForms(WordsAndCounts words, string langCode) { WordsAndCounts basicForms = new WordsAndCounts(); WordAndCount[] wordsAndCounts = words.AsSortedArray; foreach (WordAndCount word in wordsAndCounts) { List <string> basicFormWords = LemmaDictionaryUtils.GetBasicForms(word.Word, langCode); foreach (string basicWord in basicFormWords) { //double count = (double)word.Count / basicFormWords.Count; double count = (double)word.Count; basicForms.Add(basicWord, count); } } return(basicForms); }