예제 #1
0
        private static void ExtractWordsAround(
            string text, Dictionary <string, bool> allWordForms, string langCode,
            WordsAndCounts wordsAndCounts, List <string> phrasesToKeep)
        {
            string textLowerCase      = text.ToLowerInvariant();
            string textHtmlDecoded    = HttpUtility.HtmlDecode(textLowerCase);
            string textWithoutPhrases = EncodePhrases(textHtmlDecoded, phrasesToKeep);

            string[] allWords;
            if ((langCode == LANG_CODE_BG) || (langCode == LANG_CODE_RU))
            {
                string nonCyrillicChars = "[^�-‗א-�" + BULLET_OPERATOR + "]+";
                allWords = Regex.Split(textWithoutPhrases, nonCyrillicChars);
            }
            else
            {
                string nonLatinChars = "[^A-Za-z" + BULLET_OPERATOR + "]+";
                allWords = Regex.Split(textWithoutPhrases, nonLatinChars);
            }
            DecodePhrases(allWords);

            List <string> validWords;

            if (removeShortAndStopWords)
            {
                // TODO: this does not work correctly for short words like "םמ"
                validWords = FilterWords(allWords, langCode, allWordForms);
            }
            else
            {
                validWords = new List <string>(allWords);
            }

            bool[] wordsToInclude = new bool[validWords.Count];
            for (int i = 0; i < validWords.Count; i++)
            {
                string currentWord = validWords[i];
                if (allWordForms.ContainsKey(currentWord))
                {
                    int start = Math.Max(0, i - contextSize);
                    int end   = Math.Min(i + contextSize, validWords.Count - 1);
                    for (int contextIndex = start; contextIndex <= end; contextIndex++)
                    {
                        wordsToInclude[contextIndex] = true;
                    }
                }
            }
            for (int i = 0; i < validWords.Count; i++)
            {
                if (wordsToInclude[i])
                {
                    string wordToInclude = validWords[i];
                    wordsAndCounts.Add(wordToInclude, 1);
                }
            }
        }
        public static WordsAndCounts GetBasicForms(WordsAndCounts words, string langCode)
        {
            WordsAndCounts basicForms = new WordsAndCounts();

            WordAndCount[] wordsAndCounts = words.AsSortedArray;
            foreach (WordAndCount word in wordsAndCounts)
            {
                List <string> basicFormWords =
                    LemmaDictionaryUtils.GetBasicForms(word.Word, langCode);
                foreach (string basicWord in basicFormWords)
                {
                    //double count = (double)word.Count / basicFormWords.Count;
                    double count = (double)word.Count;
                    basicForms.Add(basicWord, count);
                }
            }
            return(basicForms);
        }