예제 #1
0
        public IEnumerable <string> Preprocess(IEnumerable <string> words)
        {
            List <string> wordsInBasicForm;

            using (var hunspell = new Hunspell("Dictionaries/ru_RU.aff", "Dictionaries/ru_RU.dic"))
            {
                wordsInBasicForm = (from word in words
                                    select hunspell.Stem(word).Any()
                        ? hunspell.Stem(word).First()
                        : word).ToList();
            }
            return(wordsInBasicForm);
        }
예제 #2
0
 public Result <string> Normalize(string word)
 {
     try
     {
         var lowered = word.ToLower();
         var stemmed = hunspell.Stem(lowered);
         return(stemmed.Count != 0
             ? hunspell.Stem(word.ToLower())[0]
             : lowered);
     }
     catch (Exception e)
     {
         return(Result.Fail <string>(e.Message));
     }
 }
예제 #3
0
        private bool IsVerb(string word)
        {
            var verbConsonants = new[] { 'с', 'з' };

            var verbSuffixes = new[] { "ть", "чь", "тся", "ться", "чься" };

            if (word.Length < 3)
            {
                return(false);
            }

            var stems = hunspell.Stem(word);

            if (!stems.Any())
            {
                return(false);
            }

            var stem = stems[0];

            if (verbSuffixes.Any(verbSuffix => stem.EndsWith(verbSuffix) && (vowels.Contains(stem[stem.Length - verbSuffix.Length - 1]) || verbConsonants.Contains(stem[stem.Length - verbSuffix.Length - 1]))))
            {
                return(true);
            }

            verbSuffixes = new[] { "ти", "тись" };

            if (verbSuffixes.Any(verbSuffix => stem.EndsWith(verbSuffix) && (!vowels.Contains(stem[stem.Length - verbSuffix.Length - 1]))))
            {
                return(true);
            }

            return(false);
        }
예제 #4
0
        public virtual IEnumerable <(string word, int count)> GetAllWords(IEnumerable <string> text)
        {
            using (var hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic"))
            {
                var result = new Dictionary <string, int>();
                foreach (var word in text.SelectMany(SplitToWords))
                {
                    if (string.IsNullOrEmpty(word))
                    {
                        continue;
                    }

                    var stems = hunspell.Stem(word);
                    var stem  = stems.Any() ? stems[0] : word;

                    if (!result.TryGetValue(stem, out var count))
                    {
                        count = 0;
                    }

                    result[stem] = count + 1;
                }
                return(result
                       .Select(kvp => (kvp.Key, kvp.Value)));
            }
        }
예제 #5
0
        private static List <string> Synonyms(string word)
        {
            var result = new List <string>();
            var thes   = new MyThes(DatFilePath);

            using (var hunspell = new Hunspell(AffFilePath, DictionaryFilePath))
            {
                var stemmedWordResult = hunspell.Stem(word);
                if (stemmedWordResult.Any())
                {
                    var stemmedWord = stemmedWordResult.FirstOrDefault();
                    if (!string.IsNullOrEmpty(stemmedWord))
                    {
                        var thesaurusResult = thes.Lookup(stemmedWord);
                        if (thesaurusResult != null && thesaurusResult.Meanings != null && thesaurusResult.Meanings.Any())
                        {
                            thesaurusResult.Meanings.ForEach(m => m.Synonyms
                                                             .Where(s => s.ToLower() != stemmedWord.ToLower())
                                                             .Where(s => s.ToLower() != word.ToLower())
                                                             .ToList()
                                                             .ForEach(s => result.Add(s.ToLower()))
                                                             );
                        }
                    }
                }
            }

            return(result);
        }
예제 #6
0
        private Result <List <string> > PrepareWords(IEnumerable <string> words)
        {
            var dir      = Path.Combine(Directory.GetCurrentDirectory(), "Resources", "HunspellDicts", "Russian");
            var affFile  = Path.Combine(dir, "ru.aff");
            var dictFile = Path.Combine(dir, "ru.dic");

            var hunspellDictsPresent = CheckIfHunspellDictsPresent(affFile, dictFile);

            if (!hunspellDictsPresent.IsSuccess)
            {
                return(hunspellDictsPresent);
            }

            var preprocessedWords = new List <string>();

            using (var hunspell = new Hunspell(affFile, dictFile))
            {
                foreach (var word in words)
                {
                    var lemma = hunspell.Stem(word).FirstOrDefault();

                    if (lemma != null && CheckIfWordMeetsAllRequirements(lemma))
                    {
                        preprocessedWords.Add(lemma.ToLower());
                    }
                }
            }

            return(preprocessedWords);
        }
예제 #7
0
 public IEnumerable <string> Find(IEnumerable <string> words)
 {
     using (var hunspell = new Hunspell("en_US.aff", "en_US.dic"))
         foreach (var word in words)
         {
             var stem = hunspell.Stem(word);
             yield return(stem.Count == 0 ? word : stem[0]);
         }
 }
예제 #8
0
        public Result <string> Process(string word)
        {
            var result = hunspell.Stem(word).FirstOrDefault();

            if (result == null)
            {
                return(new Result <string>("HUnspell error: no such infinitive words found"));
            }
            return(new Result <string>(null, result));
        }
예제 #9
0
        public WordsFilter Normalize(Hunspell hunspell)
        {
            transformations.Add(word =>
            {
                var stemResult = hunspell.Stem(word);
                return(stemResult.Count == 0 ? word : stemResult.First());
            });

            return(Normalize());
        }
예제 #10
0
 private static IEnumerable <string> StemWords(IEnumerable <string> words)
 {
     using (var hunspell = new Hunspell(AffFile, DictFile))
     {
         foreach (var word in words)
         {
             var stems = hunspell.Stem(word);
             yield return(stems.Count > 0 ? stems[0] : word);
         }
     }
 }
예제 #11
0
 public IEnumerable <string> GetNormalizedWords(IEnumerable <string> text, HashSet <string> boringWords,
                                                Hunspell hunspell)
 {
     return(from line in text
            from word in line
            .Split()
            .Select(TrimPunctuation)
            .Where(w => w.Length > 0)
            let stemResult = hunspell.Stem(word)
                             select stemResult.Count > 0 ? stemResult[0] : word.ToLower()
                             into normalizedWord
                             where hunspell.Spell(normalizedWord) && !boringWords.Contains(normalizedWord)
                             select normalizedWord);
 }
예제 #12
0
        public IEnumerable <string> Prepare(IEnumerable <string> wordFlow)
        {
            var result = new List <string>();

            using (var hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic"))
            {
                foreach (var word in wordFlow)
                {
                    var stem = hunspell.Stem(word);
                    result.Add(stem.Any() ? stem.First() : word);
                }
            }
            return(result);
        }
예제 #13
0
        public WordsFilter Normalize()
        {
            if (hunspell != null)
            {
                transformations.Add(word =>
                {
                    var stemResult = hunspell.Stem(word);
                    return(stemResult.Count == 0 ? word : stemResult.First());
                });
            }

            transformations.Add(word => word.ToLower());
            return(this);
        }
예제 #14
0
 public IEnumerable <string> PrepareTags(IEnumerable <string> tags)
 {
     using (var hunspell = new Hunspell(Resources.en_us_aff, Encoding.UTF8.GetBytes(Resources.en_us_dic)))
     {
         foreach (var tag in tags)
         {
             var stems = hunspell.Stem(tag);
             if (stems.Any())
             {
                 yield return(stems[0]);
             }
             else
             {
                 yield return(tag);
             }
         }
     }
 }
예제 #15
0
        public static IList <string> SplitToWords(string sentence, bool correct = false)
        {
            //if (lemmatize)
            //{
            //    return DoLemmatize(sentence);
            //}
            var words = SplitToWordsNoLemmatize(sentence);

            if (_spell == null)
            {
                _spell = new Hunspell("en_us.aff", "en_us.dic");
            }
            var stems = new List <string>();

            foreach (var word in words)
            {
                var tmpWord = _multipleCharacterRegex.Replace(word, "$1$1");
                if (correct)
                {
                    var correctlySpelled = _spell.Spell(word);
                    if (!correctlySpelled)
                    {
                        var tmp = _spell.Suggest(word);
                        if (tmp != null && tmp.Count == 1)
                        {
                            tmpWord = tmp[0];
                        }
                    }
                }

                var wordStems = _spell.Stem(tmpWord);
                if (wordStems.Count > 0)
                {
                    stems.AddRange(wordStems);
                }
                else
                {
                    stems.Add(word);
                }
            }

            return(stems);
        }
예제 #16
0
        public WordPreprocessing IgnoreInvalidWords()
        {
            using (Hunspell hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic"))
            {
                //Words = Words.Where(w => hunspell.Spell(w)); // Hunspell выдает какие-то странные ошибки в linq
                //Words = Words.Select(w => hunspell.Stem(w)[0]).Where(w => !string.IsNullOrEmpty(w));
                var newWords = new List <string>();
                foreach (var word in Words)
                {
                    var stem = hunspell.Stem(word);
                    if (stem.Count > 0)
                    {
                        newWords.Add(stem[0]);
                    }
                }

                Words = newWords;
            }
            return(this);
        }
예제 #17
0
        public WordPreprocessing IgnoreInvalidWords()
        {
            Words = Words.Then(words =>
            {
                using (Hunspell hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic"))
                {
                    var newWords = new List <string>();
                    foreach (var word in words)
                    {
                        var stem = hunspell.Stem(word);
                        if (stem.Count > 0)
                        {
                            newWords.Add(stem[0]);
                        }
                    }

                    return(newWords as IEnumerable <string>);
                }
            }).RefineError("Hunspell didn't find dictionaries");
            return(this);
        }
예제 #18
0
        private static List <NormalizeToken> normalizeArabic(String input)
        {
            List <String> stopWords = arabicStopWordArray.ToList <String>();

            List <NormalizeToken> list = new List <NormalizeToken>();


            using (Hunspell hunspell = new Hunspell(ar_aff_path, ar_dic_path))
            {
                if (!stopWords.Contains(input))
                {
                    List <string> stems = hunspell.Stem(input);
                    if (stems.Count == 0)
                    {
                        NormalizeToken normalizeToken = new NormalizeToken()
                        {
                            source = input,
                            stem   = input,
                            isEn   = false
                        };
                        list.Add(normalizeToken);
                    }
                    else
                    {
                        foreach (string sstem in stems)
                        {
                            NormalizeToken normalizeToken = new NormalizeToken()
                            {
                                source = input,
                                stem   = sstem,
                                isEn   = false
                            };
                            list.Add(normalizeToken);
                        }
                    }
                }
            }
            return(list);
        }
예제 #19
0
        public Dictionary <string, int> Processing(Dictionary <string, int> stats)
        {
            var newStats = new Dictionary <string, int>();

            using (Hunspell hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic"))
            {
                foreach (var wordToFrequence in stats)
                {
                    var stem = hunspell.Stem(wordToFrequence.Key).FirstOrDefault();
                    stem = stem ?? wordToFrequence.Key;
                    if (newStats.ContainsKey(stem))
                    {
                        newStats[stem] += wordToFrequence.Value;
                    }
                    else
                    {
                        newStats.Add(stem, wordToFrequence.Value);
                    }
                }
            }
            return(newStats);
        }
예제 #20
0
파일: Program.cs 프로젝트: NickZ/nhunspell
        static void Main(string[] args)
        {
            using (Hunspell hunspell = new Hunspell("en_us.aff", "en_us.dic"))
            {
                var correct = hunspell.Spell("houses");
                var suggest = hunspell.Suggest("haise");
                foreach (var x in suggest)
                {
                    Console.WriteLine(x);
                }
            }


            /*
             * var test = new SpellEngineTests();
             * test.CreationAndDestructionTest();
             * test.FunctionsTest();
             * return;
             */


            // var test = new HyphenTests();
            // test.CreationAndDestructionTest();
            // test.MemoryLeakTest();
            // test.UnicodeFilenameTest();
            // test.GermanUmlautTest();
            // test.CyrillicLanguagesTest();
            // test.NemethTests();

            var test = new HunspellTests();

            // test.AllDictionariesTest();
            test.SpellComplexWordsTest();
            test.AddWordTest();
            // test.GermanUmlautTest();
            // test.UnicodeFilenameTest();
            // test.MemoryLeakTest();

            /*
             * var test = new InteropTests();
             * test.Init();
             * test.ArrayInteropTests();
             * test.StringInteropTests();
             *
             *
             * Console.WriteLine("");
             * Console.WriteLine("Press any key to continue...");
             * Console.ReadKey();
             *
             * return;
             */
            Console.WriteLine("NHunspell functions and classes demo");

            /*
             * Console.WriteLine("Thesaurus with Thes");
             * Thes thes = new Thes();
             * thes.LoadOpenOffice("th_en_us_new.dat");
             */


            Console.WriteLine("");
            Console.WriteLine("Thesaurus with Thes");
            MyThes thes = new MyThes("th_en_us_new.dat");

            using (Hunspell hunspell = new Hunspell("en_us.aff", "en_us.dic"))
            {
                ThesResult result = thes.Lookup("cars", hunspell);
                foreach (ThesMeaning meaning in result.Meanings)
                {
                    Console.WriteLine("  Meaning:" + meaning.Description);
                    foreach (string synonym in meaning.Synonyms)
                    {
                        Console.WriteLine("    Synonym:" + synonym);
                    }
                }
            }

            Console.WriteLine("");
            Console.WriteLine("Spell Check with with Hunspell");

            // Important: Due to the fact Hunspell will use unmanaged memory you have to serve the IDisposable pattern
            // In this block of code this is be done by a using block. But you can also call hunspell.Dispose()
            using (Hunspell hunspell = new Hunspell("en_us.aff", "en_us.dic"))
            {
                Console.WriteLine("Check if the word 'Recommendation' is spelled correct");
                bool correct = hunspell.Spell("Recommendation");
                Console.WriteLine("Recommendation is spelled " + (correct ? "correct" : "not correct"));

                Console.WriteLine("");
                Console.WriteLine("Make suggestions for the word 'Recommendatio'");
                List <string> suggestions = hunspell.Suggest("Recommendatio");
                Console.WriteLine("There are " + suggestions.Count.ToString() + " suggestions");
                foreach (string suggestion in suggestions)
                {
                    Console.WriteLine("Suggestion is: " + suggestion);
                }

                Console.WriteLine("");
                Console.WriteLine("Analyze the word 'decompressed'");
                List <string> morphs = hunspell.Analyze("decompressed");
                foreach (string morph in morphs)
                {
                    Console.WriteLine("Morph is: " + morph);
                }

                Console.WriteLine("");
                Console.WriteLine("Stem the word 'decompressed'");
                List <string> stems = hunspell.Stem("decompressed");
                foreach (string stem in stems)
                {
                    Console.WriteLine("Stem is: " + stem);
                }

                /*
                 * for (; ; )
                 * {
                 *  Console.WriteLine("");
                 *  Console.WriteLine("Word1:");
                 *  string word = Console.ReadLine();
                 *  Console.WriteLine("Word2:");
                 *  string word2 = Console.ReadLine();
                 *
                 *  List<string> generated = hunspell.Generate(word, word2); // Generate("Girl","Boys");
                 *  foreach (string stem in generated)
                 *  {
                 *      Console.WriteLine("Generated is: " + stem);
                 *  }
                 * }
                 */
            }

            Console.WriteLine("");
            Console.WriteLine("Hyphenation with Hyph");

            // Important: Due to the fact Hyphen will use unmanaged memory you have to serve the IDisposable pattern
            // In this block of code this is be done by a using block. But you can also call hyphen.Dispose()
            using (Hyphen hyphen = new Hyphen("hyph_en_us.dic"))
            {
                Console.WriteLine("Get the hyphenation of the word 'Recommendation'");
                HyphenResult hyphenated = hyphen.Hyphenate("Recommendation");
                Console.WriteLine("'Recommendation' is hyphenated as: " + hyphenated.HyphenatedWord);

                hyphenated = hyphen.Hyphenate("eighteen");
                hyphenated = hyphen.Hyphenate("eighteen");
            }

            Console.WriteLine("");
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }
예제 #21
0
 public string ConvertWord(string word)
 {
     return(hunspell.Stem(word).Any() ? hunspell.Stem(word).First() : word);
 }
예제 #22
0
 /// <summary>
 /// Gets the word stems for the specified word.
 /// </summary>
 /// <param name="word">The word to find stems for.</param>
 /// <returns>The list of stems.</returns>
 public List <string> Stem(string word)
 {
     return(_hunspell.Stem(word));
 }
예제 #23
0
        public string GetSynonyms(string query)
        {
            //var words = query.Split(new char[] {' '});
            //for( int i = 0; i < words.Length; i++ )
            //{
            //	var synSetList = wordNet.GetSynSets(words[i]);

            //	words[i] += "^5";

            //	if (synSetList.Count == 0)
            //	{
            //		continue;
            //	}

            //	foreach (var synSet in synSetList)
            //	{
            //		var synWords = synSet.Words;
            //		for( int j = 0; j < synWords.Count; j++)
            //		{
            //			if(synWords[j].Contains("_"))
            //			{
            //				synWords[j] = synWords[j].Replace("_", " ");
            //				synWords[j] = "\"" + synWords[j] + "\"";
            //			}
            //		}
            //		var synonyms = string.Join(" ", synWords);
            //		words[i] += " " + synonyms;
            //	}

            //}
            //query = string.Join(" ", words);
            //return query;

            string expandedQuery = String.Empty;

            MyThes thes = new MyThes("th_en_us_new.dat");

            using (Hunspell hunspell = new Hunspell("en_AU.aff", "en_AU.dic"))
            {
                var words = query.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                for (int i = 0; i < words.Length; i++)
                {
                    List <string> stems = new List <string>();
                    var           word  = words[i];
                    expandedQuery += " " + word + "^5";
                    var wordStems = hunspell.Stem(word);
                    if (wordStems.Count > 0)
                    {
                        stems.Add(word);
                        stems.AddRange(wordStems);
                    }
                    else
                    {
                        stems.Add(word);
                    }

                    foreach (var stem in stems)
                    {
                        ThesResult tr = thes.Lookup(stem, hunspell);

                        if (!stem.Equals(word))
                        {
                            expandedQuery += " " + stem;
                        }

                        if (tr != null && tr.Meanings.Count > 0)
                        {
                            foreach (ThesMeaning meaning in tr.Meanings)
                            {
                                expandedQuery += " " + string.Join(" ", meaning.Synonyms);
                            }
                        }
                    }
                }
            }
            return(expandedQuery);
        }
예제 #24
0
 public List <string> getStems(string word)
 {
     return(hunspell.Stem(word));
 }
예제 #25
0
 public static IEnumerable <string> GetStems(string word)
 {
     return(HunspellTr.Stem(word));
 }
예제 #26
0
        private string ToInitialForm(string word, Hunspell hunspell)
        {
            var firstForm = hunspell.Stem(word).FirstOrDefault();

            return(firstForm ?? word);
        }