public IEnumerable <string> Preprocess(IEnumerable <string> words) { List <string> wordsInBasicForm; using (var hunspell = new Hunspell("Dictionaries/ru_RU.aff", "Dictionaries/ru_RU.dic")) { wordsInBasicForm = (from word in words select hunspell.Stem(word).Any() ? hunspell.Stem(word).First() : word).ToList(); } return(wordsInBasicForm); }
public Result <string> Normalize(string word) { try { var lowered = word.ToLower(); var stemmed = hunspell.Stem(lowered); return(stemmed.Count != 0 ? hunspell.Stem(word.ToLower())[0] : lowered); } catch (Exception e) { return(Result.Fail <string>(e.Message)); } }
private bool IsVerb(string word) { var verbConsonants = new[] { 'с', 'з' }; var verbSuffixes = new[] { "ть", "чь", "тся", "ться", "чься" }; if (word.Length < 3) { return(false); } var stems = hunspell.Stem(word); if (!stems.Any()) { return(false); } var stem = stems[0]; if (verbSuffixes.Any(verbSuffix => stem.EndsWith(verbSuffix) && (vowels.Contains(stem[stem.Length - verbSuffix.Length - 1]) || verbConsonants.Contains(stem[stem.Length - verbSuffix.Length - 1])))) { return(true); } verbSuffixes = new[] { "ти", "тись" }; if (verbSuffixes.Any(verbSuffix => stem.EndsWith(verbSuffix) && (!vowels.Contains(stem[stem.Length - verbSuffix.Length - 1])))) { return(true); } return(false); }
public virtual IEnumerable <(string word, int count)> GetAllWords(IEnumerable <string> text) { using (var hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic")) { var result = new Dictionary <string, int>(); foreach (var word in text.SelectMany(SplitToWords)) { if (string.IsNullOrEmpty(word)) { continue; } var stems = hunspell.Stem(word); var stem = stems.Any() ? stems[0] : word; if (!result.TryGetValue(stem, out var count)) { count = 0; } result[stem] = count + 1; } return(result .Select(kvp => (kvp.Key, kvp.Value))); } }
private static List <string> Synonyms(string word) { var result = new List <string>(); var thes = new MyThes(DatFilePath); using (var hunspell = new Hunspell(AffFilePath, DictionaryFilePath)) { var stemmedWordResult = hunspell.Stem(word); if (stemmedWordResult.Any()) { var stemmedWord = stemmedWordResult.FirstOrDefault(); if (!string.IsNullOrEmpty(stemmedWord)) { var thesaurusResult = thes.Lookup(stemmedWord); if (thesaurusResult != null && thesaurusResult.Meanings != null && thesaurusResult.Meanings.Any()) { thesaurusResult.Meanings.ForEach(m => m.Synonyms .Where(s => s.ToLower() != stemmedWord.ToLower()) .Where(s => s.ToLower() != word.ToLower()) .ToList() .ForEach(s => result.Add(s.ToLower())) ); } } } } return(result); }
private Result <List <string> > PrepareWords(IEnumerable <string> words) { var dir = Path.Combine(Directory.GetCurrentDirectory(), "Resources", "HunspellDicts", "Russian"); var affFile = Path.Combine(dir, "ru.aff"); var dictFile = Path.Combine(dir, "ru.dic"); var hunspellDictsPresent = CheckIfHunspellDictsPresent(affFile, dictFile); if (!hunspellDictsPresent.IsSuccess) { return(hunspellDictsPresent); } var preprocessedWords = new List <string>(); using (var hunspell = new Hunspell(affFile, dictFile)) { foreach (var word in words) { var lemma = hunspell.Stem(word).FirstOrDefault(); if (lemma != null && CheckIfWordMeetsAllRequirements(lemma)) { preprocessedWords.Add(lemma.ToLower()); } } } return(preprocessedWords); }
public IEnumerable <string> Find(IEnumerable <string> words) { using (var hunspell = new Hunspell("en_US.aff", "en_US.dic")) foreach (var word in words) { var stem = hunspell.Stem(word); yield return(stem.Count == 0 ? word : stem[0]); } }
public Result <string> Process(string word) { var result = hunspell.Stem(word).FirstOrDefault(); if (result == null) { return(new Result <string>("HUnspell error: no such infinitive words found")); } return(new Result <string>(null, result)); }
public WordsFilter Normalize(Hunspell hunspell) { transformations.Add(word => { var stemResult = hunspell.Stem(word); return(stemResult.Count == 0 ? word : stemResult.First()); }); return(Normalize()); }
private static IEnumerable <string> StemWords(IEnumerable <string> words) { using (var hunspell = new Hunspell(AffFile, DictFile)) { foreach (var word in words) { var stems = hunspell.Stem(word); yield return(stems.Count > 0 ? stems[0] : word); } } }
public IEnumerable <string> GetNormalizedWords(IEnumerable <string> text, HashSet <string> boringWords, Hunspell hunspell) { return(from line in text from word in line .Split() .Select(TrimPunctuation) .Where(w => w.Length > 0) let stemResult = hunspell.Stem(word) select stemResult.Count > 0 ? stemResult[0] : word.ToLower() into normalizedWord where hunspell.Spell(normalizedWord) && !boringWords.Contains(normalizedWord) select normalizedWord); }
public IEnumerable <string> Prepare(IEnumerable <string> wordFlow) { var result = new List <string>(); using (var hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic")) { foreach (var word in wordFlow) { var stem = hunspell.Stem(word); result.Add(stem.Any() ? stem.First() : word); } } return(result); }
public WordsFilter Normalize() { if (hunspell != null) { transformations.Add(word => { var stemResult = hunspell.Stem(word); return(stemResult.Count == 0 ? word : stemResult.First()); }); } transformations.Add(word => word.ToLower()); return(this); }
public IEnumerable <string> PrepareTags(IEnumerable <string> tags) { using (var hunspell = new Hunspell(Resources.en_us_aff, Encoding.UTF8.GetBytes(Resources.en_us_dic))) { foreach (var tag in tags) { var stems = hunspell.Stem(tag); if (stems.Any()) { yield return(stems[0]); } else { yield return(tag); } } } }
public static IList <string> SplitToWords(string sentence, bool correct = false) { //if (lemmatize) //{ // return DoLemmatize(sentence); //} var words = SplitToWordsNoLemmatize(sentence); if (_spell == null) { _spell = new Hunspell("en_us.aff", "en_us.dic"); } var stems = new List <string>(); foreach (var word in words) { var tmpWord = _multipleCharacterRegex.Replace(word, "$1$1"); if (correct) { var correctlySpelled = _spell.Spell(word); if (!correctlySpelled) { var tmp = _spell.Suggest(word); if (tmp != null && tmp.Count == 1) { tmpWord = tmp[0]; } } } var wordStems = _spell.Stem(tmpWord); if (wordStems.Count > 0) { stems.AddRange(wordStems); } else { stems.Add(word); } } return(stems); }
public WordPreprocessing IgnoreInvalidWords() { using (Hunspell hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic")) { //Words = Words.Where(w => hunspell.Spell(w)); // Hunspell выдает какие-то странные ошибки в linq //Words = Words.Select(w => hunspell.Stem(w)[0]).Where(w => !string.IsNullOrEmpty(w)); var newWords = new List <string>(); foreach (var word in Words) { var stem = hunspell.Stem(word); if (stem.Count > 0) { newWords.Add(stem[0]); } } Words = newWords; } return(this); }
public WordPreprocessing IgnoreInvalidWords() { Words = Words.Then(words => { using (Hunspell hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic")) { var newWords = new List <string>(); foreach (var word in words) { var stem = hunspell.Stem(word); if (stem.Count > 0) { newWords.Add(stem[0]); } } return(newWords as IEnumerable <string>); } }).RefineError("Hunspell didn't find dictionaries"); return(this); }
private static List <NormalizeToken> normalizeArabic(String input) { List <String> stopWords = arabicStopWordArray.ToList <String>(); List <NormalizeToken> list = new List <NormalizeToken>(); using (Hunspell hunspell = new Hunspell(ar_aff_path, ar_dic_path)) { if (!stopWords.Contains(input)) { List <string> stems = hunspell.Stem(input); if (stems.Count == 0) { NormalizeToken normalizeToken = new NormalizeToken() { source = input, stem = input, isEn = false }; list.Add(normalizeToken); } else { foreach (string sstem in stems) { NormalizeToken normalizeToken = new NormalizeToken() { source = input, stem = sstem, isEn = false }; list.Add(normalizeToken); } } } } return(list); }
public Dictionary <string, int> Processing(Dictionary <string, int> stats) { var newStats = new Dictionary <string, int>(); using (Hunspell hunspell = new Hunspell("ru_RU.aff", "ru_RU.dic")) { foreach (var wordToFrequence in stats) { var stem = hunspell.Stem(wordToFrequence.Key).FirstOrDefault(); stem = stem ?? wordToFrequence.Key; if (newStats.ContainsKey(stem)) { newStats[stem] += wordToFrequence.Value; } else { newStats.Add(stem, wordToFrequence.Value); } } } return(newStats); }
static void Main(string[] args) { using (Hunspell hunspell = new Hunspell("en_us.aff", "en_us.dic")) { var correct = hunspell.Spell("houses"); var suggest = hunspell.Suggest("haise"); foreach (var x in suggest) { Console.WriteLine(x); } } /* * var test = new SpellEngineTests(); * test.CreationAndDestructionTest(); * test.FunctionsTest(); * return; */ // var test = new HyphenTests(); // test.CreationAndDestructionTest(); // test.MemoryLeakTest(); // test.UnicodeFilenameTest(); // test.GermanUmlautTest(); // test.CyrillicLanguagesTest(); // test.NemethTests(); var test = new HunspellTests(); // test.AllDictionariesTest(); test.SpellComplexWordsTest(); test.AddWordTest(); // test.GermanUmlautTest(); // test.UnicodeFilenameTest(); // test.MemoryLeakTest(); /* * var test = new InteropTests(); * test.Init(); * test.ArrayInteropTests(); * test.StringInteropTests(); * * * Console.WriteLine(""); * Console.WriteLine("Press any key to continue..."); * Console.ReadKey(); * * return; */ Console.WriteLine("NHunspell functions and classes demo"); /* * Console.WriteLine("Thesaurus with Thes"); * Thes thes = new Thes(); * thes.LoadOpenOffice("th_en_us_new.dat"); */ Console.WriteLine(""); Console.WriteLine("Thesaurus with Thes"); MyThes thes = new MyThes("th_en_us_new.dat"); using (Hunspell hunspell = new Hunspell("en_us.aff", "en_us.dic")) { ThesResult result = thes.Lookup("cars", hunspell); foreach (ThesMeaning meaning in result.Meanings) { Console.WriteLine(" Meaning:" + meaning.Description); foreach (string synonym in meaning.Synonyms) { Console.WriteLine(" Synonym:" + synonym); } } } Console.WriteLine(""); Console.WriteLine("Spell Check with with Hunspell"); // Important: Due to the fact Hunspell will use unmanaged memory you have to serve the IDisposable pattern // In this block of code this is be done by a using block. But you can also call hunspell.Dispose() using (Hunspell hunspell = new Hunspell("en_us.aff", "en_us.dic")) { Console.WriteLine("Check if the word 'Recommendation' is spelled correct"); bool correct = hunspell.Spell("Recommendation"); Console.WriteLine("Recommendation is spelled " + (correct ? "correct" : "not correct")); Console.WriteLine(""); Console.WriteLine("Make suggestions for the word 'Recommendatio'"); List <string> suggestions = hunspell.Suggest("Recommendatio"); Console.WriteLine("There are " + suggestions.Count.ToString() + " suggestions"); foreach (string suggestion in suggestions) { Console.WriteLine("Suggestion is: " + suggestion); } Console.WriteLine(""); Console.WriteLine("Analyze the word 'decompressed'"); List <string> morphs = hunspell.Analyze("decompressed"); foreach (string morph in morphs) { Console.WriteLine("Morph is: " + morph); } Console.WriteLine(""); Console.WriteLine("Stem the word 'decompressed'"); List <string> stems = hunspell.Stem("decompressed"); foreach (string stem in stems) { Console.WriteLine("Stem is: " + stem); } /* * for (; ; ) * { * Console.WriteLine(""); * Console.WriteLine("Word1:"); * string word = Console.ReadLine(); * Console.WriteLine("Word2:"); * string word2 = Console.ReadLine(); * * List<string> generated = hunspell.Generate(word, word2); // Generate("Girl","Boys"); * foreach (string stem in generated) * { * Console.WriteLine("Generated is: " + stem); * } * } */ } Console.WriteLine(""); Console.WriteLine("Hyphenation with Hyph"); // Important: Due to the fact Hyphen will use unmanaged memory you have to serve the IDisposable pattern // In this block of code this is be done by a using block. But you can also call hyphen.Dispose() using (Hyphen hyphen = new Hyphen("hyph_en_us.dic")) { Console.WriteLine("Get the hyphenation of the word 'Recommendation'"); HyphenResult hyphenated = hyphen.Hyphenate("Recommendation"); Console.WriteLine("'Recommendation' is hyphenated as: " + hyphenated.HyphenatedWord); hyphenated = hyphen.Hyphenate("eighteen"); hyphenated = hyphen.Hyphenate("eighteen"); } Console.WriteLine(""); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
public string ConvertWord(string word) { return(hunspell.Stem(word).Any() ? hunspell.Stem(word).First() : word); }
/// <summary> /// Gets the word stems for the specified word. /// </summary> /// <param name="word">The word to find stems for.</param> /// <returns>The list of stems.</returns> public List <string> Stem(string word) { return(_hunspell.Stem(word)); }
public string GetSynonyms(string query) { //var words = query.Split(new char[] {' '}); //for( int i = 0; i < words.Length; i++ ) //{ // var synSetList = wordNet.GetSynSets(words[i]); // words[i] += "^5"; // if (synSetList.Count == 0) // { // continue; // } // foreach (var synSet in synSetList) // { // var synWords = synSet.Words; // for( int j = 0; j < synWords.Count; j++) // { // if(synWords[j].Contains("_")) // { // synWords[j] = synWords[j].Replace("_", " "); // synWords[j] = "\"" + synWords[j] + "\""; // } // } // var synonyms = string.Join(" ", synWords); // words[i] += " " + synonyms; // } //} //query = string.Join(" ", words); //return query; string expandedQuery = String.Empty; MyThes thes = new MyThes("th_en_us_new.dat"); using (Hunspell hunspell = new Hunspell("en_AU.aff", "en_AU.dic")) { var words = query.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < words.Length; i++) { List <string> stems = new List <string>(); var word = words[i]; expandedQuery += " " + word + "^5"; var wordStems = hunspell.Stem(word); if (wordStems.Count > 0) { stems.Add(word); stems.AddRange(wordStems); } else { stems.Add(word); } foreach (var stem in stems) { ThesResult tr = thes.Lookup(stem, hunspell); if (!stem.Equals(word)) { expandedQuery += " " + stem; } if (tr != null && tr.Meanings.Count > 0) { foreach (ThesMeaning meaning in tr.Meanings) { expandedQuery += " " + string.Join(" ", meaning.Synonyms); } } } } } return(expandedQuery); }
public List <string> getStems(string word) { return(hunspell.Stem(word)); }
public static IEnumerable <string> GetStems(string word) { return(HunspellTr.Stem(word)); }
private string ToInitialForm(string word, Hunspell hunspell) { var firstForm = hunspell.Stem(word).FirstOrDefault(); return(firstForm ?? word); }