void RemoveUnnassesaryWords(TajikWord word, TajikSentence sentence) { var shouldBeRemoved = ShouldBeRemoved(word.Value); if (shouldBeRemoved) { sentence.Words.Remove(word); } }
/// <summary> /// RU: Соберет слова из предложения /// TJ: Гирифтани калимаҳо аз ҷумла /// </summary> public IEnumerable <TajikWord> SplitWordsFromSentences(TajikSentence sentence) { List <TajikWord> wordsInstances = new List <TajikWord>(); var words = sentence.Content.Replace(" - ", "-").Split(' ').GroupBy(w => w).Select(s => s.FirstOrDefault()); foreach (var word in words) { wordsInstances.Add(new TajikWord(Regex.Replace(word, Statics.SentenceEndSymbols, string.Empty))); } return(wordsInstances); }
/// <summary> /// RU: Нормализация текста /// TJ: Нормаликунонии матн /// </summary> public void NormalizeWords(IEnumerable <TajikWord> words, TajikSentence sentence) { foreach (var word in words.ToList()) { ShakliJam(word); Ishorakuni(word); BandakiI(word); BandakiE(word); BandakiU(word); RemoveUnnassesaryWords(word, sentence); } }