public static List <NormalizeToken> normalize(string[] input) { List <NormalizeToken> ret = new List <NormalizeToken>(); foreach (string str in input) { String norm = str; if (norm == "") { continue; } try { norm = norm.Normalize(NormalizationForm.FormD); }catch (Exception ex) { } norm = TrimPunctuation(norm); norm = removePeriods(norm); if (norm == "") { continue; } List <NormalizeToken> normsList = new List <NormalizeToken>(); bool isEnbool = isEnglish(norm); if (isEnbool) { normsList = normalizeEnglish(norm); } else { normsList = normalizeArabic(norm); } if (!checkIfStrInNormalizeTokenOrNot(norm, normsList)) { NormalizeToken normalizeToken = new NormalizeToken() { source = norm, stem = norm, isEn = isEnbool }; normsList.Add(normalizeToken); } foreach (NormalizeToken ns in normsList) { ret.Add(ns); } } return(ret); }
private static List <NormalizeToken> normalizeArabic(String input) { List <String> stopWords = arabicStopWordArray.ToList <String>(); List <NormalizeToken> list = new List <NormalizeToken>(); using (Hunspell hunspell = new Hunspell(ar_aff_path, ar_dic_path)) { if (!stopWords.Contains(input)) { List <string> stems = hunspell.Stem(input); if (stems.Count == 0) { NormalizeToken normalizeToken = new NormalizeToken() { source = input, stem = input, isEn = false }; list.Add(normalizeToken); } else { foreach (string sstem in stems) { NormalizeToken normalizeToken = new NormalizeToken() { source = input, stem = sstem, isEn = false }; list.Add(normalizeToken); } } } } return(list); }