Пример #1
0
        public static List <NormalizeToken> normalize(string[] input)
        {
            List <NormalizeToken> ret = new List <NormalizeToken>();

            foreach (string str in input)
            {
                String norm = str;
                if (norm == "")
                {
                    continue;
                }
                try
                {
                    norm = norm.Normalize(NormalizationForm.FormD);
                }catch (Exception ex)
                {
                }

                norm = TrimPunctuation(norm);
                norm = removePeriods(norm);

                if (norm == "")
                {
                    continue;
                }
                List <NormalizeToken> normsList = new List <NormalizeToken>();

                bool isEnbool = isEnglish(norm);
                if (isEnbool)
                {
                    normsList = normalizeEnglish(norm);
                }
                else
                {
                    normsList = normalizeArabic(norm);
                }

                if (!checkIfStrInNormalizeTokenOrNot(norm, normsList))
                {
                    NormalizeToken normalizeToken = new NormalizeToken()
                    {
                        source = norm,
                        stem   = norm,
                        isEn   = isEnbool
                    };
                    normsList.Add(normalizeToken);
                }

                foreach (NormalizeToken ns in normsList)
                {
                    ret.Add(ns);
                }
            }
            return(ret);
        }
Пример #2
0
        private static List <NormalizeToken> normalizeArabic(String input)
        {
            List <String> stopWords = arabicStopWordArray.ToList <String>();

            List <NormalizeToken> list = new List <NormalizeToken>();


            using (Hunspell hunspell = new Hunspell(ar_aff_path, ar_dic_path))
            {
                if (!stopWords.Contains(input))
                {
                    List <string> stems = hunspell.Stem(input);
                    if (stems.Count == 0)
                    {
                        NormalizeToken normalizeToken = new NormalizeToken()
                        {
                            source = input,
                            stem   = input,
                            isEn   = false
                        };
                        list.Add(normalizeToken);
                    }
                    else
                    {
                        foreach (string sstem in stems)
                        {
                            NormalizeToken normalizeToken = new NormalizeToken()
                            {
                                source = input,
                                stem   = sstem,
                                isEn   = false
                            };
                            list.Add(normalizeToken);
                        }
                    }
                }
            }
            return(list);
        }