Пример #1
0
        public static string Normalizer(string input)
        {
            if (input == "" || input == null)
            {
                return("");
            }

            input = Utilizer.RemoveTroublesomeCharacters(input);

            // split string based on size
            // 5000 character is a safe size for hazm websevice
            List <string> portions;

            if (input.Length > safePrtionSize)
            {
                portions = Utilizer.SplitUsingUpssalaSentSegmenter(input, safePrtionSize);
            }
            else
            {
                portions = new List <string>()
                {
                    input
                };
            }

            for (int i = 0; i < portions.Count; i++)
            {
                // normalize input text
                portions[i] = HazmWebService().Normalizer(portions[i]);
            }

            string output = portions.Aggregate((x, y) => x + y);

            return(output);
        }
Пример #2
0
        public static string[] SentenceTokenizer(string input)
        {
            if (input == "" || input == null)
            {
                string[] a = new string[0];
                return(a);
            }

            input = Utilizer.RemoveTroublesomeCharacters(input);

            // split string based on size
            // 5000 character is a safe size for hazm websevice
            List <string> portions;

            if (input.Length > safePrtionSize)
            {
                portions = Utilizer.SplitUsingUpssalaSentSegmenter(input, safePrtionSize);
            }
            else
            {
                portions = new List <string>()
                {
                    input
                };
            }

            List <string> finalTokens = new List <string>();

            for (int i = 0; i < portions.Count; i++)
            {
                // normalize input text
                finalTokens.AddRange(HazmWebService().SentenceTokenizer(portions[i]));
            }

            // trim all tokens
            finalTokens = finalTokens.Select(s => s.Trim()).ToList();

            //  remove empty tokens.
            finalTokens = finalTokens.Where(s => !string.IsNullOrWhiteSpace(s)).ToList();

            return(finalTokens.ToArray());
        }