public static string Normalizer(string input) { if (input == "" || input == null) { return(""); } input = Utilizer.RemoveTroublesomeCharacters(input); // split string based on size // 5000 character is a safe size for hazm websevice List <string> portions; if (input.Length > safePrtionSize) { portions = Utilizer.SplitUsingUpssalaSentSegmenter(input, safePrtionSize); } else { portions = new List <string>() { input }; } for (int i = 0; i < portions.Count; i++) { // normalize input text portions[i] = HazmWebService().Normalizer(portions[i]); } string output = portions.Aggregate((x, y) => x + y); return(output); }
public static string[] SentenceTokenizer(string input) { if (input == "" || input == null) { string[] a = new string[0]; return(a); } input = Utilizer.RemoveTroublesomeCharacters(input); // split string based on size // 5000 character is a safe size for hazm websevice List <string> portions; if (input.Length > safePrtionSize) { portions = Utilizer.SplitUsingUpssalaSentSegmenter(input, safePrtionSize); } else { portions = new List <string>() { input }; } List <string> finalTokens = new List <string>(); for (int i = 0; i < portions.Count; i++) { // normalize input text finalTokens.AddRange(HazmWebService().SentenceTokenizer(portions[i])); } // trim all tokens finalTokens = finalTokens.Select(s => s.Trim()).ToList(); // remove empty tokens. finalTokens = finalTokens.Where(s => !string.IsNullOrWhiteSpace(s)).ToList(); return(finalTokens.ToArray()); }