/// <summary> /// This function tokenizes the string(content of each document one by one). It tokenize by using regular expressions /// Also, StopWords will also be removed here /// </summary> /// <param name="input"></param> /// <returns>it returns a string array whose each index contains a token.</returns> public string[] Tokenize(string documentContents) { string pattern = "[ ۔،؛:)(!؟/؎{}‘’0123456789]"; //it will match space and other punctuation marks. Regex _regex = new Regex(pattern); string[] tokens = _regex.Split(documentContents); List <string> processedList = new List <string>(); // this list will contain words after punctuation removal and stopword removal for (int i = 0; i < tokens.Length; i++) { //Below line further checks if any character in RE is still in content, maybe a space character, which will be removed in if condition MatchCollection mc = _regex.Matches(tokens[i]); //Represents the set of successful matches found by iteratively applying a regular //expression pattern to the input string. if (mc.Count <= 0 && tokens[i].Trim().Length > 0 && !StopWordsHandler.IsStopWord(tokens[i])) { processedList.Add(tokens[i]); } } return(processedList.ToArray()); }
public Tokeniser() { StopWordsHandler stopWordHandler = new StopWordsHandler(); }