public List <Token> Tokenize(string sentence, TokenizationOptions options) { string pattern = options.Pattern; if (options.SpecialWords != null) { options.SpecialWords.ForEach(r => { sentence = Regex.Replace(sentence, r, " " + r); }); pattern = String.Join("|", options.SpecialWords) + "|" + pattern; } _regex = new Regex(pattern); var matches = _regex.Matches(sentence).Cast <Match>().ToArray(); options.IsGap = new string[] { WHITE_SPACE, BLANK_LINE }.Contains(pattern); if (options.IsGap) { int pos = 0; var tokens = new Token[matches.Length + 1]; for (int span = 0; span <= matches.Length; span++) { var token = new Token { Text = (span == matches.Length) ? sentence.Substring(pos) : sentence.Substring(pos, matches[span].Index - pos), Start = pos }; token.Text = token.Text.Trim(); tokens[span] = token; if (span < matches.Length) { pos = matches[span].Index + 1; } } return(tokens.ToList()); } else { var m = matches.Select(x => new Token { Text = x.Value, Start = x.Index }).ToList(); if (options.SpecialWords != null) { int offset = 0; m.ForEach(t => { if (options.SpecialWords.Contains(t.Text)) { offset++; } t.Start = t.Start - offset; }); } return(m); } }
public TokenizerFactory(TokenizationOptions options, SupportedLanguage lang) { _lang = lang; _options = options; }
public List <Token> Tokenize(string sentence, TokenizationOptions options) { string text = sentence; // starting quoting replace STARTING_QUOTES.ForEach(x => { text = Regex.Replace(text, x.Item1, x.Item2); }); // replace PUNCTUATION PUNCTUATION.ForEach(x => { text = Regex.Replace(text, x.Item1, x.Item2); }); // Handles parentheses. PARENS_BRACKETS.ForEach(x => { text = Regex.Replace(text, x.Item1, x.Item2); }); // convert parentheses if (options.ConvertParentheses) { CONVERT_PARENTHESES.ForEach(x => { text = Regex.Replace(text, x.Item1, x.Item2); }); } // Handles repeated dash. text = Regex.Replace(text, "(-{2,})", " $1 ").Trim(); // replace ending quotes ENDING_QUOTES.ForEach(x => { text = Regex.Replace(text, x.Item1, x.Item2); }); // replace ending quotes CONVENTIONS.ForEach(x => { text = Regex.Replace(text, x.Item1, x.Item2); }); // remove duplicated spaces text = Regex.Replace(text, "\\s+", " ") + " "; // split int pos = 0; var tokens = Regex.Matches(text, "\\s") .Cast <Match>() .Select(x => { var token = new Token { Start = pos, Text = text.Substring(pos, x.Index - pos) }; pos = x.Index + 1; return(token); }).ToList(); // correct token position CorrectTokenPosition(sentence, tokens); return(tokens); }