Example #1
0
        public List <Token> Tokenize(string sentence, TokenizationOptions options)
        {
            string pattern = options.Pattern;

            if (options.SpecialWords != null)
            {
                options.SpecialWords.ForEach(r =>
                {
                    sentence = Regex.Replace(sentence, r, " " + r);
                });

                pattern = String.Join("|", options.SpecialWords) + "|" + pattern;
            }

            _regex = new Regex(pattern);

            var matches = _regex.Matches(sentence).Cast <Match>().ToArray();

            options.IsGap = new string[] { WHITE_SPACE, BLANK_LINE }.Contains(pattern);

            if (options.IsGap)
            {
                int pos    = 0;
                var tokens = new Token[matches.Length + 1];

                for (int span = 0; span <= matches.Length; span++)
                {
                    var token = new Token
                    {
                        Text  = (span == matches.Length) ? sentence.Substring(pos) : sentence.Substring(pos, matches[span].Index - pos),
                        Start = pos
                    };

                    token.Text = token.Text.Trim();

                    tokens[span] = token;

                    if (span < matches.Length)
                    {
                        pos = matches[span].Index + 1;
                    }
                }

                return(tokens.ToList());
            }
            else
            {
                var m = matches.Select(x => new Token
                {
                    Text  = x.Value,
                    Start = x.Index
                }).ToList();

                if (options.SpecialWords != null)
                {
                    int offset = 0;
                    m.ForEach(t =>
                    {
                        if (options.SpecialWords.Contains(t.Text))
                        {
                            offset++;
                        }

                        t.Start = t.Start - offset;
                    });
                }


                return(m);
            }
        }
Example #2
0
 public TokenizerFactory(TokenizationOptions options, SupportedLanguage lang)
 {
     _lang    = lang;
     _options = options;
 }
Example #3
0
        public List <Token> Tokenize(string sentence, TokenizationOptions options)
        {
            string text = sentence;

            // starting quoting replace
            STARTING_QUOTES.ForEach(x =>
            {
                text = Regex.Replace(text, x.Item1, x.Item2);
            });

            // replace PUNCTUATION
            PUNCTUATION.ForEach(x =>
            {
                text = Regex.Replace(text, x.Item1, x.Item2);
            });

            // Handles parentheses.
            PARENS_BRACKETS.ForEach(x =>
            {
                text = Regex.Replace(text, x.Item1, x.Item2);
            });

            // convert parentheses
            if (options.ConvertParentheses)
            {
                CONVERT_PARENTHESES.ForEach(x =>
                {
                    text = Regex.Replace(text, x.Item1, x.Item2);
                });
            }

            // Handles repeated dash.
            text = Regex.Replace(text, "(-{2,})", " $1 ").Trim();

            // replace ending quotes
            ENDING_QUOTES.ForEach(x =>
            {
                text = Regex.Replace(text, x.Item1, x.Item2);
            });

            // replace ending quotes
            CONVENTIONS.ForEach(x =>
            {
                text = Regex.Replace(text, x.Item1, x.Item2);
            });

            // remove duplicated spaces
            text = Regex.Replace(text, "\\s+", " ") + " ";

            // split
            int pos = 0;

            var tokens = Regex.Matches(text, "\\s")
                         .Cast <Match>()
                         .Select(x => {
                var token = new Token
                {
                    Start = pos,
                    Text  = text.Substring(pos, x.Index - pos)
                };

                pos = x.Index + 1;

                return(token);
            }).ToList();

            // correct token position
            CorrectTokenPosition(sentence, tokens);

            return(tokens);
        }