Beispiel #1
0
 private string[] TokenizeSentence(string sentence)
 {
     if (mTokenizer == null)
     {
         mTokenizer = new OpenNLP.Tools.Tokenize.EnglishMaximumEntropyTokenizer(mModelPath + "EnglishTok.nbin");
     }
     return(mTokenizer.Tokenize(sentence));
 }
        public string DoParse(string[] lines, int requestedParses)
        {
            System.Text.StringBuilder parseStringBuilder = new System.Text.StringBuilder();

            foreach (string line in lines)
            {
                System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder();

                string[]  rawTokens = mTokenizer.Tokenize(line);
                ArrayList tokens    = new ArrayList();
                foreach (string rawToken in rawTokens)
                {
                    string convertedToken = ConvertToken(rawToken);
                    tokens.Add(convertedToken);
                    lineBuilder.Append(convertedToken).Append(" ");
                }
                if (lineBuilder.Length != 0)
                {
                    string text         = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString();
                    Parse  currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null);
                    int    start        = 0;

                    foreach (string token in tokens)
                    {
                        currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0));
                        start += token.Length + 1;
                    }

                    Parse[] parses = mParser.FullParse(currentParse, requestedParses);
                    for (int currentParseIndex = 0, parseCount = parses.Length; currentParseIndex < parseCount; currentParseIndex++)
                    {
                        if (requestedParses > 1)
                        {
                            lineBuilder.Append(currentParse.ToString() + " " + parses[currentParseIndex].Probability.ToString(System.Globalization.CultureInfo.InvariantCulture) + " ");
                        }
                        lineBuilder.Append(parses[currentParseIndex].Show());
                        parseStringBuilder.Append(lineBuilder.ToString());
                    }
                }
                else
                {
                    parseStringBuilder.Append("\r\n");
                }
            }
            return(parseStringBuilder.ToString());
        }
Beispiel #3
0
        public IEnumerable <string> Tokenize(string input)
        {
            var sanitizedInput = Sanitize(input);

            var tokenizer = new OpenNLP.Tools.Tokenize.EnglishMaximumEntropyTokenizer("Resources/EnglishTok.nbin");

            var tokenized = tokenizer.Tokenize(sanitizedInput);

            var output = tokenized.Where(token => !m_stopWords.Contains(token)).ToList();

            return(output);
        }
 static string[] TokenSentence(string text)
 {
     return(mTokenizer.Tokenize(text));
 }