private string[] TokenizeSentence(string sentence) { if (mTokenizer == null) { mTokenizer = new OpenNLP.Tools.Tokenize.EnglishMaximumEntropyTokenizer(mModelPath + "EnglishTok.nbin"); } return(mTokenizer.Tokenize(sentence)); }
public string DoParse(string[] lines, int requestedParses) { System.Text.StringBuilder parseStringBuilder = new System.Text.StringBuilder(); foreach (string line in lines) { System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder(); string[] rawTokens = mTokenizer.Tokenize(line); ArrayList tokens = new ArrayList(); foreach (string rawToken in rawTokens) { string convertedToken = ConvertToken(rawToken); tokens.Add(convertedToken); lineBuilder.Append(convertedToken).Append(" "); } if (lineBuilder.Length != 0) { string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString(); Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null); int start = 0; foreach (string token in tokens) { currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0)); start += token.Length + 1; } Parse[] parses = mParser.FullParse(currentParse, requestedParses); for (int currentParseIndex = 0, parseCount = parses.Length; currentParseIndex < parseCount; currentParseIndex++) { if (requestedParses > 1) { lineBuilder.Append(currentParse.ToString() + " " + parses[currentParseIndex].Probability.ToString(System.Globalization.CultureInfo.InvariantCulture) + " "); } lineBuilder.Append(parses[currentParseIndex].Show()); parseStringBuilder.Append(lineBuilder.ToString()); } } else { parseStringBuilder.Append("\r\n"); } } return(parseStringBuilder.ToString()); }
public IEnumerable <string> Tokenize(string input) { var sanitizedInput = Sanitize(input); var tokenizer = new OpenNLP.Tools.Tokenize.EnglishMaximumEntropyTokenizer("Resources/EnglishTok.nbin"); var tokenized = tokenizer.Tokenize(sanitizedInput); var output = tokenized.Where(token => !m_stopWords.Contains(token)).ToList(); return(output); }
static string[] TokenSentence(string text) { return(mTokenizer.Tokenize(text)); }