public static Phrase GetTokens(this Tree tree, Tree root = null, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var tokens = new Phrase(sentence: sentence); java.util.List leaves = tree.getLeaves(); for (java.util.Iterator i = leaves.iterator(); i.hasNext();) { Tree leaf = (Tree)i.next(); string token = leaf.value().Trim(); Tree preterminal = leaf.parent(tree); if (preterminal == null) { continue; } string tag = preterminal.value().Trim(); bool ignoreMeansInclude = options.HasFlag(AnalyzerOptions.IgnoreMeansInclude); if (ignore != string.Empty) { bool isMatch = Regex.IsMatch(token, ignore); if (ignoreMeansInclude) { if (!isMatch) { continue; } } else { if (isMatch) { continue; } } } bool omitPunctuation = options.HasFlag(AnalyzerOptions.OmitPunctuationTokens); if (omitPunctuation) { // Leave out certain types of punctuation: bool isPunctuation = Regex.IsMatch(tag, punctuation ?? Analyzer.PunctuationPatterns) || Regex.IsMatch(token, punctuation ?? Analyzer.PunctuationPatterns); if (isPunctuation) { tokens.IsPunctuationOmitted = true; continue; } // But also remove any straggler punctuation missed within a token...? Maybe not. Use RegExp 'FloatingPunctuationPatterns' if so. } root = root ?? tree; int depth = root.depth() - root.depth(preterminal); var characterEdges = new CharacterEdges(root.leftCharEdge(leaf), root.rightCharEdge(leaf)); tokens.Add(new Token(token, tag, depth, characterEdges)); } return(tokens); }
public static Phrase GetTokens(this Tree tree, Tree root = null, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var tokens = new Phrase(sentence: sentence); java.util.List leaves = tree.getLeaves(); for (java.util.Iterator i = leaves.iterator(); i.hasNext(); ) { Tree leaf = (Tree)i.next(); string token = leaf.value().Trim(); Tree preterminal = leaf.parent(tree); if (preterminal == null) continue; string tag = preterminal.value().Trim(); bool ignoreMeansInclude = options.HasFlag(AnalyzerOptions.IgnoreMeansInclude); if (ignore != string.Empty) { bool isMatch = Regex.IsMatch(token, ignore); if (ignoreMeansInclude) { if (!isMatch) continue; } else { if (isMatch) continue; } } bool omitPunctuation = options.HasFlag(AnalyzerOptions.OmitPunctuationTokens); if (omitPunctuation) { // Leave out certain types of punctuation: bool isPunctuation = Regex.IsMatch(tag, punctuation ?? Analyzer.PunctuationPatterns) || Regex.IsMatch(token, punctuation ?? Analyzer.PunctuationPatterns); if (isPunctuation) { tokens.IsPunctuationOmitted = true; continue; } // But also remove any straggler punctuation missed within a token...? Maybe not. Use RegExp 'FloatingPunctuationPatterns' if so. } root = root ?? tree; int depth = root.depth() - root.depth(preterminal); var characterEdges = new CharacterEdges(root.leftCharEdge(leaf), root.rightCharEdge(leaf)); tokens.Add(new Token(token, tag, depth, characterEdges)); } return tokens; }