public static List<Phrase> GetPhrases(this Tree root, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var phrases = new List<Phrase>(); for (java.util.Iterator i = root.iterator(); i.hasNext(); ) { Tree tree = (Tree)i.next(); if (tree.isPhrasal()) { java.util.List children = tree.getChildrenAsList(); if (children.size() == 1 && ((Tree)children.get(0)).isPhrasal()) continue; var current = new Phrase(tree.GetTokens(root, sentence, ignore, punctuation, options)); // If current node matches previous node but for punctuation omission, replace previous with current: bool omitFalseDuplicatePhrases = options.HasFlag(AnalyzerOptions.OmitFalseDuplicatePhrases); if (omitFalseDuplicatePhrases) { if (phrases.Count > 0) { Phrase previous = phrases.Last(); if (previous.EqualExceptPunctuationOmission(current)) { phrases[phrases.Count - 1] = current; continue; } } } if (current.Count == 0) continue; phrases.Add(current); } } return phrases; }
public static List <Phrase> GetPrePreTerminalPhrases(this Tree root, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var phrases = new List <Phrase>(); for (java.util.Iterator i = root.iterator(); i.hasNext();) { Tree tree = (Tree)i.next(); if (tree.isPreTerminal() || tree.isPrePreTerminal()) { if (tree.isPreTerminal() && tree.parent(root) != null) { if (tree.parent(root).isPrePreTerminal()) { continue; } } var current = new Phrase(tree.GetTokens(root, sentence, ignore, punctuation, options)); // If current node matches previous node but for punctuation omission, replace previous with current: bool omitFalseDuplicatePhrases = options.HasFlag(AnalyzerOptions.OmitFalseDuplicatePhrases); if (omitFalseDuplicatePhrases) { if (phrases.Count > 0) { Phrase previous = phrases.Last(); if (previous.EqualExceptPunctuationOmission(current)) { phrases[phrases.Count - 1] = current; continue; } } } if (current.Count == 0) { continue; } phrases.Add(current); } } // If "phrase" is a single token which is a preposition (IN) or infinitival to (TO), then join it to the subsequent phrase. for (int i = 0; i < phrases.Count; ++i) { if (phrases[i].Count == 1 && Regex.IsMatch(phrases[i][0].TagEquivalent, @"^(IN|TO)$", RegexOptions.IgnoreCase) && i != phrases.Count - 1) { phrases[i + 1].Tokens.InsertRange(0, phrases[i].Tokens); phrases.RemoveAt(i); i = -1; } } return(phrases); }
public static List <Phrase> GetClauses(this Tree root, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var phrases = new List <Phrase>(); for (java.util.Iterator i = root.iterator(); i.hasNext();) { Tree tree = (Tree)i.next(); var treeLabel = tree.label().value(); var clauseRe = @"^(S|SBAR|SBARQ|SINV|SQ|FRAG)$"; bool isClausal = Regex.IsMatch(treeLabel, clauseRe, RegexOptions.IgnoreCase); if (isClausal) { var current = new Phrase(tree.GetTokens(root, sentence, ignore, punctuation, options)); // If current node matches previous node but for punctuation omission, replace previous with current: bool omitFalseDuplicatePhrases = options.HasFlag(AnalyzerOptions.OmitFalseDuplicatePhrases); if (omitFalseDuplicatePhrases) { if (phrases.Count > 0) { Phrase previous = phrases.Last(); if (previous.EqualExceptPunctuationOmission(current)) { phrases[phrases.Count - 1] = current; continue; } } } if (current.Count == 0) { continue; } phrases.Add(current); } } if (phrases.Count == 0) // Since 'root' has been identified as a sentence, it should have at least one clause associated with it. { var pseudoClauses = root.GetPhrases(sentence, ignore, punctuation, options); if (pseudoClauses.Count > 0) { phrases.Add(pseudoClauses[0]); } } return(phrases); }
public static List <Phrase> GetPhrases(this Tree root, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var phrases = new List <Phrase>(); for (java.util.Iterator i = root.iterator(); i.hasNext();) { Tree tree = (Tree)i.next(); if (tree.isPhrasal()) { java.util.List children = tree.getChildrenAsList(); if (children.size() == 1 && ((Tree)children.get(0)).isPhrasal()) { continue; } var current = new Phrase(tree.GetTokens(root, sentence, ignore, punctuation, options)); // If current node matches previous node but for punctuation omission, replace previous with current: bool omitFalseDuplicatePhrases = options.HasFlag(AnalyzerOptions.OmitFalseDuplicatePhrases); if (omitFalseDuplicatePhrases) { if (phrases.Count > 0) { Phrase previous = phrases.Last(); if (previous.EqualExceptPunctuationOmission(current)) { phrases[phrases.Count - 1] = current; continue; } } } if (current.Count == 0) { continue; } phrases.Add(current); } } return(phrases); }
public static List<Phrase> GetClauses(this Tree root, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var phrases = new List<Phrase>(); for (java.util.Iterator i = root.iterator(); i.hasNext(); ) { Tree tree = (Tree)i.next(); var treeLabel = tree.label().value(); var clauseRe = @"^(S|SBAR|SBARQ|SINV|SQ|FRAG)$"; bool isClausal = Regex.IsMatch(treeLabel, clauseRe, RegexOptions.IgnoreCase); if (isClausal) { var current = new Phrase(tree.GetTokens(root, sentence, ignore, punctuation, options)); // If current node matches previous node but for punctuation omission, replace previous with current: bool omitFalseDuplicatePhrases = options.HasFlag(AnalyzerOptions.OmitFalseDuplicatePhrases); if (omitFalseDuplicatePhrases) { if (phrases.Count > 0) { Phrase previous = phrases.Last(); if (previous.EqualExceptPunctuationOmission(current)) { phrases[phrases.Count - 1] = current; continue; } } } if (current.Count == 0) continue; phrases.Add(current); } } if (phrases.Count == 0) { // Since 'root' has been identified as a sentence, it should have at least one clause associated with it. var pseudoClauses = root.GetPhrases(sentence, ignore, punctuation, options); if (pseudoClauses.Count > 0) phrases.Add(pseudoClauses[0]); } return phrases; }
public static Phrase GetTokens(this Tree tree, Tree root = null, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var tokens = new Phrase(sentence: sentence); java.util.List leaves = tree.getLeaves(); for (java.util.Iterator i = leaves.iterator(); i.hasNext();) { Tree leaf = (Tree)i.next(); string token = leaf.value().Trim(); Tree preterminal = leaf.parent(tree); if (preterminal == null) { continue; } string tag = preterminal.value().Trim(); bool ignoreMeansInclude = options.HasFlag(AnalyzerOptions.IgnoreMeansInclude); if (ignore != string.Empty) { bool isMatch = Regex.IsMatch(token, ignore); if (ignoreMeansInclude) { if (!isMatch) { continue; } } else { if (isMatch) { continue; } } } bool omitPunctuation = options.HasFlag(AnalyzerOptions.OmitPunctuationTokens); if (omitPunctuation) { // Leave out certain types of punctuation: bool isPunctuation = Regex.IsMatch(tag, punctuation ?? Analyzer.PunctuationPatterns) || Regex.IsMatch(token, punctuation ?? Analyzer.PunctuationPatterns); if (isPunctuation) { tokens.IsPunctuationOmitted = true; continue; } // But also remove any straggler punctuation missed within a token...? Maybe not. Use RegExp 'FloatingPunctuationPatterns' if so. } root = root ?? tree; int depth = root.depth() - root.depth(preterminal); var characterEdges = new CharacterEdges(root.leftCharEdge(leaf), root.rightCharEdge(leaf)); tokens.Add(new Token(token, tag, depth, characterEdges)); } return(tokens); }
public static List<Phrase> GetPrePreTerminalPhrases(this Tree root, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var phrases = new List<Phrase>(); for (java.util.Iterator i = root.iterator(); i.hasNext(); ) { Tree tree = (Tree)i.next(); if (tree.isPreTerminal() || tree.isPrePreTerminal()) { if (tree.isPreTerminal() && tree.parent(root) != null) { if (tree.parent(root).isPrePreTerminal()) continue; } var current = new Phrase(tree.GetTokens(root, sentence, ignore, punctuation, options)); // If current node matches previous node but for punctuation omission, replace previous with current: bool omitFalseDuplicatePhrases = options.HasFlag(AnalyzerOptions.OmitFalseDuplicatePhrases); if (omitFalseDuplicatePhrases) { if (phrases.Count > 0) { Phrase previous = phrases.Last(); if (previous.EqualExceptPunctuationOmission(current)) { phrases[phrases.Count - 1] = current; continue; } } } if (current.Count == 0) continue; phrases.Add(current); } } // If "phrase" is a single token which is a preposition (IN) or infinitival to (TO), then join it to the subsequent phrase. for (int i = 0; i < phrases.Count; ++i) { if (phrases[i].Count == 1 && Regex.IsMatch(phrases[i][0].TagEquivalent, @"^(IN|TO)$", RegexOptions.IgnoreCase) && i != phrases.Count - 1) { phrases[i + 1].Tokens.InsertRange(0, phrases[i].Tokens); phrases.RemoveAt(i); i =- 1; } } return phrases; }
public static Phrase GetTokens(this Tree tree, Tree root = null, Rhetorica.Sentence sentence = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) { var tokens = new Phrase(sentence: sentence); java.util.List leaves = tree.getLeaves(); for (java.util.Iterator i = leaves.iterator(); i.hasNext(); ) { Tree leaf = (Tree)i.next(); string token = leaf.value().Trim(); Tree preterminal = leaf.parent(tree); if (preterminal == null) continue; string tag = preterminal.value().Trim(); bool ignoreMeansInclude = options.HasFlag(AnalyzerOptions.IgnoreMeansInclude); if (ignore != string.Empty) { bool isMatch = Regex.IsMatch(token, ignore); if (ignoreMeansInclude) { if (!isMatch) continue; } else { if (isMatch) continue; } } bool omitPunctuation = options.HasFlag(AnalyzerOptions.OmitPunctuationTokens); if (omitPunctuation) { // Leave out certain types of punctuation: bool isPunctuation = Regex.IsMatch(tag, punctuation ?? Analyzer.PunctuationPatterns) || Regex.IsMatch(token, punctuation ?? Analyzer.PunctuationPatterns); if (isPunctuation) { tokens.IsPunctuationOmitted = true; continue; } // But also remove any straggler punctuation missed within a token...? Maybe not. Use RegExp 'FloatingPunctuationPatterns' if so. } root = root ?? tree; int depth = root.depth() - root.depth(preterminal); var characterEdges = new CharacterEdges(root.leftCharEdge(leaf), root.rightCharEdge(leaf)); tokens.Add(new Token(token, tag, depth, characterEdges)); } return tokens; }