private void FindTreePattern(Tree tree, TregexPattern tgrepPattern, ICollection <Pair <int, int> > foundPairs) { try { TregexMatcher m = tgrepPattern.Matcher(tree); while (m.Find()) { Tree t = m.GetMatch(); Tree np1 = m.GetNode("m1"); Tree np2 = m.GetNode("m2"); Tree np3 = null; if (tgrepPattern.Pattern().Contains("m3")) { np3 = m.GetNode("m3"); } AddFoundPair(np1, np2, t, foundPairs); if (np3 != null) { AddFoundPair(np2, np3, t, foundPairs); } } } catch (Exception e) { // shouldn't happen.... throw new Exception(e); } }
public virtual void ExtractNPorPRP(ICoreMap s, IList <Mention> mentions, ICollection <IntPair> mentionSpanSet, ICollection <IntPair> namedEntitySpanSet) { IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation)); Tree tree = s.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); tree.IndexLeaves(); SemanticGraph basicDependency = s.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); SemanticGraph enhancedDependency = s.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); if (enhancedDependency == null) { enhancedDependency = s.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); } TregexPattern tgrepPattern = npOrPrpMentionPattern; TregexMatcher matcher = tgrepPattern.Matcher(tree); while (matcher.Find()) { Tree t = matcher.GetMatch(); IList <Tree> mLeaves = t.GetLeaves(); int beginIdx = ((CoreLabel)mLeaves[0].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; int endIdx = ((CoreLabel)mLeaves[mLeaves.Count - 1].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)); //if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , IntPair mSpan = new IntPair(beginIdx, endIdx); if (!mentionSpanSet.Contains(mSpan) && (lang == Locale.Chinese || !InsideNE(mSpan, namedEntitySpanSet))) { // if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet) || t.value().startsWith("PRP")) ) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIdx, endIdx, sent, basicDependency, enhancedDependency, new List <CoreLabel>(sent.SubList(beginIdx, endIdx)), t); mentions.Add(m); mentionSpanSet.Add(mSpan); } } }
public virtual void TestBackReference() { TregexPattern tregex = TregexPattern.Compile("__ <1 B=n <2 ~n"); TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel n X"); RunTest(tregex, tsurgeon, "(A (B w) (B w))", "(A (X w) (B w))"); }
private static TregexPattern[] GetPleonasticPatterns() { string[] patterns = new string[] { "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))", "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))" , "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))", "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))", "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))" , "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))", "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))" , "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))", "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))", "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))" , "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))" }; // cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev // I tried some more precise patterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns. //"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))", // this one seems more accurate, but ... // in practice, go with this one (best results) // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers.... // these next 5 had buggy space in "$ ..", which I fixed // extraposed. OK 1/2 correct; need non-adverbial case // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ // certain can be either but relatively likely pleonastic with it ... be // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones TregexPattern[] tgrepPatterns = new TregexPattern[patterns.Length]; for (int i = 0; i < tgrepPatterns.Length; i++) { tgrepPatterns[i] = TregexPattern.Compile(patterns[i]); } return(tgrepPatterns); }
public virtual void TestReplaceTree() { TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("replace foo (BAR 1)"); TregexPattern tregex = TregexPattern.Compile("B=foo"); RunTest(tregex, tsurgeon, "(A (B 0) (B 1) (C 2))", "(A (BAR 1) (BAR 1) (C 2))"); // test that a single replacement at the root is allowed RunTest(tregex, tsurgeon, "(B (C 1))", "(BAR 1)"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("replace foo (BAR 1) (BAZ 2)"); RunTest(tregex, tsurgeon, "(A (B 0) (B 1) (C 2))", "(A (BAR 1) (BAZ 2) (BAR 1) (BAZ 2) (C 2))"); try { RunTest(tregex, tsurgeon, "(B 0)", "(B 0)"); throw new Exception("Expected a failure"); } catch (TsurgeonRuntimeException) { } // good, we expected to fail if you try to replace the root node with two nodes // it is possible for numbers to work and words to not work if // the tsurgeon parser is not correct tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("replace foo (BAR blah)"); tregex = TregexPattern.Compile("B=foo"); RunTest(tregex, tsurgeon, "(A (B 0) (B 1) (C 2))", "(A (BAR blah) (BAR blah) (C 2))"); }
protected internal static void ExtractEnumerations(ICoreMap s, IList <Mention> mentions, ICollection <IntPair> mentionSpanSet, ICollection <IntPair> namedEntitySpanSet) { IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation)); Tree tree = s.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); SemanticGraph dependency = s.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); TregexPattern tgrepPattern = enumerationsMentionPattern; TregexMatcher matcher = tgrepPattern.Matcher(tree); IDictionary <IntPair, Tree> spanToMentionSubTree = Generics.NewHashMap(); while (matcher.Find()) { matcher.GetMatch(); Tree m1 = matcher.GetNode("m1"); Tree m2 = matcher.GetNode("m2"); IList <Tree> mLeaves = m1.GetLeaves(); int beginIdx = ((CoreLabel)mLeaves[0].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; int endIdx = ((CoreLabel)mLeaves[mLeaves.Count - 1].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)); spanToMentionSubTree[new IntPair(beginIdx, endIdx)] = m1; mLeaves = m2.GetLeaves(); beginIdx = ((CoreLabel)mLeaves[0].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; endIdx = ((CoreLabel)mLeaves[mLeaves.Count - 1].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)); spanToMentionSubTree[new IntPair(beginIdx, endIdx)] = m2; } foreach (IntPair mSpan in spanToMentionSubTree.Keys) { if (!mentionSpanSet.Contains(mSpan) && !InsideNE(mSpan, namedEntitySpanSet)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, mSpan.Get(0), mSpan.Get(1), dependency, new List <CoreLabel>(sent.SubList(mSpan.Get(0), mSpan.Get(1))), spanToMentionSubTree[mSpan]); mentions.Add(m); mentionSpanSet.Add(mSpan); } } }
public virtual void TestInsert() { TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("insert (D (E 6)) $+ bar"); TregexPattern tregex = TregexPattern.Compile("B=bar !$ D"); RunTest(tregex, tsurgeon, "(A (B 0) (C 1))", "(A (D (E 6)) (B 0) (C 1))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("insert (D (E 6)) $- bar"); RunTest(tregex, tsurgeon, "(A (B 0) (C 1))", "(A (B 0) (D (E 6)) (C 1))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("insert (D (E 6)) >0 bar"); tregex = TregexPattern.Compile("B=bar !<D"); RunTest(tregex, tsurgeon, "(A (B 0) (C 1))", "(A (B (D (E 6)) 0) (C 1))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("insert foo >0 bar"); tregex = TregexPattern.Compile("B=bar !<C $C=foo"); RunTest(tregex, tsurgeon, "(A (B 0) (C 1))", "(A (B (C 1) 0) (C 1))"); // the name will be cut off tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("insert (D (E=blah 6)) >0 bar"); tregex = TregexPattern.Compile("B=bar !<D"); RunTest(tregex, tsurgeon, "(A (B 0) (C 1))", "(A (B (D (E 6)) 0) (C 1))"); // the name should not be cut off, with the escaped = unescaped now tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("insert (D (E\\=blah 6)) >0 bar"); tregex = TregexPattern.Compile("B=bar !<D"); RunTest(tregex, tsurgeon, "(A (B 0) (C 1))", "(A (B (D (E=blah 6)) 0) (C 1))"); // the name should be cut off again, with a \ at the end of the new node tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("insert (D (E\\\\=blah 6)) >0 bar"); tregex = TregexPattern.Compile("B=bar !<D"); RunTest(tregex, tsurgeon, "(A (B 0) (C 1))", "(A (B (D (E\\ 6)) 0) (C 1))"); }
protected internal static void ExtractNPorPRP(ICoreMap s, IList <Mention> mentions, ICollection <IntPair> mentionSpanSet, ICollection <IntPair> namedEntitySpanSet) { IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation)); Tree tree = s.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); tree.IndexLeaves(); SemanticGraph dependency = s.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); TregexPattern tgrepPattern = npOrPrpMentionPattern; TregexMatcher matcher = tgrepPattern.Matcher(tree); while (matcher.Find()) { Tree t = matcher.GetMatch(); IList <Tree> mLeaves = t.GetLeaves(); int beginIdx = ((CoreLabel)mLeaves[0].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; int endIdx = ((CoreLabel)mLeaves[mLeaves.Count - 1].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)); if (",".Equals(sent[endIdx - 1].Word())) { endIdx--; } // try not to have span that ends with , IntPair mSpan = new IntPair(beginIdx, endIdx); if (!mentionSpanSet.Contains(mSpan) && !InsideNE(mSpan, namedEntitySpanSet)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIdx, endIdx, dependency, new List <CoreLabel>(sent.SubList(beginIdx, endIdx)), t); mentions.Add(m); mentionSpanSet.Add(mSpan); } } }
private IList <Pair <TregexPattern, TsurgeonPattern> > LoadOps() { IList <Pair <TregexPattern, TsurgeonPattern> > ops = new List <Pair <TregexPattern, TsurgeonPattern> >(); string line = null; try { BufferedReader br = new BufferedReader(new StringReader(editStr)); IList <TsurgeonPattern> tsp = new List <TsurgeonPattern>(); while ((line = br.ReadLine()) != null) { TregexPattern matchPattern = TregexPattern.Compile(line); tsp.Clear(); while (Continuing(line = br.ReadLine())) { TsurgeonPattern p = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation(line); tsp.Add(p); } if (!tsp.IsEmpty()) { TsurgeonPattern tp = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.CollectOperations(tsp); ops.Add(new Pair <TregexPattern, TsurgeonPattern>(matchPattern, tp)); } } } catch (IOException ioe) { // while not at end of file Sharpen.Runtime.PrintStackTrace(ioe); } return(ops); }
public virtual void TestForeign() { TregexPattern tregex = TregexPattern.Compile("atentát=test"); TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel test perform_atentát"); RunTest(tregex, tsurgeon, "(foo atentát)", "(foo perform_atentát)"); }
public virtual void TestKeyword() { // This should successfully compile, assuming the keyword parsing is correct TregexPattern tregex = TregexPattern.Compile("A=foo << B=bar << C=baz"); TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel foo relabel"); RunTest(tregex, tsurgeon, "(A (B foo) (C foo) (C bar))", "(relabel (B foo) (C foo) (C bar))"); }
/// <summary>Right now this outputs trees in PTB format.</summary> /// <remarks> /// Right now this outputs trees in PTB format. It outputs one tree /// at a time until we have output enough trees to fill the given /// file, then moves on to the next file. Trees are output in the /// order given in the <code>ids</code> list. /// <br /> /// Trees have their words replaced with the words' lemmas, if those /// lemmas exist. /// </remarks> /// <exception cref="System.IO.IOException"/> public static void OutputSplits(IList <string> ids, IDictionary <string, Tree> treeMap) { IQueue <int> fSizeQueue = new LinkedList <int>(Arrays.AsList(fSizes)); IQueue <string> fNameQueue = new LinkedList <string>(Arrays.AsList(fNames)); TregexPattern pBadTree = TregexPattern.Compile("@SENT <: @PUNC"); TregexPattern pBadTree2 = TregexPattern.Compile("@SENT <1 @PUNC <2 @PUNC !<3 __"); ITreeTransformer tt = new FTBCorrector(); int size = fSizeQueue.Remove(); string filename = fNameQueue.Remove(); log.Info("Outputing " + filename); PrintWriter writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))); int outputCount = 0; foreach (string id in ids) { if (!treeMap.Contains(id)) { log.Info("Missing id: " + id); continue; } Tree tree = treeMap[id]; TregexMatcher m = pBadTree.Matcher(tree); TregexMatcher m2 = pBadTree2.Matcher(tree); if (m.Find() || m2.Find()) { log.Info("Discarding tree: " + tree.ToString()); continue; } // Punctuation normalization, etc. Tree backupCopy = tree.DeepCopy(); tree = tt.TransformTree(tree); if (tree.FirstChild().Children().Length == 0) { // Some trees have only punctuation. Tregex will mangle these. Don't throw those away. log.Info("Saving tree: " + tree.ToString()); log.Info("Backup: " + backupCopy.ToString()); tree = backupCopy; } if (LemmasAsLeaves || AddMorphoToLeaves) { MungeLeaves(tree, LemmasAsLeaves, AddMorphoToLeaves); } ReplacePOSTags(tree); writer.Println(tree.ToString()); ++outputCount; if (outputCount == size) { outputCount = 0; size = fSizeQueue.Remove(); filename = fNameQueue.Remove(); log.Info("Outputing " + filename); writer.Close(); writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))); } } writer.Close(); }
public virtual void TestChineseReplaceTree() { string input = "(IP (IP (PP (P 像) (NP (NP (NR 赖斯) (PU ,) (NR 赖斯)) (NP (PN 本身)))) (PU 她{) (NP (NN breath)) (PU }) (IJ 呃) (VP (VV 担任) (NP (NN 国务卿)) (VP (ADVP (AD 比较)) (VP (VA 晚))))))"; string expected = "(IP (IP (PP (P 像) (NP (NP (NR 赖斯) (PU ,) (NR 赖斯)) (NP (PN 本身)))) (PN 她) (PU {) (NP (NN breath)) (PU }) (IJ 呃) (VP (VV 担任) (NP (NN 国务卿)) (VP (ADVP (AD 比较)) (VP (VA 晚))))))"; TregexPattern tregex = TregexPattern.Compile("PU=punc < 她{"); TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("replace punc (PN 她) (PU {)"); RunTest(tregex, tsurgeon, input, expected); }
public virtual void TestReplaceWithRepeats() { TsurgeonPattern tsurgeon; TregexPattern tregex; tregex = TregexPattern.Compile("@NP < (/^,/=comma $+ CC)"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("replace comma (COMMA)"); RunTest(tregex, tsurgeon, "(NP NP , NP , NP , CC NP)", "(NP NP , NP , NP COMMA CC NP)"); }
public virtual void TestCreateSubtrees() { TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("createSubtree FOO left right"); TregexPattern tregex = TregexPattern.Compile("A < B=left < C=right"); // Verify when there are only two nodes RunTest(tregex, tsurgeon, "(A (B 1) (C 2))", "(A (FOO (B 1) (C 2)))"); // We allow backwards nodes as well RunTest(tregex, tsurgeon, "(A (C 1) (B 2))", "(A (FOO (C 1) (B 2)))"); // Check nodes in between RunTest(tregex, tsurgeon, "(A (B 1) (D 3) (C 2))", "(A (FOO (B 1) (D 3) (C 2)))"); // Check nodes outside the span RunTest(tregex, tsurgeon, "(A (D 3) (B 1) (C 2))", "(A (D 3) (FOO (B 1) (C 2)))"); RunTest(tregex, tsurgeon, "(A (B 1) (C 2) (D 3))", "(A (FOO (B 1) (C 2)) (D 3))"); RunTest(tregex, tsurgeon, "(A (D 3) (B 1) (C 2) (E 4))", "(A (D 3) (FOO (B 1) (C 2)) (E 4))"); // Check when the two endpoints are the same tregex = TregexPattern.Compile("A < B=left < B=right"); RunTest(tregex, tsurgeon, "(A (B 1) (C 2))", "(A (FOO (B 1)) (C 2))"); // Check double operation - should make two FOO nodes and then stop RunTest(tregex, tsurgeon, "(A (B 1) (B 2))", "(A (FOO (B 1)) (FOO (B 2)))"); // Check when we only have one argument to createSubtree tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("createSubtree FOO child"); tregex = TregexPattern.Compile("A < B=child"); RunTest(tregex, tsurgeon, "(A (B 1) (C 2))", "(A (FOO (B 1)) (C 2))"); RunTest(tregex, tsurgeon, "(A (B 1) (B 2))", "(A (FOO (B 1)) (FOO (B 2)))"); // Check that incorrectly formatted operations don't successfully parse try { tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("createSubtree FOO"); throw new AssertionError("Expected to fail parsing"); } catch (TsurgeonParseException) { } // yay try { tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("createSubtree FOO a b c"); throw new AssertionError("Expected to fail parsing"); } catch (TsurgeonParseException) { } // yay // Verify that it fails when the parents are different tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("createSubtree FOO left right"); tregex = TregexPattern.Compile("A << B=left << C=right"); try { RunTest(tregex, tsurgeon, "(A (B 1) (D (C 2)))", "(A (B 1) (D (C 2)))"); throw new AssertionError("Expected a runtime failure"); } catch (TsurgeonRuntimeException) { } }
//Delete sentence-initial punctuation //Delete sentence final punctuation that is preceded by punctuation (first time) //Delete sentence final punctuation that is preceded by punctuation (second time) //Convert remaining sentence-final punctuation to either . if it is not [.!?] //Delete medial, sentence-final punctuation //Now move the sentence-final mark under SENT //For those trees that lack a sentence-final punc, add one. //Finally, delete these punctuation marks, which I can't seem to kill otherwise... //A bad MWADV tree in the training set // Not sure why this got a label of X. Similar trees suggest it // should be A instead // This also seems to be mislabeled /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { log.Info("Usage: java " + typeof(Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector).FullName + " filename\n"); System.Environment.Exit(-1); } ITreeTransformer tt = new Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector(); File f = new File(args[0]); try { //These bad trees in the Candito training set should be thrown out: // (ROOT (SENT (" ") (. .))) // (ROOT (SENT (. .))) TregexPattern pBadTree = TregexPattern.Compile("@SENT <: @PUNC"); TregexPattern pBadTree2 = TregexPattern.Compile("@SENT <1 @PUNC <2 @PUNC !<3 __"); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TregexMatcher m = pBadTree.Matcher(t); TregexMatcher m2 = pBadTree2.Matcher(t); if (m.Find() || m2.Find()) { log.Info("Discarding tree: " + t.ToString()); } else { Tree fixedT = tt.TransformTree(t); System.Console.Out.WriteLine(fixedT.ToString()); } } tr.Close(); System.Console.Error.Printf("Wrote %d trees%n", nTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (TregexParseException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/** * Parses a tsurgeon script text input and compiles a tregex pattern and a list * of tsurgeon operations into a pair. * * @param reader Reader to read patterns from * @return A pair of a tregex and tsurgeon pattern read from a file, or <code>null</code> * when the operations in the Reader have been exhausted * @throws IOException If any IO problem */ /*public static Tuple<TregexPattern, TsurgeonPattern> getOperationFromReader(BufferedReader reader, TregexPatternCompiler compiler) /*throws IOException#1# { * string patternString = getTregexPatternFromReader(reader); * if ("".equals(patternString)) { * return null; * } * TregexPattern matchPattern = compiler.compile(patternString); * * TsurgeonPattern collectedPattern = getTsurgeonOperationsFromReader(reader); * return new Pair<TregexPattern,TsurgeonPattern>(matchPattern,collectedPattern); * }*/ /** * Assumes that we are at the beginning of a tsurgeon script file and gets the string for the * tregex pattern leading the file * @return tregex pattern string */ /*public static string getTregexPatternFromReader(BufferedReader reader) throws IOException { * StringBuilder matchString = new StringBuilder(); * for (string thisLine; (thisLine = reader.readLine()) != null; ) { * if (matchString.length() > 0 && emptyLinePattern.matcher(thisLine).matches()) { * // A blank line after getting some real content (not just comments or nothing) * break; * } * Matcher m = commentPattern.matcher(thisLine); * if (m.matches()) { * // delete it * thisLine = m.replaceFirst(""); * } * if ( ! emptyLinePattern.matcher(thisLine).matches()) { * matchString.append(thisLine); * } * } * return matchString.ToString(); * }*/ /** * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and parses * these out, collecting them into one operation. Stops on a whitespace line. * * @throws IOException */ /*public static TsurgeonPattern getTsurgeonOperationsFromReader(BufferedReader reader) throws IOException { * List<TsurgeonPattern> operations = new ArrayList<TsurgeonPattern>(); * for (string thisLine; (thisLine = reader.readLine()) != null; ) { * if (emptyLinePattern.matcher(thisLine).matches()) { * break; * } * thisLine = removeComments(thisLine); * if (emptyLinePattern.matcher(thisLine).matches()) { * continue; * } * operations.add(parseOperation(thisLine)); * } * * if (operations.size() == 0) * throw new TsurgeonParseException("No Tsurgeon operation provided."); * * return collectOperations(operations); * }*/ /*private static string removeComments(string line) { * Matcher m = commentPattern.matcher(line); * line = m.replaceFirst(""); * Matcher m1 = escapedCommentCharacterPattern.matcher(line); * line = m1.replaceAll(commentIntroducingCharacter); * return line; * }*/ /** * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and returns * them as a String, mirroring the way the strings appear in the file. This is helpful * for lazy evaluation of the operations, as in a GUI, * because you do not parse the operations on load. Comments are still excised. * @throws IOException */ /*public static string getTsurgeonTextFromReader(BufferedReader reader) throws IOException { * StringBuilder sb = new StringBuilder(); * for (string thisLine; (thisLine = reader.readLine()) != null; ) { * thisLine = removeComments(thisLine); * if (emptyLinePattern.matcher(thisLine).matches()) { * continue; * } * sb.append(thisLine); * sb.append('\n'); * } * return sb.ToString(); * }*/ /** * Parses a tsurgeon script file and compiles all operations in the file into a list * of pairs of tregex and tsurgeon patterns. * * @param filename file containing the tsurgeon script * @return A pair of a tregex and tsurgeon pattern read from a file * @throws IOException If there is any I/O problem */ /*public static List<Pair<TregexPattern, TsurgeonPattern>> getOperationsFromFile(string filename, string encoding, TregexPatternCompiler compiler) throws IOException { * List<Pair<TregexPattern,TsurgeonPattern>> operations = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); * BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding)); * for ( ; ; ) { * Pair<TregexPattern, TsurgeonPattern> operation = getOperationFromReader(reader, compiler); * if (operation == null) { * break; * } * operations.add(operation); * } * reader.close(); * return operations; * }*/ /// <summary> /// Applies {#processPattern} to a collection of trees. /// </summary> /// <param name="matchPattern">A {@link TregexPattern} to be matched against a {@link Tree}.</param> /// <param name="p">A {@link TsurgeonPattern} to apply.</param> /// <param name="inputTrees">The input trees to be processed</param> /// <returns>A List of the transformed trees</returns> public static List <Tree> ProcessPatternOnTrees(TregexPattern matchPattern, TsurgeonPattern p, List <Tree> inputTrees) { var result = new List <Tree>(); foreach (Tree tree in inputTrees) { result.Add(ProcessPattern(matchPattern, p, tree)); } return(result); }
public virtual void TestCoindex() { TregexPattern tregex = TregexPattern.Compile("A=foo << B=bar << C=baz"); TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("coindex foo bar baz"); RunTest(tregex, tsurgeon, "(A (B (C foo)))", "(A-1 (B-1 (C-1 foo)))"); // note that the indexing does not happen a second time, since the labels are now changed RunTest(tregex, tsurgeon, "(A (B foo) (C foo) (C bar))", "(A-1 (B-1 foo) (C-1 foo) (C bar))"); // Test that it indexes at 2 instead of 1 RunTest(tregex, tsurgeon, "(A (B foo) (C-1 bar) (C baz))", "(A-2 (B-2 foo) (C-1 bar) (C-2 baz))"); }
private static void ExtractNPorPRP(ICoreMap s, IList <Mention> mentions, ICollection <IntPair> mentionSpanSet, ICollection <IntPair> namedEntitySpanSet) { IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation)); Tree tree = s.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); tree.IndexLeaves(); SemanticGraph basicDependency = s.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); SemanticGraph enhancedDependency = s.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); if (enhancedDependency == null) { enhancedDependency = s.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); } TregexPattern tgrepPattern = npOrPrpMentionPattern; TregexMatcher matcher = tgrepPattern.Matcher(tree); while (matcher.Find()) { Tree t = matcher.GetMatch(); IList <Tree> mLeaves = t.GetLeaves(); int beginIdx = ((CoreLabel)mLeaves[0].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; int endIdx = ((CoreLabel)mLeaves[mLeaves.Count - 1].Label()).Get(typeof(CoreAnnotations.IndexAnnotation)); if (",".Equals(sent[endIdx - 1].Word())) { endIdx--; } // try not to have span that ends with , IntPair mSpan = new IntPair(beginIdx, endIdx); // if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet)) ) { if (!mentionSpanSet.Contains(mSpan) && (!InsideNE(mSpan, namedEntitySpanSet) || t.Value().StartsWith("PRP"))) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIdx, endIdx, sent, basicDependency, enhancedDependency, new List <CoreLabel>(sent.SubList(beginIdx, endIdx)), t); mentions.Add(m); mentionSpanSet.Add(mSpan); if (m.originalSpan.Count > 1) { bool isNE = true; foreach (CoreLabel cl in m.originalSpan) { if (!cl.Tag().StartsWith("NNP")) { isNE = false; } } if (isNE) { namedEntitySpanSet.Add(mSpan); } } } } }
public virtual void TestMultiplePatterns() { TregexPattern tregex = TregexPattern.Compile("A=foo < B=bar < C=baz"); TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("[relabel baz BAZ] [move baz >-1 bar]"); RunTest(tregex, tsurgeon, "(A (B foo) (C foo) (C bar))", "(A (B foo (BAZ foo) (BAZ bar)))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("[relabel baz /^.*$/={bar}={baz}FOO/] [move baz >-1 bar]"); RunTest(tregex, tsurgeon, "(A (B foo) (C foo) (C bar))", "(A (B foo (BCFOO foo) (BCFOO bar)))"); // This in particular was a problem until we required "/" to be escaped tregex = TregexPattern.Compile("A=foo < B=bar < C=baz < D=biff"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("[relabel baz /^.*$/={bar}={baz}/] [relabel biff /^.*$/={bar}={biff}/]"); RunTest(tregex, tsurgeon, "(A (B foo) (C bar) (D baz))", "(A (B foo) (BC bar) (BD baz))"); }
/// <summary>Find syntactic pattern in a sentence by tregex</summary> private void FindTreePattern(Tree tree, string tregex, ICollection <Pair <int, int> > foundPairs) { try { TregexPattern tgrepPattern = TregexPattern.Compile(tregex); FindTreePattern(tree, tgrepPattern, foundPairs); } catch (Exception e) { // shouldn't happen.... throw new Exception(e); } }
/// <summary> /// Parses a tsurgeon script text input and compiles a tregex pattern and a list /// of tsurgeon operations into a pair. /// </summary> /// <param name="reader">Reader to read patterns from</param> /// <returns> /// A pair of a tregex and tsurgeon pattern read from a file, or /// <see langword="null"/> /// when the operations present in the Reader have been exhausted /// </returns> /// <exception cref="System.IO.IOException">If any IO problem</exception> public static Pair <TregexPattern, TsurgeonPattern> GetOperationFromReader(BufferedReader reader, TregexPatternCompiler compiler) { string patternString = GetTregexPatternFromReader(reader); // log.info("Read tregex pattern: " + patternString); if (patternString.IsEmpty()) { return(null); } TregexPattern matchPattern = compiler.Compile(patternString); TsurgeonPattern collectedPattern = GetTsurgeonOperationsFromReader(reader); return(new Pair <TregexPattern, TsurgeonPattern>(matchPattern, collectedPattern)); }
public virtual void TestExcise() { // TODO: needs more meat to this test TregexPattern tregex = TregexPattern.Compile("__=repeat <: (~repeat < __)"); TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("excise repeat repeat"); RunTest(tregex, tsurgeon, "(A (B (B foo)))", "(A (B foo))"); // Test that if a deleted root is excised down to a level that has // just one child, that one child gets returned as the new tree RunTest(tregex, tsurgeon, "(B (B foo))", "(B foo)"); tregex = TregexPattern.Compile("A=root"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("excise root root"); RunTest(tregex, tsurgeon, "(A (B bar) (C foo))", null); }
public virtual void TestAdjoinWithNamedNode() { TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("[adjoinF (D (E=target foot@)) bar] " + "[insert (G 1) $+ target]"); TregexPattern tregex = TregexPattern.Compile("B=bar !>> D"); RunTest(tregex, tsurgeon, "(A (B C))", "(A (D (G 1) (E (B C))))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("[adjoinF (D (E=target foot@)) bar] " + "[insert (G 1) >0 target]"); tregex = TregexPattern.Compile("B=bar !>> D"); RunTest(tregex, tsurgeon, "(A (B C))", "(A (D (E (G 1) (B C))))"); // Named leaf tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("[adjoinF (D (E foot@) F=target) bar] " + "[insert (G 1) >0 target]"); tregex = TregexPattern.Compile("B=bar !>> D"); RunTest(tregex, tsurgeon, "(A (B C))", "(A (D (E (B C)) (F (G 1))))"); }
public virtual void TestInsertWithNamedNode() { TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("[insert (D=target E) $+ bar] " + "[insert (F 1) >0 target]"); TregexPattern tregex = TregexPattern.Compile("B=bar !$- D"); RunTest(tregex, tsurgeon, "(A (B C))", "(A (D (F 1) E) (B C))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("[insert (D=target E) $+ bar] " + "[insert (F 1) $+ target]"); tregex = TregexPattern.Compile("B=bar !$- D"); RunTest(tregex, tsurgeon, "(A (B C))", "(A (F 1) (D E) (B C))"); // Named leaf tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("[insert (D E=target) $+ bar] " + "[insert (F 1) $+ target]"); tregex = TregexPattern.Compile("B=bar !$- D"); RunTest(tregex, tsurgeon, "(A (B C))", "(A (D (F 1) E) (B C))"); }
public virtual void TestRelabel() { TsurgeonPattern tsurgeon; TregexPattern tregex; tregex = TregexPattern.Compile("/^((?!_head).)*$/=preTerminal < (__=terminal !< __)"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel preTerminal /^(.*)$/$1_head=={terminal}/"); RunTest(tregex, tsurgeon, "($ $)", "($_head=$ $)"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel foo blah"); tregex = TregexPattern.Compile("B=foo"); RunTest(tregex, tsurgeon, "(A (B 0) (C 1))", "(A (blah 0) (C 1))"); RunTest(tregex, tsurgeon, "(A (B 0) (B 1))", "(A (blah 0) (blah 1))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel foo /\\//"); tregex = TregexPattern.Compile("B=foo"); RunTest(tregex, tsurgeon, "(A (B 0) (C 1))", "(A (/ 0) (C 1))"); RunTest(tregex, tsurgeon, "(A (B 0) (B 1))", "(A (/ 0) (/ 1))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel foo /.*(voc.*)/$1/"); tregex = TregexPattern.Compile("/^a.*t/=foo"); RunTest(tregex, tsurgeon, "(A (avocet 0) (C 1))", "(A (vocet 0) (C 1))"); RunTest(tregex, tsurgeon, "(A (avocet 0) (advocate 1))", "(A (vocet 0) (vocate 1))"); tregex = TregexPattern.Compile("curlew=baz < /^a(.*)t/#1%bar=foo"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel baz /cu(rle)w/={foo}/"); RunTest(tregex, tsurgeon, "(curlew (avocet 0))", "(avocet (avocet 0))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel baz /cu(rle)w/%{bar}/"); RunTest(tregex, tsurgeon, "(curlew (avocet 0))", "(voce (avocet 0))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel baz /cu(rle)w/$1/"); RunTest(tregex, tsurgeon, "(curlew (avocet 0))", "(rle (avocet 0))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel baz /cu(rle)w/$1={foo}/"); RunTest(tregex, tsurgeon, "(curlew (avocet 0))", "(rleavocet (avocet 0))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel baz /cu(rle)w/%{bar}$1={foo}/"); RunTest(tregex, tsurgeon, "(curlew (avocet 0))", "(vocerleavocet (avocet 0))"); tregex = TregexPattern.Compile("A=baz < /curlew.*/=foo < /avocet.*/=bar"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel baz /^.*$/={foo}={bar}/"); RunTest(tregex, tsurgeon, "(A (curlewfoo 0) (avocetzzz 1))", "(curlewfooavocetzzz (curlewfoo 0) (avocetzzz 1))"); tregex = TregexPattern.Compile("A=baz < /curle.*/=foo < /avo(.*)/#1%bar"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel baz /^(.*)$/={foo}$1%{bar}/"); RunTest(tregex, tsurgeon, "(A (curlew 0) (avocet 1))", "(curlewAcet (curlew 0) (avocet 1))"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel baz /^(.*)$/=foo$1%bar/"); RunTest(tregex, tsurgeon, "(A (curlew 0) (avocet 1))", "(=fooA%bar (curlew 0) (avocet 1))"); tregex = TregexPattern.Compile("/foo/=foo"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel foo /foo/bar/"); RunTest(tregex, tsurgeon, "(foofoo (curlew 0) (avocet 1))", "(barbar (curlew 0) (avocet 1))"); tregex = TregexPattern.Compile("/foo/=foo < /cur.*/=bar"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel foo /foo/={bar}/"); RunTest(tregex, tsurgeon, "(foofoo (curlew 0) (avocet 1))", "(curlewcurlew (curlew 0) (avocet 1))"); tregex = TregexPattern.Compile("/^foo(.*)$/=foo"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("relabel foo /foo(.*)$/bar$1/"); RunTest(tregex, tsurgeon, "(foofoo (curlew 0) (avocet 1))", "(barfoo (curlew 0) (avocet 1))"); }
public virtual void TestPrune() { TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("prune bob"); TregexPattern tregex = TregexPattern.Compile("B=bob"); RunTest(tregex, tsurgeon, "(A (B (C 1)))", null); RunTest(tregex, tsurgeon, "(A (foo 1) (B (C 1)))", "(A (foo 1))"); RunTest(tregex, tsurgeon, "(A (B 1) (B (C 1)))", null); RunTest(tregex, tsurgeon, "(A (foo 1) (bar (C 1)))", "(A (foo 1) (bar (C 1)))"); tregex = TregexPattern.Compile("C=bob"); RunTest(tregex, tsurgeon, "(A (B (C 1)))", null); RunTest(tregex, tsurgeon, "(A (foo 1) (B (C 1)))", "(A (foo 1))"); RunTest(tregex, tsurgeon, "(A (B 1) (B (C 1)))", "(A (B 1))"); RunTest(tregex, tsurgeon, "(A (foo 1) (bar (C 1)))", "(A (foo 1))"); }
public ArabicTreeNormalizer(bool retainNPTmp, bool markPRDverb, bool changeNoLabels, bool retainNPSbj, bool retainPPClr) : base(new ArabicTreebankLanguagePack()) { this.retainNPTmp = retainNPTmp; this.retainNPSbj = retainNPSbj; this.markPRDverb = markPRDverb; this.changeNoLabels = changeNoLabels; this.retainPPClr = retainPPClr; rootLabel = tlp.StartSymbol(); prdVerbPattern = TregexPattern.Compile("/^V[^P]/ > VP $ /-PRD$/=prd"); prdPattern = Pattern.Compile("^[A-Z]+-PRD"); //Marks NP subjects that *do not* occur in verb-initial clauses npSbjPattern = TregexPattern.Compile("/^NP-SBJ/ !> @VP"); emptyFilter = new ArabicTreeNormalizer.ArabicEmptyFilter(); }
/// <summary>Tries to match a pattern against a tree.</summary> /// <remarks> /// Tries to match a pattern against a tree. If it succeeds, apply the surgical operations contained in a /// <see cref="TsurgeonPattern"/> /// . /// </remarks> /// <param name="matchPattern"> /// A /// <see cref="Edu.Stanford.Nlp.Trees.Tregex.TregexPattern"/> /// to be matched against a /// <see cref="Edu.Stanford.Nlp.Trees.Tree"/> /// . /// </param> /// <param name="p"> /// A /// <see cref="TsurgeonPattern"/> /// to apply. /// </param> /// <param name="t"> /// the /// <see cref="Edu.Stanford.Nlp.Trees.Tree"/> /// to match against and perform surgery on. /// </param> /// <returns>t, which has been surgically modified.</returns> public static Tree ProcessPattern(TregexPattern matchPattern, TsurgeonPattern p, Tree t) { TregexMatcher m = matchPattern.Matcher(t); TsurgeonMatcher tsm = p.Matcher(); while (m.Find()) { t = tsm.Evaluate(t, m); if (t == null) { break; } m = matchPattern.Matcher(t); } return(t); }
public virtual void TestInsertDelete() { // The same bug as the Replace bug, but for a sequence of // insert/delete operations IList <Pair <TregexPattern, TsurgeonPattern> > surgery = new List <Pair <TregexPattern, TsurgeonPattern> >(); TregexPattern tregex = TregexPattern.Compile("(/-([0-9]+)$/#1%i=src > /^FILLER$/) : (/^-NONE-/=dest <: /-([0-9]+)$/#1%i !$ ~src)"); TsurgeonPattern tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("insert src $+ dest"); surgery.Add(new Pair <TregexPattern, TsurgeonPattern>(tregex, tsurgeon)); tregex = TregexPattern.Compile("(/-([0-9]+)$/#1%i=src > /^FILLER$/) : (/^-NONE-/=dest <: /-([0-9]+)$/#1%i)"); tsurgeon = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation("delete dest"); surgery.Add(new Pair <TregexPattern, TsurgeonPattern>(tregex, tsurgeon)); RunTest(surgery, "( (S (FILLER (NP-SBJ-1 (NNP Koito))) (VP (VBZ has) (VP (VBN refused) (S (NP-SBJ (-NONE- *-1)) (VP (TO to) (VP (VB grant) (NP (NNP Mr.) (NNP Pickens)) (NP (NP (NNS seats)) (PP-LOC (IN on) (NP (PRP$ its) (NN board))))))) (, ,) (S-ADV (NP-SBJ (-NONE- *-1)) (VP (VBG asserting) (SBAR (-NONE- 0) (S (NP-SBJ (PRP he)) (VP (VBZ is) (NP-PRD (NP (DT a) (NN greenmailer)) (VP (VBG trying) (S (NP-SBJ (-NONE- *)) (VP (TO to) (VP (VB pressure) (NP (NP (NNP Koito) (POS 's)) (JJ other) (NNS shareholders)) (PP-CLR (IN into) (S-NOM (NP-SBJ (-NONE- *)) (VP (VBG buying) (NP (PRP him)) (PRT (RP out)) (PP-MNR (IN at) (NP (DT a) (NN profit)))))))))))))))))) (. .)))" , "( (S (FILLER (NP-SBJ-1 (NNP Koito))) (VP (VBZ has) (VP (VBN refused) (S (NP-SBJ (NP-SBJ-1 (NNP Koito))) (VP (TO to) (VP (VB grant) (NP (NNP Mr.) (NNP Pickens)) (NP (NP (NNS seats)) (PP-LOC (IN on) (NP (PRP$ its) (NN board))))))) (, ,) (S-ADV (NP-SBJ (NP-SBJ-1 (NNP Koito))) (VP (VBG asserting) (SBAR (-NONE- 0) (S (NP-SBJ (PRP he)) (VP (VBZ is) (NP-PRD (NP (DT a) (NN greenmailer)) (VP (VBG trying) (S (NP-SBJ (-NONE- *)) (VP (TO to) (VP (VB pressure) (NP (NP (NNP Koito) (POS 's)) (JJ other) (NNS shareholders)) (PP-CLR (IN into) (S-NOM (NP-SBJ (-NONE- *)) (VP (VBG buying) (NP (PRP him)) (PRT (RP out)) (PP-MNR (IN at) (NP (DT a) (NN profit)))))))))))))))))) (. .)))" ); }
/** * Parses a tsurgeon script text input and compiles a tregex pattern and a list * of tsurgeon operations into a pair. * * @param reader Reader to read patterns from * @return A pair of a tregex and tsurgeon pattern read from a file, or <code>null</code> * when the operations in the Reader have been exhausted * @throws IOException If any IO problem */ /*public static Tuple<TregexPattern, TsurgeonPattern> getOperationFromReader(BufferedReader reader, TregexPatternCompiler compiler) /*throws IOException#1# { string patternString = getTregexPatternFromReader(reader); if ("".equals(patternString)) { return null; } TregexPattern matchPattern = compiler.compile(patternString); TsurgeonPattern collectedPattern = getTsurgeonOperationsFromReader(reader); return new Pair<TregexPattern,TsurgeonPattern>(matchPattern,collectedPattern); }*/ /** * Assumes that we are at the beginning of a tsurgeon script file and gets the string for the * tregex pattern leading the file * @return tregex pattern string */ /*public static string getTregexPatternFromReader(BufferedReader reader) throws IOException { StringBuilder matchString = new StringBuilder(); for (string thisLine; (thisLine = reader.readLine()) != null; ) { if (matchString.length() > 0 && emptyLinePattern.matcher(thisLine).matches()) { // A blank line after getting some real content (not just comments or nothing) break; } Matcher m = commentPattern.matcher(thisLine); if (m.matches()) { // delete it thisLine = m.replaceFirst(""); } if ( ! emptyLinePattern.matcher(thisLine).matches()) { matchString.append(thisLine); } } return matchString.ToString(); }*/ /** * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and parses * these out, collecting them into one operation. Stops on a whitespace line. * * @throws IOException */ /*public static TsurgeonPattern getTsurgeonOperationsFromReader(BufferedReader reader) throws IOException { List<TsurgeonPattern> operations = new ArrayList<TsurgeonPattern>(); for (string thisLine; (thisLine = reader.readLine()) != null; ) { if (emptyLinePattern.matcher(thisLine).matches()) { break; } thisLine = removeComments(thisLine); if (emptyLinePattern.matcher(thisLine).matches()) { continue; } operations.add(parseOperation(thisLine)); } if (operations.size() == 0) throw new TsurgeonParseException("No Tsurgeon operation provided."); return collectOperations(operations); }*/ /*private static string removeComments(string line) { Matcher m = commentPattern.matcher(line); line = m.replaceFirst(""); Matcher m1 = escapedCommentCharacterPattern.matcher(line); line = m1.replaceAll(commentIntroducingCharacter); return line; }*/ /** * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and returns * them as a String, mirroring the way the strings appear in the file. This is helpful * for lazy evaluation of the operations, as in a GUI, * because you do not parse the operations on load. Comments are still excised. * @throws IOException */ /*public static string getTsurgeonTextFromReader(BufferedReader reader) throws IOException { StringBuilder sb = new StringBuilder(); for (string thisLine; (thisLine = reader.readLine()) != null; ) { thisLine = removeComments(thisLine); if (emptyLinePattern.matcher(thisLine).matches()) { continue; } sb.append(thisLine); sb.append('\n'); } return sb.ToString(); }*/ /** * Parses a tsurgeon script file and compiles all operations in the file into a list * of pairs of tregex and tsurgeon patterns. * * @param filename file containing the tsurgeon script * @return A pair of a tregex and tsurgeon pattern read from a file * @throws IOException If there is any I/O problem */ /*public static List<Pair<TregexPattern, TsurgeonPattern>> getOperationsFromFile(string filename, string encoding, TregexPatternCompiler compiler) throws IOException { List<Pair<TregexPattern,TsurgeonPattern>> operations = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding)); for ( ; ; ) { Pair<TregexPattern, TsurgeonPattern> operation = getOperationFromReader(reader, compiler); if (operation == null) { break; } operations.add(operation); } reader.close(); return operations; }*/ /// <summary> /// Applies {#processPattern} to a collection of trees. /// </summary> /// <param name="matchPattern">A {@link TregexPattern} to be matched against a {@link Tree}.</param> /// <param name="p">A {@link TsurgeonPattern} to apply.</param> /// <param name="inputTrees">The input trees to be processed</param> /// <returns>A List of the transformed trees</returns> public static List<Tree> ProcessPatternOnTrees(TregexPattern matchPattern, TsurgeonPattern p, List<Tree> inputTrees) { var result = new List<Tree>(); foreach (Tree tree in inputTrees) { result.Add(ProcessPattern(matchPattern, p, tree)); } return result; }
/// <summary> /// Tries to match a pattern against a tree. If it succeeds, apply the surgical operations contained in a {@link TsurgeonPattern}. /// </summary> /// <param name="matchPattern">A {@link TregexPattern} to be matched against a {@link Tree}.</param> /// <param name="p">A {@link TsurgeonPattern} to apply.</param> /// <param name="t">the {@link Tree} to match against and perform surgery on.</param> /// <returns>t, which has been surgically modified.</returns> public static Tree ProcessPattern(TregexPattern matchPattern, TsurgeonPattern p, Tree t) { TregexMatcher m = matchPattern.Matcher(t); TsurgeonMatcher tsm = p.GetMatcher(); while (m.Find()) { t = tsm.Evaluate(t, m); if (t == null) { break; } m = matchPattern.Matcher(t); } return t; }