コード例 #1
0
 private void Write(Tree t, PrintWriter pw)
 {
     if (this.taggedOutput)
     {
         pw.Println(ATBTreeUtils.TaggedStringFromTree(t, this._enclosing.removeEscapeTokens, this._enclosing.wordTagDelim));
     }
     else
     {
         t.PennPrint(pw);
     }
 }
コード例 #2
0
            public override void VisitTree(Tree t)
            {
                if (t == null || t.Value().Equals("X"))
                {
                    return;
                }
                t = t.Prune(this.nullFilter, new LabeledScoredTreeFactory());
                //Do *not* strip traces here. The ArabicTreeReader will do that if needed
                foreach (Tree node in t)
                {
                    if (node.IsPreTerminal())
                    {
                        this.ProcessPreterminal(node);
                    }
                }
                this.treesVisited++;
                string flatString = (this.makeFlatFile) ? ATBTreeUtils.FlattenTree(t) : null;

                //Do the decimation
                if (this.treesVisited % 9 == 0)
                {
                    this.Write(t, this.outFiles[this.devExtension]);
                    if (this.makeFlatFile)
                    {
                        this.outFiles[this.devExtension + this.flatExtension].Println(flatString);
                    }
                }
                else
                {
                    if (this.treesVisited % 10 == 0)
                    {
                        this.Write(t, this.outFiles[this.testExtension]);
                        if (this.makeFlatFile)
                        {
                            this.outFiles[this.testExtension + this.flatExtension].Println(flatString);
                        }
                    }
                    else
                    {
                        this.Write(t, this.outFiles[this.trainExtension]);
                        if (this.makeFlatFile)
                        {
                            this.outFiles[this.trainExtension + this.flatExtension].Println(flatString);
                        }
                    }
                }
            }
コード例 #3
0
 public virtual void VisitTree(Tree t)
 {
     // Filter out XBar trees
     if (t == null || t.Value().Equals("X"))
     {
         return;
     }
     if (t.Yield().Count > this._enclosing.maxLen)
     {
         return;
     }
     // Strip out traces and pronoun deletion markers,
     t = t.Prune(this.nullFilter, this.tf);
     t = this.ArabicAoverAFilter(t);
     // Visit nodes with a custom visitor
     if (this._enclosing.customTreeVisitor != null)
     {
         this._enclosing.customTreeVisitor.VisitTree(t);
     }
     // Process each node in the tree
     foreach (Tree node in t)
     {
         if (node.IsPreTerminal())
         {
             this.ProcessPreterminal(node);
         }
         if (this._enclosing.removeDashTags && !node.IsLeaf())
         {
             node.SetValue(this.tlp.BasicCategory(node.Value()));
         }
     }
     // Add a ROOT node if necessary
     if (this._enclosing.addRoot && t.Value() != null && !t.Value().Equals("ROOT"))
     {
         t = this.tf.NewTreeNode("ROOT", Collections.SingletonList(t));
     }
     // Output the trees to file
     this.outfile.Println(t.ToString());
     if (this.flatFile != null)
     {
         string flatString = (this._enclosing.removeEscapeTokens) ? ATBTreeUtils.UnEscape(ATBTreeUtils.FlattenTree(t)) : ATBTreeUtils.FlattenTree(t);
         this.flatFile.Println(flatString);
     }
 }
コード例 #4
0
 public override void VisitTree(Tree t)
 {
     if (t == null || t.Value().Equals("X"))
     {
         return;
     }
     t = t.Prune(this.nullFilter, new LabeledScoredTreeFactory());
     foreach (Tree node in t)
     {
         if (node.IsPreTerminal())
         {
             this.ProcessPreterminal(node);
         }
     }
     this.outfile.Println(ATBTreeUtils.TaggedStringFromTree(t, this._enclosing.removeEscapeTokens, this._enclosing.wordTagDelim));
     if (this.flatFile != null)
     {
         this.flatFile.Println(ATBTreeUtils.FlattenTree(t));
     }
 }
コード例 #5
0
        /// <summary>Escapes a word.</summary>
        /// <remarks>Escapes a word. This method will *not* map a word to the null string.</remarks>
        /// <returns>The escaped string</returns>
        private string EscapeString(string word)
        {
            string firstStage  = StripAnnotationsAndClassing(word);
            string secondStage = ATBTreeUtils.Escape(firstStage);

            if (secondStage.IsEmpty())
            {
                return(firstStage);
            }
            else
            {
                if (!firstStage.Equals(secondStage))
                {
                    return(secondStage);
                }
            }
            string thirdStage = lexMapper.Map(null, secondStage);

            if (thirdStage.IsEmpty())
            {
                return(secondStage);
            }
            return(thirdStage);
        }
コード例 #6
0
        public override void Build()
        {
            foreach (File path in pathsToData)
            {
                treebank.LoadPath(path, treeFileExtension, false);
            }
            PrintWriter outfile  = null;
            PrintWriter flatFile = null;

            try
            {
                outfile  = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")));
                flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName), "UTF-8"))) : null;
                outputFileList.Add(outFileName);
                if (makeFlatFile)
                {
                    outputFileList.Add(flatFileName);
                    toStringBuffer.Append(" Made flat files\n");
                }
                PreprocessMWEs();
                IList <TregexPattern> badTrees = new List <TregexPattern>();
                //These trees appear in the Candito training set
                //They are mangled by the TreeCorrector, so discard them ahead of time.
                badTrees.Add(TregexPattern.Compile("@SENT <: @PUNC"));
                badTrees.Add(TregexPattern.Compile("@SENT <1 @PUNC <2 @PUNC !<3 __"));
                //wsg2011: This filters out tree #552 in the Candito test set. We saved this tree for the
                //EMNLP2011 paper, but since it consists entirely of punctuation, it won't be evaluated anyway.
                //Since we aren't doing the split in this data set, just remove the tree.
                badTrees.Add(TregexPattern.Compile("@SENT <1 @PUNC <2 @PUNC <3 @PUNC <4 @PUNC !<5 __"));
                foreach (Tree t in treebank)
                {
                    //Filter out bad trees
                    bool skipTree = false;
                    foreach (TregexPattern p in badTrees)
                    {
                        skipTree = p.Matcher(t).Find();
                        if (skipTree)
                        {
                            break;
                        }
                    }
                    if (skipTree)
                    {
                        log.Info("Discarding tree: " + t.ToString());
                        continue;
                    }
                    // Filter out trees that aren't in this part of the split
                    if (splitSet != null)
                    {
                        string canditoTreeID = GetCanditoTreeID(t);
                        if (!splitSet.Contains(canditoTreeID))
                        {
                            continue;
                        }
                    }
                    if (customTreeVisitor != null)
                    {
                        customTreeVisitor.VisitTree(t);
                    }
                    // outfile.printf("%s\t%s%n",treeName,t.toString());
                    outfile.Println(t.ToString());
                    if (makeFlatFile)
                    {
                        string flatString = (removeEscapeTokens) ? ATBTreeUtils.UnEscape(ATBTreeUtils.FlattenTree(t)) : ATBTreeUtils.FlattenTree(t);
                        flatFile.Println(flatString);
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                System.Console.Error.Printf("%s: Filesystem does not support UTF-8 output%n", this.GetType().FullName);
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException)
            {
                System.Console.Error.Printf("%s: Could not open %s for writing%n", this.GetType().FullName, outFileName);
            }
            catch (TregexParseException e)
            {
                System.Console.Error.Printf("%s: Could not compile Tregex expressions%n", this.GetType().FullName);
                Sharpen.Runtime.PrintStackTrace(e);
            }
            finally
            {
                if (outfile != null)
                {
                    outfile.Close();
                }
                if (flatFile != null)
                {
                    flatFile.Close();
                }
            }
        }