public virtual Tree TransformTree(Tree tree)
        {
            ILabel l = tree.Label();

            if (tree.IsLeaf())
            {
                return(tf.NewLeaf(l));
            }
            string s = l.Value();

            s = tlpp.TreebankLanguagePack().BasicCategory(s);
            if (deletePunct)
            {
                // this is broken as it's not the right thing to do when there
                // is any tag ambiguity -- and there is for ' (POS/'').  Sentences
                // can then have more or less words.  It's also unnecessary for EVALB,
                // since it ignores punctuation anyway
                if (tree.IsPreTerminal() && tlpp.TreebankLanguagePack().IsEvalBIgnoredPunctuationTag(s))
                {
                    return(null);
                }
            }
            // TEMPORARY: eliminate the TOPP constituent
            if (tree.Children()[0].Label().Value().Equals("TOPP"))
            {
                log.Info("Found a TOPP");
                tree.SetChildren(tree.Children()[0].Children());
            }
            // Negra has lots of non-unary roots; delete unary roots
            if (tlpp.TreebankLanguagePack().IsStartSymbol(s) && tree.NumChildren() == 1)
            {
                // NB: This deletes the boundary symbol, which is in the tree!
                return(TransformTree(tree.GetChild(0)));
            }
            IList <Tree> children = new List <Tree>();

            for (int cNum = 0; cNum < numC; cNum++)
            {
                Tree child    = tree.GetChild(cNum);
                Tree newChild = TransformTree(child);
                if (newChild != null)
                {
                    children.Add(newChild);
                }
            }
            if (children.IsEmpty())
            {
                return(null);
            }
            return(tf.NewTreeNode(new StringLabel(s), children));
        }
Esempio n. 2
0
        public BinarizerAnnotator(string annotatorName, Properties props)
        {
            this.tlppClass = props.GetProperty(annotatorName + ".tlppClass", DefaultTlppClass);
            ITreebankLangParserParams tlpp = ReflectionLoading.LoadByReflection(tlppClass);

            this.binarizer = TreeBinarizer.SimpleTreeBinarizer(tlpp.HeadFinder(), tlpp.TreebankLanguagePack());
        }
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(Usage());
                System.Environment.Exit(-1);
            }
            Properties options             = StringUtils.ArgsToProperties(args, ArgDefs());
            Language   language            = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            ITreebankLangParserParams tlpp = language.@params;
            DiskTreebank tb            = null;
            string       encoding      = options.GetProperty("l", "UTF-8");
            bool         removeBracket = PropertiesUtils.GetBool(options, "b", false);

            tlpp.SetInputEncoding(encoding);
            tlpp.SetOutputEncoding(encoding);
            tb = tlpp.DiskTreebank();
            string[] files = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (files.Length != 0)
            {
                foreach (string filename in files)
                {
                    tb.LoadPath(filename);
                }
            }
            else
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            PrintWriter  pwo         = tlpp.Pw();
            string       startSymbol = tlpp.TreebankLanguagePack().StartSymbol();
            ITreeFactory tf          = new LabeledScoredTreeFactory();
            int          nTrees      = 0;

            foreach (Tree t in tb)
            {
                if (removeBracket)
                {
                    if (t.Value().Equals(startSymbol))
                    {
                        t = t.FirstChild();
                    }
                }
                else
                {
                    if (!t.Value().Equals(startSymbol))
                    {
                        //Add a bracket if it isn't already there
                        t = tf.NewTreeNode(startSymbol, Java.Util.Collections.SingletonList(t));
                    }
                }
                pwo.Println(t.ToString());
                nTrees++;
            }
            pwo.Close();
            System.Console.Error.Printf("Processed %d trees.%n", nTrees);
        }
Esempio n. 4
0
 public TreeAnnotatorAndBinarizer(IHeadFinder annotationHF, IHeadFinder binarizationHF, ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op)
 {
     this.trainOptions = op.trainOptions;
     if (doSubcategorization)
     {
         annotator = new TreeAnnotator(annotationHF, tlpParams, op);
     }
     else
     {
         annotator = new TreeAnnotatorAndBinarizer.TreeNullAnnotator(annotationHF);
     }
     binarizer = new TreeBinarizer(binarizationHF, tlpParams.TreebankLanguagePack(), insideFactor, trainOptions.markovFactor, trainOptions.markovOrder, trainOptions.CompactGrammar() > 0, trainOptions.CompactGrammar() > 1, trainOptions.HselCut, trainOptions
                                   .markFinalStates, trainOptions.simpleBinarizedLabels, trainOptions.noRebinarization);
     if (trainOptions.selectivePostSplit)
     {
         postSplitter = new PostSplitter(tlpParams, op);
     }
     else
     {
         postSplitter = null;
     }
     this.tf       = new LabeledScoredTreeFactory(new CategoryWordTagFactory());
     this.tlp      = tlpParams.TreebankLanguagePack();
     this.forceCNF = forceCNF;
     if (trainOptions.printAnnotatedRuleCounts)
     {
         annotatedRuleCounts = new ClassicCounter <Tree>();
     }
     else
     {
         annotatedRuleCounts = null;
     }
     if (trainOptions.printAnnotatedStateCounts)
     {
         annotatedStateCounts = new ClassicCounter <string>();
     }
     else
     {
         annotatedStateCounts = null;
     }
 }
 public MLEDependencyGrammar(ITagProjection tagProjection, ITreebankLangParserParams tlpParams, bool directional, bool useDistance, bool useCoarseDistance, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
     : base(tlpParams.TreebankLanguagePack(), tagProjection, directional, useDistance, useCoarseDistance, op, wordIndex, tagIndex)
 {
     // reduced tag space
     //  public double distanceDecay = 0.0;
     // extra smoothing hyperparameters for tag projection backoff.  Only used if useSmoothTagProjection is true.
     // back off Bayesian m-estimate of aTW given aT to aPTW given aPT
     // back off Bayesian m-estimate of aTW_hTd to aPTW_hPTd (?? guessed, not tuned)
     // back off Bayesian m-estimate of aT_hTd to aPT_hPTd (?? guessed, not tuned)
     // back off word prediction from tag to projected tag (only used if useUnigramWordSmoothing is true)
     useSmoothTagProjection  = op.useSmoothTagProjection;
     useUnigramWordSmoothing = op.useUnigramWordSmoothing;
     argCounter  = new ClassicCounter <IntDependency>();
     stopCounter = new ClassicCounter <IntDependency>();
     double[] smoothParams = tlpParams.MLEDependencyGrammarSmoothingParams();
     smooth_aT_hTWd  = smoothParams[0];
     smooth_aTW_hTWd = smoothParams[1];
     smooth_stop     = smoothParams[2];
     interp          = smoothParams[3];
     // cdm added Jan 2007 to play with dep grammar smoothing.  Integrate this better if we keep it!
     smoothTP = new BasicCategoryTagProjection(tlpParams.TreebankLanguagePack());
 }
Esempio n. 6
0
        /// <summary>Lets you test out the TreeBinarizer on the command line.</summary>
        /// <remarks>
        /// Lets you test out the TreeBinarizer on the command line.
        /// This main method doesn't yet handle as many flags as one would like.
        /// But it does have:
        /// <ul>
        /// <li> -tlp TreebankLanguagePack
        /// <li>-tlpp TreebankLangParserParams
        /// <li>-insideFactor
        /// <li>-markovOrder
        /// </ul>
        /// </remarks>
        /// <param name="args">
        /// Command line arguments: flags as above, as above followed by
        /// treebankPath
        /// </param>
        public static void Main(string[] args)
        {
            ITreebankLangParserParams tlpp = null;
            // TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            // TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
            // Looks like it must build CategoryWordTagFactory!!
            ITreeReaderFactory    trf     = null;
            string                fileExt = "mrg";
            IHeadFinder           hf      = new ModCollinsHeadFinder();
            ITreebankLanguagePack tlp     = new PennTreebankLanguagePack();
            bool   insideFactor           = false;
            bool   mf               = false;
            int    mo               = 1;
            bool   uwl              = false;
            bool   uat              = false;
            double sst              = 20.0;
            bool   mfs              = false;
            bool   simpleLabels     = false;
            bool   noRebinarization = false;
            int    i = 0;

            while (i < args.Length && args[i].StartsWith("-"))
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp") && i + 1 < args.Length)
                {
                    try
                    {
                        tlp = (ITreebankLanguagePack)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                    }
                    catch (Exception e)
                    {
                        log.Info("Couldn't instantiate: " + args[i + 1]);
                        throw new Exception(e);
                    }
                    i++;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlpp") && i + 1 < args.Length)
                    {
                        try
                        {
                            tlpp = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                        }
                        catch (Exception e)
                        {
                            log.Info("Couldn't instantiate: " + args[i + 1]);
                            throw new Exception(e);
                        }
                        i++;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-insideFactor"))
                        {
                            insideFactor = true;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-markovOrder") && i + 1 < args.Length)
                            {
                                i++;
                                mo = System.Convert.ToInt32(args[i]);
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-simpleLabels"))
                                {
                                    simpleLabels = true;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noRebinarization"))
                                    {
                                        noRebinarization = true;
                                    }
                                    else
                                    {
                                        log.Info("Unknown option:" + args[i]);
                                    }
                                }
                            }
                        }
                    }
                }
                i++;
            }
            if (i >= args.Length)
            {
                log.Info("usage: java TreeBinarizer [-tlpp class|-markovOrder int|...] treebankPath");
                System.Environment.Exit(0);
            }
            Treebank treebank;

            if (tlpp != null)
            {
                treebank = tlpp.MemoryTreebank();
                tlp      = tlpp.TreebankLanguagePack();
                fileExt  = tlp.TreebankFileExtension();
                hf       = tlpp.HeadFinder();
            }
            else
            {
                treebank = new DiskTreebank(trf);
            }
            treebank.LoadPath(args[i], fileExt, true);
            ITreeTransformer tt = new Edu.Stanford.Nlp.Parser.Lexparser.TreeBinarizer(hf, tlp, insideFactor, mf, mo, uwl, uat, sst, mfs, simpleLabels, noRebinarization);

            foreach (Tree t in treebank)
            {
                Tree newT = tt.TransformTree(t);
                System.Console.Out.WriteLine("Original tree:");
                t.PennPrint();
                System.Console.Out.WriteLine("Binarized tree:");
                newT.PennPrint();
                System.Console.Out.WriteLine();
            }
        }
Esempio n. 7
0
        private static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromTreebank(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
        {
            System.Console.Out.WriteLine("Currently " + new DateTime());
            //    printOptions(true, op);
            Timing.StartTime();
            // setup tree transforms
            ITreebankLangParserParams tlpParams = op.tlpParams;

            if (op.testOptions.verbose)
            {
                System.Console.Out.Write("Training ");
                System.Console.Out.WriteLine(trainTreebank.TextualSummary());
            }
            System.Console.Out.Write("Binarizing trees...");
            TreeAnnotatorAndBinarizer binarizer;

            // initialized below
            if (!op.trainOptions.leftToRight)
            {
                binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            else
            {
                binarizer = new TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            CollinsPuncTransformer collinsPuncTransformer = null;

            if (op.trainOptions.collinsPunc)
            {
                collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.TreebankLanguagePack());
            }
            IList <Tree> binaryTrainTrees = new List <Tree>();

            // List<Tree> binaryTuneTrees = new ArrayList<Tree>();
            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.TreebankLanguagePack());
                if (op.testOptions.verbose)
                {
                    log.Info("Parent split categories: " + op.trainOptions.splitters);
                }
            }
            if (op.trainOptions.selectivePostSplit)
            {
                ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);
                Treebank         annotatedTB   = trainTreebank.Transform(myTransformer);
                op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.TreebankLanguagePack());
                if (op.testOptions.verbose)
                {
                    log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
                }
            }
            if (op.trainOptions.hSelSplit)
            {
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in trainTreebank)
                {
                    if (op.trainOptions.collinsPunc)
                    {
                        tree = collinsPuncTransformer.TransformTree(tree);
                    }
                    tree = binarizer.TransformTree(tree);
                }
                binarizer.SetDoSelectiveSplit(true);
            }
            foreach (Tree tree_1 in trainTreebank)
            {
                if (op.trainOptions.collinsPunc)
                {
                    tree_1 = collinsPuncTransformer.TransformTree(tree_1);
                }
                tree_1 = binarizer.TransformTree(tree_1);
                binaryTrainTrees.Add(tree_1);
            }
            Timing.Tick("done.");
            if (op.testOptions.verbose)
            {
                binarizer.DumpStats();
            }
            System.Console.Out.Write("Extracting Lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter clex = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)op.tlpParams.Lex(op, wordIndex, tagIndex);
            clex.InitializeTraining(binaryTrainTrees.Count);
            clex.Train(binaryTrainTrees);
            clex.FinishTraining();
            Timing.Tick("done.");
            return(clex);
        }
Esempio n. 8
0
        /// <returns>A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.</returns>
        public static Triple <Treebank, Treebank, Treebank> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op)
        {
            // setup tree transforms
            ITreebankLangParserParams tlpParams = op.tlpParams;
            ITreebankLanguagePack     tlp       = tlpParams.TreebankLanguagePack();

            if (op.testOptions.verbose)
            {
                PrintWriter pwErr = tlpParams.Pw(System.Console.Error);
                pwErr.Print("Training ");
                pwErr.Println(trainTreebank.TextualSummary(tlp));
                if (secondaryTreebank != null)
                {
                    pwErr.Print("Secondary training ");
                    pwErr.Println(secondaryTreebank.TextualSummary(tlp));
                }
            }
            CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer();

            if (op.trainOptions.preTransformer != null)
            {
                trainTransformer.AddTransformer(op.trainOptions.preTransformer);
            }
            if (op.trainOptions.collinsPunc)
            {
                CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp);
                trainTransformer.AddTransformer(collinsPuncTransformer);
            }
            log.Info("Binarizing trees...");
            Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer binarizer;
            if (!op.trainOptions.leftToRight)
            {
                binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op);
            }
            else
            {
                binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op);
            }
            trainTransformer.AddTransformer(binarizer);
            if (op.wordFunction != null)
            {
                ITreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction);
                trainTransformer.AddTransformer(wordFunctionTransformer);
            }
            Treebank wholeTreebank;

            if (secondaryTreebank == null)
            {
                wholeTreebank = trainTreebank;
            }
            else
            {
                wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank);
            }
            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp);
                RemoveDeleteSplittersFromSplitters(tlp, op);
                if (op.testOptions.verbose)
                {
                    IList <string> list = new List <string>(op.trainOptions.splitters);
                    list.Sort();
                    log.Info("Parent split categories: " + list);
                }
            }
            if (op.trainOptions.selectivePostSplit)
            {
                // Do all the transformations once just to learn selective splits on annotated categories
                ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);
                wholeTreebank = wholeTreebank.Transform(myTransformer);
                op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp);
                if (op.testOptions.verbose)
                {
                    log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
                }
            }
            if (op.trainOptions.hSelSplit)
            {
                // We run through all the trees once just to gather counts for hSelSplit!
                int ptt = op.trainOptions.printTreeTransformations;
                op.trainOptions.printTreeTransformations = 0;
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in wholeTreebank)
                {
                    trainTransformer.TransformTree(tree);
                }
                binarizer.SetDoSelectiveSplit(true);
                op.trainOptions.printTreeTransformations = ptt;
            }
            // we've done all the setup now. here's where the train treebank is transformed.
            trainTreebank = trainTreebank.Transform(trainTransformer);
            if (secondaryTreebank != null)
            {
                secondaryTreebank = secondaryTreebank.Transform(trainTransformer);
            }
            if (op.trainOptions.printAnnotatedStateCounts)
            {
                binarizer.PrintStateCounts();
            }
            if (op.trainOptions.printAnnotatedRuleCounts)
            {
                binarizer.PrintRuleCounts();
            }
            if (tuneTreebank != null)
            {
                tuneTreebank = tuneTreebank.Transform(trainTransformer);
            }
            if (op.testOptions.verbose)
            {
                binarizer.DumpStats();
            }
            return(new Triple <Treebank, Treebank, Treebank>(trainTreebank, secondaryTreebank, tuneTreebank));
        }
Esempio n. 9
0
        public virtual IList <Tree> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank)
        {
            ITreebankLangParserParams tlpParams = op.tlpParams;
            ITreebankLanguagePack     tlp       = tlpParams.TreebankLanguagePack();

            if (Verbose)
            {
                log.Info("\n\n" + trainTreebank.TextualSummary(tlp));
            }
            log.Info("Binarizing trees...");
            TreeAnnotatorAndBinarizer binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);

            Timing.Tick("done.");
            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp);
                RemoveDeleteSplittersFromSplitters(tlp);
                if (op.testOptions.verbose)
                {
                    IList <string> list = new List <string>(op.trainOptions.splitters);
                    list.Sort();
                    log.Info("Parent split categories: " + list);
                }
            }
            //		if (op.trainOptions.selectivePostSplit) {
            //			// Do all the transformations once just to learn selective splits on annotated categories
            //			TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams);
            //			Treebank annotatedTB = trainTreebank.transform(myTransformer);
            //			op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp);
            //			if (op.testOptions.verbose) {
            //				log.info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
            //			}
            //		}
            if (op.trainOptions.hSelSplit)
            {
                // We run through all the trees once just to gather counts for hSelSplit!
                int ptt = op.trainOptions.printTreeTransformations;
                op.trainOptions.printTreeTransformations = 0;
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in trainTreebank)
                {
                    binarizer.TransformTree(tree);
                }
                binarizer.SetDoSelectiveSplit(true);
                op.trainOptions.printTreeTransformations = ptt;
            }
            //Tree transformation
            //
            IList <Tree> binaryTrainTrees = new List <Tree>();

            foreach (Tree tree_1 in trainTreebank)
            {
                tree_1 = binarizer.TransformTree(tree_1);
                if (tree_1.Yield().Count - 1 <= trainLengthLimit)
                {
                    binaryTrainTrees.Add(tree_1);
                }
            }
            // WSGDEBUG: Lot's of stuff on the grammar
            //    if(VERBOSE) {
            //      binarizer.printStateCounts();
            //      binarizer.printRuleCounts();
            //    binarizer.dumpStats();
            //    }
            return(binaryTrainTrees);
        }
 public MLEDependencyGrammar(ITreebankLangParserParams tlpParams, bool directional, bool distance, bool coarseDistance, bool basicCategoryTagsInDependencyGrammar, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
     : this(basicCategoryTagsInDependencyGrammar ? new BasicCategoryTagProjection(tlpParams.TreebankLanguagePack()) : new TestTagProjection(), tlpParams, directional, distance, coarseDistance, op, wordIndex, tagIndex)
 {
 }
Esempio n. 11
0
		// initial value is -0xDEADBEEF (actually positive because of 2s complement)
		// Don't change this; set with -v
		/// <summary>Determines method for print trees on output.</summary>
		/// <param name="tlpParams">The treebank parser params</param>
		/// <returns>A suitable tree printing object</returns>
		public virtual Edu.Stanford.Nlp.Trees.TreePrint TreePrint(ITreebankLangParserParams tlpParams)
		{
			ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack();
			return new Edu.Stanford.Nlp.Trees.TreePrint(outputFormat, outputFormatOptions, tlp, tlpParams.HeadFinder(), tlpParams.TypedDependencyHeadFinder());
		}
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            Properties options       = StringUtils.ArgsToProperties(args, OptionArgDefs());
            bool       Verbose       = PropertiesUtils.GetBool(options, "v", false);
            Language   Language      = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            int        MaxGoldYield  = PropertiesUtils.GetInt(options, "g", int.MaxValue);
            int        MaxGuessYield = PropertiesUtils.GetInt(options, "y", int.MaxValue);

            string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (parsedArgs.Length != MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            File goldFile  = new File(parsedArgs[0]);
            File guessFile = new File(parsedArgs[1]);
            ITreebankLangParserParams tlpp = Language.@params;
            PrintWriter pwOut         = tlpp.Pw();
            Treebank    guessTreebank = tlpp.DiskTreebank();

            guessTreebank.LoadPath(guessFile);
            pwOut.Println("GUESS TREEBANK:");
            pwOut.Println(guessTreebank.TextualSummary());
            Treebank goldTreebank = tlpp.DiskTreebank();

            goldTreebank.LoadPath(goldFile);
            pwOut.Println("GOLD TREEBANK:");
            pwOut.Println(goldTreebank.TextualSummary());
            Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval depEval = new Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval("CollinsDep", true, tlpp.HeadFinder(), tlpp.TreebankLanguagePack().StartSymbol());
            ITreeTransformer tc = tlpp.Collinizer();
            //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
            //don't match, we need to keep looking for the next gold tree that matches.
            //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
            //status as follows:
            //
            //   0 - Ok (yields match)
            //   1 - length mismatch
            //   2 - null parse e.g. (()).
            //
            //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
            IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator();
            int goldLineId             = 0;
            int skippedGuessTrees      = 0;

            foreach (Tree guess in guessTreebank)
            {
                Tree evalGuess = tc.TransformTree(guess);
                if (guess.Yield().Count > MaxGuessYield)
                {
                    skippedGuessTrees++;
                    continue;
                }
                bool doneEval = false;
                while (goldItr.MoveNext() && !doneEval)
                {
                    Tree gold     = goldItr.Current;
                    Tree evalGold = tc.TransformTree(gold);
                    goldLineId++;
                    if (gold.Yield().Count > MaxGoldYield)
                    {
                        continue;
                    }
                    else
                    {
                        if (evalGold.Yield().Count != evalGuess.Yield().Count)
                        {
                            pwOut.Println("Yield mismatch at gold line " + goldLineId);
                            skippedGuessTrees++;
                            break;
                        }
                    }
                    //Default evalb behavior -- skip this guess tree
                    depEval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
                    doneEval = true;
                }
            }
            //Move to the next guess parse
            pwOut.Println("================================================================================");
            if (skippedGuessTrees != 0)
            {
                pwOut.Printf("%s %d guess trees\n", ((MaxGuessYield < int.MaxValue) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
            }
            depEval.Display(true, pwOut);
            pwOut.Close();
        }
Esempio n. 13
0
        /// <summary>Do the category splitting of the tree passed in.</summary>
        /// <remarks>
        /// Do the category splitting of the tree passed in.
        /// This is initially called on the root node of a tree, and it recursively
        /// calls itself on children.  A depth first left-to-right traversal is
        /// done whereby a tree node's children are first transformed and then
        /// the parent is transformed.  At the time of calling, the original root
        /// always sits above the current node.  This routine can be assumed to,
        /// and does, change the tree passed in: it destructively modifies tree nodes,
        /// and makes new tree structure when it needs to.
        /// </remarks>
        /// <param name="t">The tree node to subcategorize.</param>
        /// <param name="root">
        /// The root of the tree.  It must contain
        /// <paramref name="t"/>
        /// or
        /// this code will throw a NullPointerException.
        /// </param>
        /// <returns>The annotated tree.</returns>
        private Tree TransformTreeHelper(Tree t, Tree root)
        {
            if (t == null)
            {
                // handle null
                return(null);
            }
            if (t.IsLeaf())
            {
                //No need to change the label
                return(t);
            }
            string cat = t.Label().Value();
            Tree   parent;
            string parentStr;
            string grandParentStr;

            if (root == null || t.Equals(root))
            {
                parent    = null;
                parentStr = string.Empty;
            }
            else
            {
                parent    = t.Parent(root);
                parentStr = parent.Label().Value();
            }
            if (parent == null || parent.Equals(root))
            {
                grandParentStr = string.Empty;
            }
            else
            {
                grandParentStr = parent.Parent(root).Label().Value();
            }
            string baseParentStr      = tlpParams.TreebankLanguagePack().BasicCategory(parentStr);
            string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr);

            //System.out.println(t.label().value() + " " + parentStr + " " + grandParentStr);
            if (t.IsPreTerminal())
            {
                // handle tags
                Tree childResult = TransformTreeHelper(t.Children()[0], null);
                // recurse
                string word = childResult.Value();
                // would be nicer if Word/CWT ??
                if (!trainOptions.noTagSplit)
                {
                    if (trainOptions.tagPA)
                    {
                        string test = cat + "^" + baseParentStr;
                        if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.Contains(test))
                        {
                            cat = test;
                        }
                    }
                    if (trainOptions.markUnaryTags && parent.NumChildren() == 1)
                    {
                        cat = cat + "^U";
                    }
                }
                // otherwise, leave the tags alone!
                // Label label = new CategoryWordTag(cat, word, cat);
                ILabel label = t.Label().LabelFactory().NewLabel(t.Label());
                label.SetValue(cat);
                if (label is IHasCategory)
                {
                    ((IHasCategory)label).SetCategory(cat);
                }
                if (label is IHasWord)
                {
                    ((IHasWord)label).SetWord(word);
                }
                if (label is IHasTag)
                {
                    ((IHasTag)label).SetTag(cat);
                }
                t.SetLabel(label);
                t.SetChild(0, childResult);
                // just in case word is changed
                if (trainOptions.noTagSplit)
                {
                    return(t);
                }
                else
                {
                    // language-specific transforms
                    return(tlpParams.TransformTree(t, root));
                }
            }
            // end isPreTerminal()
            // handle phrasal categories
            Tree[] kids = t.Children();
            for (int childNum = 0; childNum < kids.Length; childNum++)
            {
                Tree child       = kids[childNum];
                Tree childResult = TransformTreeHelper(child, root);
                // recursive call
                t.SetChild(childNum, childResult);
            }
            Tree headChild = hf.DetermineHead(t);

            if (headChild == null || headChild.Label() == null)
            {
                throw new Exception("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t);
            }
            ILabel headLabel = headChild.Label();

            if (!(headLabel is IHasWord))
            {
                throw new Exception("TreeAnnotator: Head label lacks a Word annotation!");
            }
            if (!(headLabel is IHasTag))
            {
                throw new Exception("TreeAnnotator: Head label lacks a Tag annotation!");
            }
            string word_1 = ((IHasWord)headLabel).Word();
            string tag    = ((IHasTag)headLabel).Tag();
            // String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag);
            string baseCat = tlpParams.TreebankLanguagePack().BasicCategory(cat);

            /* Sister annotation. Potential problem: if multiple sisters are
             * strong indicators for a single category's expansions.  This
             * happens concretely in the Chinese Treebank when NP (object)
             * has left sisters VV and AS.  Could lead to too much
             * sparseness.  The ideal solution would be to give the
             * splitting list an ordering, and take only the highest (~most
             * informative/reliable) sister annotation.
             */
            if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.Length > 0)
            {
                IList <string> leftSis  = ListBasicCategories(SisterAnnotationStats.LeftSisterLabels(t, parent));
                IList <string> rightSis = ListBasicCategories(SisterAnnotationStats.RightSisterLabels(t, parent));
                IList <string> leftAnn  = new List <string>();
                IList <string> rightAnn = new List <string>();
                foreach (string s in leftSis)
                {
                    //s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s);
                    leftAnn.Add(baseCat + "=l=" + tlpParams.TreebankLanguagePack().BasicCategory(s));
                }
                //System.out.println("left-annotated test string " + s);
                foreach (string s_1 in rightSis)
                {
                    //s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s);
                    rightAnn.Add(baseCat + "=r=" + tlpParams.TreebankLanguagePack().BasicCategory(s_1));
                }
                for (IEnumerator <string> j = rightAnn.GetEnumerator(); j.MoveNext();)
                {
                }
                //System.out.println("new rightsis " + (String)j.next()); //debugging
                foreach (string annCat in trainOptions.sisterSplitters)
                {
                    //System.out.println("annotated test string " + annCat);
                    if (leftAnn.Contains(annCat) || rightAnn.Contains(annCat))
                    {
                        cat = cat + annCat.ReplaceAll("^" + baseCat, string.Empty);
                        break;
                    }
                }
            }
            if (trainOptions.Pa && !trainOptions.smoothing && baseParentStr.Length > 0)
            {
                string cat2 = baseCat + "^" + baseParentStr;
                if (!trainOptions.selectiveSplit || trainOptions.splitters.Contains(cat2))
                {
                    cat = cat + "^" + baseParentStr;
                }
            }
            if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.Length > 0)
            {
                if (trainOptions.selectiveSplit)
                {
                    string cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr;
                    if (cat.Contains("^") && trainOptions.splitters.Contains(cat2))
                    {
                        cat = cat + "~" + baseGrandParentStr;
                    }
                }
                else
                {
                    cat = cat + "~" + baseGrandParentStr;
                }
            }
            if (trainOptions.markUnary > 0)
            {
                if (trainOptions.markUnary == 1 && kids.Length == 1 && kids[0].Depth() >= 2)
                {
                    cat = cat + "-U";
                }
                else
                {
                    if (trainOptions.markUnary == 2 && parent != null && parent.NumChildren() == 1 && t.Depth() >= 2)
                    {
                        cat = cat + "-u";
                    }
                }
            }
            if (trainOptions.rightRec && RightRec(t, baseCat))
            {
                cat = cat + "-R";
            }
            if (trainOptions.leftRec && LeftRec(t, baseCat))
            {
                cat = cat + "-L";
            }
            if (trainOptions.splitPrePreT && t.IsPrePreTerminal())
            {
                cat = cat + "-PPT";
            }
            //    Label label = new CategoryWordTag(cat, word, tag);
            ILabel label_1 = t.Label().LabelFactory().NewLabel(t.Label());

            label_1.SetValue(cat);
            if (label_1 is IHasCategory)
            {
                ((IHasCategory)label_1).SetCategory(cat);
            }
            if (label_1 is IHasWord)
            {
                ((IHasWord)label_1).SetWord(word_1);
            }
            if (label_1 is IHasTag)
            {
                ((IHasTag)label_1).SetTag(tag);
            }
            t.SetLabel(label_1);
            return(tlpParams.TransformTree(t, root));
        }
        public virtual Tree TransformTreeHelper(Tree t, Tree root, ITreeFactory tf)
        {
            Tree   result;
            Tree   parent;
            string parentStr;
            string grandParentStr;

            if (root == null || t.Equals(root))
            {
                parent    = null;
                parentStr = string.Empty;
            }
            else
            {
                parent    = t.Parent(root);
                parentStr = parent.Label().Value();
            }
            if (parent == null || parent.Equals(root))
            {
                grandParentStr = string.Empty;
            }
            else
            {
                Tree grandParent = parent.Parent(root);
                grandParentStr = grandParent.Label().Value();
            }
            string cat                = t.Label().Value();
            string baseParentStr      = tlpParams.TreebankLanguagePack().BasicCategory(parentStr);
            string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr);

            if (t.IsLeaf())
            {
                return(tf.NewLeaf(new Word(t.Label().Value())));
            }
            string word = t.HeadTerminal(hf).Value();

            if (t.IsPreTerminal())
            {
                nonTerms.IncrementCount(t.Label().Value());
            }
            else
            {
                nonTerms.IncrementCount(t.Label().Value());
                if (trainOptions.postPA && !trainOptions.smoothing && baseParentStr.Length > 0)
                {
                    string cat2;
                    if (trainOptions.postSplitWithBaseCategory)
                    {
                        cat2 = cat + '^' + baseParentStr;
                    }
                    else
                    {
                        cat2 = cat + '^' + parentStr;
                    }
                    if (!trainOptions.selectivePostSplit || trainOptions.postSplitters.Contains(cat2))
                    {
                        cat = cat2;
                    }
                }
                if (trainOptions.postGPA && !trainOptions.smoothing && grandParentStr.Length > 0)
                {
                    string cat2;
                    if (trainOptions.postSplitWithBaseCategory)
                    {
                        cat2 = cat + '~' + baseGrandParentStr;
                    }
                    else
                    {
                        cat2 = cat + '~' + grandParentStr;
                    }
                    if (trainOptions.selectivePostSplit)
                    {
                        if (cat.Contains("^") && trainOptions.postSplitters.Contains(cat2))
                        {
                            cat = cat2;
                        }
                    }
                    else
                    {
                        cat = cat2;
                    }
                }
            }
            result = tf.NewTreeNode(new CategoryWordTag(cat, word, cat), Collections.EmptyList <Tree>());
            List <Tree> newKids = new List <Tree>();

            Tree[] kids = t.Children();
            foreach (Tree kid in kids)
            {
                newKids.Add(TransformTreeHelper(kid, root, tf));
            }
            result.SetChildren(newKids);
            return(result);
        }