コード例 #1
0
        private void TestOnTreebank(LexicalizedParser pd, ITreebankLangParserParams tlpParams, Treebank testTreebank, string treebankRoot, IIndex <string> stateIndex)
        {
            Timing.StartTime();
            ITreeTransformer annotator = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);

            // CDM: Aug 2004: With new implementation of treebank split categories,
            // I've hardwired this to load English ones.  Otherwise need training data.
            // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters()));
            op.trainOptions.splitters       = ParentAnnotationStats.GetEnglishSplitCategories(treebankRoot);
            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters()));
            foreach (Tree goldTree in testTreebank)
            {
                goldTree = annotator.TransformTree(goldTree);
                //      System.out.println();
                //      System.out.println("Checking tree: " + goldTree);
                foreach (Tree localTree in goldTree)
                {
                    // now try to use the grammar to score this local tree
                    if (localTree.IsLeaf() || localTree.IsPreTerminal() || localTree.Children().Length < 2)
                    {
                        continue;
                    }
                    System.Console.Out.WriteLine(LocalTreeToRule(localTree));
                    double score = ComputeLocalTreeScore(localTree, stateIndex, pd);
                    if (score == double.NegativeInfinity)
                    {
                    }
                    //          System.out.println(localTreeToRule(localTree));
                    System.Console.Out.WriteLine("score: " + score);
                }
            }
        }
コード例 #2
0
        public BinarizerAnnotator(string annotatorName, Properties props)
        {
            this.tlppClass = props.GetProperty(annotatorName + ".tlppClass", DefaultTlppClass);
            ITreebankLangParserParams tlpp = ReflectionLoading.LoadByReflection(tlppClass);

            this.binarizer = TreeBinarizer.SimpleTreeBinarizer(tlpp.HeadFinder(), tlpp.TreebankLanguagePack());
        }
コード例 #3
0
        /// <summary>Lets you test out the TreeBinarizer on the command line.</summary>
        /// <remarks>
        /// Lets you test out the TreeBinarizer on the command line.
        /// This main method doesn't yet handle as many flags as one would like.
        /// But it does have:
        /// <ul>
        /// <li> -tlp TreebankLanguagePack
        /// <li>-tlpp TreebankLangParserParams
        /// <li>-insideFactor
        /// <li>-markovOrder
        /// </ul>
        /// </remarks>
        /// <param name="args">
        /// Command line arguments: flags as above, as above followed by
        /// treebankPath
        /// </param>
        public static void Main(string[] args)
        {
            ITreebankLangParserParams tlpp = null;
            // TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            // TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
            // Looks like it must build CategoryWordTagFactory!!
            ITreeReaderFactory    trf     = null;
            string                fileExt = "mrg";
            IHeadFinder           hf      = new ModCollinsHeadFinder();
            ITreebankLanguagePack tlp     = new PennTreebankLanguagePack();
            bool   insideFactor           = false;
            bool   mf               = false;
            int    mo               = 1;
            bool   uwl              = false;
            bool   uat              = false;
            double sst              = 20.0;
            bool   mfs              = false;
            bool   simpleLabels     = false;
            bool   noRebinarization = false;
            int    i = 0;

            while (i < args.Length && args[i].StartsWith("-"))
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp") && i + 1 < args.Length)
                {
                    try
                    {
                        tlp = (ITreebankLanguagePack)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                    }
                    catch (Exception e)
                    {
                        log.Info("Couldn't instantiate: " + args[i + 1]);
                        throw new Exception(e);
                    }
                    i++;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlpp") && i + 1 < args.Length)
                    {
                        try
                        {
                            tlpp = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                        }
                        catch (Exception e)
                        {
                            log.Info("Couldn't instantiate: " + args[i + 1]);
                            throw new Exception(e);
                        }
                        i++;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-insideFactor"))
                        {
                            insideFactor = true;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-markovOrder") && i + 1 < args.Length)
                            {
                                i++;
                                mo = System.Convert.ToInt32(args[i]);
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-simpleLabels"))
                                {
                                    simpleLabels = true;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noRebinarization"))
                                    {
                                        noRebinarization = true;
                                    }
                                    else
                                    {
                                        log.Info("Unknown option:" + args[i]);
                                    }
                                }
                            }
                        }
                    }
                }
                i++;
            }
            if (i >= args.Length)
            {
                log.Info("usage: java TreeBinarizer [-tlpp class|-markovOrder int|...] treebankPath");
                System.Environment.Exit(0);
            }
            Treebank treebank;

            if (tlpp != null)
            {
                treebank = tlpp.MemoryTreebank();
                tlp      = tlpp.TreebankLanguagePack();
                fileExt  = tlp.TreebankFileExtension();
                hf       = tlpp.HeadFinder();
            }
            else
            {
                treebank = new DiskTreebank(trf);
            }
            treebank.LoadPath(args[i], fileExt, true);
            ITreeTransformer tt = new Edu.Stanford.Nlp.Parser.Lexparser.TreeBinarizer(hf, tlp, insideFactor, mf, mo, uwl, uat, sst, mfs, simpleLabels, noRebinarization);

            foreach (Tree t in treebank)
            {
                Tree newT = tt.TransformTree(t);
                System.Console.Out.WriteLine("Original tree:");
                t.PennPrint();
                System.Console.Out.WriteLine("Binarized tree:");
                newT.PennPrint();
                System.Console.Out.WriteLine();
            }
        }
コード例 #4
0
        private static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromTreebank(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
        {
            System.Console.Out.WriteLine("Currently " + new DateTime());
            //    printOptions(true, op);
            Timing.StartTime();
            // setup tree transforms
            ITreebankLangParserParams tlpParams = op.tlpParams;

            if (op.testOptions.verbose)
            {
                System.Console.Out.Write("Training ");
                System.Console.Out.WriteLine(trainTreebank.TextualSummary());
            }
            System.Console.Out.Write("Binarizing trees...");
            TreeAnnotatorAndBinarizer binarizer;

            // initialized below
            if (!op.trainOptions.leftToRight)
            {
                binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            else
            {
                binarizer = new TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            CollinsPuncTransformer collinsPuncTransformer = null;

            if (op.trainOptions.collinsPunc)
            {
                collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.TreebankLanguagePack());
            }
            IList <Tree> binaryTrainTrees = new List <Tree>();

            // List<Tree> binaryTuneTrees = new ArrayList<Tree>();
            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.TreebankLanguagePack());
                if (op.testOptions.verbose)
                {
                    log.Info("Parent split categories: " + op.trainOptions.splitters);
                }
            }
            if (op.trainOptions.selectivePostSplit)
            {
                ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);
                Treebank         annotatedTB   = trainTreebank.Transform(myTransformer);
                op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.TreebankLanguagePack());
                if (op.testOptions.verbose)
                {
                    log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
                }
            }
            if (op.trainOptions.hSelSplit)
            {
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in trainTreebank)
                {
                    if (op.trainOptions.collinsPunc)
                    {
                        tree = collinsPuncTransformer.TransformTree(tree);
                    }
                    tree = binarizer.TransformTree(tree);
                }
                binarizer.SetDoSelectiveSplit(true);
            }
            foreach (Tree tree_1 in trainTreebank)
            {
                if (op.trainOptions.collinsPunc)
                {
                    tree_1 = collinsPuncTransformer.TransformTree(tree_1);
                }
                tree_1 = binarizer.TransformTree(tree_1);
                binaryTrainTrees.Add(tree_1);
            }
            Timing.Tick("done.");
            if (op.testOptions.verbose)
            {
                binarizer.DumpStats();
            }
            System.Console.Out.Write("Extracting Lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter clex = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)op.tlpParams.Lex(op, wordIndex, tagIndex);
            clex.InitializeTraining(binaryTrainTrees.Count);
            clex.Train(binaryTrainTrees);
            clex.FinishTraining();
            Timing.Tick("done.");
            return(clex);
        }
コード例 #5
0
 public TreeAnnotatorAndBinarizer(ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op)
     : this(tlpParams.HeadFinder(), tlpParams.HeadFinder(), tlpParams, forceCNF, insideFactor, doSubcategorization, op)
 {
 }
コード例 #6
0
        /// <returns>A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.</returns>
        public static Triple <Treebank, Treebank, Treebank> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op)
        {
            // setup tree transforms
            ITreebankLangParserParams tlpParams = op.tlpParams;
            ITreebankLanguagePack     tlp       = tlpParams.TreebankLanguagePack();

            if (op.testOptions.verbose)
            {
                PrintWriter pwErr = tlpParams.Pw(System.Console.Error);
                pwErr.Print("Training ");
                pwErr.Println(trainTreebank.TextualSummary(tlp));
                if (secondaryTreebank != null)
                {
                    pwErr.Print("Secondary training ");
                    pwErr.Println(secondaryTreebank.TextualSummary(tlp));
                }
            }
            CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer();

            if (op.trainOptions.preTransformer != null)
            {
                trainTransformer.AddTransformer(op.trainOptions.preTransformer);
            }
            if (op.trainOptions.collinsPunc)
            {
                CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp);
                trainTransformer.AddTransformer(collinsPuncTransformer);
            }
            log.Info("Binarizing trees...");
            Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer binarizer;
            if (!op.trainOptions.leftToRight)
            {
                binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op);
            }
            else
            {
                binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op);
            }
            trainTransformer.AddTransformer(binarizer);
            if (op.wordFunction != null)
            {
                ITreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction);
                trainTransformer.AddTransformer(wordFunctionTransformer);
            }
            Treebank wholeTreebank;

            if (secondaryTreebank == null)
            {
                wholeTreebank = trainTreebank;
            }
            else
            {
                wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank);
            }
            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp);
                RemoveDeleteSplittersFromSplitters(tlp, op);
                if (op.testOptions.verbose)
                {
                    IList <string> list = new List <string>(op.trainOptions.splitters);
                    list.Sort();
                    log.Info("Parent split categories: " + list);
                }
            }
            if (op.trainOptions.selectivePostSplit)
            {
                // Do all the transformations once just to learn selective splits on annotated categories
                ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);
                wholeTreebank = wholeTreebank.Transform(myTransformer);
                op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp);
                if (op.testOptions.verbose)
                {
                    log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
                }
            }
            if (op.trainOptions.hSelSplit)
            {
                // We run through all the trees once just to gather counts for hSelSplit!
                int ptt = op.trainOptions.printTreeTransformations;
                op.trainOptions.printTreeTransformations = 0;
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in wholeTreebank)
                {
                    trainTransformer.TransformTree(tree);
                }
                binarizer.SetDoSelectiveSplit(true);
                op.trainOptions.printTreeTransformations = ptt;
            }
            // we've done all the setup now. here's where the train treebank is transformed.
            trainTreebank = trainTreebank.Transform(trainTransformer);
            if (secondaryTreebank != null)
            {
                secondaryTreebank = secondaryTreebank.Transform(trainTransformer);
            }
            if (op.trainOptions.printAnnotatedStateCounts)
            {
                binarizer.PrintStateCounts();
            }
            if (op.trainOptions.printAnnotatedRuleCounts)
            {
                binarizer.PrintRuleCounts();
            }
            if (tuneTreebank != null)
            {
                tuneTreebank = tuneTreebank.Transform(trainTransformer);
            }
            if (op.testOptions.verbose)
            {
                binarizer.DumpStats();
            }
            return(new Triple <Treebank, Treebank, Treebank>(trainTreebank, secondaryTreebank, tuneTreebank));
        }
コード例 #7
0
		// initial value is -0xDEADBEEF (actually positive because of 2s complement)
		// Don't change this; set with -v
		/// <summary>Determines method for print trees on output.</summary>
		/// <param name="tlpParams">The treebank parser params</param>
		/// <returns>A suitable tree printing object</returns>
		public virtual Edu.Stanford.Nlp.Trees.TreePrint TreePrint(ITreebankLangParserParams tlpParams)
		{
			ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack();
			return new Edu.Stanford.Nlp.Trees.TreePrint(outputFormat, outputFormatOptions, tlp, tlpParams.HeadFinder(), tlpParams.TypedDependencyHeadFinder());
		}
コード例 #8
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            Properties options       = StringUtils.ArgsToProperties(args, OptionArgDefs());
            bool       Verbose       = PropertiesUtils.GetBool(options, "v", false);
            Language   Language      = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            int        MaxGoldYield  = PropertiesUtils.GetInt(options, "g", int.MaxValue);
            int        MaxGuessYield = PropertiesUtils.GetInt(options, "y", int.MaxValue);

            string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (parsedArgs.Length != MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            File goldFile  = new File(parsedArgs[0]);
            File guessFile = new File(parsedArgs[1]);
            ITreebankLangParserParams tlpp = Language.@params;
            PrintWriter pwOut         = tlpp.Pw();
            Treebank    guessTreebank = tlpp.DiskTreebank();

            guessTreebank.LoadPath(guessFile);
            pwOut.Println("GUESS TREEBANK:");
            pwOut.Println(guessTreebank.TextualSummary());
            Treebank goldTreebank = tlpp.DiskTreebank();

            goldTreebank.LoadPath(goldFile);
            pwOut.Println("GOLD TREEBANK:");
            pwOut.Println(goldTreebank.TextualSummary());
            Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval depEval = new Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval("CollinsDep", true, tlpp.HeadFinder(), tlpp.TreebankLanguagePack().StartSymbol());
            ITreeTransformer tc = tlpp.Collinizer();
            //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
            //don't match, we need to keep looking for the next gold tree that matches.
            //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
            //status as follows:
            //
            //   0 - Ok (yields match)
            //   1 - length mismatch
            //   2 - null parse e.g. (()).
            //
            //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
            IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator();
            int goldLineId             = 0;
            int skippedGuessTrees      = 0;

            foreach (Tree guess in guessTreebank)
            {
                Tree evalGuess = tc.TransformTree(guess);
                if (guess.Yield().Count > MaxGuessYield)
                {
                    skippedGuessTrees++;
                    continue;
                }
                bool doneEval = false;
                while (goldItr.MoveNext() && !doneEval)
                {
                    Tree gold     = goldItr.Current;
                    Tree evalGold = tc.TransformTree(gold);
                    goldLineId++;
                    if (gold.Yield().Count > MaxGoldYield)
                    {
                        continue;
                    }
                    else
                    {
                        if (evalGold.Yield().Count != evalGuess.Yield().Count)
                        {
                            pwOut.Println("Yield mismatch at gold line " + goldLineId);
                            skippedGuessTrees++;
                            break;
                        }
                    }
                    //Default evalb behavior -- skip this guess tree
                    depEval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
                    doneEval = true;
                }
            }
            //Move to the next guess parse
            pwOut.Println("================================================================================");
            if (skippedGuessTrees != 0)
            {
                pwOut.Printf("%s %d guess trees\n", ((MaxGuessYield < int.MaxValue) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
            }
            depEval.Display(true, pwOut);
            pwOut.Close();
        }
コード例 #9
0
 public PostSplitter(ITreebankLangParserParams tlpParams, Options op)
 {
     this.tlpParams    = tlpParams;
     this.hf           = tlpParams.HeadFinder();
     this.trainOptions = op.trainOptions;
 }