コード例 #1
0
        public virtual bool Run(File trainTreebankFile, File testTreebankFile, InputStream inputStream)
        {
            op           = new Options();
            op.tlpParams = new ArabicTreebankParserParams();
            op.SetOptions("-arabicFactored");
            op.testOptions.maxLength = maxSentLen;
            op.testOptions.MaxItems  = 5000000;
            //500000 is the default for Arabic, but we have substantially more edges now
            op.testOptions.outputFormatOptions = "removeTopBracket,includePunctuationDependencies";
            // WSG: Just set this to some high value so that extractBestParse()
            // actually calls the lattice reader (e.g., this says that we can't have a word longer than
            // 80 characters...seems sensible for Arabic
            op.testOptions.maxSpanForTags = 80;
            treePrint           = op.testOptions.TreePrint(op.tlpParams);
            debinarizer         = new Debinarizer(op.forceCNF, new CategoryWordTagFactory());
            subcategoryStripper = op.tlpParams.SubcategoryStripper();
            Timing.StartTime();
            Treebank trainTreebank = op.tlpParams.DiskTreebank();

            trainTreebank.LoadPath(trainTreebankFile);
            lp = GetParserDataFromTreebank(trainTreebank);
            MakeParsers();
            if (Verbose)
            {
                op.Display();
                string lexNumRules = (pparser != null) ? int.ToString(lp.lex.NumRules()) : string.Empty;
                log.Info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
                log.Info("Grammar\t" + lp.stateIndex.Size() + '\t' + lp.tagIndex.Size() + '\t' + lp.wordIndex.Size() + '\t' + (pparser != null ? lp.ug.NumRules() : string.Empty) + '\t' + (pparser != null ? lp.bg.NumRules() : string.Empty) + '\t' + lexNumRules
                         );
                log.Info("ParserPack is " + op.tlpParams.GetType().FullName);
                log.Info("Lexicon is " + lp.lex.GetType().FullName);
            }
            return(Parse(inputStream));
        }
コード例 #2
0
        /// <summary>
        /// This is hardwired to calculate the split categories from English
        /// Penn Treebank sections 2-21 with a default cutoff of 300 (as used
        /// in ACL03PCFG).
        /// </summary>
        /// <remarks>
        /// This is hardwired to calculate the split categories from English
        /// Penn Treebank sections 2-21 with a default cutoff of 300 (as used
        /// in ACL03PCFG).  It was added to upgrading of code in cases where no
        /// Treebank was available, and the pre-stored list was being used).
        /// </remarks>
        public static ICollection <string> GetEnglishSplitCategories(string treebankRoot)
        {
            ITreebankLangParserParams tlpParams = new EnglishTreebankParserParams();
            Treebank trees = tlpParams.MemoryTreebank();

            trees.LoadPath(treebankRoot, new NumberRangeFileFilter(200, 2199, true));
            return(GetSplitCategories(trees, 300.0, tlpParams.TreebankLanguagePack()));
        }
コード例 #3
0
        public virtual IList <Tree> ReadBinarizedTreebank(string treebankPath, IFileFilter treebankFilter)
        {
            Treebank     treebank  = ReadTreebank(treebankPath, treebankFilter);
            IList <Tree> binarized = BinarizeTreebank(treebank, op);

            log.Info("Converted trees to binarized format");
            return(binarized);
        }
コード例 #4
0
        protected virtual void SetUp()
        {
            Options  op       = new Options();
            Treebank treebank = op.tlpParams.MemoryTreebank();

            Sharpen.Collections.AddAll(treebank, Arrays.AsList(correctTrees));
            binarizedTrees = ShiftReduceParser.BinarizeTreebank(treebank, op);
        }
コード例 #5
0
        public virtual Treebank ReadTreebank(string treebankPath, IFileFilter treebankFilter)
        {
            log.Info("Loading trees from " + treebankPath);
            Treebank treebank = op.tlpParams.MemoryTreebank();

            treebank.LoadPath(treebankPath, treebankFilter);
            log.Info("Read in " + treebank.Count + " trees from " + treebankPath);
            return(treebank);
        }
コード例 #6
0
        /// <summary>Call this method to get a String array of categories to split on.</summary>
        /// <remarks>
        /// Call this method to get a String array of categories to split on.
        /// It calculates parent annotation statistics suitable for doing
        /// selective parent splitting in the PCFGParser inside
        /// FactoredParser.  <p>
        /// If tlp is non-null tlp.basicCategory() will be called on parent and
        /// grandparent nodes. <p>
        /// <i>Implementation note:</i> This method is not designed for concurrent
        /// invocation: it uses static state variables.
        /// </remarks>
        public static ICollection <string> GetSplitCategories(Treebank t, bool doTags, int algorithm, double phrasalCutOff, double tagCutOff, ITreebankLanguagePack tlp)
        {
            Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats pas = new Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats(tlp, doTags);
            t.Apply(pas);
            ICollection <string> splitters = Generics.NewHashSet();

            Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats.GetSplitters(phrasalCutOff, pas.nodeRules, pas.pRules, pas.gPRules, splitters);
            Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats.GetSplitters(tagCutOff, pas.tagNodeRules, pas.tagPRules, pas.tagGPRules, splitters);
            return(splitters);
        }
コード例 #7
0
        public static void Main(string[] args)
        {
            ITreebankLangParserParams tlpParams = new ChineseTreebankParserParams();
            ITreebankLanguagePack     ctlp      = tlpParams.TreebankLanguagePack();
            Options       op = new Options(tlpParams);
            TreeAnnotator ta = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);

            log.Info("Reading Trees...");
            IFileFilter trainFilter   = new NumberRangesFileFilter(args[1], true);
            Treebank    trainTreebank = tlpParams.MemoryTreebank();

            trainTreebank.LoadPath(args[0], trainFilter);
            log.Info("Annotating trees...");
            ICollection <Tree> trainTrees = new List <Tree>();

            foreach (Tree tree in trainTreebank)
            {
                trainTrees.Add(ta.TransformTree(tree));
            }
            trainTreebank = null;
            // saves memory
            log.Info("Training lexicon...");
            IIndex <string> wordIndex    = new HashIndex <string>();
            IIndex <string> tagIndex     = new HashIndex <string>();
            int             featureLevel = DefaultFeatureLevel;

            if (args.Length > 3)
            {
                featureLevel = System.Convert.ToInt32(args[3]);
            }
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseMaxentLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseMaxentLexicon(op, wordIndex, tagIndex, featureLevel);
            lex.InitializeTraining(trainTrees.Count);
            lex.Train(trainTrees);
            lex.FinishTraining();
            log.Info("Testing");
            IFileFilter testFilter   = new NumberRangesFileFilter(args[2], true);
            Treebank    testTreebank = tlpParams.MemoryTreebank();

            testTreebank.LoadPath(args[0], testFilter);
            IList <TaggedWord> testWords = new List <TaggedWord>();

            foreach (Tree t in testTreebank)
            {
                foreach (TaggedWord tw in t.TaggedYield())
                {
                    testWords.Add(tw);
                }
            }
            //testWords.addAll(t.taggedYield());
            int[] totalAndCorrect = lex.TestOnTreebank(testWords);
            log.Info("done.");
            System.Console.Out.WriteLine(totalAndCorrect[1] + " correct out of " + totalAndCorrect[0] + " -- ACC: " + ((double)totalAndCorrect[1]) / totalAndCorrect[0]);
        }
コード例 #8
0
        private void Train(IList <Pair <string, IFileFilter> > trainTreebankPath, Pair <string, IFileFilter> devTreebankPath, string serializedPath)
        {
            log.Info("Training method: " + op.TrainOptions().trainingMethod);
            IList <Tree> binarizedTrees = Generics.NewArrayList();

            foreach (Pair <string, IFileFilter> treebank in trainTreebankPath)
            {
                Sharpen.Collections.AddAll(binarizedTrees, ReadBinarizedTreebank(treebank.First(), treebank.Second()));
            }
            int nThreads = op.trainOptions.trainingThreads;

            nThreads = nThreads <= 0 ? Runtime.GetRuntime().AvailableProcessors() : nThreads;
            Edu.Stanford.Nlp.Tagger.Common.Tagger tagger = null;
            if (op.testOptions.preTag)
            {
                Timing retagTimer = new Timing();
                tagger = Edu.Stanford.Nlp.Tagger.Common.Tagger.LoadModel(op.testOptions.taggerSerializedFile);
                RedoTags(binarizedTrees, tagger, nThreads);
                retagTimer.Done("Retagging");
            }
            ICollection <string> knownStates    = FindKnownStates(binarizedTrees);
            ICollection <string> rootStates     = FindRootStates(binarizedTrees);
            ICollection <string> rootOnlyStates = FindRootOnlyStates(binarizedTrees, rootStates);

            log.Info("Known states: " + knownStates);
            log.Info("States which occur at the root: " + rootStates);
            log.Info("States which only occur at the root: " + rootStates);
            Timing transitionTimer = new Timing();
            IList <IList <ITransition> > transitionLists = CreateTransitionSequence.CreateTransitionSequences(binarizedTrees, op.compoundUnaries, rootStates, rootOnlyStates);
            IIndex <ITransition>         transitionIndex = new HashIndex <ITransition>();

            foreach (IList <ITransition> transitions in transitionLists)
            {
                transitionIndex.AddAll(transitions);
            }
            transitionTimer.Done("Converting trees into transition lists");
            log.Info("Number of transitions: " + transitionIndex.Size());
            Random   random      = new Random(op.trainOptions.randomSeed);
            Treebank devTreebank = null;

            if (devTreebankPath != null)
            {
                devTreebank = ReadTreebank(devTreebankPath.First(), devTreebankPath.Second());
            }
            PerceptronModel newModel = new PerceptronModel(this.op, transitionIndex, knownStates, rootStates, rootOnlyStates);

            newModel.TrainModel(serializedPath, tagger, random, binarizedTrees, transitionLists, devTreebank, nThreads);
            this.model = newModel;
        }
コード例 #9
0
        public virtual void RunTest(string[] args)
        {
            // get a parser from file
            LexicalizedParser pd = ((LexicalizedParser)LexicalizedParser.LoadModel(args[0]));

            op = pd.GetOp();
            // in case a serialized options was read in
            Treebank testTreebank = op.tlpParams.MemoryTreebank();
            int      testlow      = System.Convert.ToInt32(args[2]);
            int      testhigh     = System.Convert.ToInt32(args[3]);

            testTreebank.LoadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true));
            op.SetOptionsOrWarn(args, 4, args.Length);
            TestOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex);
        }
コード例 #10
0
        private static Treebank MakeTreebank(string treebankPath, Options op, IFileFilter filt)
        {
            log.Info("Training a segmenter from treebank dir: " + treebankPath);
            Treebank trainTreebank = op.tlpParams.MemoryTreebank();

            log.Info("Reading trees...");
            if (filt == null)
            {
                trainTreebank.LoadPath(treebankPath);
            }
            else
            {
                trainTreebank.LoadPath(treebankPath, filt);
            }
            Timing.Tick("done [read " + trainTreebank.Count + " trees].");
            return(trainTreebank);
        }
コード例 #11
0
        private static IList <FactoredLexiconEvent> GetTuningSet(Treebank devTreebank, Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon, ITreebankLangParserParams tlpp)
        {
            IList <Tree> devTrees = new List <Tree>(3000);

            foreach (Tree tree in devTreebank)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                devTrees.Add(tree);
            }
            IList <FactoredLexiconEvent> tuningSet = TreebankToLexiconEvents(devTrees, lexicon);

            return(tuningSet);
        }
コード例 #12
0
 public TreeTaggedFileReader(TaggedFileRecord record)
 {
     // int numSentences = 0;
     filename    = record.file;
     trf         = record.trf == null ? new LabeledScoredTreeReaderFactory() : record.trf;
     transformer = record.treeTransformer;
     normalizer  = record.treeNormalizer;
     treeFilter  = record.treeFilter;
     treebank    = new DiskTreebank(trf, record.encoding);
     if (record.treeRange != null)
     {
         treebank.LoadPath(filename, record.treeRange);
     }
     else
     {
         treebank.LoadPath(filename);
     }
     treeIterator = treebank.GetEnumerator();
     FindNext();
 }
コード例 #13
0
        public virtual LexicalizedParser GetParserDataFromTreebank(Treebank trainTreebank)
        {
            log.Info("Binarizing training trees...");
            IList <Tree> binaryTrainTrees = GetAnnotatedBinaryTreebankFromTreebank(trainTreebank);

            Timing.Tick("done.");
            IIndex <string> stateIndex = new HashIndex <string>();

            log.Info("Extracting PCFG...");
            IExtractor <Pair <UnaryGrammar, BinaryGrammar> > bgExtractor = new BinaryGrammarExtractor(op, stateIndex);
            Pair <UnaryGrammar, BinaryGrammar> bgug = bgExtractor.Extract(binaryTrainTrees);
            BinaryGrammar bg = bgug.second;

            bg.SplitRules();
            UnaryGrammar ug = bgug.first;

            ug.PurgeRules();
            Timing.Tick("done.");
            log.Info("Extracting Lexicon...");
            IIndex <string> wordIndex = new HashIndex <string>();
            IIndex <string> tagIndex  = new HashIndex <string>();
            ILexicon        lex       = op.tlpParams.Lex(op, wordIndex, tagIndex);

            lex.InitializeTraining(binaryTrainTrees.Count);
            lex.Train(binaryTrainTrees);
            lex.FinishTraining();
            Timing.Tick("done.");
            IExtractor <IDependencyGrammar> dgExtractor = op.tlpParams.DependencyGrammarExtractor(op, wordIndex, tagIndex);
            IDependencyGrammar dg = null;

            if (op.doDep)
            {
                log.Info("Extracting Dependencies...");
                dg = dgExtractor.Extract(binaryTrainTrees);
                dg.SetLexicon(lex);
                Timing.Tick("done.");
            }
            log.Info("Done extracting grammars and lexicon.");
            return(new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op));
        }
コード例 #14
0
        private Distribution <int> GetSegmentedWordLengthDistribution(Treebank tb)
        {
            // CharacterLevelTagExtender ext = new CharacterLevelTagExtender();
            ClassicCounter <int> c = new ClassicCounter <int>();

            foreach (Tree gold in tb)
            {
                StringBuilder goldChars = new StringBuilder();
                ArrayList     goldYield = gold.Yield();
                foreach (object aGoldYield in goldYield)
                {
                    Word word = (Word)aGoldYield;
                    goldChars.Append(word);
                }
                IList <IHasWord> ourWords = Segment(goldChars.ToString());
                foreach (IHasWord ourWord in ourWords)
                {
                    c.IncrementCount(int.Parse(ourWord.Word().Length));
                }
            }
            return(Distribution.GetDistribution(c));
        }
コード例 #15
0
        public static IList <Tree> BinarizeTreebank(Treebank treebank, Options op)
        {
            TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(op.tlpParams.HeadFinder(), op.tlpParams.TreebankLanguagePack());
            BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.Langpack());
            CompositeTreeTransformer     transformer      = new CompositeTreeTransformer();

            transformer.AddTransformer(binarizer);
            transformer.AddTransformer(basicTransformer);
            treebank = treebank.Transform(transformer);
            IHeadFinder  binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.HeadFinder());
            IList <Tree> binarizedTrees   = Generics.NewArrayList();

            foreach (Tree tree in treebank)
            {
                Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(tree);
                tree.PercolateHeadAnnotations(binaryHeadFinder);
                // Index from 1.  Tools downstream expect index from 1, so for
                // uses internal to the srparser we have to renormalize the
                // indices, with the result that here we have to index from 1
                tree.IndexLeaves(1, true);
                binarizedTrees.Add(tree);
            }
            return(binarizedTrees);
        }
コード例 #16
0
        /// <returns>A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.</returns>
        public static Triple <Treebank, Treebank, Treebank> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op)
        {
            // setup tree transforms
            ITreebankLangParserParams tlpParams = op.tlpParams;
            ITreebankLanguagePack     tlp       = tlpParams.TreebankLanguagePack();

            if (op.testOptions.verbose)
            {
                PrintWriter pwErr = tlpParams.Pw(System.Console.Error);
                pwErr.Print("Training ");
                pwErr.Println(trainTreebank.TextualSummary(tlp));
                if (secondaryTreebank != null)
                {
                    pwErr.Print("Secondary training ");
                    pwErr.Println(secondaryTreebank.TextualSummary(tlp));
                }
            }
            CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer();

            if (op.trainOptions.preTransformer != null)
            {
                trainTransformer.AddTransformer(op.trainOptions.preTransformer);
            }
            if (op.trainOptions.collinsPunc)
            {
                CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp);
                trainTransformer.AddTransformer(collinsPuncTransformer);
            }
            log.Info("Binarizing trees...");
            Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer binarizer;
            if (!op.trainOptions.leftToRight)
            {
                binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op);
            }
            else
            {
                binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op);
            }
            trainTransformer.AddTransformer(binarizer);
            if (op.wordFunction != null)
            {
                ITreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction);
                trainTransformer.AddTransformer(wordFunctionTransformer);
            }
            Treebank wholeTreebank;

            if (secondaryTreebank == null)
            {
                wholeTreebank = trainTreebank;
            }
            else
            {
                wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank);
            }
            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp);
                RemoveDeleteSplittersFromSplitters(tlp, op);
                if (op.testOptions.verbose)
                {
                    IList <string> list = new List <string>(op.trainOptions.splitters);
                    list.Sort();
                    log.Info("Parent split categories: " + list);
                }
            }
            if (op.trainOptions.selectivePostSplit)
            {
                // Do all the transformations once just to learn selective splits on annotated categories
                ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);
                wholeTreebank = wholeTreebank.Transform(myTransformer);
                op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp);
                if (op.testOptions.verbose)
                {
                    log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
                }
            }
            if (op.trainOptions.hSelSplit)
            {
                // We run through all the trees once just to gather counts for hSelSplit!
                int ptt = op.trainOptions.printTreeTransformations;
                op.trainOptions.printTreeTransformations = 0;
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in wholeTreebank)
                {
                    trainTransformer.TransformTree(tree);
                }
                binarizer.SetDoSelectiveSplit(true);
                op.trainOptions.printTreeTransformations = ptt;
            }
            // we've done all the setup now. here's where the train treebank is transformed.
            trainTreebank = trainTreebank.Transform(trainTransformer);
            if (secondaryTreebank != null)
            {
                secondaryTreebank = secondaryTreebank.Transform(trainTransformer);
            }
            if (op.trainOptions.printAnnotatedStateCounts)
            {
                binarizer.PrintStateCounts();
            }
            if (op.trainOptions.printAnnotatedRuleCounts)
            {
                binarizer.PrintRuleCounts();
            }
            if (tuneTreebank != null)
            {
                tuneTreebank = tuneTreebank.Transform(trainTransformer);
            }
            if (op.testOptions.verbose)
            {
                binarizer.DumpStats();
            }
            return(new Triple <Treebank, Treebank, Treebank>(trainTreebank, secondaryTreebank, tuneTreebank));
        }
コード例 #17
0
 /// <summary>Call this method to get a String array of categories to split on.</summary>
 /// <remarks>
 /// Call this method to get a String array of categories to split on.
 /// It calculates parent annotation statistics suitable for doing
 /// selective parent splitting in the PCFGParser inside
 /// FactoredParser.  <p>
 /// If tlp is non-null tlp.basicCategory() will be called on parent and
 /// grandparent nodes. <p>
 /// This version just defaults some parameters.
 /// <i>Implementation note:</i> This method is not designed for concurrent
 /// invocation: it uses static state variables.
 /// </remarks>
 public static ICollection <string> GetSplitCategories(Treebank t, double cutOff, ITreebankLanguagePack tlp)
 {
     return(GetSplitCategories(t, true, 0, cutOff, cutOff, tlp));
 }
コード例 #18
0
        /// <summary>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// </summary>
        /// <remarks>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// <p>
        /// <i>Implementation note:</i> This method is largely cloned from
        /// LexicalizedParser's main method.  Should we try to have it be able
        /// to train segmenters to stop things going out of sync?
        /// </remarks>
        public static void Main(string[] args)
        {
            bool     train = false;
            bool     saveToSerializedFile      = false;
            bool     saveToTextFile            = false;
            string   serializedInputFileOrUrl  = null;
            string   textInputFileOrUrl        = null;
            string   serializedOutputFileOrUrl = null;
            string   textOutputFileOrUrl       = null;
            string   treebankPath = null;
            Treebank testTreebank = null;
            // Treebank tuneTreebank = null;
            string      testPath    = null;
            IFileFilter testFilter  = null;
            IFileFilter trainFilter = null;
            string      encoding    = null;
            // variables needed to process the files to be parsed
            ITokenizerFactory <Word> tokenizerFactory = null;
            //    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
            bool tokenized = false;
            // whether or not the input file has already been tokenized
            IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper();
            // int tagDelimiter = -1;
            // String sentenceDelimiter = "\n";
            // boolean fromXML = false;
            int argIndex = 0;

            if (args.Length < 1)
            {
                log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*");
                return;
            }
            Options op = new Options();

            op.tlpParams = new ChineseTreebankParserParams();
            // while loop through option arguments
            while (argIndex < args.Length && args[argIndex][0] == '-')
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                {
                    train = true;
                    saveToSerializedFile = true;
                    int numSubArgs = NumSubArgs(args, argIndex);
                    argIndex++;
                    if (numSubArgs > 1)
                    {
                        treebankPath = args[argIndex];
                        argIndex++;
                    }
                    else
                    {
                        throw new Exception("Error: -train option must have treebankPath as first argument.");
                    }
                    if (numSubArgs == 2)
                    {
                        trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
                    }
                    else
                    {
                        if (numSubArgs >= 3)
                        {
                            try
                            {
                                int low  = System.Convert.ToInt32(args[argIndex]);
                                int high = System.Convert.ToInt32(args[argIndex + 1]);
                                trainFilter = new NumberRangeFileFilter(low, high, true);
                                argIndex   += 2;
                            }
                            catch (NumberFormatException)
                            {
                                // maybe it's a ranges expression?
                                trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                                argIndex++;
                            }
                        }
                    }
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding"))
                    {
                        // sets encoding for TreebankLangParserParams
                        encoding = args[argIndex + 1];
                        op.tlpParams.SetInputEncoding(encoding);
                        op.tlpParams.SetOutputEncoding(encoding);
                        argIndex += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile"))
                        {
                            // load the parser from a binary serialized file
                            // the next argument must be the path to the parser file
                            serializedInputFileOrUrl = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            // doesn't make sense to load from TextFile -pichuan
                            //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
                            //        // load the parser from declarative text file
                            //        // the next argument must be the path to the parser file
                            //        textInputFileOrUrl = args[argIndex + 1];
                            //        argIndex += 2;
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile"))
                            {
                                saveToSerializedFile      = true;
                                serializedOutputFileOrUrl = args[argIndex + 1];
                                argIndex += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile"))
                                {
                                    // save the parser to declarative text file
                                    saveToTextFile      = true;
                                    textOutputFileOrUrl = args[argIndex + 1];
                                    argIndex           += 2;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                                    {
                                        // the next argument is the treebank path and range for testing
                                        int numSubArgs = NumSubArgs(args, argIndex);
                                        argIndex++;
                                        if (numSubArgs == 1)
                                        {
                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                        }
                                        else
                                        {
                                            if (numSubArgs > 1)
                                            {
                                                testPath = args[argIndex++];
                                                if (numSubArgs == 2)
                                                {
                                                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                }
                                                else
                                                {
                                                    if (numSubArgs >= 3)
                                                    {
                                                        try
                                                        {
                                                            int low  = System.Convert.ToInt32(args[argIndex]);
                                                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                                                            testFilter = new NumberRangeFileFilter(low, high, true);
                                                            argIndex  += 2;
                                                        }
                                                        catch (NumberFormatException)
                                                        {
                                                            // maybe it's a ranges expression?
                                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                    else
                                    {
                                        int j = op.tlpParams.SetOptionFlag(args, argIndex);
                                        if (j == argIndex)
                                        {
                                            log.Info("Unknown option ignored: " + args[argIndex]);
                                            j++;
                                        }
                                        argIndex = j;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // end while loop through arguments
            ITreebankLangParserParams tlpParams = op.tlpParams;

            // all other arguments are order dependent and
            // are processed in order below
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null;
            if (!train && op.testOptions.verbose)
            {
                System.Console.Out.WriteLine("Currently " + new DateTime());
                PrintArgs(args, System.Console.Out);
            }
            if (train)
            {
                PrintArgs(args, System.Console.Out);
                // so we train a parser using the treebank
                if (treebankPath == null)
                {
                    // the next arg must be the treebank path, since it wasn't give earlier
                    treebankPath = args[argIndex];
                    argIndex++;
                    if (args.Length > argIndex + 1)
                    {
                        try
                        {
                            // the next two args might be the range
                            int low  = System.Convert.ToInt32(args[argIndex]);
                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                            trainFilter = new NumberRangeFileFilter(low, high, true);
                            argIndex   += 2;
                        }
                        catch (NumberFormatException)
                        {
                            // maybe it's a ranges expression?
                            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                            argIndex++;
                        }
                    }
                }
                Treebank        trainTreebank = MakeTreebank(treebankPath, op, trainFilter);
                IIndex <string> wordIndex     = new HashIndex <string>();
                IIndex <string> tagIndex      = new HashIndex <string>();
                cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
            }
            else
            {
                if (textInputFileOrUrl != null)
                {
                }
                else
                {
                    // so we load the segmenter from a text grammar file
                    // XXXXX fix later -pichuan
                    //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
                    // so we load a serialized segmenter
                    if (serializedInputFileOrUrl == null)
                    {
                        // the next argument must be the path to the serialized parser
                        serializedInputFileOrUrl = args[argIndex];
                        argIndex++;
                    }
                    try
                    {
                        cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
                    }
                    catch (ArgumentException)
                    {
                        log.Info("Error loading segmenter, exiting...");
                        System.Environment.Exit(0);
                    }
                }
            }
            // the following has to go after reading parser to make sure
            // op and tlpParams are the same for train and test
            TreePrint treePrint = op.testOptions.TreePrint(tlpParams);

            if (testFilter != null)
            {
                if (testPath == null)
                {
                    if (treebankPath == null)
                    {
                        throw new Exception("No test treebank path specified...");
                    }
                    else
                    {
                        log.Info("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
                        testPath = treebankPath;
                    }
                }
                testTreebank = tlpParams.TestMemoryTreebank();
                testTreebank.LoadPath(testPath, testFilter);
            }
            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters()));
            // at this point we should be sure that op.tlpParams is
            // set appropriately (from command line, or from grammar file),
            // and will never change again.  We also set the tlpParams of the
            // LexicalizedParser instance to be the same object.  This is
            // redundancy that we probably should take out eventually.
            //
            // -- Roger
            if (op.testOptions.verbose)
            {
                log.Info("Lexicon is " + cs.GetType().FullName);
            }
            PrintWriter pwOut = tlpParams.Pw();
            PrintWriter pwErr = tlpParams.Pw(System.Console.Error);

            // Now what do we do with the parser we've made
            if (saveToTextFile)
            {
                // save the parser to textGrammar format
                if (textOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToText(cs, textOutputFileOrUrl);
                }
                else
                {
                    log.Info("Usage: must specify a text segmenter data output path");
                }
            }
            if (saveToSerializedFile)
            {
                if (serializedOutputFileOrUrl == null && argIndex < args.Length)
                {
                    // the next argument must be the path to serialize to
                    serializedOutputFileOrUrl = args[argIndex];
                    argIndex++;
                }
                if (serializedOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
                }
                else
                {
                    if (textOutputFileOrUrl == null && testTreebank == null)
                    {
                        // no saving/parsing request has been specified
                        log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
                    }
                }
            }
            /* --------------------- Testing part!!!! ----------------------- */
            if (op.testOptions.verbose)
            {
            }
            //      printOptions(false, op);
            if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")))
            {
                // test parser on treebank
                if (testTreebank == null)
                {
                    // the next argument is the treebank path and range for testing
                    testTreebank = tlpParams.TestMemoryTreebank();
                    if (args.Length < argIndex + 4)
                    {
                        testTreebank.LoadPath(args[argIndex + 1]);
                    }
                    else
                    {
                        int testlow  = System.Convert.ToInt32(args[argIndex + 2]);
                        int testhigh = System.Convert.ToInt32(args[argIndex + 3]);
                        testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
                    }
                }
            }
        }
コード例 #19
0
        private void TestOnTreebank(LexicalizedParser pd, ITreebankLangParserParams tlpParams, Treebank testTreebank, string treebankRoot, IIndex <string> stateIndex)
        {
            Timing.StartTime();
            ITreeTransformer annotator = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);

            // CDM: Aug 2004: With new implementation of treebank split categories,
            // I've hardwired this to load English ones.  Otherwise need training data.
            // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters()));
            op.trainOptions.splitters       = ParentAnnotationStats.GetEnglishSplitCategories(treebankRoot);
            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters()));
            foreach (Tree goldTree in testTreebank)
            {
                goldTree = annotator.TransformTree(goldTree);
                //      System.out.println();
                //      System.out.println("Checking tree: " + goldTree);
                foreach (Tree localTree in goldTree)
                {
                    // now try to use the grammar to score this local tree
                    if (localTree.IsLeaf() || localTree.IsPreTerminal() || localTree.Children().Length < 2)
                    {
                        continue;
                    }
                    System.Console.Out.WriteLine(LocalTreeToRule(localTree));
                    double score = ComputeLocalTreeScore(localTree, stateIndex, pd);
                    if (score == double.NegativeInfinity)
                    {
                    }
                    //          System.out.println(localTreeToRule(localTree));
                    System.Console.Out.WriteLine("score: " + score);
                }
            }
        }
コード例 #20
0
        private static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromTreebank(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
        {
            System.Console.Out.WriteLine("Currently " + new DateTime());
            //    printOptions(true, op);
            Timing.StartTime();
            // setup tree transforms
            ITreebankLangParserParams tlpParams = op.tlpParams;

            if (op.testOptions.verbose)
            {
                System.Console.Out.Write("Training ");
                System.Console.Out.WriteLine(trainTreebank.TextualSummary());
            }
            System.Console.Out.Write("Binarizing trees...");
            TreeAnnotatorAndBinarizer binarizer;

            // initialized below
            if (!op.trainOptions.leftToRight)
            {
                binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            else
            {
                binarizer = new TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            CollinsPuncTransformer collinsPuncTransformer = null;

            if (op.trainOptions.collinsPunc)
            {
                collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.TreebankLanguagePack());
            }
            IList <Tree> binaryTrainTrees = new List <Tree>();

            // List<Tree> binaryTuneTrees = new ArrayList<Tree>();
            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.TreebankLanguagePack());
                if (op.testOptions.verbose)
                {
                    log.Info("Parent split categories: " + op.trainOptions.splitters);
                }
            }
            if (op.trainOptions.selectivePostSplit)
            {
                ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);
                Treebank         annotatedTB   = trainTreebank.Transform(myTransformer);
                op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.TreebankLanguagePack());
                if (op.testOptions.verbose)
                {
                    log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
                }
            }
            if (op.trainOptions.hSelSplit)
            {
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in trainTreebank)
                {
                    if (op.trainOptions.collinsPunc)
                    {
                        tree = collinsPuncTransformer.TransformTree(tree);
                    }
                    tree = binarizer.TransformTree(tree);
                }
                binarizer.SetDoSelectiveSplit(true);
            }
            foreach (Tree tree_1 in trainTreebank)
            {
                if (op.trainOptions.collinsPunc)
                {
                    tree_1 = collinsPuncTransformer.TransformTree(tree_1);
                }
                tree_1 = binarizer.TransformTree(tree_1);
                binaryTrainTrees.Add(tree_1);
            }
            Timing.Tick("done.");
            if (op.testOptions.verbose)
            {
                binarizer.DumpStats();
            }
            System.Console.Out.Write("Extracting Lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter clex = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)op.tlpParams.Lex(op, wordIndex, tagIndex);
            clex.InitializeTraining(binaryTrainTrees.Count);
            clex.Train(binaryTrainTrees);
            clex.FinishTraining();
            Timing.Tick("done.");
            return(clex);
        }
コード例 #21
0
 private ChineseLexiconAndWordSegmenter(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
 {
     Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = GetSegmenterDataFromTreebank(trainTreebank, op, wordIndex, tagIndex);
     chineseLexicon = cs.chineseLexicon;
     wordSegmenter  = cs.wordSegmenter;
 }
コード例 #22
0
        /// <summary>
        /// An example command line for training a new parser:
        /// <br />
        /// nohup java -mx6g edu.stanford.nlp.parser.dvparser.DVParser -cachedTrees /scr/nlp/data/dvparser/wsj/cached.wsj.train.simple.ser.gz -train -testTreebank  /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 -debugOutputFrequency 400 -nofilter -trainingThreads 5 -parser /u/nlp/data/lexparser/wsjPCFG.nocompact.simple.ser.gz -trainingIterations 40 -batchSize 25 -model /scr/nlp/data/dvparser/wsj/wsj.combine.v2.ser.gz -unkWord "*UNK*" -dvCombineCategories &gt; /scr/nlp/data/dvparser/wsj/wsj.combine.v2.out 2&gt;&amp;1 &amp;
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                Help();
                System.Environment.Exit(2);
            }
            log.Info("Running DVParser with arguments:");
            foreach (string arg in args)
            {
                log.Info("  " + arg);
            }
            log.Info();
            string         parserPath           = null;
            string         trainTreebankPath    = null;
            IFileFilter    trainTreebankFilter  = null;
            string         cachedTrainTreesPath = null;
            bool           runGradientCheck     = false;
            bool           runTraining          = false;
            string         testTreebankPath     = null;
            IFileFilter    testTreebankFilter   = null;
            string         initialModelPath     = null;
            string         modelPath            = null;
            bool           filter            = true;
            string         resultsRecordPath = null;
            IList <string> unusedArgs        = new List <string>();
            // These parameters can be null or 0 if the model was not
            // serialized with the new parameters.  Setting the options at the
            // command line will override these defaults.
            // TODO: if/when we integrate back into the main branch and
            // rebuild models, we can get rid of this
            IList <string> argsWithDefaults = new List <string>(Arrays.AsList(new string[] { "-wordVectorFile", Options.LexOptions.DefaultWordVectorFile, "-dvKBest", int.ToString(TrainOptions.DefaultKBest), "-batchSize", int.ToString(TrainOptions.DefaultBatchSize
                                                                                                                                                                                                                                          ), "-trainingIterations", int.ToString(TrainOptions.DefaultTrainingIterations), "-qnIterationsPerBatch", int.ToString(TrainOptions.DefaultQnIterationsPerBatch), "-regCost", double.ToString(TrainOptions.DefaultRegcost), "-learningRate", double
                                                                                             .ToString(TrainOptions.DefaultLearningRate), "-deltaMargin", double.ToString(TrainOptions.DefaultDeltaMargin), "-unknownNumberVector", "-unknownDashedWordVectors", "-unknownCapsVector", "-unknownchinesepercentvector", "-unknownchinesenumbervector"
                                                                                             , "-unknownchineseyearvector", "-unkWord", "*UNK*", "-transformMatrixType", "DIAGONAL", "-scalingForInit", double.ToString(TrainOptions.DefaultScalingForInit), "-trainWordVectors" }));

            Sharpen.Collections.AddAll(argsWithDefaults, Arrays.AsList(args));
            args = Sharpen.Collections.ToArray(argsWithDefaults, new string[argsWithDefaults.Count]);
            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser"))
                {
                    parserPath = args[argIndex + 1];
                    argIndex  += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                        {
                            Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank");
                            argIndex            = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                            trainTreebankPath   = treebankDescription.First();
                            trainTreebankFilter = treebankDescription.Second();
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-cachedTrees"))
                            {
                                cachedTrainTreesPath = args[argIndex + 1];
                                argIndex            += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-runGradientCheck"))
                                {
                                    runGradientCheck = true;
                                    argIndex++;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                                    {
                                        runTraining = true;
                                        argIndex++;
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                                        {
                                            modelPath = args[argIndex + 1];
                                            argIndex += 2;
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-nofilter"))
                                            {
                                                filter = false;
                                                argIndex++;
                                            }
                                            else
                                            {
                                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-continueTraining"))
                                                {
                                                    runTraining      = true;
                                                    filter           = false;
                                                    initialModelPath = args[argIndex + 1];
                                                    argIndex        += 2;
                                                }
                                                else
                                                {
                                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-resultsRecord"))
                                                    {
                                                        resultsRecordPath = args[argIndex + 1];
                                                        argIndex         += 2;
                                                    }
                                                    else
                                                    {
                                                        unusedArgs.Add(args[argIndex++]);
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            if (parserPath == null && modelPath == null)
            {
                throw new ArgumentException("Must supply either a base parser model with -parser or a serialized DVParser with -model");
            }
            if (!runTraining && modelPath == null && !runGradientCheck)
            {
                throw new ArgumentException("Need to either train a new model, run the gradient check or specify a model to load with -model");
            }
            string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            Edu.Stanford.Nlp.Parser.Dvparser.DVParser dvparser = null;
            LexicalizedParser lexparser = null;

            if (initialModelPath != null)
            {
                lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(initialModelPath, newArgs));
                DVModel model = GetModelFromLexicalizedParser(lexparser);
                dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser);
            }
            else
            {
                if (runTraining || runGradientCheck)
                {
                    lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserPath, newArgs));
                    dvparser  = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(lexparser);
                }
                else
                {
                    if (modelPath != null)
                    {
                        lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs));
                        DVModel model = GetModelFromLexicalizedParser(lexparser);
                        dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser);
                    }
                }
            }
            IList <Tree> trainSentences = new List <Tree>();
            IdentityHashMap <Tree, byte[]> trainCompressedParses = Generics.NewIdentityHashMap();

            if (cachedTrainTreesPath != null)
            {
                foreach (string path in cachedTrainTreesPath.Split(","))
                {
                    IList <Pair <Tree, byte[]> > cache = IOUtils.ReadObjectFromFile(path);
                    foreach (Pair <Tree, byte[]> pair in cache)
                    {
                        trainSentences.Add(pair.First());
                        trainCompressedParses[pair.First()] = pair.Second();
                    }
                    log.Info("Read in " + cache.Count + " trees from " + path);
                }
            }
            if (trainTreebankPath != null)
            {
                // TODO: make the transformer a member of the model?
                ITreeTransformer transformer = BuildTrainTransformer(dvparser.GetOp());
                Treebank         treebank    = dvparser.GetOp().tlpParams.MemoryTreebank();
                treebank.LoadPath(trainTreebankPath, trainTreebankFilter);
                treebank = treebank.Transform(transformer);
                log.Info("Read in " + treebank.Count + " trees from " + trainTreebankPath);
                CacheParseHypotheses cacher = new CacheParseHypotheses(dvparser.parser);
                CacheParseHypotheses.CacheProcessor processor = new CacheParseHypotheses.CacheProcessor(cacher, lexparser, dvparser.op.trainOptions.dvKBest, transformer);
                foreach (Tree tree in treebank)
                {
                    trainSentences.Add(tree);
                    trainCompressedParses[tree] = processor.Process(tree).second;
                }
                //System.out.println(tree);
                log.Info("Finished parsing " + treebank.Count + " trees, getting " + dvparser.op.trainOptions.dvKBest + " hypotheses each");
            }
            if ((runTraining || runGradientCheck) && filter)
            {
                log.Info("Filtering rules for the given training set");
                dvparser.dvModel.SetRulesForTrainingSet(trainSentences, trainCompressedParses);
                log.Info("Done filtering rules; " + dvparser.dvModel.numBinaryMatrices + " binary matrices, " + dvparser.dvModel.numUnaryMatrices + " unary matrices, " + dvparser.dvModel.wordVectors.Count + " word vectors");
            }
            //dvparser.dvModel.printAllMatrices();
            Treebank testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = dvparser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
            }
            //    runGradientCheck= true;
            if (runGradientCheck)
            {
                log.Info("Running gradient check on " + trainSentences.Count + " trees");
                dvparser.RunGradientCheck(trainSentences, trainCompressedParses);
            }
            if (runTraining)
            {
                log.Info("Training the RNN parser");
                log.Info("Current train options: " + dvparser.GetOp().trainOptions);
                dvparser.Train(trainSentences, trainCompressedParses, testTreebank, modelPath, resultsRecordPath);
                if (modelPath != null)
                {
                    dvparser.SaveModel(modelPath);
                }
            }
            if (testTreebankPath != null)
            {
                EvaluateTreebank evaluator = new EvaluateTreebank(dvparser.AttachModelToLexicalizedParser());
                evaluator.TestOnTreebank(testTreebank);
            }
            log.Info("Successfully ran DVParser");
        }
コード例 #23
0
        /// <exception cref="System.IO.IOException"/>
        public virtual void Train(IList <Tree> sentences, IdentityHashMap <Tree, byte[]> compressedParses, Treebank testTreebank, string modelPath, string resultsRecordPath)
        {
            // process:
            //   we come up with a cost and a derivative for the model
            //   we always use the gold tree as the example to train towards
            //   every time through, we will look at the top N trees from
            //     the LexicalizedParser and pick the best one according to
            //     our model (at the start, this is essentially random)
            // we use QN to minimize the cost function for the model
            // to do this minimization, we turn all of the matrices in the
            //   DVModel into one big Theta, which is the set of variables to
            //   be optimized by the QN.
            Timing timing             = new Timing();
            long   maxTrainTimeMillis = op.trainOptions.maxTrainTimeSeconds * 1000;
            int    batchCount         = 0;
            int    debugCycle         = 0;
            double bestLabelF1        = 0.0;

            if (op.trainOptions.useContextWords)
            {
                foreach (Tree tree in sentences)
                {
                    Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(tree);
                    tree.SetSpans();
                }
            }
            // for AdaGrad
            double[] sumGradSquare = new double[dvModel.TotalParamSize()];
            Arrays.Fill(sumGradSquare, 1.0);
            int numBatches = sentences.Count / op.trainOptions.batchSize + 1;

            log.Info("Training on " + sentences.Count + " trees in " + numBatches + " batches");
            log.Info("Times through each training batch: " + op.trainOptions.trainingIterations);
            log.Info("QN iterations per batch: " + op.trainOptions.qnIterationsPerBatch);
            for (int iter = 0; iter < op.trainOptions.trainingIterations; ++iter)
            {
                IList <Tree> shuffledSentences = new List <Tree>(sentences);
                Java.Util.Collections.Shuffle(shuffledSentences, dvModel.rand);
                for (int batch = 0; batch < numBatches; ++batch)
                {
                    ++batchCount;
                    // This did not help performance
                    //log.info("Setting AdaGrad's sum of squares to 1...");
                    //Arrays.fill(sumGradSquare, 1.0);
                    log.Info("======================================");
                    log.Info("Iteration " + iter + " batch " + batch);
                    // Each batch will be of the specified batch size, except the
                    // last batch will include any leftover trees at the end of
                    // the list
                    int startTree = batch * op.trainOptions.batchSize;
                    int endTree   = (batch + 1) * op.trainOptions.batchSize;
                    if (endTree > shuffledSentences.Count)
                    {
                        endTree = shuffledSentences.Count;
                    }
                    ExecuteOneTrainingBatch(shuffledSentences.SubList(startTree, endTree), compressedParses, sumGradSquare);
                    long totalElapsed = timing.Report();
                    log.Info("Finished iteration " + iter + " batch " + batch + "; total training time " + totalElapsed + " ms");
                    if (maxTrainTimeMillis > 0 && totalElapsed > maxTrainTimeMillis)
                    {
                        // no need to debug output, we're done now
                        break;
                    }
                    if (op.trainOptions.debugOutputFrequency > 0 && batchCount % op.trainOptions.debugOutputFrequency == 0)
                    {
                        log.Info("Finished " + batchCount + " total batches, running evaluation cycle");
                        // Time for debugging output!
                        double tagF1   = 0.0;
                        double labelF1 = 0.0;
                        if (testTreebank != null)
                        {
                            EvaluateTreebank evaluator = new EvaluateTreebank(AttachModelToLexicalizedParser());
                            evaluator.TestOnTreebank(testTreebank);
                            labelF1 = evaluator.GetLBScore();
                            tagF1   = evaluator.GetTagScore();
                            if (labelF1 > bestLabelF1)
                            {
                                bestLabelF1 = labelF1;
                            }
                            log.Info("Best label f1 on dev set so far: " + Nf.Format(bestLabelF1));
                        }
                        string tempName = null;
                        if (modelPath != null)
                        {
                            tempName = modelPath;
                            if (modelPath.EndsWith(".ser.gz"))
                            {
                                tempName = Sharpen.Runtime.Substring(modelPath, 0, modelPath.Length - 7) + "-" + Filename.Format(debugCycle) + "-" + Nf.Format(labelF1) + ".ser.gz";
                            }
                            SaveModel(tempName);
                        }
                        string statusLine = ("CHECKPOINT:" + " iteration " + iter + " batch " + batch + " labelF1 " + Nf.Format(labelF1) + " tagF1 " + Nf.Format(tagF1) + " bestLabelF1 " + Nf.Format(bestLabelF1) + " model " + tempName + op.trainOptions + " word vectors: "
                                             + op.lexOptions.wordVectorFile + " numHid: " + op.lexOptions.numHid);
                        log.Info(statusLine);
                        if (resultsRecordPath != null)
                        {
                            FileWriter fout = new FileWriter(resultsRecordPath, true);
                            // append
                            fout.Write(statusLine);
                            fout.Write("\n");
                            fout.Close();
                        }
                        ++debugCycle;
                    }
                }
                long totalElapsed_1 = timing.Report();
                if (maxTrainTimeMillis > 0 && totalElapsed_1 > maxTrainTimeMillis)
                {
                    // no need to debug output, we're done now
                    log.Info("Max training time exceeded, exiting");
                    break;
                }
            }
        }
コード例 #24
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 4)
            {
                System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName);
                System.Environment.Exit(-1);
            }
            // Command line options
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;
            Treebank trainTreebank         = tlpp.DiskTreebank();

            trainTreebank.LoadPath(args[2]);
            Treebank devTreebank = tlpp.DiskTreebank();

            devTreebank.LoadPath(args[3]);
            MorphoFeatureSpecification morphoSpec;
            Options options = GetOptions(language);

            if (language.Equals(Language.Arabic))
            {
                morphoSpec = new ArabicMorphoFeatureSpecification();
                string[] languageOptions = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(languageOptions, 0);
            }
            else
            {
                if (language.Equals(Language.French))
                {
                    morphoSpec = new FrenchMorphoFeatureSpecification();
                    string[] languageOptions = new string[] { "-frenchFactored" };
                    tlpp.SetOptionFlag(languageOptions, 0);
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            string featureList = args[1];

            string[] features = featureList.Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.WriteLine("Features: " + args[1]);
            // Create word and tag indices
            // Save trees in a collection since the interface requires that....
            System.Console.Out.Write("Loading training trees...");
            IList <Tree>    trainTrees = new List <Tree>(19000);
            IIndex <string> wordIndex  = new HashIndex <string>();
            IIndex <string> tagIndex   = new HashIndex <string>();

            foreach (Tree tree in trainTreebank)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                trainTrees.Add(tree);
            }
            System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count);
            // Setup and train the lexicon.
            System.Console.Out.Write("Collecting sufficient statistics for lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
            lexicon.InitializeTraining(trainTrees.Count);
            lexicon.Train(trainTrees, null);
            lexicon.FinishTraining();
            System.Console.Out.WriteLine("Done!");
            trainTrees = null;
            // Load the tuning set
            System.Console.Out.Write("Loading tuning set...");
            IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp);

            System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count);
            // Print the probabilities that we obtain
            // TODO(spenceg): Implement tagging accuracy with FactLex
            int nCorrect             = 0;
            ICounter <string> errors = new ClassicCounter <string>();

            foreach (FactoredLexiconEvent @event in tuningSet)
            {
                IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr());
                ICounter <int> logScores        = new ClassicCounter <int>();
                bool           noRules          = true;
                int            goldTagId        = -1;
                while (itr.MoveNext())
                {
                    noRules = false;
                    IntTaggedWord iTW = itr.Current;
                    if (iTW.Tag() == @event.TagId())
                    {
                        log.Info("GOLD-");
                        goldTagId = iTW.Tag();
                    }
                    float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr());
                    logScores.IncrementCount(iTW.Tag(), tagScore);
                }
                if (noRules)
                {
                    System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr());
                }
                else
                {
                    // Score the tagging
                    int hypTagId = Counters.Argmax(logScores);
                    if (hypTagId == goldTagId)
                    {
                        ++nCorrect;
                    }
                    else
                    {
                        string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId);
                        errors.IncrementCount(goldTag);
                    }
                }
                log.Info();
            }
            // Output accuracy
            double acc = (double)nCorrect / (double)tuningSet.Count;

            System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
            log.Info("% of errors by type:");
            IList <string> biggestKeys = new List <string>(errors.KeySet());

            biggestKeys.Sort(Counters.ToComparator(errors, false, true));
            Counters.Normalize(errors);
            foreach (string key in biggestKeys)
            {
                System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0);
            }
        }
コード例 #25
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            string         dvmodelFile        = null;
            string         lexparserFile      = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = new List <string>();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-lexparser"))
                {
                    lexparserFile = args[argIndex + 1];
                    argIndex     += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        unusedArgs.Add(args[argIndex++]);
                    }
                }
            }
            log.Info("Loading lexparser from: " + lexparserFile);
            string[]          newArgs   = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(lexparserFile, newArgs));

            log.Info("... done");
            Treebank testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = lexparser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
            }
            double[] labelResults = new double[weights.Length];
            double[] tagResults   = new double[weights.Length];
            for (int i = 0; i < weights.Length; ++i)
            {
                lexparser.GetOp().baseParserWeight = weights[i];
                EvaluateTreebank evaluator         = new EvaluateTreebank(lexparser);
                evaluator.TestOnTreebank(testTreebank);
                labelResults[i] = evaluator.GetLBScore();
                tagResults[i]   = evaluator.GetTagScore();
            }
            for (int i_1 = 0; i_1 < weights.Length; ++i_1)
            {
                log.Info("LexicalizedParser weight " + weights[i_1] + ": labeled " + labelResults[i_1] + " tag " + tagResults[i_1]);
            }
        }
コード例 #26
0
        public virtual IList <Tree> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank)
        {
            ITreebankLangParserParams tlpParams = op.tlpParams;
            ITreebankLanguagePack     tlp       = tlpParams.TreebankLanguagePack();

            if (Verbose)
            {
                log.Info("\n\n" + trainTreebank.TextualSummary(tlp));
            }
            log.Info("Binarizing trees...");
            TreeAnnotatorAndBinarizer binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);

            Timing.Tick("done.");
            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp);
                RemoveDeleteSplittersFromSplitters(tlp);
                if (op.testOptions.verbose)
                {
                    IList <string> list = new List <string>(op.trainOptions.splitters);
                    list.Sort();
                    log.Info("Parent split categories: " + list);
                }
            }
            //		if (op.trainOptions.selectivePostSplit) {
            //			// Do all the transformations once just to learn selective splits on annotated categories
            //			TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams);
            //			Treebank annotatedTB = trainTreebank.transform(myTransformer);
            //			op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp);
            //			if (op.testOptions.verbose) {
            //				log.info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
            //			}
            //		}
            if (op.trainOptions.hSelSplit)
            {
                // We run through all the trees once just to gather counts for hSelSplit!
                int ptt = op.trainOptions.printTreeTransformations;
                op.trainOptions.printTreeTransformations = 0;
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in trainTreebank)
                {
                    binarizer.TransformTree(tree);
                }
                binarizer.SetDoSelectiveSplit(true);
                op.trainOptions.printTreeTransformations = ptt;
            }
            //Tree transformation
            //
            IList <Tree> binaryTrainTrees = new List <Tree>();

            foreach (Tree tree_1 in trainTreebank)
            {
                tree_1 = binarizer.TransformTree(tree_1);
                if (tree_1.Yield().Count - 1 <= trainLengthLimit)
                {
                    binaryTrainTrees.Add(tree_1);
                }
            }
            // WSGDEBUG: Lot's of stuff on the grammar
            //    if(VERBOSE) {
            //      binarizer.printStateCounts();
            //      binarizer.printRuleCounts();
            //    binarizer.dumpStats();
            //    }
            return(binaryTrainTrees);
        }
コード例 #27
0
        /// <summary>Run the scoring metric on guess/gold input.</summary>
        /// <remarks>
        /// Run the scoring metric on guess/gold input. This method performs "Collinization."
        /// The default language is English.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            int    maxGoldYield            = int.MaxValue;
            bool   Verbose   = false;
            string encoding  = "UTF-8";
            string guessFile = null;
            string goldFile  = null;
            IDictionary <string, string[]> argsMap = StringUtils.ArgsToMap(args, optionArgDefs);

            foreach (KeyValuePair <string, string[]> opt in argsMap)
            {
                if (opt.Key == null)
                {
                    continue;
                }
                if (opt.Key.Equals("-l"))
                {
                    Language lang = Language.ValueOf(opt.Value[0].Trim());
                    tlpp = lang.@params;
                }
                else
                {
                    if (opt.Key.Equals("-y"))
                    {
                        maxGoldYield = System.Convert.ToInt32(opt.Value[0].Trim());
                    }
                    else
                    {
                        if (opt.Key.Equals("-v"))
                        {
                            Verbose = true;
                        }
                        else
                        {
                            if (opt.Key.Equals("-c"))
                            {
                                Edu.Stanford.Nlp.Parser.Metrics.TaggingEval.doCatLevelEval = true;
                            }
                            else
                            {
                                if (opt.Key.Equals("-e"))
                                {
                                    encoding = opt.Value[0];
                                }
                                else
                                {
                                    log.Info(usage.ToString());
                                    System.Environment.Exit(-1);
                                }
                            }
                        }
                    }
                }
                //Non-option arguments located at key null
                string[] rest = argsMap[null];
                if (rest == null || rest.Length < minArgs)
                {
                    log.Info(usage.ToString());
                    System.Environment.Exit(-1);
                }
                goldFile  = rest[0];
                guessFile = rest[1];
            }
            tlpp.SetInputEncoding(encoding);
            PrintWriter pwOut         = tlpp.Pw();
            Treebank    guessTreebank = tlpp.DiskTreebank();

            guessTreebank.LoadPath(guessFile);
            pwOut.Println("GUESS TREEBANK:");
            pwOut.Println(guessTreebank.TextualSummary());
            Treebank goldTreebank = tlpp.DiskTreebank();

            goldTreebank.LoadPath(goldFile);
            pwOut.Println("GOLD TREEBANK:");
            pwOut.Println(goldTreebank.TextualSummary());
            Edu.Stanford.Nlp.Parser.Metrics.TaggingEval metric = new Edu.Stanford.Nlp.Parser.Metrics.TaggingEval("Tagging LP/LR");
            ITreeTransformer tc = tlpp.Collinizer();
            //The evalb ref implementation assigns status for each tree pair as follows:
            //
            //   0 - Ok (yields match)
            //   1 - length mismatch
            //   2 - null parse e.g. (()).
            //
            //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
            IEnumerator <Tree> goldItr  = goldTreebank.GetEnumerator();
            IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator();
            int goldLineId        = 0;
            int guessLineId       = 0;
            int skippedGuessTrees = 0;

            while (guessItr.MoveNext() && goldItr.MoveNext())
            {
                Tree           guessTree  = guessItr.Current;
                IList <ILabel> guessYield = guessTree.Yield();
                guessLineId++;
                Tree           goldTree  = goldItr.Current;
                IList <ILabel> goldYield = goldTree.Yield();
                goldLineId++;
                // Check that we should evaluate this tree
                if (goldYield.Count > maxGoldYield)
                {
                    skippedGuessTrees++;
                    continue;
                }
                // Only trees with equal yields can be evaluated
                if (goldYield.Count != guessYield.Count)
                {
                    pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId);
                    skippedGuessTrees++;
                    continue;
                }
                Tree evalGuess = tc.TransformTree(guessTree);
                Tree evalGold  = tc.TransformTree(goldTree);
                metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
            }
            if (guessItr.MoveNext() || goldItr.MoveNext())
            {
                System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
            }
            pwOut.Println("================================================================================");
            if (skippedGuessTrees != 0)
            {
                pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
            }
            metric.Display(true, pwOut);
            pwOut.Println();
            pwOut.Close();
        }
コード例 #28
0
        /// <summary>Lets you test out the TreeAnnotatorAndBinarizer on the command line.</summary>
        /// <param name="args">
        /// Command line arguments: All flags accepted by FactoredParser.setOptionFlag
        /// and -train treebankPath [fileRanges]
        /// </param>
        public static void Main(string[] args)
        {
            Options     op           = new Options();
            string      treebankPath = null;
            IFileFilter trainFilter  = null;
            int         i            = 0;

            while (i < args.Length && args[i].StartsWith("-"))
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-train"))
                {
                    int numSubArgs = NumSubArgs(args, i);
                    i++;
                    if (numSubArgs >= 1)
                    {
                        treebankPath = args[i];
                        i++;
                    }
                    else
                    {
                        throw new Exception("Error: -train option must have treebankPath as first argument.");
                    }
                    if (numSubArgs == 2)
                    {
                        trainFilter = new NumberRangesFileFilter(args[i++], true);
                    }
                    else
                    {
                        if (numSubArgs >= 3)
                        {
                            int low  = System.Convert.ToInt32(args[i]);
                            int high = System.Convert.ToInt32(args[i + 1]);
                            trainFilter = new NumberRangeFileFilter(low, high, true);
                            i          += 2;
                        }
                    }
                }
                else
                {
                    i = op.SetOption(args, i);
                }
            }
            if (i < args.Length)
            {
                log.Info("usage: java TreeAnnotatorAndBinarizer options*");
                log.Info("  Options are like for lexicalized parser including -train treebankPath fileRange]");
                return;
            }
            log.Info("Annotating from treebank dir: " + treebankPath);
            Treebank trainTreebank = op.tlpParams.DiskTreebank();

            if (trainFilter == null)
            {
                trainTreebank.LoadPath(treebankPath);
            }
            else
            {
                trainTreebank.LoadPath(treebankPath, trainFilter);
            }
            Treebank           binaryTrainTreebank = GetAnnotatedBinaryTreebankFromTreebank(trainTreebank, null, null, op).First();
            IEnumerator <Tree> it = trainTreebank.GetEnumerator();

            foreach (Tree t in binaryTrainTreebank)
            {
                System.Console.Out.WriteLine("Original tree:");
                it.Current.PennPrint();
                System.Console.Out.WriteLine("Binarized tree:");
                t.PennPrint();
                System.Console.Out.WriteLine();
            }
        }
コード例 #29
0
        //  private static String stripTag(String tag) {
        //    if (tag.startsWith("DT")) {
        //      String newTag = tag.substring(2, tag.length());
        //      return newTag.length() > 0 ? newTag : tag;
        //    }
        //    return tag;
        //  }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName);
                System.Environment.Exit(-1);
            }
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;

            if (language.Equals(Language.Arabic))
            {
                string[] options = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            else
            {
                string[] options = new string[] { "-frenchFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            Treebank tb = tlpp.DiskTreebank();

            tb.LoadPath(args[1]);
            MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();

            string[] features = args[2].Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            // Counters
            ICounter <string> wordTagCounter  = new ClassicCounter <string>(30000);
            ICounter <string> morphTagCounter = new ClassicCounter <string>(500);
            //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
            ICounter <string> morphCounter           = new ClassicCounter <string>(500);
            ICounter <string> wordCounter            = new ClassicCounter <string>(30000);
            ICounter <string> tagCounter             = new ClassicCounter <string>(300);
            ICounter <string> lemmaCounter           = new ClassicCounter <string>(25000);
            ICounter <string> lemmaTagCounter        = new ClassicCounter <string>(25000);
            ICounter <string> richTagCounter         = new ClassicCounter <string>(1000);
            ICounter <string> reducedTagCounter      = new ClassicCounter <string>(500);
            ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500);
            IDictionary <string, ICollection <string> > wordLemmaMap           = Generics.NewHashMap();
            TwoDimensionalIntCounter <string, string>   lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000);
            TwoDimensionalIntCounter <string, string>   reducedTagTagCounter   = new TwoDimensionalIntCounter <string, string>(500);
            TwoDimensionalIntCounter <string, string>   tagReducedTagCounter   = new TwoDimensionalIntCounter <string, string>(300);
            int numTrees = 0;

            foreach (Tree tree in tb)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                IList <ILabel> pretermList = tree.PreTerminalYield();
                IList <ILabel> yield       = tree.Yield();
                System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count);
                int yieldLen = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string tag   = pretermList[i].Value();
                    string word  = yield[i].Value();
                    string morph = ((CoreLabel)yield[i]).OriginalText();
                    // Note: if there is no lemma, then we use the surface form.
                    Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph);
                    string lemma   = lemmaTag.First();
                    string richTag = lemmaTag.Second();
                    // WSGDEBUG
                    if (tag.Contains("MW"))
                    {
                        lemma += "-MWE";
                    }
                    lemmaCounter.IncrementCount(lemma);
                    lemmaTagCounter.IncrementCount(lemma + tag);
                    richTagCounter.IncrementCount(richTag);
                    string reducedTag = morphoSpec.StrToFeatures(richTag).ToString();
                    reducedTagCounter.IncrementCount(reducedTag);
                    reducedTagLemmaCounter.IncrementCount(reducedTag + lemma);
                    wordTagCounter.IncrementCount(word + tag);
                    morphTagCounter.IncrementCount(morph + tag);
                    morphCounter.IncrementCount(morph);
                    wordCounter.IncrementCount(word);
                    tagCounter.IncrementCount(tag);
                    reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag;
                    if (wordLemmaMap.Contains(word))
                    {
                        wordLemmaMap[word].Add(lemma);
                    }
                    else
                    {
                        ICollection <string> lemmas = Generics.NewHashSet(1);
                        wordLemmaMap[word] = lemmas;
                    }
                    lemmaReducedTagCounter.IncrementCount(lemma, reducedTag);
                    reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag);
                    tagReducedTagCounter.IncrementCount(tag, reducedTag);
                }
                ++numTrees;
            }
            // Barf...
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.Printf("#trees:\t%d%n", numTrees);
            System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount());
            System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count);
            System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count);
            System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count);
            // Extra
            System.Console.Out.WriteLine("==================");
            StringBuilder sbNoLemma    = new StringBuilder();
            StringBuilder sbMultLemmas = new StringBuilder();

            foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap)
            {
                string word = wordLemmas.Key;
                ICollection <string> lemmas = wordLemmas.Value;
                if (lemmas.Count == 0)
                {
                    sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n");
                    continue;
                }
                if (lemmas.Count > 1)
                {
                    sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n");
                    continue;
                }
                string lemma = lemmas.GetEnumerator().Current;
                ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet();
                if (reducedTags.Count > 1)
                {
                    System.Console.Out.Printf("%s --> %s%n", word, lemma);
                    foreach (string reducedTag in reducedTags)
                    {
                        int    count   = lemmaReducedTagCounter.GetCount(lemma, reducedTag);
                        string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet());
                        System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
                    }
                    System.Console.Out.WriteLine();
                }
            }
            System.Console.Out.WriteLine("==================");
            System.Console.Out.WriteLine(sbNoLemma.ToString());
            System.Console.Out.WriteLine(sbMultLemmas.ToString());
            System.Console.Out.WriteLine("==================");
            IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet());

            tags.Sort();
            foreach (string tag_1 in tags)
            {
                System.Console.Out.WriteLine(tag_1);
                ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet();
                foreach (string reducedTag in reducedTags)
                {
                    int count = tagReducedTagCounter.GetCount(tag_1, reducedTag);
                    //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
                    System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count);
                }
                System.Console.Out.WriteLine();
            }
            System.Console.Out.WriteLine("==================");
        }
コード例 #30
0
        /// <summary>Run the Evalb scoring metric on guess/gold input.</summary>
        /// <remarks>Run the Evalb scoring metric on guess/gold input. The default language is English.</remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            Properties options             = StringUtils.ArgsToProperties(args, OptionArgDefs());
            Language   language            = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            ITreebankLangParserParams tlpp = language.@params;
            int  maxGoldYield      = PropertiesUtils.GetInt(options, "y", int.MaxValue);
            bool Verbose           = PropertiesUtils.GetBool(options, "v", false);
            bool sortByF1          = PropertiesUtils.HasProperty(options, "s");
            int  worstKTreesToEmit = PropertiesUtils.GetInt(options, "s", 0);
            PriorityQueue <Triple <double, Tree, Tree> > queue = sortByF1 ? new PriorityQueue <Triple <double, Tree, Tree> >(2000, new Evalb.F1Comparator()) : null;
            bool   doCatLevel = PropertiesUtils.GetBool(options, "c", false);
            string labelRegex = options.GetProperty("f", null);
            string encoding   = options.GetProperty("e", "UTF-8");

            string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (parsedArgs.Length != minArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            string goldFile  = parsedArgs[0];
            string guessFile = parsedArgs[1];

            // Command-line has been parsed. Configure the metric for evaluation.
            tlpp.SetInputEncoding(encoding);
            PrintWriter pwOut         = tlpp.Pw();
            Treebank    guessTreebank = tlpp.DiskTreebank();

            guessTreebank.LoadPath(guessFile);
            pwOut.Println("GUESS TREEBANK:");
            pwOut.Println(guessTreebank.TextualSummary());
            Treebank goldTreebank = tlpp.DiskTreebank();

            goldTreebank.LoadPath(goldFile);
            pwOut.Println("GOLD TREEBANK:");
            pwOut.Println(goldTreebank.TextualSummary());
            Evalb            metric   = new Evalb("Evalb LP/LR", true);
            EvalbByCat       evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null;
            ITreeTransformer tc       = tlpp.Collinizer();
            //The evalb ref implementation assigns status for each tree pair as follows:
            //
            //   0 - Ok (yields match)
            //   1 - length mismatch
            //   2 - null parse e.g. (()).
            //
            //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
            IEnumerator <Tree> goldItr  = goldTreebank.GetEnumerator();
            IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator();
            int goldLineId        = 0;
            int guessLineId       = 0;
            int skippedGuessTrees = 0;

            while (guessItr.MoveNext() && goldItr.MoveNext())
            {
                Tree           guessTree  = guessItr.Current;
                IList <ILabel> guessYield = guessTree.Yield();
                guessLineId++;
                Tree           goldTree  = goldItr.Current;
                IList <ILabel> goldYield = goldTree.Yield();
                goldLineId++;
                // Check that we should evaluate this tree
                if (goldYield.Count > maxGoldYield)
                {
                    skippedGuessTrees++;
                    continue;
                }
                // Only trees with equal yields can be evaluated
                if (goldYield.Count != guessYield.Count)
                {
                    pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId);
                    skippedGuessTrees++;
                    continue;
                }
                Tree evalGuess = tc.TransformTree(guessTree);
                Tree evalGold  = tc.TransformTree(goldTree);
                metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
                if (doCatLevel)
                {
                    evalbCat.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
                }
                if (sortByF1)
                {
                    StoreTrees(queue, guessTree, goldTree, metric.GetLastF1());
                }
            }
            if (guessItr.MoveNext() || goldItr.MoveNext())
            {
                System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
            }
            pwOut.Println("================================================================================");
            if (skippedGuessTrees != 0)
            {
                pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
            }
            metric.Display(true, pwOut);
            pwOut.Println();
            if (doCatLevel)
            {
                evalbCat.Display(true, pwOut);
                pwOut.Println();
            }
            if (sortByF1)
            {
                EmitSortedTrees(queue, worstKTreesToEmit, guessFile);
            }
            pwOut.Close();
        }