Пример #1
0
 public CacheProcessor(CacheParseHypotheses cacher, LexicalizedParser parser, int dvKBest, ITreeTransformer transformer)
 {
     this.cacher      = cacher;
     this.parser      = parser;
     this.dvKBest     = dvKBest;
     this.transformer = transformer;
 }
 public Query(DVModelReranker _enclosing)
 {
     this._enclosing  = _enclosing;
     this.transformer = LexicalizedParser.BuildTrainTransformer(this._enclosing.op);
     this.scorer      = new DVParserCostAndGradient(null, null, this._enclosing.model, this._enclosing.op);
     this.deepTrees   = Generics.NewArrayList();
 }
Пример #3
0
        public virtual bool Run(File trainTreebankFile, File testTreebankFile, InputStream inputStream)
        {
            op           = new Options();
            op.tlpParams = new ArabicTreebankParserParams();
            op.SetOptions("-arabicFactored");
            op.testOptions.maxLength = maxSentLen;
            op.testOptions.MaxItems  = 5000000;
            //500000 is the default for Arabic, but we have substantially more edges now
            op.testOptions.outputFormatOptions = "removeTopBracket,includePunctuationDependencies";
            // WSG: Just set this to some high value so that extractBestParse()
            // actually calls the lattice reader (e.g., this says that we can't have a word longer than
            // 80 characters...seems sensible for Arabic
            op.testOptions.maxSpanForTags = 80;
            treePrint           = op.testOptions.TreePrint(op.tlpParams);
            debinarizer         = new Debinarizer(op.forceCNF, new CategoryWordTagFactory());
            subcategoryStripper = op.tlpParams.SubcategoryStripper();
            Timing.StartTime();
            Treebank trainTreebank = op.tlpParams.DiskTreebank();

            trainTreebank.LoadPath(trainTreebankFile);
            lp = GetParserDataFromTreebank(trainTreebank);
            MakeParsers();
            if (Verbose)
            {
                op.Display();
                string lexNumRules = (pparser != null) ? int.ToString(lp.lex.NumRules()) : string.Empty;
                log.Info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
                log.Info("Grammar\t" + lp.stateIndex.Size() + '\t' + lp.tagIndex.Size() + '\t' + lp.wordIndex.Size() + '\t' + (pparser != null ? lp.ug.NumRules() : string.Empty) + '\t' + (pparser != null ? lp.bg.NumRules() : string.Empty) + '\t' + lexNumRules
                         );
                log.Info("ParserPack is " + op.tlpParams.GetType().FullName);
                log.Info("Lexicon is " + lp.lex.GetType().FullName);
            }
            return(Parse(inputStream));
        }
Пример #4
0
        /// <summary>
        /// Takes a Tree and a collinizer and returns a Collection of
        /// <see cref="Edu.Stanford.Nlp.Trees.Constituent"/>
        /// s for
        /// PARSEVAL evaluation.  Some notes on this particular parseval:
        /// <ul>
        /// <li> It is character-based, which allows it to be used on segmentation/parsing combination evaluation.
        /// <li> whether it gives you labeled or unlabeled bracketings depends on the value of the
        /// <paramref name="labelConstituents"/>
        /// parameter
        /// </ul>
        /// (Note that I haven't checked this rigorously yet with the PARSEVAL definition
        /// -- Roger.)
        /// </summary>
        public static ICollection <Constituent> ParsevalObjectify(Tree t, ITreeTransformer collinizer, bool labelConstituents)
        {
            ICollection <Constituent> spans = new List <Constituent>();
            Tree t1 = collinizer.TransformTree(t);

            if (t1 == null)
            {
                return(spans);
            }
            foreach (Tree node in t1)
            {
                if (node.IsLeaf() || node.IsPreTerminal() || (node != t1 && node.Parent(t1) == null))
                {
                    continue;
                }
                int leftEdge  = t1.LeftCharEdge(node);
                int rightEdge = t1.RightCharEdge(node);
                if (labelConstituents)
                {
                    spans.Add(new LabeledConstituent(leftEdge, rightEdge, node.Label()));
                }
                else
                {
                    spans.Add(new SimpleConstituent(leftEdge, rightEdge));
                }
            }
            return(spans);
        }
 /// <summary>Constructor</summary>
 /// <param name="hf">the headfinder</param>
 /// <param name="performMWETransformation">
 /// Parameter for backwards compatibility.
 /// If set to false, multi-word expressions won't be attached to a new "MWE" node
 /// </param>
 public CoordinationTransformer(IHeadFinder hf, bool performMWETransformation)
 {
     //to get rid of unwanted nodes and tag
     //to flatten date patterns
     //to restructure the QP constituents
     // default constructor
     this.headFinder = hf;
     this.performMWETransformation = performMWETransformation;
     qp = new QPTreeTransformer(performMWETransformation);
 }
Пример #6
0
        /// <summary>
        /// Return a MemoryTreebank where each
        /// Tree in the current treebank has been transformed using the
        /// TreeTransformer.
        /// </summary>
        /// <remarks>
        /// Return a MemoryTreebank where each
        /// Tree in the current treebank has been transformed using the
        /// TreeTransformer.  This Treebank is unchanged (assuming that the
        /// TreeTransformer correctly doesn't change input Trees).
        /// </remarks>
        /// <param name="treeTrans">The TreeTransformer to use</param>
        public override Treebank Transform(ITreeTransformer treeTrans)
        {
            Treebank mtb = new Edu.Stanford.Nlp.Trees.MemoryTreebank(Count, TreeReaderFactory());

            foreach (Tree t in this)
            {
                mtb.Add(treeTrans.TransformTree(t));
            }
            return(mtb);
        }
 public Query(CombinedDVModelReranker _enclosing)
 {
     this._enclosing  = _enclosing;
     this.transformer = LexicalizedParser.BuildTrainTransformer(this._enclosing.op);
     this.scorers     = Generics.NewArrayList();
     foreach (DVModel model in this._enclosing.models)
     {
         this.scorers.Add(new DVParserCostAndGradient(null, null, model, this._enclosing.op));
     }
 }
Пример #8
0
        /// <summary>Loads treebank grammar from first argument and prints it.</summary>
        /// <remarks>
        /// Loads treebank grammar from first argument and prints it.
        /// Just a demonstration of functionality. <br />
        /// <code>usage: java MemoryTreebank treebankFilesPath</code>
        /// </remarks>
        /// <param name="args">array of command-line arguments</param>
        public static void Main(string[] args)
        {
            Timing.StartTime();
            Treebank treebank  = new DiskTreebank(null);
            Treebank treebank2 = new MemoryTreebank(null);

            treebank.LoadPath(args[0]);
            treebank2.LoadPath(args[0]);
            CompositeTreebank c = new CompositeTreebank(treebank, treebank2);

            Timing.EndTime();
            ITreeTransformer myTransformer  = new TransformingTreebank.MyTreeTransformer();
            ITreeTransformer myTransformer2 = new TransformingTreebank.MyTreeTransformer2();
            ITreeTransformer myTransformer3 = new TransformingTreebank.MyTreeTransformer3();
            Treebank         tf1            = c.Transform(myTransformer).Transform(myTransformer2).Transform(myTransformer3);
            Treebank         tf2            = new Edu.Stanford.Nlp.Trees.TransformingTreebank(new Edu.Stanford.Nlp.Trees.TransformingTreebank(new Edu.Stanford.Nlp.Trees.TransformingTreebank(c, myTransformer), myTransformer2), myTransformer3);

            ITreeTransformer[] tta = new ITreeTransformer[] { myTransformer, myTransformer2, myTransformer3 };
            ITreeTransformer   tt3 = new CompositeTreeTransformer(Arrays.AsList(tta));
            Treebank           tf3 = c.Transform(tt3);

            System.Console.Out.WriteLine("-------------------------");
            System.Console.Out.WriteLine("COMPOSITE (DISK THEN MEMORY REPEATED VERSION OF) INPUT TREEBANK");
            System.Console.Out.WriteLine(c);
            System.Console.Out.WriteLine("-------------------------");
            System.Console.Out.WriteLine("SLOWLY TRANSFORMED TREEBANK, USING TransformingTreebank() CONSTRUCTOR");
            Treebank tx1 = new Edu.Stanford.Nlp.Trees.TransformingTreebank(c, myTransformer);

            System.Console.Out.WriteLine(tx1);
            System.Console.Out.WriteLine("-----");
            Treebank tx2 = new Edu.Stanford.Nlp.Trees.TransformingTreebank(tx1, myTransformer2);

            System.Console.Out.WriteLine(tx2);
            System.Console.Out.WriteLine("-----");
            Treebank tx3 = new Edu.Stanford.Nlp.Trees.TransformingTreebank(tx2, myTransformer3);

            System.Console.Out.WriteLine(tx3);
            System.Console.Out.WriteLine("-------------------------");
            System.Console.Out.WriteLine("TRANSFORMED TREEBANK, USING Treebank.transform()");
            System.Console.Out.WriteLine(tf1);
            System.Console.Out.WriteLine("-------------------------");
            System.Console.Out.WriteLine("PRINTING AGAIN TRANSFORMED TREEBANK, USING Treebank.transform()");
            System.Console.Out.WriteLine(tf1);
            System.Console.Out.WriteLine("-------------------------");
            System.Console.Out.WriteLine("TRANSFORMED TREEBANK, USING TransformingTreebank() CONSTRUCTOR");
            System.Console.Out.WriteLine(tf2);
            System.Console.Out.WriteLine("-------------------------");
            System.Console.Out.WriteLine("TRANSFORMED TREEBANK, USING CompositeTreeTransformer");
            System.Console.Out.WriteLine(tf3);
            System.Console.Out.WriteLine("-------------------------");
            System.Console.Out.WriteLine("COMPOSITE (DISK THEN MEMORY REPEATED VERSION OF) INPUT TREEBANK");
            System.Console.Out.WriteLine(c);
            System.Console.Out.WriteLine("-------------------------");
        }
 public TreebankAnnotator(Options op, string treebankRoot)
 {
     //    op.tlpParams = new EnglishTreebankParserParams();
     // CDM: Aug 2004: With new implementation of treebank split categories,
     // I've hardwired this to load English ones.  Otherwise need training data.
     // op.trainOptions.splitters = Generics.newHashSet(Arrays.asList(op.tlpParams.splitters()));
     op.trainOptions.splitters       = ParentAnnotationStats.GetEnglishSplitCategories(treebankRoot);
     op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters()));
     op.SetOptions("-acl03pcfg", "-cnf");
     treeTransformer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
     //    BinarizerFactory.TreeAnnotator.setTreebankLang(op.tlpParams);
     treeUnTransformer = new Debinarizer(op.forceCNF);
     collinizer        = op.tlpParams.Collinizer();
     this.op           = op;
 }
 private TaggedFileRecord(string file, TaggedFileRecord.Format format, string encoding, string tagSeparator, ITreeTransformer treeTransformer, TreeNormalizer treeNormalizer, ITreeReaderFactory trf, NumberRangesFileFilter treeRange, IPredicate
                          <Tree> treeFilter, int wordColumn, int tagColumn)
 {
     // represents a tokenized file separated by text
     // represents a tsv file such as a conll file
     // represents a file in PTB format
     this.file            = file;
     this.format          = format;
     this.encoding        = encoding;
     this.tagSeparator    = tagSeparator;
     this.treeTransformer = treeTransformer;
     this.treeNormalizer  = treeNormalizer;
     this.treeRange       = treeRange;
     this.treeFilter      = treeFilter;
     this.wordColumn      = wordColumn;
     this.tagColumn       = tagColumn;
     this.trf             = trf;
 }
 public TreeTaggedFileReader(TaggedFileRecord record)
 {
     // int numSentences = 0;
     filename    = record.file;
     trf         = record.trf == null ? new LabeledScoredTreeReaderFactory() : record.trf;
     transformer = record.treeTransformer;
     normalizer  = record.treeNormalizer;
     treeFilter  = record.treeFilter;
     treebank    = new DiskTreebank(trf, record.encoding);
     if (record.treeRange != null)
     {
         treebank.LoadPath(filename, record.treeRange);
     }
     else
     {
         treebank.LoadPath(filename);
     }
     treeIterator = treebank.GetEnumerator();
     FindNext();
 }
Пример #12
0
 public TreeAnnotatorAndBinarizer(IHeadFinder annotationHF, IHeadFinder binarizationHF, ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op)
 {
     this.trainOptions = op.trainOptions;
     if (doSubcategorization)
     {
         annotator = new TreeAnnotator(annotationHF, tlpParams, op);
     }
     else
     {
         annotator = new TreeAnnotatorAndBinarizer.TreeNullAnnotator(annotationHF);
     }
     binarizer = new TreeBinarizer(binarizationHF, tlpParams.TreebankLanguagePack(), insideFactor, trainOptions.markovFactor, trainOptions.markovOrder, trainOptions.CompactGrammar() > 0, trainOptions.CompactGrammar() > 1, trainOptions.HselCut, trainOptions
                                   .markFinalStates, trainOptions.simpleBinarizedLabels, trainOptions.noRebinarization);
     if (trainOptions.selectivePostSplit)
     {
         postSplitter = new PostSplitter(tlpParams, op);
     }
     else
     {
         postSplitter = null;
     }
     this.tf       = new LabeledScoredTreeFactory(new CategoryWordTagFactory());
     this.tlp      = tlpParams.TreebankLanguagePack();
     this.forceCNF = forceCNF;
     if (trainOptions.printAnnotatedRuleCounts)
     {
         annotatedRuleCounts = new ClassicCounter <Tree>();
     }
     else
     {
         annotatedRuleCounts = null;
     }
     if (trainOptions.printAnnotatedStateCounts)
     {
         annotatedStateCounts = new ClassicCounter <string>();
     }
     else
     {
         annotatedStateCounts = null;
     }
 }
 /// <summary>Build a CTBErrorCorrectingTreeNormalizer.</summary>
 /// <param name="splitNPTMP">Temporal annotation on NPs</param>
 /// <param name="splitPPTMP">Temporal annotation on PPs</param>
 /// <param name="splitXPTMP">Temporal annotation on any phrase marked in CTB</param>
 /// <param name="charTags">
 /// Whether you wish to push POS tags down on to the
 /// characters of a word (for unsegmented text)
 /// </param>
 public CTBErrorCorrectingTreeNormalizer(bool splitNPTMP, bool splitPPTMP, bool splitXPTMP, bool charTags)
 {
     this.splitNPTMP = splitNPTMP;
     this.splitPPTMP = splitPPTMP;
     this.splitXPTMP = splitXPTMP;
     if (charTags)
     {
         try
         {
             tagExtender = (ITreeTransformer)System.Activator.CreateInstance(Sharpen.Runtime.GetType("edu.stanford.nlp.trees.international.pennchinese.CharacterLevelTagExtender"));
         }
         catch (Exception e)
         {
             throw new Exception(e);
         }
     }
     else
     {
         tagExtender = null;
     }
 }
Пример #14
0
        /// <summary>
        /// Returns the set of dependencies in a tree, according to some
        /// <see cref="Edu.Stanford.Nlp.Trees.IDependencyTyper{T}"/>
        /// .
        /// </summary>
        public static ICollection <E> DependencyObjectify <E>(Tree t, IHeadFinder hf, ITreeTransformer collinizer, IDependencyTyper <E> typer)
        {
            ICollection <E> deps = new List <E>();
            Tree            t1   = collinizer.TransformTree(t);

            if (t1 == null)
            {
                return(deps);
            }
            DependencyObjectifyHelper(t1, t1, hf, deps, typer);
            return(deps);
        }
        public static Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord CreateRecord(Properties config, string description)
        {
            string[] pieces = description.Split(",");
            if (pieces.Length == 1)
            {
                return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(description, TaggedFileRecord.Format.Text, GetEncoding(config), GetTagSeparator(config), null, null, null, null, null, null, null));
            }
            string[] args = new string[pieces.Length - 1];
            System.Array.Copy(pieces, 0, args, 0, pieces.Length - 1);
            string file = pieces[pieces.Length - 1];

            TaggedFileRecord.Format format         = TaggedFileRecord.Format.Text;
            string                 encoding        = GetEncoding(config);
            string                 tagSeparator    = GetTagSeparator(config);
            ITreeTransformer       treeTransformer = null;
            TreeNormalizer         treeNormalizer  = null;
            ITreeReaderFactory     trf             = null;
            NumberRangesFileFilter treeRange       = null;
            IPredicate <Tree>      treeFilter      = null;
            int wordColumn = null;
            int tagColumn  = null;

            foreach (string arg in args)
            {
                string[] argPieces = arg.Split("=", 2);
                if (argPieces.Length != 2)
                {
                    throw new ArgumentException("TaggedFileRecord argument " + arg + " has an unexpected number of =s");
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Format))
                {
                    format = TaggedFileRecord.Format.ValueOf(argPieces[1]);
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Encoding))
                    {
                        encoding = argPieces[1];
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagSeparator))
                        {
                            tagSeparator = argPieces[1];
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeTransformer))
                            {
                                treeTransformer = ReflectionLoading.LoadByReflection(argPieces[1]);
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeNormalizer))
                                {
                                    treeNormalizer = ReflectionLoading.LoadByReflection(argPieces[1]);
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeReader))
                                    {
                                        trf = ReflectionLoading.LoadByReflection(argPieces[1]);
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeRange))
                                        {
                                            string range = argPieces[1].ReplaceAll(":", ",");
                                            treeRange = new NumberRangesFileFilter(range, true);
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeFilter))
                                            {
                                                treeFilter = ReflectionLoading.LoadByReflection(argPieces[1]);
                                            }
                                            else
                                            {
                                                if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], WordColumn))
                                                {
                                                    wordColumn = int.Parse(argPieces[1]);
                                                }
                                                else
                                                {
                                                    if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagColumn))
                                                    {
                                                        tagColumn = int.Parse(argPieces[1]);
                                                    }
                                                    else
                                                    {
                                                        throw new ArgumentException("TaggedFileRecord argument " + argPieces[0] + " is unknown");
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(file, format, encoding, tagSeparator, treeTransformer, treeNormalizer, trf, treeRange, treeFilter, wordColumn, tagColumn));
        }
 internal virtual IdentityHashMap <Tree, IList <Tree> > GetTopParses(IList <Tree> trees, ITreeTransformer transformer)
 {
     return(GetTopParses(parser, op, trees, transformer, false));
 }
        internal static IdentityHashMap <Tree, IList <Tree> > GetTopParses(LexicalizedParser parser, Options op, ICollection <Tree> trees, ITreeTransformer transformer, bool outputUpdates)
        {
            IdentityHashMap <Tree, IList <Tree> > topParses = new IdentityHashMap <Tree, IList <Tree> >();

            foreach (Tree tree in trees)
            {
                IList <Tree> parses = GetTopParsesForOneTree(parser, op.trainOptions.dvKBest, tree, transformer);
                topParses[tree] = parses;
                if (outputUpdates && topParses.Count % 10 == 0)
                {
                    log.Info("Processed " + topParses.Count + " trees");
                }
            }
            if (outputUpdates)
            {
                log.Info("Finished processing " + topParses.Count + " trees");
            }
            return(topParses);
        }
        /// <summary>
        /// An example command line for training a new parser:
        /// <br />
        /// nohup java -mx6g edu.stanford.nlp.parser.dvparser.DVParser -cachedTrees /scr/nlp/data/dvparser/wsj/cached.wsj.train.simple.ser.gz -train -testTreebank  /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 -debugOutputFrequency 400 -nofilter -trainingThreads 5 -parser /u/nlp/data/lexparser/wsjPCFG.nocompact.simple.ser.gz -trainingIterations 40 -batchSize 25 -model /scr/nlp/data/dvparser/wsj/wsj.combine.v2.ser.gz -unkWord "*UNK*" -dvCombineCategories &gt; /scr/nlp/data/dvparser/wsj/wsj.combine.v2.out 2&gt;&amp;1 &amp;
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                Help();
                System.Environment.Exit(2);
            }
            log.Info("Running DVParser with arguments:");
            foreach (string arg in args)
            {
                log.Info("  " + arg);
            }
            log.Info();
            string         parserPath           = null;
            string         trainTreebankPath    = null;
            IFileFilter    trainTreebankFilter  = null;
            string         cachedTrainTreesPath = null;
            bool           runGradientCheck     = false;
            bool           runTraining          = false;
            string         testTreebankPath     = null;
            IFileFilter    testTreebankFilter   = null;
            string         initialModelPath     = null;
            string         modelPath            = null;
            bool           filter            = true;
            string         resultsRecordPath = null;
            IList <string> unusedArgs        = new List <string>();
            // These parameters can be null or 0 if the model was not
            // serialized with the new parameters.  Setting the options at the
            // command line will override these defaults.
            // TODO: if/when we integrate back into the main branch and
            // rebuild models, we can get rid of this
            IList <string> argsWithDefaults = new List <string>(Arrays.AsList(new string[] { "-wordVectorFile", Options.LexOptions.DefaultWordVectorFile, "-dvKBest", int.ToString(TrainOptions.DefaultKBest), "-batchSize", int.ToString(TrainOptions.DefaultBatchSize
                                                                                                                                                                                                                                          ), "-trainingIterations", int.ToString(TrainOptions.DefaultTrainingIterations), "-qnIterationsPerBatch", int.ToString(TrainOptions.DefaultQnIterationsPerBatch), "-regCost", double.ToString(TrainOptions.DefaultRegcost), "-learningRate", double
                                                                                             .ToString(TrainOptions.DefaultLearningRate), "-deltaMargin", double.ToString(TrainOptions.DefaultDeltaMargin), "-unknownNumberVector", "-unknownDashedWordVectors", "-unknownCapsVector", "-unknownchinesepercentvector", "-unknownchinesenumbervector"
                                                                                             , "-unknownchineseyearvector", "-unkWord", "*UNK*", "-transformMatrixType", "DIAGONAL", "-scalingForInit", double.ToString(TrainOptions.DefaultScalingForInit), "-trainWordVectors" }));

            Sharpen.Collections.AddAll(argsWithDefaults, Arrays.AsList(args));
            args = Sharpen.Collections.ToArray(argsWithDefaults, new string[argsWithDefaults.Count]);
            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser"))
                {
                    parserPath = args[argIndex + 1];
                    argIndex  += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                        {
                            Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank");
                            argIndex            = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                            trainTreebankPath   = treebankDescription.First();
                            trainTreebankFilter = treebankDescription.Second();
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-cachedTrees"))
                            {
                                cachedTrainTreesPath = args[argIndex + 1];
                                argIndex            += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-runGradientCheck"))
                                {
                                    runGradientCheck = true;
                                    argIndex++;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                                    {
                                        runTraining = true;
                                        argIndex++;
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                                        {
                                            modelPath = args[argIndex + 1];
                                            argIndex += 2;
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-nofilter"))
                                            {
                                                filter = false;
                                                argIndex++;
                                            }
                                            else
                                            {
                                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-continueTraining"))
                                                {
                                                    runTraining      = true;
                                                    filter           = false;
                                                    initialModelPath = args[argIndex + 1];
                                                    argIndex        += 2;
                                                }
                                                else
                                                {
                                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-resultsRecord"))
                                                    {
                                                        resultsRecordPath = args[argIndex + 1];
                                                        argIndex         += 2;
                                                    }
                                                    else
                                                    {
                                                        unusedArgs.Add(args[argIndex++]);
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            if (parserPath == null && modelPath == null)
            {
                throw new ArgumentException("Must supply either a base parser model with -parser or a serialized DVParser with -model");
            }
            if (!runTraining && modelPath == null && !runGradientCheck)
            {
                throw new ArgumentException("Need to either train a new model, run the gradient check or specify a model to load with -model");
            }
            string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            Edu.Stanford.Nlp.Parser.Dvparser.DVParser dvparser = null;
            LexicalizedParser lexparser = null;

            if (initialModelPath != null)
            {
                lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(initialModelPath, newArgs));
                DVModel model = GetModelFromLexicalizedParser(lexparser);
                dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser);
            }
            else
            {
                if (runTraining || runGradientCheck)
                {
                    lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserPath, newArgs));
                    dvparser  = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(lexparser);
                }
                else
                {
                    if (modelPath != null)
                    {
                        lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs));
                        DVModel model = GetModelFromLexicalizedParser(lexparser);
                        dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser);
                    }
                }
            }
            IList <Tree> trainSentences = new List <Tree>();
            IdentityHashMap <Tree, byte[]> trainCompressedParses = Generics.NewIdentityHashMap();

            if (cachedTrainTreesPath != null)
            {
                foreach (string path in cachedTrainTreesPath.Split(","))
                {
                    IList <Pair <Tree, byte[]> > cache = IOUtils.ReadObjectFromFile(path);
                    foreach (Pair <Tree, byte[]> pair in cache)
                    {
                        trainSentences.Add(pair.First());
                        trainCompressedParses[pair.First()] = pair.Second();
                    }
                    log.Info("Read in " + cache.Count + " trees from " + path);
                }
            }
            if (trainTreebankPath != null)
            {
                // TODO: make the transformer a member of the model?
                ITreeTransformer transformer = BuildTrainTransformer(dvparser.GetOp());
                Treebank         treebank    = dvparser.GetOp().tlpParams.MemoryTreebank();
                treebank.LoadPath(trainTreebankPath, trainTreebankFilter);
                treebank = treebank.Transform(transformer);
                log.Info("Read in " + treebank.Count + " trees from " + trainTreebankPath);
                CacheParseHypotheses cacher = new CacheParseHypotheses(dvparser.parser);
                CacheParseHypotheses.CacheProcessor processor = new CacheParseHypotheses.CacheProcessor(cacher, lexparser, dvparser.op.trainOptions.dvKBest, transformer);
                foreach (Tree tree in treebank)
                {
                    trainSentences.Add(tree);
                    trainCompressedParses[tree] = processor.Process(tree).second;
                }
                //System.out.println(tree);
                log.Info("Finished parsing " + treebank.Count + " trees, getting " + dvparser.op.trainOptions.dvKBest + " hypotheses each");
            }
            if ((runTraining || runGradientCheck) && filter)
            {
                log.Info("Filtering rules for the given training set");
                dvparser.dvModel.SetRulesForTrainingSet(trainSentences, trainCompressedParses);
                log.Info("Done filtering rules; " + dvparser.dvModel.numBinaryMatrices + " binary matrices, " + dvparser.dvModel.numUnaryMatrices + " unary matrices, " + dvparser.dvModel.wordVectors.Count + " word vectors");
            }
            //dvparser.dvModel.printAllMatrices();
            Treebank testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = dvparser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
            }
            //    runGradientCheck= true;
            if (runGradientCheck)
            {
                log.Info("Running gradient check on " + trainSentences.Count + " trees");
                dvparser.RunGradientCheck(trainSentences, trainCompressedParses);
            }
            if (runTraining)
            {
                log.Info("Training the RNN parser");
                log.Info("Current train options: " + dvparser.GetOp().trainOptions);
                dvparser.Train(trainSentences, trainCompressedParses, testTreebank, modelPath, resultsRecordPath);
                if (modelPath != null)
                {
                    dvparser.SaveModel(modelPath);
                }
            }
            if (testTreebankPath != null)
            {
                EvaluateTreebank evaluator = new EvaluateTreebank(dvparser.AttachModelToLexicalizedParser());
                evaluator.TestOnTreebank(testTreebank);
            }
            log.Info("Successfully ran DVParser");
        }
Пример #19
0
        /// <summary>Run the scoring metric on guess/gold input.</summary>
        /// <remarks>
        /// Run the scoring metric on guess/gold input. This method performs "Collinization."
        /// The default language is English.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            int    maxGoldYield            = int.MaxValue;
            bool   Verbose   = false;
            string encoding  = "UTF-8";
            string guessFile = null;
            string goldFile  = null;
            IDictionary <string, string[]> argsMap = StringUtils.ArgsToMap(args, optionArgDefs);

            foreach (KeyValuePair <string, string[]> opt in argsMap)
            {
                if (opt.Key == null)
                {
                    continue;
                }
                if (opt.Key.Equals("-l"))
                {
                    Language lang = Language.ValueOf(opt.Value[0].Trim());
                    tlpp = lang.@params;
                }
                else
                {
                    if (opt.Key.Equals("-y"))
                    {
                        maxGoldYield = System.Convert.ToInt32(opt.Value[0].Trim());
                    }
                    else
                    {
                        if (opt.Key.Equals("-v"))
                        {
                            Verbose = true;
                        }
                        else
                        {
                            if (opt.Key.Equals("-c"))
                            {
                                Edu.Stanford.Nlp.Parser.Metrics.TaggingEval.doCatLevelEval = true;
                            }
                            else
                            {
                                if (opt.Key.Equals("-e"))
                                {
                                    encoding = opt.Value[0];
                                }
                                else
                                {
                                    log.Info(usage.ToString());
                                    System.Environment.Exit(-1);
                                }
                            }
                        }
                    }
                }
                //Non-option arguments located at key null
                string[] rest = argsMap[null];
                if (rest == null || rest.Length < minArgs)
                {
                    log.Info(usage.ToString());
                    System.Environment.Exit(-1);
                }
                goldFile  = rest[0];
                guessFile = rest[1];
            }
            tlpp.SetInputEncoding(encoding);
            PrintWriter pwOut         = tlpp.Pw();
            Treebank    guessTreebank = tlpp.DiskTreebank();

            guessTreebank.LoadPath(guessFile);
            pwOut.Println("GUESS TREEBANK:");
            pwOut.Println(guessTreebank.TextualSummary());
            Treebank goldTreebank = tlpp.DiskTreebank();

            goldTreebank.LoadPath(goldFile);
            pwOut.Println("GOLD TREEBANK:");
            pwOut.Println(goldTreebank.TextualSummary());
            Edu.Stanford.Nlp.Parser.Metrics.TaggingEval metric = new Edu.Stanford.Nlp.Parser.Metrics.TaggingEval("Tagging LP/LR");
            ITreeTransformer tc = tlpp.Collinizer();
            //The evalb ref implementation assigns status for each tree pair as follows:
            //
            //   0 - Ok (yields match)
            //   1 - length mismatch
            //   2 - null parse e.g. (()).
            //
            //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
            IEnumerator <Tree> goldItr  = goldTreebank.GetEnumerator();
            IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator();
            int goldLineId        = 0;
            int guessLineId       = 0;
            int skippedGuessTrees = 0;

            while (guessItr.MoveNext() && goldItr.MoveNext())
            {
                Tree           guessTree  = guessItr.Current;
                IList <ILabel> guessYield = guessTree.Yield();
                guessLineId++;
                Tree           goldTree  = goldItr.Current;
                IList <ILabel> goldYield = goldTree.Yield();
                goldLineId++;
                // Check that we should evaluate this tree
                if (goldYield.Count > maxGoldYield)
                {
                    skippedGuessTrees++;
                    continue;
                }
                // Only trees with equal yields can be evaluated
                if (goldYield.Count != guessYield.Count)
                {
                    pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId);
                    skippedGuessTrees++;
                    continue;
                }
                Tree evalGuess = tc.TransformTree(guessTree);
                Tree evalGold  = tc.TransformTree(goldTree);
                metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
            }
            if (guessItr.MoveNext() || goldItr.MoveNext())
            {
                System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
            }
            pwOut.Println("================================================================================");
            if (skippedGuessTrees != 0)
            {
                pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
            }
            metric.Display(true, pwOut);
            pwOut.Println();
            pwOut.Close();
        }
        /// <summary>Run the scoring metric on guess/gold input.</summary>
        /// <remarks>
        /// Run the scoring metric on guess/gold input. This method performs "Collinization."
        /// The default language is English.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            int    maxGoldYield            = int.MaxValue;
            int    maxGuessYield           = int.MaxValue;
            bool   Verbose   = false;
            bool   skipGuess = false;
            bool   tagMode   = false;
            string guessFile = null;
            string goldFile  = null;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].StartsWith("-"))
                {
                    switch (args[i])
                    {
                    case "-l":
                    {
                        Language lang = Language.ValueOf(args[++i].Trim());
                        tlpp = lang.@params;
                        break;
                    }

                    case "-y":
                    {
                        maxGoldYield = System.Convert.ToInt32(args[++i].Trim());
                        break;
                    }

                    case "-t":
                    {
                        tagMode = true;
                        break;
                    }

                    case "-v":
                    {
                        Verbose = true;
                        break;
                    }

                    case "-g":
                    {
                        maxGuessYield = System.Convert.ToInt32(args[++i].Trim());
                        skipGuess     = true;
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        System.Environment.Exit(-1);
                        break;
                    }
                    }
                }
                else
                {
                    //Required parameters
                    goldFile  = args[i++];
                    guessFile = args[i];
                    break;
                }
            }
            PrintWriter pwOut         = tlpp.Pw();
            Treebank    guessTreebank = tlpp.DiskTreebank();

            guessTreebank.LoadPath(guessFile);
            pwOut.Println("GUESS TREEBANK:");
            pwOut.Println(guessTreebank.TextualSummary());
            Treebank goldTreebank = tlpp.DiskTreebank();

            goldTreebank.LoadPath(goldFile);
            pwOut.Println("GOLD TREEBANK:");
            pwOut.Println(goldTreebank.TextualSummary());
            string evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";

            Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval eval = new Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval(evalName, tagMode);
            ITreeTransformer tc = tlpp.Collinizer();
            //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
            //don't match, we need to keep looking for the next gold tree that matches.
            //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
            //status as follows:
            //
            //   0 - Ok (yields match)
            //   1 - length mismatch
            //   2 - null parse e.g. (()).
            //
            //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
            IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator();
            int goldLineId             = 0;
            int skippedGuessTrees      = 0;

            foreach (Tree guess in guessTreebank)
            {
                Tree          evalGuess  = tc.TransformTree(guess);
                List <ILabel> guessSent  = guess.Yield();
                string        guessChars = SentenceUtils.ListToString(guessSent).ReplaceAll("\\s+", string.Empty);
                if (guessSent.Count > maxGuessYield)
                {
                    skippedGuessTrees++;
                    continue;
                }
                bool doneEval = false;
                while (goldItr.MoveNext() && !doneEval)
                {
                    Tree gold     = goldItr.Current;
                    Tree evalGold = tc.TransformTree(gold);
                    goldLineId++;
                    List <ILabel> goldSent  = gold.Yield();
                    string        goldChars = SentenceUtils.ListToString(goldSent).ReplaceAll("\\s+", string.Empty);
                    if (goldSent.Count > maxGoldYield)
                    {
                        continue;
                    }
                    else
                    {
                        if (goldChars.Length != guessChars.Length)
                        {
                            pwOut.Printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.Length, goldChars.Length);
                            skippedGuessTrees++;
                            break;
                        }
                    }
                    //Default evalb behavior -- skip this guess tree
                    eval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
                    doneEval = true;
                }
            }
            //Move to the next guess parse
            pwOut.Println("================================================================================");
            if (skippedGuessTrees != 0)
            {
                pwOut.Printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
            }
            eval.Display(true, pwOut);
            pwOut.Println();
            pwOut.Close();
        }
Пример #21
0
 public Debinarizer(bool forceCNF, ILabelFactory lf)
 {
     this.forceCNF   = forceCNF;
     tf              = new LabeledScoredTreeFactory(lf);
     boundaryRemover = new BoundaryRemover();
 }
Пример #22
0
 /// <summary>
 /// Takes a Tree and a collinizer and returns a Collection of labeled
 /// <see cref="Edu.Stanford.Nlp.Trees.Constituent"/>
 /// s for PARSEVAL.
 /// </summary>
 /// <param name="t">The tree to extract constituents from</param>
 /// <param name="collinizer">
 /// The TreeTransformer used to normalize the tree for
 /// evaluation
 /// </param>
 /// <returns>The bag of Constituents for PARSEVAL.</returns>
 public static ICollection <Constituent> ParsevalObjectify(Tree t, ITreeTransformer collinizer)
 {
     return(ParsevalObjectify(t, collinizer, true));
 }
Пример #23
0
        /// <summary>Run the Evalb scoring metric on guess/gold input.</summary>
        /// <remarks>Run the Evalb scoring metric on guess/gold input. The default language is English.</remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            Properties options             = StringUtils.ArgsToProperties(args, OptionArgDefs());
            Language   language            = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            ITreebankLangParserParams tlpp = language.@params;
            int  maxGoldYield      = PropertiesUtils.GetInt(options, "y", int.MaxValue);
            bool Verbose           = PropertiesUtils.GetBool(options, "v", false);
            bool sortByF1          = PropertiesUtils.HasProperty(options, "s");
            int  worstKTreesToEmit = PropertiesUtils.GetInt(options, "s", 0);
            PriorityQueue <Triple <double, Tree, Tree> > queue = sortByF1 ? new PriorityQueue <Triple <double, Tree, Tree> >(2000, new Evalb.F1Comparator()) : null;
            bool   doCatLevel = PropertiesUtils.GetBool(options, "c", false);
            string labelRegex = options.GetProperty("f", null);
            string encoding   = options.GetProperty("e", "UTF-8");

            string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (parsedArgs.Length != minArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            string goldFile  = parsedArgs[0];
            string guessFile = parsedArgs[1];

            // Command-line has been parsed. Configure the metric for evaluation.
            tlpp.SetInputEncoding(encoding);
            PrintWriter pwOut         = tlpp.Pw();
            Treebank    guessTreebank = tlpp.DiskTreebank();

            guessTreebank.LoadPath(guessFile);
            pwOut.Println("GUESS TREEBANK:");
            pwOut.Println(guessTreebank.TextualSummary());
            Treebank goldTreebank = tlpp.DiskTreebank();

            goldTreebank.LoadPath(goldFile);
            pwOut.Println("GOLD TREEBANK:");
            pwOut.Println(goldTreebank.TextualSummary());
            Evalb            metric   = new Evalb("Evalb LP/LR", true);
            EvalbByCat       evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null;
            ITreeTransformer tc       = tlpp.Collinizer();
            //The evalb ref implementation assigns status for each tree pair as follows:
            //
            //   0 - Ok (yields match)
            //   1 - length mismatch
            //   2 - null parse e.g. (()).
            //
            //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
            IEnumerator <Tree> goldItr  = goldTreebank.GetEnumerator();
            IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator();
            int goldLineId        = 0;
            int guessLineId       = 0;
            int skippedGuessTrees = 0;

            while (guessItr.MoveNext() && goldItr.MoveNext())
            {
                Tree           guessTree  = guessItr.Current;
                IList <ILabel> guessYield = guessTree.Yield();
                guessLineId++;
                Tree           goldTree  = goldItr.Current;
                IList <ILabel> goldYield = goldTree.Yield();
                goldLineId++;
                // Check that we should evaluate this tree
                if (goldYield.Count > maxGoldYield)
                {
                    skippedGuessTrees++;
                    continue;
                }
                // Only trees with equal yields can be evaluated
                if (goldYield.Count != guessYield.Count)
                {
                    pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId);
                    skippedGuessTrees++;
                    continue;
                }
                Tree evalGuess = tc.TransformTree(guessTree);
                Tree evalGold  = tc.TransformTree(goldTree);
                metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
                if (doCatLevel)
                {
                    evalbCat.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
                }
                if (sortByF1)
                {
                    StoreTrees(queue, guessTree, goldTree, metric.GetLastF1());
                }
            }
            if (guessItr.MoveNext() || goldItr.MoveNext())
            {
                System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
            }
            pwOut.Println("================================================================================");
            if (skippedGuessTrees != 0)
            {
                pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees);
            }
            metric.Display(true, pwOut);
            pwOut.Println();
            if (doCatLevel)
            {
                evalbCat.Display(true, pwOut);
                pwOut.Println();
            }
            if (sortByF1)
            {
                EmitSortedTrees(queue, worstKTreesToEmit, guessFile);
            }
            pwOut.Close();
        }
 /// <summary>
 /// Initializes a new instance of the <see cref="TreeGenerator"/> class.
 /// </summary>
 /// <param name="transformer">The transformer of a CST tree to a transformation schema.</param>
 /// <param name="logger">The syntax errors logger.</param>
 public TreeGenerator(ITreeTransformer transformer, ILogger <ITreeGenerator> logger = null)
 {
     this._logger      = logger;
     this._transformer = transformer;
 }
 public SynchronizedTreeTransformer(ITreeTransformer threadUnsafe)
 {
     this.threadUnsafe = threadUnsafe;
 }
        public static IList <Tree> GetTopParsesForOneTree(LexicalizedParser parser, int dvKBest, Tree tree, ITreeTransformer transformer)
        {
            IParserQuery pq       = parser.ParserQuery();
            IList <Word> sentence = tree.YieldWords();

            // Since the trees are binarized and otherwise manipulated, we
            // need to chop off the last word in order to remove the end of
            // sentence symbol
            if (sentence.Count <= 1)
            {
                return(null);
            }
            sentence = sentence.SubList(0, sentence.Count - 1);
            if (!pq.Parse(sentence))
            {
                log.Info("Failed to use the given parser to reparse sentence \"" + sentence + "\"");
                return(null);
            }
            IList <Tree> parses = new List <Tree>();
            IList <ScoredObject <Tree> > bestKParses = pq.GetKBestPCFGParses(dvKBest);

            foreach (ScoredObject <Tree> so in bestKParses)
            {
                Tree result = so.Object();
                if (transformer != null)
                {
                    result = transformer.TransformTree(result);
                }
                parses.Add(result);
            }
            return(parses);
        }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap();

            flagsToNumArgs["-parser"]        = int.Parse(3);
            flagsToNumArgs["-lex"]           = int.Parse(3);
            flagsToNumArgs["-test"]          = int.Parse(2);
            flagsToNumArgs["-out"]           = int.Parse(1);
            flagsToNumArgs["-lengthPenalty"] = int.Parse(1);
            flagsToNumArgs["-penaltyType"]   = int.Parse(1);
            flagsToNumArgs["-maxLength"]     = int.Parse(1);
            flagsToNumArgs["-stats"]         = int.Parse(2);
            IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs);
            bool        eval = argMap.Contains("-eval");
            PrintWriter pw   = null;

            if (argMap.Contains("-out"))
            {
                pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap["-out"])[0]), "GB18030"), true);
            }
            log.Info("ChineseCharacterBasedLexicon called with args:");
            ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();

            for (int i = 0; i < args.Length; i++)
            {
                ctpp.SetOptionFlag(args, i);
                log.Info(" " + args[i]);
            }
            log.Info();
            Options op = new Options(ctpp);

            if (argMap.Contains("-stats"))
            {
                string[]       statArgs         = (argMap["-stats"]);
                MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank();
                IFileFilter    trainFilt        = new NumberRangesFileFilter(statArgs[1], false);
                rawTrainTreebank.LoadPath(new File(statArgs[0]), trainFilt);
                log.Info("Done reading trees.");
                MemoryTreebank trainTreebank;
                if (argMap.Contains("-annotate"))
                {
                    trainTreebank = new MemoryTreebank();
                    TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op);
                    foreach (Tree tree in rawTrainTreebank)
                    {
                        trainTreebank.Add(annotator.TransformTree(tree));
                    }
                    log.Info("Done annotating trees.");
                }
                else
                {
                    trainTreebank = rawTrainTreebank;
                }
                PrintStats(trainTreebank, pw);
                System.Environment.Exit(0);
            }
            int maxLength = 1000000;

            //    Test.verbose = true;
            if (argMap.Contains("-norm"))
            {
                op.testOptions.lengthNormalization = true;
            }
            if (argMap.Contains("-maxLength"))
            {
                maxLength = System.Convert.ToInt32((argMap["-maxLength"])[0]);
            }
            op.testOptions.maxLength = 120;
            bool combo = argMap.Contains("-combo");

            if (combo)
            {
                ctpp.useCharacterBasedLexicon = true;
                op.testOptions.maxSpanForTags = 10;
                op.doDep  = false;
                op.dcTags = false;
            }
            LexicalizedParser lp  = null;
            ILexicon          lex = null;

            if (argMap.Contains("-parser"))
            {
                string[] parserArgs = (argMap["-parser"]);
                if (parserArgs.Length > 1)
                {
                    IFileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
                    lp = LexicalizedParser.TrainFromTreebank(parserArgs[0], trainFilt, op);
                    if (parserArgs.Length == 3)
                    {
                        string filename = parserArgs[2];
                        log.Info("Writing parser in serialized format to file " + filename + " ");
                        System.Console.Error.Flush();
                        ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                        @out.WriteObject(lp);
                        @out.Close();
                        log.Info("done.");
                    }
                }
                else
                {
                    string parserFile = parserArgs[0];
                    lp = LexicalizedParser.LoadModel(parserFile, op);
                }
                lex  = lp.GetLexicon();
                op   = lp.GetOp();
                ctpp = (ChineseTreebankParserParams)op.tlpParams;
            }
            if (argMap.Contains("-rad"))
            {
                ctpp.useUnknownCharacterModel = true;
            }
            if (argMap.Contains("-lengthPenalty"))
            {
                ctpp.lengthPenalty = double.Parse((argMap["-lengthPenalty"])[0]);
            }
            if (argMap.Contains("-penaltyType"))
            {
                ctpp.penaltyType = System.Convert.ToInt32((argMap["-penaltyType"])[0]);
            }
            if (argMap.Contains("-lex"))
            {
                string[] lexArgs = (argMap["-lex"]);
                if (lexArgs.Length > 1)
                {
                    IIndex <string> wordIndex = new HashIndex <string>();
                    IIndex <string> tagIndex  = new HashIndex <string>();
                    lex = ctpp.Lex(op, wordIndex, tagIndex);
                    MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank();
                    IFileFilter    trainFilt        = new NumberRangesFileFilter(lexArgs[1], false);
                    rawTrainTreebank.LoadPath(new File(lexArgs[0]), trainFilt);
                    log.Info("Done reading trees.");
                    MemoryTreebank trainTreebank;
                    if (argMap.Contains("-annotate"))
                    {
                        trainTreebank = new MemoryTreebank();
                        TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op);
                        foreach (Tree tree in rawTrainTreebank)
                        {
                            tree = annotator.TransformTree(tree);
                            trainTreebank.Add(tree);
                        }
                        log.Info("Done annotating trees.");
                    }
                    else
                    {
                        trainTreebank = rawTrainTreebank;
                    }
                    lex.InitializeTraining(trainTreebank.Count);
                    lex.Train(trainTreebank);
                    lex.FinishTraining();
                    log.Info("Done training lexicon.");
                    if (lexArgs.Length == 3)
                    {
                        string filename = lexArgs.Length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
                        log.Info("Writing lexicon in serialized format to file " + filename + " ");
                        System.Console.Error.Flush();
                        ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                        @out.WriteObject(lex);
                        @out.Close();
                        log.Info("done.");
                    }
                }
                else
                {
                    string lexFile = lexArgs.Length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
                    log.Info("Reading Lexicon from file " + lexFile);
                    ObjectInputStream @in = IOUtils.ReadStreamFromString(lexFile);
                    try
                    {
                        lex = (ILexicon)@in.ReadObject();
                    }
                    catch (TypeLoadException)
                    {
                        throw new Exception("Bad serialized file: " + lexFile);
                    }
                    @in.Close();
                }
            }
            if (argMap.Contains("-test"))
            {
                bool segmentWords = ctpp.segment;
                bool parse        = lp != null;
                System.Diagnostics.Debug.Assert((parse || segmentWords));
                //      WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
                //      WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
                IWordSegmenter seg = null;
                if (segmentWords)
                {
                    seg = (IWordSegmenter)lex;
                }
                string[]       testArgs     = (argMap["-test"]);
                MemoryTreebank testTreebank = op.tlpParams.MemoryTreebank();
                IFileFilter    testFilt     = new NumberRangesFileFilter(testArgs[1], false);
                testTreebank.LoadPath(new File(testArgs[0]), testFilt);
                ITreeTransformer          subcategoryStripper = op.tlpParams.SubcategoryStripper();
                ITreeTransformer          collinizer          = ctpp.Collinizer();
                WordCatEquivalenceClasser eqclass             = new WordCatEquivalenceClasser();
                WordCatEqualityChecker    eqcheck             = new WordCatEqualityChecker();
                EquivalenceClassEval      basicEval           = new EquivalenceClassEval(eqclass, eqcheck, "basic");
                EquivalenceClassEval      collinsEval         = new EquivalenceClassEval(eqclass, eqcheck, "collinized");
                IList <string>            evalTypes           = new List <string>(3);
                bool goodPOS = false;
                if (segmentWords)
                {
                    evalTypes.Add(WordCatConstituent.wordType);
                    if (ctpp.segmentMarkov && !parse)
                    {
                        evalTypes.Add(WordCatConstituent.tagType);
                        goodPOS = true;
                    }
                }
                if (parse)
                {
                    evalTypes.Add(WordCatConstituent.tagType);
                    evalTypes.Add(WordCatConstituent.catType);
                    if (combo)
                    {
                        evalTypes.Add(WordCatConstituent.wordType);
                        goodPOS = true;
                    }
                }
                TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes);
                log.Info("Testing...");
                foreach (Tree goldTop in testTreebank)
                {
                    Tree             gold         = goldTop.FirstChild();
                    IList <IHasWord> goldSentence = gold.YieldHasWord();
                    if (goldSentence.Count > maxLength)
                    {
                        log.Info("Skipping sentence; too long: " + goldSentence.Count);
                        continue;
                    }
                    else
                    {
                        log.Info("Processing sentence; length: " + goldSentence.Count);
                    }
                    IList <IHasWord> s;
                    if (segmentWords)
                    {
                        StringBuilder goldCharBuf = new StringBuilder();
                        foreach (IHasWord aGoldSentence in goldSentence)
                        {
                            StringLabel word = (StringLabel)aGoldSentence;
                            goldCharBuf.Append(word.Value());
                        }
                        string goldChars = goldCharBuf.ToString();
                        s = seg.Segment(goldChars);
                    }
                    else
                    {
                        s = goldSentence;
                    }
                    Tree tree;
                    if (parse)
                    {
                        tree = lp.ParseTree(s);
                        if (tree == null)
                        {
                            throw new Exception("PARSER RETURNED NULL!!!");
                        }
                    }
                    else
                    {
                        tree = Edu.Stanford.Nlp.Trees.Trees.ToFlatTree(s);
                        tree = subcategoryStripper.TransformTree(tree);
                    }
                    if (pw != null)
                    {
                        if (parse)
                        {
                            tree.PennPrint(pw);
                        }
                        else
                        {
                            IEnumerator sentIter = s.GetEnumerator();
                            for (; ;)
                            {
                                Word word = (Word)sentIter.Current;
                                pw.Print(word.Word());
                                if (sentIter.MoveNext())
                                {
                                    pw.Print(" ");
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }
                        pw.Println();
                    }
                    if (eval)
                    {
                        ICollection ourBrackets;
                        ICollection goldBrackets;
                        ourBrackets  = proc.AllBrackets(tree);
                        goldBrackets = proc.AllBrackets(gold);
                        if (goodPOS)
                        {
                            Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(tree, gold));
                            Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(gold, tree));
                        }
                        basicEval.Eval(ourBrackets, goldBrackets);
                        System.Console.Out.WriteLine("\nScores:");
                        basicEval.DisplayLast();
                        Tree collinsTree = collinizer.TransformTree(tree);
                        Tree collinsGold = collinizer.TransformTree(gold);
                        ourBrackets  = proc.AllBrackets(collinsTree);
                        goldBrackets = proc.AllBrackets(collinsGold);
                        if (goodPOS)
                        {
                            Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsTree, collinsGold));
                            Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsGold, collinsTree));
                        }
                        collinsEval.Eval(ourBrackets, goldBrackets);
                        System.Console.Out.WriteLine("\nCollinized scores:");
                        collinsEval.DisplayLast();
                        System.Console.Out.WriteLine();
                    }
                }
                if (eval)
                {
                    basicEval.Display();
                    System.Console.Out.WriteLine();
                    collinsEval.Display();
                }
            }
        }
 /// <summary>
 /// Return a Treebank (actually a TransformingTreebank) where each
 /// Tree in the current treebank has been transformed using the
 /// TreeTransformer.
 /// </summary>
 /// <remarks>
 /// Return a Treebank (actually a TransformingTreebank) where each
 /// Tree in the current treebank has been transformed using the
 /// TreeTransformer.  The argument Treebank is unchanged (assuming
 /// that the TreeTransformer correctly doesn't change input Trees).
 /// </remarks>
 /// <param name="treeTrans">The TreeTransformer to use</param>
 /// <returns>
 /// A Treebank (actually a TransformingTreebank) where each
 /// Tree in the current treebank has been transformed using the
 /// TreeTransformer.
 /// </returns>
 public virtual Edu.Stanford.Nlp.Trees.Treebank Transform(ITreeTransformer treeTrans)
 {
     return(new TransformingTreebank(this, treeTrans));
 }
Пример #29
0
        /// <summary>Normalize a whole tree -- one can assume that this is the root.</summary>
        /// <remarks>
        /// Normalize a whole tree -- one can assume that this is the root.
        /// This implementation deletes empty elements (ones with nonterminal
        /// tag label '-NONE-') from the tree.
        /// </remarks>
        public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf)
        {
            ITreeTransformer transformer1 = null;
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            IPredicate <Tree> subtreeFilter = new _IPredicate_218();
            // The special Switchboard non-terminals clause.
            // Note that it deletes IP which other Treebanks might use!
            //Prevents deletion of the word "IP"
            // Delete empty/trace nodes (ones marked '-NONE-')
            IPredicate <Tree> nodeFilter = new _IPredicate_238();
            // The special switchboard non-terminals clause. Try keeping EDITED for now....
            // if ("EDITED".equals(t.label().value())) {
            //   return false;
            // }
            ITreeTransformer transformer2 = null;

            // special fix for possessives! -- make noun before head
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            // look to right
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            // change all tags to -TMP
            // Note: this changes the tree label, rather
            // than creating a new tree node.  Beware!
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            // special fix for possessives! -- make noun before head
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            // also allow chain to start with PP
            // special fix for possessives! -- make noun before head
            // change the head to be NP if possible
            // Note: this next bit changes the tree label, rather
            // than creating a new tree node.  Beware!
            // also allow chain to start with PP or ADVP
            // special fix for possessives! -- make noun before head
            // Note: this next bit changes the tree label, rather
            // than creating a new tree node.  Beware!
            // also allow chain to start with PP or ADVP
            // log.info("TMP: Annotating " + t);
            // special fix for possessives! -- make noun before head
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            // special fix for possessives! -- make noun before head
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            // if there wasn't an empty nonterminal at the top, but an S, wrap it.
            if (tree.Label().Value().Equals("S"))
            {
                tree = tf.NewTreeNode("ROOT", Collections.SingletonList(tree));
            }
            // repair for the phrasal VB in Switchboard (PTB version 3) that should be a VP
            foreach (Tree subtree in tree)
            {
                if (subtree.IsPhrasal() && "VB".Equals(subtree.Label().Value()))
                {
                    subtree.SetValue("VP");
                }
            }
            tree = tree.Transform(transformer1);
            if (tree == null)
            {
                return(null);
            }
            tree = tree.Prune(subtreeFilter, tf);
            if (tree == null)
            {
                return(null);
            }
            tree = tree.SpliceOut(nodeFilter, tf);
            if (tree == null)
            {
                return(null);
            }
            return(tree.Transform(transformer2, tf));
        }
Пример #30
0
 /// <summary>Returns a collection of unordered (but directed!) typed word-word dependencies for the tree.</summary>
 public static ICollection <IList <string> > UnorderedTypedDependencyObjectify(Tree t, IHeadFinder hf, ITreeTransformer collinizer)
 {
     return(DependencyObjectify(t, hf, collinizer, new AbstractTreebankParserParams.UnorderedTypedDependencyTyper(hf)));
 }