public CacheProcessor(CacheParseHypotheses cacher, LexicalizedParser parser, int dvKBest, ITreeTransformer transformer) { this.cacher = cacher; this.parser = parser; this.dvKBest = dvKBest; this.transformer = transformer; }
public Query(DVModelReranker _enclosing) { this._enclosing = _enclosing; this.transformer = LexicalizedParser.BuildTrainTransformer(this._enclosing.op); this.scorer = new DVParserCostAndGradient(null, null, this._enclosing.model, this._enclosing.op); this.deepTrees = Generics.NewArrayList(); }
public virtual bool Run(File trainTreebankFile, File testTreebankFile, InputStream inputStream) { op = new Options(); op.tlpParams = new ArabicTreebankParserParams(); op.SetOptions("-arabicFactored"); op.testOptions.maxLength = maxSentLen; op.testOptions.MaxItems = 5000000; //500000 is the default for Arabic, but we have substantially more edges now op.testOptions.outputFormatOptions = "removeTopBracket,includePunctuationDependencies"; // WSG: Just set this to some high value so that extractBestParse() // actually calls the lattice reader (e.g., this says that we can't have a word longer than // 80 characters...seems sensible for Arabic op.testOptions.maxSpanForTags = 80; treePrint = op.testOptions.TreePrint(op.tlpParams); debinarizer = new Debinarizer(op.forceCNF, new CategoryWordTagFactory()); subcategoryStripper = op.tlpParams.SubcategoryStripper(); Timing.StartTime(); Treebank trainTreebank = op.tlpParams.DiskTreebank(); trainTreebank.LoadPath(trainTreebankFile); lp = GetParserDataFromTreebank(trainTreebank); MakeParsers(); if (Verbose) { op.Display(); string lexNumRules = (pparser != null) ? int.ToString(lp.lex.NumRules()) : string.Empty; log.Info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings"); log.Info("Grammar\t" + lp.stateIndex.Size() + '\t' + lp.tagIndex.Size() + '\t' + lp.wordIndex.Size() + '\t' + (pparser != null ? lp.ug.NumRules() : string.Empty) + '\t' + (pparser != null ? lp.bg.NumRules() : string.Empty) + '\t' + lexNumRules ); log.Info("ParserPack is " + op.tlpParams.GetType().FullName); log.Info("Lexicon is " + lp.lex.GetType().FullName); } return(Parse(inputStream)); }
/// <summary> /// Takes a Tree and a collinizer and returns a Collection of /// <see cref="Edu.Stanford.Nlp.Trees.Constituent"/> /// s for /// PARSEVAL evaluation. Some notes on this particular parseval: /// <ul> /// <li> It is character-based, which allows it to be used on segmentation/parsing combination evaluation. /// <li> whether it gives you labeled or unlabeled bracketings depends on the value of the /// <paramref name="labelConstituents"/> /// parameter /// </ul> /// (Note that I haven't checked this rigorously yet with the PARSEVAL definition /// -- Roger.) /// </summary> public static ICollection <Constituent> ParsevalObjectify(Tree t, ITreeTransformer collinizer, bool labelConstituents) { ICollection <Constituent> spans = new List <Constituent>(); Tree t1 = collinizer.TransformTree(t); if (t1 == null) { return(spans); } foreach (Tree node in t1) { if (node.IsLeaf() || node.IsPreTerminal() || (node != t1 && node.Parent(t1) == null)) { continue; } int leftEdge = t1.LeftCharEdge(node); int rightEdge = t1.RightCharEdge(node); if (labelConstituents) { spans.Add(new LabeledConstituent(leftEdge, rightEdge, node.Label())); } else { spans.Add(new SimpleConstituent(leftEdge, rightEdge)); } } return(spans); }
/// <summary>Constructor</summary> /// <param name="hf">the headfinder</param> /// <param name="performMWETransformation"> /// Parameter for backwards compatibility. /// If set to false, multi-word expressions won't be attached to a new "MWE" node /// </param> public CoordinationTransformer(IHeadFinder hf, bool performMWETransformation) { //to get rid of unwanted nodes and tag //to flatten date patterns //to restructure the QP constituents // default constructor this.headFinder = hf; this.performMWETransformation = performMWETransformation; qp = new QPTreeTransformer(performMWETransformation); }
/// <summary> /// Return a MemoryTreebank where each /// Tree in the current treebank has been transformed using the /// TreeTransformer. /// </summary> /// <remarks> /// Return a MemoryTreebank where each /// Tree in the current treebank has been transformed using the /// TreeTransformer. This Treebank is unchanged (assuming that the /// TreeTransformer correctly doesn't change input Trees). /// </remarks> /// <param name="treeTrans">The TreeTransformer to use</param> public override Treebank Transform(ITreeTransformer treeTrans) { Treebank mtb = new Edu.Stanford.Nlp.Trees.MemoryTreebank(Count, TreeReaderFactory()); foreach (Tree t in this) { mtb.Add(treeTrans.TransformTree(t)); } return(mtb); }
public Query(CombinedDVModelReranker _enclosing) { this._enclosing = _enclosing; this.transformer = LexicalizedParser.BuildTrainTransformer(this._enclosing.op); this.scorers = Generics.NewArrayList(); foreach (DVModel model in this._enclosing.models) { this.scorers.Add(new DVParserCostAndGradient(null, null, model, this._enclosing.op)); } }
/// <summary>Loads treebank grammar from first argument and prints it.</summary> /// <remarks> /// Loads treebank grammar from first argument and prints it. /// Just a demonstration of functionality. <br /> /// <code>usage: java MemoryTreebank treebankFilesPath</code> /// </remarks> /// <param name="args">array of command-line arguments</param> public static void Main(string[] args) { Timing.StartTime(); Treebank treebank = new DiskTreebank(null); Treebank treebank2 = new MemoryTreebank(null); treebank.LoadPath(args[0]); treebank2.LoadPath(args[0]); CompositeTreebank c = new CompositeTreebank(treebank, treebank2); Timing.EndTime(); ITreeTransformer myTransformer = new TransformingTreebank.MyTreeTransformer(); ITreeTransformer myTransformer2 = new TransformingTreebank.MyTreeTransformer2(); ITreeTransformer myTransformer3 = new TransformingTreebank.MyTreeTransformer3(); Treebank tf1 = c.Transform(myTransformer).Transform(myTransformer2).Transform(myTransformer3); Treebank tf2 = new Edu.Stanford.Nlp.Trees.TransformingTreebank(new Edu.Stanford.Nlp.Trees.TransformingTreebank(new Edu.Stanford.Nlp.Trees.TransformingTreebank(c, myTransformer), myTransformer2), myTransformer3); ITreeTransformer[] tta = new ITreeTransformer[] { myTransformer, myTransformer2, myTransformer3 }; ITreeTransformer tt3 = new CompositeTreeTransformer(Arrays.AsList(tta)); Treebank tf3 = c.Transform(tt3); System.Console.Out.WriteLine("-------------------------"); System.Console.Out.WriteLine("COMPOSITE (DISK THEN MEMORY REPEATED VERSION OF) INPUT TREEBANK"); System.Console.Out.WriteLine(c); System.Console.Out.WriteLine("-------------------------"); System.Console.Out.WriteLine("SLOWLY TRANSFORMED TREEBANK, USING TransformingTreebank() CONSTRUCTOR"); Treebank tx1 = new Edu.Stanford.Nlp.Trees.TransformingTreebank(c, myTransformer); System.Console.Out.WriteLine(tx1); System.Console.Out.WriteLine("-----"); Treebank tx2 = new Edu.Stanford.Nlp.Trees.TransformingTreebank(tx1, myTransformer2); System.Console.Out.WriteLine(tx2); System.Console.Out.WriteLine("-----"); Treebank tx3 = new Edu.Stanford.Nlp.Trees.TransformingTreebank(tx2, myTransformer3); System.Console.Out.WriteLine(tx3); System.Console.Out.WriteLine("-------------------------"); System.Console.Out.WriteLine("TRANSFORMED TREEBANK, USING Treebank.transform()"); System.Console.Out.WriteLine(tf1); System.Console.Out.WriteLine("-------------------------"); System.Console.Out.WriteLine("PRINTING AGAIN TRANSFORMED TREEBANK, USING Treebank.transform()"); System.Console.Out.WriteLine(tf1); System.Console.Out.WriteLine("-------------------------"); System.Console.Out.WriteLine("TRANSFORMED TREEBANK, USING TransformingTreebank() CONSTRUCTOR"); System.Console.Out.WriteLine(tf2); System.Console.Out.WriteLine("-------------------------"); System.Console.Out.WriteLine("TRANSFORMED TREEBANK, USING CompositeTreeTransformer"); System.Console.Out.WriteLine(tf3); System.Console.Out.WriteLine("-------------------------"); System.Console.Out.WriteLine("COMPOSITE (DISK THEN MEMORY REPEATED VERSION OF) INPUT TREEBANK"); System.Console.Out.WriteLine(c); System.Console.Out.WriteLine("-------------------------"); }
public TreebankAnnotator(Options op, string treebankRoot) { // op.tlpParams = new EnglishTreebankParserParams(); // CDM: Aug 2004: With new implementation of treebank split categories, // I've hardwired this to load English ones. Otherwise need training data. // op.trainOptions.splitters = Generics.newHashSet(Arrays.asList(op.tlpParams.splitters())); op.trainOptions.splitters = ParentAnnotationStats.GetEnglishSplitCategories(treebankRoot); op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters())); op.SetOptions("-acl03pcfg", "-cnf"); treeTransformer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); // BinarizerFactory.TreeAnnotator.setTreebankLang(op.tlpParams); treeUnTransformer = new Debinarizer(op.forceCNF); collinizer = op.tlpParams.Collinizer(); this.op = op; }
private TaggedFileRecord(string file, TaggedFileRecord.Format format, string encoding, string tagSeparator, ITreeTransformer treeTransformer, TreeNormalizer treeNormalizer, ITreeReaderFactory trf, NumberRangesFileFilter treeRange, IPredicate <Tree> treeFilter, int wordColumn, int tagColumn) { // represents a tokenized file separated by text // represents a tsv file such as a conll file // represents a file in PTB format this.file = file; this.format = format; this.encoding = encoding; this.tagSeparator = tagSeparator; this.treeTransformer = treeTransformer; this.treeNormalizer = treeNormalizer; this.treeRange = treeRange; this.treeFilter = treeFilter; this.wordColumn = wordColumn; this.tagColumn = tagColumn; this.trf = trf; }
public TreeTaggedFileReader(TaggedFileRecord record) { // int numSentences = 0; filename = record.file; trf = record.trf == null ? new LabeledScoredTreeReaderFactory() : record.trf; transformer = record.treeTransformer; normalizer = record.treeNormalizer; treeFilter = record.treeFilter; treebank = new DiskTreebank(trf, record.encoding); if (record.treeRange != null) { treebank.LoadPath(filename, record.treeRange); } else { treebank.LoadPath(filename); } treeIterator = treebank.GetEnumerator(); FindNext(); }
public TreeAnnotatorAndBinarizer(IHeadFinder annotationHF, IHeadFinder binarizationHF, ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op) { this.trainOptions = op.trainOptions; if (doSubcategorization) { annotator = new TreeAnnotator(annotationHF, tlpParams, op); } else { annotator = new TreeAnnotatorAndBinarizer.TreeNullAnnotator(annotationHF); } binarizer = new TreeBinarizer(binarizationHF, tlpParams.TreebankLanguagePack(), insideFactor, trainOptions.markovFactor, trainOptions.markovOrder, trainOptions.CompactGrammar() > 0, trainOptions.CompactGrammar() > 1, trainOptions.HselCut, trainOptions .markFinalStates, trainOptions.simpleBinarizedLabels, trainOptions.noRebinarization); if (trainOptions.selectivePostSplit) { postSplitter = new PostSplitter(tlpParams, op); } else { postSplitter = null; } this.tf = new LabeledScoredTreeFactory(new CategoryWordTagFactory()); this.tlp = tlpParams.TreebankLanguagePack(); this.forceCNF = forceCNF; if (trainOptions.printAnnotatedRuleCounts) { annotatedRuleCounts = new ClassicCounter <Tree>(); } else { annotatedRuleCounts = null; } if (trainOptions.printAnnotatedStateCounts) { annotatedStateCounts = new ClassicCounter <string>(); } else { annotatedStateCounts = null; } }
/// <summary>Build a CTBErrorCorrectingTreeNormalizer.</summary> /// <param name="splitNPTMP">Temporal annotation on NPs</param> /// <param name="splitPPTMP">Temporal annotation on PPs</param> /// <param name="splitXPTMP">Temporal annotation on any phrase marked in CTB</param> /// <param name="charTags"> /// Whether you wish to push POS tags down on to the /// characters of a word (for unsegmented text) /// </param> public CTBErrorCorrectingTreeNormalizer(bool splitNPTMP, bool splitPPTMP, bool splitXPTMP, bool charTags) { this.splitNPTMP = splitNPTMP; this.splitPPTMP = splitPPTMP; this.splitXPTMP = splitXPTMP; if (charTags) { try { tagExtender = (ITreeTransformer)System.Activator.CreateInstance(Sharpen.Runtime.GetType("edu.stanford.nlp.trees.international.pennchinese.CharacterLevelTagExtender")); } catch (Exception e) { throw new Exception(e); } } else { tagExtender = null; } }
/// <summary> /// Returns the set of dependencies in a tree, according to some /// <see cref="Edu.Stanford.Nlp.Trees.IDependencyTyper{T}"/> /// . /// </summary> public static ICollection <E> DependencyObjectify <E>(Tree t, IHeadFinder hf, ITreeTransformer collinizer, IDependencyTyper <E> typer) { ICollection <E> deps = new List <E>(); Tree t1 = collinizer.TransformTree(t); if (t1 == null) { return(deps); } DependencyObjectifyHelper(t1, t1, hf, deps, typer); return(deps); }
public static Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord CreateRecord(Properties config, string description) { string[] pieces = description.Split(","); if (pieces.Length == 1) { return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(description, TaggedFileRecord.Format.Text, GetEncoding(config), GetTagSeparator(config), null, null, null, null, null, null, null)); } string[] args = new string[pieces.Length - 1]; System.Array.Copy(pieces, 0, args, 0, pieces.Length - 1); string file = pieces[pieces.Length - 1]; TaggedFileRecord.Format format = TaggedFileRecord.Format.Text; string encoding = GetEncoding(config); string tagSeparator = GetTagSeparator(config); ITreeTransformer treeTransformer = null; TreeNormalizer treeNormalizer = null; ITreeReaderFactory trf = null; NumberRangesFileFilter treeRange = null; IPredicate <Tree> treeFilter = null; int wordColumn = null; int tagColumn = null; foreach (string arg in args) { string[] argPieces = arg.Split("=", 2); if (argPieces.Length != 2) { throw new ArgumentException("TaggedFileRecord argument " + arg + " has an unexpected number of =s"); } if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Format)) { format = TaggedFileRecord.Format.ValueOf(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Encoding)) { encoding = argPieces[1]; } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagSeparator)) { tagSeparator = argPieces[1]; } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeTransformer)) { treeTransformer = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeNormalizer)) { treeNormalizer = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeReader)) { trf = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeRange)) { string range = argPieces[1].ReplaceAll(":", ","); treeRange = new NumberRangesFileFilter(range, true); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeFilter)) { treeFilter = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], WordColumn)) { wordColumn = int.Parse(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagColumn)) { tagColumn = int.Parse(argPieces[1]); } else { throw new ArgumentException("TaggedFileRecord argument " + argPieces[0] + " is unknown"); } } } } } } } } } } } return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(file, format, encoding, tagSeparator, treeTransformer, treeNormalizer, trf, treeRange, treeFilter, wordColumn, tagColumn)); }
internal virtual IdentityHashMap <Tree, IList <Tree> > GetTopParses(IList <Tree> trees, ITreeTransformer transformer) { return(GetTopParses(parser, op, trees, transformer, false)); }
internal static IdentityHashMap <Tree, IList <Tree> > GetTopParses(LexicalizedParser parser, Options op, ICollection <Tree> trees, ITreeTransformer transformer, bool outputUpdates) { IdentityHashMap <Tree, IList <Tree> > topParses = new IdentityHashMap <Tree, IList <Tree> >(); foreach (Tree tree in trees) { IList <Tree> parses = GetTopParsesForOneTree(parser, op.trainOptions.dvKBest, tree, transformer); topParses[tree] = parses; if (outputUpdates && topParses.Count % 10 == 0) { log.Info("Processed " + topParses.Count + " trees"); } } if (outputUpdates) { log.Info("Finished processing " + topParses.Count + " trees"); } return(topParses); }
/// <summary> /// An example command line for training a new parser: /// <br /> /// nohup java -mx6g edu.stanford.nlp.parser.dvparser.DVParser -cachedTrees /scr/nlp/data/dvparser/wsj/cached.wsj.train.simple.ser.gz -train -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 -debugOutputFrequency 400 -nofilter -trainingThreads 5 -parser /u/nlp/data/lexparser/wsjPCFG.nocompact.simple.ser.gz -trainingIterations 40 -batchSize 25 -model /scr/nlp/data/dvparser/wsj/wsj.combine.v2.ser.gz -unkWord "*UNK*" -dvCombineCategories > /scr/nlp/data/dvparser/wsj/wsj.combine.v2.out 2>&1 & /// </summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { if (args.Length == 0) { Help(); System.Environment.Exit(2); } log.Info("Running DVParser with arguments:"); foreach (string arg in args) { log.Info(" " + arg); } log.Info(); string parserPath = null; string trainTreebankPath = null; IFileFilter trainTreebankFilter = null; string cachedTrainTreesPath = null; bool runGradientCheck = false; bool runTraining = false; string testTreebankPath = null; IFileFilter testTreebankFilter = null; string initialModelPath = null; string modelPath = null; bool filter = true; string resultsRecordPath = null; IList <string> unusedArgs = new List <string>(); // These parameters can be null or 0 if the model was not // serialized with the new parameters. Setting the options at the // command line will override these defaults. // TODO: if/when we integrate back into the main branch and // rebuild models, we can get rid of this IList <string> argsWithDefaults = new List <string>(Arrays.AsList(new string[] { "-wordVectorFile", Options.LexOptions.DefaultWordVectorFile, "-dvKBest", int.ToString(TrainOptions.DefaultKBest), "-batchSize", int.ToString(TrainOptions.DefaultBatchSize ), "-trainingIterations", int.ToString(TrainOptions.DefaultTrainingIterations), "-qnIterationsPerBatch", int.ToString(TrainOptions.DefaultQnIterationsPerBatch), "-regCost", double.ToString(TrainOptions.DefaultRegcost), "-learningRate", double .ToString(TrainOptions.DefaultLearningRate), "-deltaMargin", double.ToString(TrainOptions.DefaultDeltaMargin), "-unknownNumberVector", "-unknownDashedWordVectors", "-unknownCapsVector", "-unknownchinesepercentvector", "-unknownchinesenumbervector" , "-unknownchineseyearvector", "-unkWord", "*UNK*", "-transformMatrixType", "DIAGONAL", "-scalingForInit", double.ToString(TrainOptions.DefaultScalingForInit), "-trainWordVectors" })); Sharpen.Collections.AddAll(argsWithDefaults, Arrays.AsList(args)); args = Sharpen.Collections.ToArray(argsWithDefaults, new string[argsWithDefaults.Count]); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser")) { parserPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; testTreebankPath = treebankDescription.First(); testTreebankFilter = treebankDescription.Second(); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; trainTreebankPath = treebankDescription.First(); trainTreebankFilter = treebankDescription.Second(); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-cachedTrees")) { cachedTrainTreesPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-runGradientCheck")) { runGradientCheck = true; argIndex++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train")) { runTraining = true; argIndex++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model")) { modelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-nofilter")) { filter = false; argIndex++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-continueTraining")) { runTraining = true; filter = false; initialModelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-resultsRecord")) { resultsRecordPath = args[argIndex + 1]; argIndex += 2; } else { unusedArgs.Add(args[argIndex++]); } } } } } } } } } } } if (parserPath == null && modelPath == null) { throw new ArgumentException("Must supply either a base parser model with -parser or a serialized DVParser with -model"); } if (!runTraining && modelPath == null && !runGradientCheck) { throw new ArgumentException("Need to either train a new model, run the gradient check or specify a model to load with -model"); } string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]); Edu.Stanford.Nlp.Parser.Dvparser.DVParser dvparser = null; LexicalizedParser lexparser = null; if (initialModelPath != null) { lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(initialModelPath, newArgs)); DVModel model = GetModelFromLexicalizedParser(lexparser); dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser); } else { if (runTraining || runGradientCheck) { lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserPath, newArgs)); dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(lexparser); } else { if (modelPath != null) { lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs)); DVModel model = GetModelFromLexicalizedParser(lexparser); dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser); } } } IList <Tree> trainSentences = new List <Tree>(); IdentityHashMap <Tree, byte[]> trainCompressedParses = Generics.NewIdentityHashMap(); if (cachedTrainTreesPath != null) { foreach (string path in cachedTrainTreesPath.Split(",")) { IList <Pair <Tree, byte[]> > cache = IOUtils.ReadObjectFromFile(path); foreach (Pair <Tree, byte[]> pair in cache) { trainSentences.Add(pair.First()); trainCompressedParses[pair.First()] = pair.Second(); } log.Info("Read in " + cache.Count + " trees from " + path); } } if (trainTreebankPath != null) { // TODO: make the transformer a member of the model? ITreeTransformer transformer = BuildTrainTransformer(dvparser.GetOp()); Treebank treebank = dvparser.GetOp().tlpParams.MemoryTreebank(); treebank.LoadPath(trainTreebankPath, trainTreebankFilter); treebank = treebank.Transform(transformer); log.Info("Read in " + treebank.Count + " trees from " + trainTreebankPath); CacheParseHypotheses cacher = new CacheParseHypotheses(dvparser.parser); CacheParseHypotheses.CacheProcessor processor = new CacheParseHypotheses.CacheProcessor(cacher, lexparser, dvparser.op.trainOptions.dvKBest, transformer); foreach (Tree tree in treebank) { trainSentences.Add(tree); trainCompressedParses[tree] = processor.Process(tree).second; } //System.out.println(tree); log.Info("Finished parsing " + treebank.Count + " trees, getting " + dvparser.op.trainOptions.dvKBest + " hypotheses each"); } if ((runTraining || runGradientCheck) && filter) { log.Info("Filtering rules for the given training set"); dvparser.dvModel.SetRulesForTrainingSet(trainSentences, trainCompressedParses); log.Info("Done filtering rules; " + dvparser.dvModel.numBinaryMatrices + " binary matrices, " + dvparser.dvModel.numUnaryMatrices + " unary matrices, " + dvparser.dvModel.wordVectors.Count + " word vectors"); } //dvparser.dvModel.printAllMatrices(); Treebank testTreebank = null; if (testTreebankPath != null) { log.Info("Reading in trees from " + testTreebankPath); if (testTreebankFilter != null) { log.Info("Filtering on " + testTreebankFilter); } testTreebank = dvparser.GetOp().tlpParams.MemoryTreebank(); testTreebank.LoadPath(testTreebankPath, testTreebankFilter); log.Info("Read in " + testTreebank.Count + " trees for testing"); } // runGradientCheck= true; if (runGradientCheck) { log.Info("Running gradient check on " + trainSentences.Count + " trees"); dvparser.RunGradientCheck(trainSentences, trainCompressedParses); } if (runTraining) { log.Info("Training the RNN parser"); log.Info("Current train options: " + dvparser.GetOp().trainOptions); dvparser.Train(trainSentences, trainCompressedParses, testTreebank, modelPath, resultsRecordPath); if (modelPath != null) { dvparser.SaveModel(modelPath); } } if (testTreebankPath != null) { EvaluateTreebank evaluator = new EvaluateTreebank(dvparser.AttachModelToLexicalizedParser()); evaluator.TestOnTreebank(testTreebank); } log.Info("Successfully ran DVParser"); }
/// <summary>Run the scoring metric on guess/gold input.</summary> /// <remarks> /// Run the scoring metric on guess/gold input. This method performs "Collinization." /// The default language is English. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); int maxGoldYield = int.MaxValue; bool Verbose = false; string encoding = "UTF-8"; string guessFile = null; string goldFile = null; IDictionary <string, string[]> argsMap = StringUtils.ArgsToMap(args, optionArgDefs); foreach (KeyValuePair <string, string[]> opt in argsMap) { if (opt.Key == null) { continue; } if (opt.Key.Equals("-l")) { Language lang = Language.ValueOf(opt.Value[0].Trim()); tlpp = lang.@params; } else { if (opt.Key.Equals("-y")) { maxGoldYield = System.Convert.ToInt32(opt.Value[0].Trim()); } else { if (opt.Key.Equals("-v")) { Verbose = true; } else { if (opt.Key.Equals("-c")) { Edu.Stanford.Nlp.Parser.Metrics.TaggingEval.doCatLevelEval = true; } else { if (opt.Key.Equals("-e")) { encoding = opt.Value[0]; } else { log.Info(usage.ToString()); System.Environment.Exit(-1); } } } } } //Non-option arguments located at key null string[] rest = argsMap[null]; if (rest == null || rest.Length < minArgs) { log.Info(usage.ToString()); System.Environment.Exit(-1); } goldFile = rest[0]; guessFile = rest[1]; } tlpp.SetInputEncoding(encoding); PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Edu.Stanford.Nlp.Parser.Metrics.TaggingEval metric = new Edu.Stanford.Nlp.Parser.Metrics.TaggingEval("Tagging LP/LR"); ITreeTransformer tc = tlpp.Collinizer(); //The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.MoveNext() && goldItr.MoveNext()) { Tree guessTree = guessItr.Current; IList <ILabel> guessYield = guessTree.Yield(); guessLineId++; Tree goldTree = goldItr.Current; IList <ILabel> goldYield = goldTree.Yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.Count > maxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.Count != guessYield.Count) { pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId); skippedGuessTrees++; continue; } Tree evalGuess = tc.TransformTree(guessTree); Tree evalGold = tc.TransformTree(goldTree); metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); } if (guessItr.MoveNext() || goldItr.MoveNext()) { System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees); } metric.Display(true, pwOut); pwOut.Println(); pwOut.Close(); }
/// <summary>Run the scoring metric on guess/gold input.</summary> /// <remarks> /// Run the scoring metric on guess/gold input. This method performs "Collinization." /// The default language is English. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); int maxGoldYield = int.MaxValue; int maxGuessYield = int.MaxValue; bool Verbose = false; bool skipGuess = false; bool tagMode = false; string guessFile = null; string goldFile = null; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { Language lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-y": { maxGoldYield = System.Convert.ToInt32(args[++i].Trim()); break; } case "-t": { tagMode = true; break; } case "-v": { Verbose = true; break; } case "-g": { maxGuessYield = System.Convert.ToInt32(args[++i].Trim()); skipGuess = true; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { //Required parameters goldFile = args[i++]; guessFile = args[i]; break; } } PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); string evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG"; Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval eval = new Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval(evalName, tagMode); ITreeTransformer tc = tlpp.Collinizer(); //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees //don't match, we need to keep looking for the next gold tree that matches. //The evalb ref implementation differs slightly as it expects one tree per line. It assigns //status as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); int goldLineId = 0; int skippedGuessTrees = 0; foreach (Tree guess in guessTreebank) { Tree evalGuess = tc.TransformTree(guess); List <ILabel> guessSent = guess.Yield(); string guessChars = SentenceUtils.ListToString(guessSent).ReplaceAll("\\s+", string.Empty); if (guessSent.Count > maxGuessYield) { skippedGuessTrees++; continue; } bool doneEval = false; while (goldItr.MoveNext() && !doneEval) { Tree gold = goldItr.Current; Tree evalGold = tc.TransformTree(gold); goldLineId++; List <ILabel> goldSent = gold.Yield(); string goldChars = SentenceUtils.ListToString(goldSent).ReplaceAll("\\s+", string.Empty); if (goldSent.Count > maxGoldYield) { continue; } else { if (goldChars.Length != guessChars.Length) { pwOut.Printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.Length, goldChars.Length); skippedGuessTrees++; break; } } //Default evalb behavior -- skip this guess tree eval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); doneEval = true; } } //Move to the next guess parse pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees); } eval.Display(true, pwOut); pwOut.Println(); pwOut.Close(); }
public Debinarizer(bool forceCNF, ILabelFactory lf) { this.forceCNF = forceCNF; tf = new LabeledScoredTreeFactory(lf); boundaryRemover = new BoundaryRemover(); }
/// <summary> /// Takes a Tree and a collinizer and returns a Collection of labeled /// <see cref="Edu.Stanford.Nlp.Trees.Constituent"/> /// s for PARSEVAL. /// </summary> /// <param name="t">The tree to extract constituents from</param> /// <param name="collinizer"> /// The TreeTransformer used to normalize the tree for /// evaluation /// </param> /// <returns>The bag of Constituents for PARSEVAL.</returns> public static ICollection <Constituent> ParsevalObjectify(Tree t, ITreeTransformer collinizer) { return(ParsevalObjectify(t, collinizer, true)); }
/// <summary>Run the Evalb scoring metric on guess/gold input.</summary> /// <remarks>Run the Evalb scoring metric on guess/gold input. The default language is English.</remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs()); Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; int maxGoldYield = PropertiesUtils.GetInt(options, "y", int.MaxValue); bool Verbose = PropertiesUtils.GetBool(options, "v", false); bool sortByF1 = PropertiesUtils.HasProperty(options, "s"); int worstKTreesToEmit = PropertiesUtils.GetInt(options, "s", 0); PriorityQueue <Triple <double, Tree, Tree> > queue = sortByF1 ? new PriorityQueue <Triple <double, Tree, Tree> >(2000, new Evalb.F1Comparator()) : null; bool doCatLevel = PropertiesUtils.GetBool(options, "c", false); string labelRegex = options.GetProperty("f", null); string encoding = options.GetProperty("e", "UTF-8"); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != minArgs) { log.Info(Usage()); System.Environment.Exit(-1); } string goldFile = parsedArgs[0]; string guessFile = parsedArgs[1]; // Command-line has been parsed. Configure the metric for evaluation. tlpp.SetInputEncoding(encoding); PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Evalb metric = new Evalb("Evalb LP/LR", true); EvalbByCat evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null; ITreeTransformer tc = tlpp.Collinizer(); //The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.MoveNext() && goldItr.MoveNext()) { Tree guessTree = guessItr.Current; IList <ILabel> guessYield = guessTree.Yield(); guessLineId++; Tree goldTree = goldItr.Current; IList <ILabel> goldYield = goldTree.Yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.Count > maxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.Count != guessYield.Count) { pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId); skippedGuessTrees++; continue; } Tree evalGuess = tc.TransformTree(guessTree); Tree evalGold = tc.TransformTree(goldTree); metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); if (doCatLevel) { evalbCat.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); } if (sortByF1) { StoreTrees(queue, guessTree, goldTree, metric.GetLastF1()); } } if (guessItr.MoveNext() || goldItr.MoveNext()) { System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees); } metric.Display(true, pwOut); pwOut.Println(); if (doCatLevel) { evalbCat.Display(true, pwOut); pwOut.Println(); } if (sortByF1) { EmitSortedTrees(queue, worstKTreesToEmit, guessFile); } pwOut.Close(); }
/// <summary> /// Initializes a new instance of the <see cref="TreeGenerator"/> class. /// </summary> /// <param name="transformer">The transformer of a CST tree to a transformation schema.</param> /// <param name="logger">The syntax errors logger.</param> public TreeGenerator(ITreeTransformer transformer, ILogger <ITreeGenerator> logger = null) { this._logger = logger; this._transformer = transformer; }
public SynchronizedTreeTransformer(ITreeTransformer threadUnsafe) { this.threadUnsafe = threadUnsafe; }
public static IList <Tree> GetTopParsesForOneTree(LexicalizedParser parser, int dvKBest, Tree tree, ITreeTransformer transformer) { IParserQuery pq = parser.ParserQuery(); IList <Word> sentence = tree.YieldWords(); // Since the trees are binarized and otherwise manipulated, we // need to chop off the last word in order to remove the end of // sentence symbol if (sentence.Count <= 1) { return(null); } sentence = sentence.SubList(0, sentence.Count - 1); if (!pq.Parse(sentence)) { log.Info("Failed to use the given parser to reparse sentence \"" + sentence + "\""); return(null); } IList <Tree> parses = new List <Tree>(); IList <ScoredObject <Tree> > bestKParses = pq.GetKBestPCFGParses(dvKBest); foreach (ScoredObject <Tree> so in bestKParses) { Tree result = so.Object(); if (transformer != null) { result = transformer.TransformTree(result); } parses.Add(result); } return(parses); }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap(); flagsToNumArgs["-parser"] = int.Parse(3); flagsToNumArgs["-lex"] = int.Parse(3); flagsToNumArgs["-test"] = int.Parse(2); flagsToNumArgs["-out"] = int.Parse(1); flagsToNumArgs["-lengthPenalty"] = int.Parse(1); flagsToNumArgs["-penaltyType"] = int.Parse(1); flagsToNumArgs["-maxLength"] = int.Parse(1); flagsToNumArgs["-stats"] = int.Parse(2); IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs); bool eval = argMap.Contains("-eval"); PrintWriter pw = null; if (argMap.Contains("-out")) { pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap["-out"])[0]), "GB18030"), true); } log.Info("ChineseCharacterBasedLexicon called with args:"); ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); for (int i = 0; i < args.Length; i++) { ctpp.SetOptionFlag(args, i); log.Info(" " + args[i]); } log.Info(); Options op = new Options(ctpp); if (argMap.Contains("-stats")) { string[] statArgs = (argMap["-stats"]); MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank(); IFileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false); rawTrainTreebank.LoadPath(new File(statArgs[0]), trainFilt); log.Info("Done reading trees."); MemoryTreebank trainTreebank; if (argMap.Contains("-annotate")) { trainTreebank = new MemoryTreebank(); TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op); foreach (Tree tree in rawTrainTreebank) { trainTreebank.Add(annotator.TransformTree(tree)); } log.Info("Done annotating trees."); } else { trainTreebank = rawTrainTreebank; } PrintStats(trainTreebank, pw); System.Environment.Exit(0); } int maxLength = 1000000; // Test.verbose = true; if (argMap.Contains("-norm")) { op.testOptions.lengthNormalization = true; } if (argMap.Contains("-maxLength")) { maxLength = System.Convert.ToInt32((argMap["-maxLength"])[0]); } op.testOptions.maxLength = 120; bool combo = argMap.Contains("-combo"); if (combo) { ctpp.useCharacterBasedLexicon = true; op.testOptions.maxSpanForTags = 10; op.doDep = false; op.dcTags = false; } LexicalizedParser lp = null; ILexicon lex = null; if (argMap.Contains("-parser")) { string[] parserArgs = (argMap["-parser"]); if (parserArgs.Length > 1) { IFileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false); lp = LexicalizedParser.TrainFromTreebank(parserArgs[0], trainFilt, op); if (parserArgs.Length == 3) { string filename = parserArgs[2]; log.Info("Writing parser in serialized format to file " + filename + " "); System.Console.Error.Flush(); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(lp); @out.Close(); log.Info("done."); } } else { string parserFile = parserArgs[0]; lp = LexicalizedParser.LoadModel(parserFile, op); } lex = lp.GetLexicon(); op = lp.GetOp(); ctpp = (ChineseTreebankParserParams)op.tlpParams; } if (argMap.Contains("-rad")) { ctpp.useUnknownCharacterModel = true; } if (argMap.Contains("-lengthPenalty")) { ctpp.lengthPenalty = double.Parse((argMap["-lengthPenalty"])[0]); } if (argMap.Contains("-penaltyType")) { ctpp.penaltyType = System.Convert.ToInt32((argMap["-penaltyType"])[0]); } if (argMap.Contains("-lex")) { string[] lexArgs = (argMap["-lex"]); if (lexArgs.Length > 1) { IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); lex = ctpp.Lex(op, wordIndex, tagIndex); MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank(); IFileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false); rawTrainTreebank.LoadPath(new File(lexArgs[0]), trainFilt); log.Info("Done reading trees."); MemoryTreebank trainTreebank; if (argMap.Contains("-annotate")) { trainTreebank = new MemoryTreebank(); TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op); foreach (Tree tree in rawTrainTreebank) { tree = annotator.TransformTree(tree); trainTreebank.Add(tree); } log.Info("Done annotating trees."); } else { trainTreebank = rawTrainTreebank; } lex.InitializeTraining(trainTreebank.Count); lex.Train(trainTreebank); lex.FinishTraining(); log.Info("Done training lexicon."); if (lexArgs.Length == 3) { string filename = lexArgs.Length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz"; log.Info("Writing lexicon in serialized format to file " + filename + " "); System.Console.Error.Flush(); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(lex); @out.Close(); log.Info("done."); } } else { string lexFile = lexArgs.Length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz"; log.Info("Reading Lexicon from file " + lexFile); ObjectInputStream @in = IOUtils.ReadStreamFromString(lexFile); try { lex = (ILexicon)@in.ReadObject(); } catch (TypeLoadException) { throw new Exception("Bad serialized file: " + lexFile); } @in.Close(); } } if (argMap.Contains("-test")) { bool segmentWords = ctpp.segment; bool parse = lp != null; System.Diagnostics.Debug.Assert((parse || segmentWords)); // WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords"); // WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags"); IWordSegmenter seg = null; if (segmentWords) { seg = (IWordSegmenter)lex; } string[] testArgs = (argMap["-test"]); MemoryTreebank testTreebank = op.tlpParams.MemoryTreebank(); IFileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false); testTreebank.LoadPath(new File(testArgs[0]), testFilt); ITreeTransformer subcategoryStripper = op.tlpParams.SubcategoryStripper(); ITreeTransformer collinizer = ctpp.Collinizer(); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval basicEval = new EquivalenceClassEval(eqclass, eqcheck, "basic"); EquivalenceClassEval collinsEval = new EquivalenceClassEval(eqclass, eqcheck, "collinized"); IList <string> evalTypes = new List <string>(3); bool goodPOS = false; if (segmentWords) { evalTypes.Add(WordCatConstituent.wordType); if (ctpp.segmentMarkov && !parse) { evalTypes.Add(WordCatConstituent.tagType); goodPOS = true; } } if (parse) { evalTypes.Add(WordCatConstituent.tagType); evalTypes.Add(WordCatConstituent.catType); if (combo) { evalTypes.Add(WordCatConstituent.wordType); goodPOS = true; } } TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes); log.Info("Testing..."); foreach (Tree goldTop in testTreebank) { Tree gold = goldTop.FirstChild(); IList <IHasWord> goldSentence = gold.YieldHasWord(); if (goldSentence.Count > maxLength) { log.Info("Skipping sentence; too long: " + goldSentence.Count); continue; } else { log.Info("Processing sentence; length: " + goldSentence.Count); } IList <IHasWord> s; if (segmentWords) { StringBuilder goldCharBuf = new StringBuilder(); foreach (IHasWord aGoldSentence in goldSentence) { StringLabel word = (StringLabel)aGoldSentence; goldCharBuf.Append(word.Value()); } string goldChars = goldCharBuf.ToString(); s = seg.Segment(goldChars); } else { s = goldSentence; } Tree tree; if (parse) { tree = lp.ParseTree(s); if (tree == null) { throw new Exception("PARSER RETURNED NULL!!!"); } } else { tree = Edu.Stanford.Nlp.Trees.Trees.ToFlatTree(s); tree = subcategoryStripper.TransformTree(tree); } if (pw != null) { if (parse) { tree.PennPrint(pw); } else { IEnumerator sentIter = s.GetEnumerator(); for (; ;) { Word word = (Word)sentIter.Current; pw.Print(word.Word()); if (sentIter.MoveNext()) { pw.Print(" "); } else { break; } } } pw.Println(); } if (eval) { ICollection ourBrackets; ICollection goldBrackets; ourBrackets = proc.AllBrackets(tree); goldBrackets = proc.AllBrackets(gold); if (goodPOS) { Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(tree, gold)); Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(gold, tree)); } basicEval.Eval(ourBrackets, goldBrackets); System.Console.Out.WriteLine("\nScores:"); basicEval.DisplayLast(); Tree collinsTree = collinizer.TransformTree(tree); Tree collinsGold = collinizer.TransformTree(gold); ourBrackets = proc.AllBrackets(collinsTree); goldBrackets = proc.AllBrackets(collinsGold); if (goodPOS) { Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsTree, collinsGold)); Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsGold, collinsTree)); } collinsEval.Eval(ourBrackets, goldBrackets); System.Console.Out.WriteLine("\nCollinized scores:"); collinsEval.DisplayLast(); System.Console.Out.WriteLine(); } } if (eval) { basicEval.Display(); System.Console.Out.WriteLine(); collinsEval.Display(); } } }
/// <summary> /// Return a Treebank (actually a TransformingTreebank) where each /// Tree in the current treebank has been transformed using the /// TreeTransformer. /// </summary> /// <remarks> /// Return a Treebank (actually a TransformingTreebank) where each /// Tree in the current treebank has been transformed using the /// TreeTransformer. The argument Treebank is unchanged (assuming /// that the TreeTransformer correctly doesn't change input Trees). /// </remarks> /// <param name="treeTrans">The TreeTransformer to use</param> /// <returns> /// A Treebank (actually a TransformingTreebank) where each /// Tree in the current treebank has been transformed using the /// TreeTransformer. /// </returns> public virtual Edu.Stanford.Nlp.Trees.Treebank Transform(ITreeTransformer treeTrans) { return(new TransformingTreebank(this, treeTrans)); }
/// <summary>Normalize a whole tree -- one can assume that this is the root.</summary> /// <remarks> /// Normalize a whole tree -- one can assume that this is the root. /// This implementation deletes empty elements (ones with nonterminal /// tag label '-NONE-') from the tree. /// </remarks> public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { ITreeTransformer transformer1 = null; // Note: this changes the tree label, rather than // creating a new tree node. Beware! IPredicate <Tree> subtreeFilter = new _IPredicate_218(); // The special Switchboard non-terminals clause. // Note that it deletes IP which other Treebanks might use! //Prevents deletion of the word "IP" // Delete empty/trace nodes (ones marked '-NONE-') IPredicate <Tree> nodeFilter = new _IPredicate_238(); // The special switchboard non-terminals clause. Try keeping EDITED for now.... // if ("EDITED".equals(t.label().value())) { // return false; // } ITreeTransformer transformer2 = null; // special fix for possessives! -- make noun before head // Note: this changes the tree label, rather than // creating a new tree node. Beware! // look to right // Note: this changes the tree label, rather than // creating a new tree node. Beware! // change all tags to -TMP // Note: this changes the tree label, rather // than creating a new tree node. Beware! // Note: this changes the tree label, rather than // creating a new tree node. Beware! // special fix for possessives! -- make noun before head // Note: this changes the tree label, rather than // creating a new tree node. Beware! // also allow chain to start with PP // special fix for possessives! -- make noun before head // change the head to be NP if possible // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! // also allow chain to start with PP or ADVP // special fix for possessives! -- make noun before head // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! // also allow chain to start with PP or ADVP // log.info("TMP: Annotating " + t); // special fix for possessives! -- make noun before head // Note: this changes the tree label, rather than // creating a new tree node. Beware! // special fix for possessives! -- make noun before head // Note: this changes the tree label, rather than // creating a new tree node. Beware! // if there wasn't an empty nonterminal at the top, but an S, wrap it. if (tree.Label().Value().Equals("S")) { tree = tf.NewTreeNode("ROOT", Collections.SingletonList(tree)); } // repair for the phrasal VB in Switchboard (PTB version 3) that should be a VP foreach (Tree subtree in tree) { if (subtree.IsPhrasal() && "VB".Equals(subtree.Label().Value())) { subtree.SetValue("VP"); } } tree = tree.Transform(transformer1); if (tree == null) { return(null); } tree = tree.Prune(subtreeFilter, tf); if (tree == null) { return(null); } tree = tree.SpliceOut(nodeFilter, tf); if (tree == null) { return(null); } return(tree.Transform(transformer2, tf)); }
/// <summary>Returns a collection of unordered (but directed!) typed word-word dependencies for the tree.</summary> public static ICollection <IList <string> > UnorderedTypedDependencyObjectify(Tree t, IHeadFinder hf, ITreeTransformer collinizer) { return(DependencyObjectify(t, hf, collinizer, new AbstractTreebankParserParams.UnorderedTypedDependencyTyper(hf))); }