public virtual bool Run(File trainTreebankFile, File testTreebankFile, InputStream inputStream) { op = new Options(); op.tlpParams = new ArabicTreebankParserParams(); op.SetOptions("-arabicFactored"); op.testOptions.maxLength = maxSentLen; op.testOptions.MaxItems = 5000000; //500000 is the default for Arabic, but we have substantially more edges now op.testOptions.outputFormatOptions = "removeTopBracket,includePunctuationDependencies"; // WSG: Just set this to some high value so that extractBestParse() // actually calls the lattice reader (e.g., this says that we can't have a word longer than // 80 characters...seems sensible for Arabic op.testOptions.maxSpanForTags = 80; treePrint = op.testOptions.TreePrint(op.tlpParams); debinarizer = new Debinarizer(op.forceCNF, new CategoryWordTagFactory()); subcategoryStripper = op.tlpParams.SubcategoryStripper(); Timing.StartTime(); Treebank trainTreebank = op.tlpParams.DiskTreebank(); trainTreebank.LoadPath(trainTreebankFile); lp = GetParserDataFromTreebank(trainTreebank); MakeParsers(); if (Verbose) { op.Display(); string lexNumRules = (pparser != null) ? int.ToString(lp.lex.NumRules()) : string.Empty; log.Info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings"); log.Info("Grammar\t" + lp.stateIndex.Size() + '\t' + lp.tagIndex.Size() + '\t' + lp.wordIndex.Size() + '\t' + (pparser != null ? lp.ug.NumRules() : string.Empty) + '\t' + (pparser != null ? lp.bg.NumRules() : string.Empty) + '\t' + lexNumRules ); log.Info("ParserPack is " + op.tlpParams.GetType().FullName); log.Info("Lexicon is " + lp.lex.GetType().FullName); } return(Parse(inputStream)); }
/// <summary> /// This is hardwired to calculate the split categories from English /// Penn Treebank sections 2-21 with a default cutoff of 300 (as used /// in ACL03PCFG). /// </summary> /// <remarks> /// This is hardwired to calculate the split categories from English /// Penn Treebank sections 2-21 with a default cutoff of 300 (as used /// in ACL03PCFG). It was added to upgrading of code in cases where no /// Treebank was available, and the pre-stored list was being used). /// </remarks> public static ICollection <string> GetEnglishSplitCategories(string treebankRoot) { ITreebankLangParserParams tlpParams = new EnglishTreebankParserParams(); Treebank trees = tlpParams.MemoryTreebank(); trees.LoadPath(treebankRoot, new NumberRangeFileFilter(200, 2199, true)); return(GetSplitCategories(trees, 300.0, tlpParams.TreebankLanguagePack())); }
public virtual IList <Tree> ReadBinarizedTreebank(string treebankPath, IFileFilter treebankFilter) { Treebank treebank = ReadTreebank(treebankPath, treebankFilter); IList <Tree> binarized = BinarizeTreebank(treebank, op); log.Info("Converted trees to binarized format"); return(binarized); }
protected virtual void SetUp() { Options op = new Options(); Treebank treebank = op.tlpParams.MemoryTreebank(); Sharpen.Collections.AddAll(treebank, Arrays.AsList(correctTrees)); binarizedTrees = ShiftReduceParser.BinarizeTreebank(treebank, op); }
public virtual Treebank ReadTreebank(string treebankPath, IFileFilter treebankFilter) { log.Info("Loading trees from " + treebankPath); Treebank treebank = op.tlpParams.MemoryTreebank(); treebank.LoadPath(treebankPath, treebankFilter); log.Info("Read in " + treebank.Count + " trees from " + treebankPath); return(treebank); }
/// <summary>Call this method to get a String array of categories to split on.</summary> /// <remarks> /// Call this method to get a String array of categories to split on. /// It calculates parent annotation statistics suitable for doing /// selective parent splitting in the PCFGParser inside /// FactoredParser. <p> /// If tlp is non-null tlp.basicCategory() will be called on parent and /// grandparent nodes. <p> /// <i>Implementation note:</i> This method is not designed for concurrent /// invocation: it uses static state variables. /// </remarks> public static ICollection <string> GetSplitCategories(Treebank t, bool doTags, int algorithm, double phrasalCutOff, double tagCutOff, ITreebankLanguagePack tlp) { Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats pas = new Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats(tlp, doTags); t.Apply(pas); ICollection <string> splitters = Generics.NewHashSet(); Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats.GetSplitters(phrasalCutOff, pas.nodeRules, pas.pRules, pas.gPRules, splitters); Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats.GetSplitters(tagCutOff, pas.tagNodeRules, pas.tagPRules, pas.tagGPRules, splitters); return(splitters); }
public static void Main(string[] args) { ITreebankLangParserParams tlpParams = new ChineseTreebankParserParams(); ITreebankLanguagePack ctlp = tlpParams.TreebankLanguagePack(); Options op = new Options(tlpParams); TreeAnnotator ta = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); log.Info("Reading Trees..."); IFileFilter trainFilter = new NumberRangesFileFilter(args[1], true); Treebank trainTreebank = tlpParams.MemoryTreebank(); trainTreebank.LoadPath(args[0], trainFilter); log.Info("Annotating trees..."); ICollection <Tree> trainTrees = new List <Tree>(); foreach (Tree tree in trainTreebank) { trainTrees.Add(ta.TransformTree(tree)); } trainTreebank = null; // saves memory log.Info("Training lexicon..."); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); int featureLevel = DefaultFeatureLevel; if (args.Length > 3) { featureLevel = System.Convert.ToInt32(args[3]); } Edu.Stanford.Nlp.Parser.Lexparser.ChineseMaxentLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseMaxentLexicon(op, wordIndex, tagIndex, featureLevel); lex.InitializeTraining(trainTrees.Count); lex.Train(trainTrees); lex.FinishTraining(); log.Info("Testing"); IFileFilter testFilter = new NumberRangesFileFilter(args[2], true); Treebank testTreebank = tlpParams.MemoryTreebank(); testTreebank.LoadPath(args[0], testFilter); IList <TaggedWord> testWords = new List <TaggedWord>(); foreach (Tree t in testTreebank) { foreach (TaggedWord tw in t.TaggedYield()) { testWords.Add(tw); } } //testWords.addAll(t.taggedYield()); int[] totalAndCorrect = lex.TestOnTreebank(testWords); log.Info("done."); System.Console.Out.WriteLine(totalAndCorrect[1] + " correct out of " + totalAndCorrect[0] + " -- ACC: " + ((double)totalAndCorrect[1]) / totalAndCorrect[0]); }
private void Train(IList <Pair <string, IFileFilter> > trainTreebankPath, Pair <string, IFileFilter> devTreebankPath, string serializedPath) { log.Info("Training method: " + op.TrainOptions().trainingMethod); IList <Tree> binarizedTrees = Generics.NewArrayList(); foreach (Pair <string, IFileFilter> treebank in trainTreebankPath) { Sharpen.Collections.AddAll(binarizedTrees, ReadBinarizedTreebank(treebank.First(), treebank.Second())); } int nThreads = op.trainOptions.trainingThreads; nThreads = nThreads <= 0 ? Runtime.GetRuntime().AvailableProcessors() : nThreads; Edu.Stanford.Nlp.Tagger.Common.Tagger tagger = null; if (op.testOptions.preTag) { Timing retagTimer = new Timing(); tagger = Edu.Stanford.Nlp.Tagger.Common.Tagger.LoadModel(op.testOptions.taggerSerializedFile); RedoTags(binarizedTrees, tagger, nThreads); retagTimer.Done("Retagging"); } ICollection <string> knownStates = FindKnownStates(binarizedTrees); ICollection <string> rootStates = FindRootStates(binarizedTrees); ICollection <string> rootOnlyStates = FindRootOnlyStates(binarizedTrees, rootStates); log.Info("Known states: " + knownStates); log.Info("States which occur at the root: " + rootStates); log.Info("States which only occur at the root: " + rootStates); Timing transitionTimer = new Timing(); IList <IList <ITransition> > transitionLists = CreateTransitionSequence.CreateTransitionSequences(binarizedTrees, op.compoundUnaries, rootStates, rootOnlyStates); IIndex <ITransition> transitionIndex = new HashIndex <ITransition>(); foreach (IList <ITransition> transitions in transitionLists) { transitionIndex.AddAll(transitions); } transitionTimer.Done("Converting trees into transition lists"); log.Info("Number of transitions: " + transitionIndex.Size()); Random random = new Random(op.trainOptions.randomSeed); Treebank devTreebank = null; if (devTreebankPath != null) { devTreebank = ReadTreebank(devTreebankPath.First(), devTreebankPath.Second()); } PerceptronModel newModel = new PerceptronModel(this.op, transitionIndex, knownStates, rootStates, rootOnlyStates); newModel.TrainModel(serializedPath, tagger, random, binarizedTrees, transitionLists, devTreebank, nThreads); this.model = newModel; }
public virtual void RunTest(string[] args) { // get a parser from file LexicalizedParser pd = ((LexicalizedParser)LexicalizedParser.LoadModel(args[0])); op = pd.GetOp(); // in case a serialized options was read in Treebank testTreebank = op.tlpParams.MemoryTreebank(); int testlow = System.Convert.ToInt32(args[2]); int testhigh = System.Convert.ToInt32(args[3]); testTreebank.LoadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true)); op.SetOptionsOrWarn(args, 4, args.Length); TestOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex); }
private static Treebank MakeTreebank(string treebankPath, Options op, IFileFilter filt) { log.Info("Training a segmenter from treebank dir: " + treebankPath); Treebank trainTreebank = op.tlpParams.MemoryTreebank(); log.Info("Reading trees..."); if (filt == null) { trainTreebank.LoadPath(treebankPath); } else { trainTreebank.LoadPath(treebankPath, filt); } Timing.Tick("done [read " + trainTreebank.Count + " trees]."); return(trainTreebank); }
private static IList <FactoredLexiconEvent> GetTuningSet(Treebank devTreebank, Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon, ITreebankLangParserParams tlpp) { IList <Tree> devTrees = new List <Tree>(3000); foreach (Tree tree in devTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } devTrees.Add(tree); } IList <FactoredLexiconEvent> tuningSet = TreebankToLexiconEvents(devTrees, lexicon); return(tuningSet); }
public TreeTaggedFileReader(TaggedFileRecord record) { // int numSentences = 0; filename = record.file; trf = record.trf == null ? new LabeledScoredTreeReaderFactory() : record.trf; transformer = record.treeTransformer; normalizer = record.treeNormalizer; treeFilter = record.treeFilter; treebank = new DiskTreebank(trf, record.encoding); if (record.treeRange != null) { treebank.LoadPath(filename, record.treeRange); } else { treebank.LoadPath(filename); } treeIterator = treebank.GetEnumerator(); FindNext(); }
public virtual LexicalizedParser GetParserDataFromTreebank(Treebank trainTreebank) { log.Info("Binarizing training trees..."); IList <Tree> binaryTrainTrees = GetAnnotatedBinaryTreebankFromTreebank(trainTreebank); Timing.Tick("done."); IIndex <string> stateIndex = new HashIndex <string>(); log.Info("Extracting PCFG..."); IExtractor <Pair <UnaryGrammar, BinaryGrammar> > bgExtractor = new BinaryGrammarExtractor(op, stateIndex); Pair <UnaryGrammar, BinaryGrammar> bgug = bgExtractor.Extract(binaryTrainTrees); BinaryGrammar bg = bgug.second; bg.SplitRules(); UnaryGrammar ug = bgug.first; ug.PurgeRules(); Timing.Tick("done."); log.Info("Extracting Lexicon..."); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); ILexicon lex = op.tlpParams.Lex(op, wordIndex, tagIndex); lex.InitializeTraining(binaryTrainTrees.Count); lex.Train(binaryTrainTrees); lex.FinishTraining(); Timing.Tick("done."); IExtractor <IDependencyGrammar> dgExtractor = op.tlpParams.DependencyGrammarExtractor(op, wordIndex, tagIndex); IDependencyGrammar dg = null; if (op.doDep) { log.Info("Extracting Dependencies..."); dg = dgExtractor.Extract(binaryTrainTrees); dg.SetLexicon(lex); Timing.Tick("done."); } log.Info("Done extracting grammars and lexicon."); return(new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op)); }
private Distribution <int> GetSegmentedWordLengthDistribution(Treebank tb) { // CharacterLevelTagExtender ext = new CharacterLevelTagExtender(); ClassicCounter <int> c = new ClassicCounter <int>(); foreach (Tree gold in tb) { StringBuilder goldChars = new StringBuilder(); ArrayList goldYield = gold.Yield(); foreach (object aGoldYield in goldYield) { Word word = (Word)aGoldYield; goldChars.Append(word); } IList <IHasWord> ourWords = Segment(goldChars.ToString()); foreach (IHasWord ourWord in ourWords) { c.IncrementCount(int.Parse(ourWord.Word().Length)); } } return(Distribution.GetDistribution(c)); }
public static IList <Tree> BinarizeTreebank(Treebank treebank, Options op) { TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(op.tlpParams.HeadFinder(), op.tlpParams.TreebankLanguagePack()); BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.Langpack()); CompositeTreeTransformer transformer = new CompositeTreeTransformer(); transformer.AddTransformer(binarizer); transformer.AddTransformer(basicTransformer); treebank = treebank.Transform(transformer); IHeadFinder binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.HeadFinder()); IList <Tree> binarizedTrees = Generics.NewArrayList(); foreach (Tree tree in treebank) { Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(tree); tree.PercolateHeadAnnotations(binaryHeadFinder); // Index from 1. Tools downstream expect index from 1, so for // uses internal to the srparser we have to renormalize the // indices, with the result that here we have to index from 1 tree.IndexLeaves(1, true); binarizedTrees.Add(tree); } return(binarizedTrees); }
/// <returns>A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.</returns> public static Triple <Treebank, Treebank, Treebank> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op) { // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); if (op.testOptions.verbose) { PrintWriter pwErr = tlpParams.Pw(System.Console.Error); pwErr.Print("Training "); pwErr.Println(trainTreebank.TextualSummary(tlp)); if (secondaryTreebank != null) { pwErr.Print("Secondary training "); pwErr.Println(secondaryTreebank.TextualSummary(tlp)); } } CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer(); if (op.trainOptions.preTransformer != null) { trainTransformer.AddTransformer(op.trainOptions.preTransformer); } if (op.trainOptions.collinsPunc) { CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp); trainTransformer.AddTransformer(collinsPuncTransformer); } log.Info("Binarizing trees..."); Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer binarizer; if (!op.trainOptions.leftToRight) { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } else { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } trainTransformer.AddTransformer(binarizer); if (op.wordFunction != null) { ITreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction); trainTransformer.AddTransformer(wordFunctionTransformer); } Treebank wholeTreebank; if (secondaryTreebank == null) { wholeTreebank = trainTreebank; } else { wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank); } if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); RemoveDeleteSplittersFromSplitters(tlp, op); if (op.testOptions.verbose) { IList <string> list = new List <string>(op.trainOptions.splitters); list.Sort(); log.Info("Parent split categories: " + list); } } if (op.trainOptions.selectivePostSplit) { // Do all the transformations once just to learn selective splits on annotated categories ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); wholeTreebank = wholeTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in wholeTreebank) { trainTransformer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } // we've done all the setup now. here's where the train treebank is transformed. trainTreebank = trainTreebank.Transform(trainTransformer); if (secondaryTreebank != null) { secondaryTreebank = secondaryTreebank.Transform(trainTransformer); } if (op.trainOptions.printAnnotatedStateCounts) { binarizer.PrintStateCounts(); } if (op.trainOptions.printAnnotatedRuleCounts) { binarizer.PrintRuleCounts(); } if (tuneTreebank != null) { tuneTreebank = tuneTreebank.Transform(trainTransformer); } if (op.testOptions.verbose) { binarizer.DumpStats(); } return(new Triple <Treebank, Treebank, Treebank>(trainTreebank, secondaryTreebank, tuneTreebank)); }
/// <summary>Call this method to get a String array of categories to split on.</summary> /// <remarks> /// Call this method to get a String array of categories to split on. /// It calculates parent annotation statistics suitable for doing /// selective parent splitting in the PCFGParser inside /// FactoredParser. <p> /// If tlp is non-null tlp.basicCategory() will be called on parent and /// grandparent nodes. <p> /// This version just defaults some parameters. /// <i>Implementation note:</i> This method is not designed for concurrent /// invocation: it uses static state variables. /// </remarks> public static ICollection <string> GetSplitCategories(Treebank t, double cutOff, ITreebankLanguagePack tlp) { return(GetSplitCategories(t, true, 0, cutOff, cutOff, tlp)); }
/// <summary> /// This method lets you train and test a segmenter relative to a /// Treebank. /// </summary> /// <remarks> /// This method lets you train and test a segmenter relative to a /// Treebank. /// <p> /// <i>Implementation note:</i> This method is largely cloned from /// LexicalizedParser's main method. Should we try to have it be able /// to train segmenters to stop things going out of sync? /// </remarks> public static void Main(string[] args) { bool train = false; bool saveToSerializedFile = false; bool saveToTextFile = false; string serializedInputFileOrUrl = null; string textInputFileOrUrl = null; string serializedOutputFileOrUrl = null; string textOutputFileOrUrl = null; string treebankPath = null; Treebank testTreebank = null; // Treebank tuneTreebank = null; string testPath = null; IFileFilter testFilter = null; IFileFilter trainFilter = null; string encoding = null; // variables needed to process the files to be parsed ITokenizerFactory <Word> tokenizerFactory = null; // DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(); bool tokenized = false; // whether or not the input file has already been tokenized IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper(); // int tagDelimiter = -1; // String sentenceDelimiter = "\n"; // boolean fromXML = false; int argIndex = 0; if (args.Length < 1) { log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); op.tlpParams = new ChineseTreebankParserParams(); // while loop through option arguments while (argIndex < args.Length && args[argIndex][0] == '-') { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train")) { train = true; saveToSerializedFile = true; int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs > 1) { treebankPath = args[argIndex]; argIndex++; } else { throw new Exception("Error: -train option must have treebankPath as first argument."); } if (numSubArgs == 2) { trainFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding")) { // sets encoding for TreebankLangParserParams encoding = args[argIndex + 1]; op.tlpParams.SetInputEncoding(encoding); op.tlpParams.SetOutputEncoding(encoding); argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { // doesn't make sense to load from TextFile -pichuan // } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // // load the parser from declarative text file // // the next argument must be the path to the parser file // textInputFileOrUrl = args[argIndex + 1]; // argIndex += 2; if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile")) { saveToSerializedFile = true; serializedOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")) { // the next argument is the treebank path and range for testing int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs == 1) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs > 1) { testPath = args[argIndex++]; if (numSubArgs == 2) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); testFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? testFilter = new NumberRangesFileFilter(args[argIndex++], true); } } } } } } else { int j = op.tlpParams.SetOptionFlag(args, argIndex); if (j == argIndex) { log.Info("Unknown option ignored: " + args[argIndex]); j++; } argIndex = j; } } } } } } } // end while loop through arguments ITreebankLangParserParams tlpParams = op.tlpParams; // all other arguments are order dependent and // are processed in order below Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null; if (!train && op.testOptions.verbose) { System.Console.Out.WriteLine("Currently " + new DateTime()); PrintArgs(args, System.Console.Out); } if (train) { PrintArgs(args, System.Console.Out); // so we train a parser using the treebank if (treebankPath == null) { // the next arg must be the treebank path, since it wasn't give earlier treebankPath = args[argIndex]; argIndex++; if (args.Length > argIndex + 1) { try { // the next two args might be the range int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } Treebank trainTreebank = MakeTreebank(treebankPath, op, trainFilter); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex); } else { if (textInputFileOrUrl != null) { } else { // so we load the segmenter from a text grammar file // XXXXX fix later -pichuan //cs = new LexicalizedParser(textInputFileOrUrl, true, op); // so we load a serialized segmenter if (serializedInputFileOrUrl == null) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } try { cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op); } catch (ArgumentException) { log.Info("Error loading segmenter, exiting..."); System.Environment.Exit(0); } } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test TreePrint treePrint = op.testOptions.TreePrint(tlpParams); if (testFilter != null) { if (testPath == null) { if (treebankPath == null) { throw new Exception("No test treebank path specified..."); } else { log.Info("No test treebank path specified. Using train path: \"" + treebankPath + "\""); testPath = treebankPath; } } testTreebank = tlpParams.TestMemoryTreebank(); testTreebank.LoadPath(testPath, testFilter); } op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. We also set the tlpParams of the // LexicalizedParser instance to be the same object. This is // redundancy that we probably should take out eventually. // // -- Roger if (op.testOptions.verbose) { log.Info("Lexicon is " + cs.GetType().FullName); } PrintWriter pwOut = tlpParams.Pw(); PrintWriter pwErr = tlpParams.Pw(System.Console.Error); // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { SaveSegmenterDataToText(cs, textOutputFileOrUrl); } else { log.Info("Usage: must specify a text segmenter data output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl == null && argIndex < args.Length) { // the next argument must be the path to serialize to serializedOutputFileOrUrl = args[argIndex]; argIndex++; } if (serializedOutputFileOrUrl != null) { SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl); } else { if (textOutputFileOrUrl == null && testTreebank == null) { // no saving/parsing request has been specified log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename"); } } } /* --------------------- Testing part!!!! ----------------------- */ if (op.testOptions.verbose) { } // printOptions(false, op); if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))) { // test parser on treebank if (testTreebank == null) { // the next argument is the treebank path and range for testing testTreebank = tlpParams.TestMemoryTreebank(); if (args.Length < argIndex + 4) { testTreebank.LoadPath(args[argIndex + 1]); } else { int testlow = System.Convert.ToInt32(args[argIndex + 2]); int testhigh = System.Convert.ToInt32(args[argIndex + 3]); testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true)); } } } }
private void TestOnTreebank(LexicalizedParser pd, ITreebankLangParserParams tlpParams, Treebank testTreebank, string treebankRoot, IIndex <string> stateIndex) { Timing.StartTime(); ITreeTransformer annotator = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); // CDM: Aug 2004: With new implementation of treebank split categories, // I've hardwired this to load English ones. Otherwise need training data. // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters())); op.trainOptions.splitters = ParentAnnotationStats.GetEnglishSplitCategories(treebankRoot); op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters())); foreach (Tree goldTree in testTreebank) { goldTree = annotator.TransformTree(goldTree); // System.out.println(); // System.out.println("Checking tree: " + goldTree); foreach (Tree localTree in goldTree) { // now try to use the grammar to score this local tree if (localTree.IsLeaf() || localTree.IsPreTerminal() || localTree.Children().Length < 2) { continue; } System.Console.Out.WriteLine(LocalTreeToRule(localTree)); double score = ComputeLocalTreeScore(localTree, stateIndex, pd); if (score == double.NegativeInfinity) { } // System.out.println(localTreeToRule(localTree)); System.Console.Out.WriteLine("score: " + score); } } }
private static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromTreebank(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { System.Console.Out.WriteLine("Currently " + new DateTime()); // printOptions(true, op); Timing.StartTime(); // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; if (op.testOptions.verbose) { System.Console.Out.Write("Training "); System.Console.Out.WriteLine(trainTreebank.TextualSummary()); } System.Console.Out.Write("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer; // initialized below if (!op.trainOptions.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } else { binarizer = new TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } CollinsPuncTransformer collinsPuncTransformer = null; if (op.trainOptions.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.TreebankLanguagePack()); } IList <Tree> binaryTrainTrees = new List <Tree>(); // List<Tree> binaryTuneTrees = new ArrayList<Tree>(); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent split categories: " + op.trainOptions.splitters); } } if (op.trainOptions.selectivePostSplit) { ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); Treebank annotatedTB = trainTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.TransformTree(tree); } tree = binarizer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); } foreach (Tree tree_1 in trainTreebank) { if (op.trainOptions.collinsPunc) { tree_1 = collinsPuncTransformer.TransformTree(tree_1); } tree_1 = binarizer.TransformTree(tree_1); binaryTrainTrees.Add(tree_1); } Timing.Tick("done."); if (op.testOptions.verbose) { binarizer.DumpStats(); } System.Console.Out.Write("Extracting Lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter clex = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)op.tlpParams.Lex(op, wordIndex, tagIndex); clex.InitializeTraining(binaryTrainTrees.Count); clex.Train(binaryTrainTrees); clex.FinishTraining(); Timing.Tick("done."); return(clex); }
private ChineseLexiconAndWordSegmenter(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = GetSegmenterDataFromTreebank(trainTreebank, op, wordIndex, tagIndex); chineseLexicon = cs.chineseLexicon; wordSegmenter = cs.wordSegmenter; }
/// <summary> /// An example command line for training a new parser: /// <br /> /// nohup java -mx6g edu.stanford.nlp.parser.dvparser.DVParser -cachedTrees /scr/nlp/data/dvparser/wsj/cached.wsj.train.simple.ser.gz -train -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 -debugOutputFrequency 400 -nofilter -trainingThreads 5 -parser /u/nlp/data/lexparser/wsjPCFG.nocompact.simple.ser.gz -trainingIterations 40 -batchSize 25 -model /scr/nlp/data/dvparser/wsj/wsj.combine.v2.ser.gz -unkWord "*UNK*" -dvCombineCategories > /scr/nlp/data/dvparser/wsj/wsj.combine.v2.out 2>&1 & /// </summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { if (args.Length == 0) { Help(); System.Environment.Exit(2); } log.Info("Running DVParser with arguments:"); foreach (string arg in args) { log.Info(" " + arg); } log.Info(); string parserPath = null; string trainTreebankPath = null; IFileFilter trainTreebankFilter = null; string cachedTrainTreesPath = null; bool runGradientCheck = false; bool runTraining = false; string testTreebankPath = null; IFileFilter testTreebankFilter = null; string initialModelPath = null; string modelPath = null; bool filter = true; string resultsRecordPath = null; IList <string> unusedArgs = new List <string>(); // These parameters can be null or 0 if the model was not // serialized with the new parameters. Setting the options at the // command line will override these defaults. // TODO: if/when we integrate back into the main branch and // rebuild models, we can get rid of this IList <string> argsWithDefaults = new List <string>(Arrays.AsList(new string[] { "-wordVectorFile", Options.LexOptions.DefaultWordVectorFile, "-dvKBest", int.ToString(TrainOptions.DefaultKBest), "-batchSize", int.ToString(TrainOptions.DefaultBatchSize ), "-trainingIterations", int.ToString(TrainOptions.DefaultTrainingIterations), "-qnIterationsPerBatch", int.ToString(TrainOptions.DefaultQnIterationsPerBatch), "-regCost", double.ToString(TrainOptions.DefaultRegcost), "-learningRate", double .ToString(TrainOptions.DefaultLearningRate), "-deltaMargin", double.ToString(TrainOptions.DefaultDeltaMargin), "-unknownNumberVector", "-unknownDashedWordVectors", "-unknownCapsVector", "-unknownchinesepercentvector", "-unknownchinesenumbervector" , "-unknownchineseyearvector", "-unkWord", "*UNK*", "-transformMatrixType", "DIAGONAL", "-scalingForInit", double.ToString(TrainOptions.DefaultScalingForInit), "-trainWordVectors" })); Sharpen.Collections.AddAll(argsWithDefaults, Arrays.AsList(args)); args = Sharpen.Collections.ToArray(argsWithDefaults, new string[argsWithDefaults.Count]); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser")) { parserPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; testTreebankPath = treebankDescription.First(); testTreebankFilter = treebankDescription.Second(); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; trainTreebankPath = treebankDescription.First(); trainTreebankFilter = treebankDescription.Second(); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-cachedTrees")) { cachedTrainTreesPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-runGradientCheck")) { runGradientCheck = true; argIndex++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train")) { runTraining = true; argIndex++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model")) { modelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-nofilter")) { filter = false; argIndex++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-continueTraining")) { runTraining = true; filter = false; initialModelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-resultsRecord")) { resultsRecordPath = args[argIndex + 1]; argIndex += 2; } else { unusedArgs.Add(args[argIndex++]); } } } } } } } } } } } if (parserPath == null && modelPath == null) { throw new ArgumentException("Must supply either a base parser model with -parser or a serialized DVParser with -model"); } if (!runTraining && modelPath == null && !runGradientCheck) { throw new ArgumentException("Need to either train a new model, run the gradient check or specify a model to load with -model"); } string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]); Edu.Stanford.Nlp.Parser.Dvparser.DVParser dvparser = null; LexicalizedParser lexparser = null; if (initialModelPath != null) { lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(initialModelPath, newArgs)); DVModel model = GetModelFromLexicalizedParser(lexparser); dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser); } else { if (runTraining || runGradientCheck) { lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserPath, newArgs)); dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(lexparser); } else { if (modelPath != null) { lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs)); DVModel model = GetModelFromLexicalizedParser(lexparser); dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser); } } } IList <Tree> trainSentences = new List <Tree>(); IdentityHashMap <Tree, byte[]> trainCompressedParses = Generics.NewIdentityHashMap(); if (cachedTrainTreesPath != null) { foreach (string path in cachedTrainTreesPath.Split(",")) { IList <Pair <Tree, byte[]> > cache = IOUtils.ReadObjectFromFile(path); foreach (Pair <Tree, byte[]> pair in cache) { trainSentences.Add(pair.First()); trainCompressedParses[pair.First()] = pair.Second(); } log.Info("Read in " + cache.Count + " trees from " + path); } } if (trainTreebankPath != null) { // TODO: make the transformer a member of the model? ITreeTransformer transformer = BuildTrainTransformer(dvparser.GetOp()); Treebank treebank = dvparser.GetOp().tlpParams.MemoryTreebank(); treebank.LoadPath(trainTreebankPath, trainTreebankFilter); treebank = treebank.Transform(transformer); log.Info("Read in " + treebank.Count + " trees from " + trainTreebankPath); CacheParseHypotheses cacher = new CacheParseHypotheses(dvparser.parser); CacheParseHypotheses.CacheProcessor processor = new CacheParseHypotheses.CacheProcessor(cacher, lexparser, dvparser.op.trainOptions.dvKBest, transformer); foreach (Tree tree in treebank) { trainSentences.Add(tree); trainCompressedParses[tree] = processor.Process(tree).second; } //System.out.println(tree); log.Info("Finished parsing " + treebank.Count + " trees, getting " + dvparser.op.trainOptions.dvKBest + " hypotheses each"); } if ((runTraining || runGradientCheck) && filter) { log.Info("Filtering rules for the given training set"); dvparser.dvModel.SetRulesForTrainingSet(trainSentences, trainCompressedParses); log.Info("Done filtering rules; " + dvparser.dvModel.numBinaryMatrices + " binary matrices, " + dvparser.dvModel.numUnaryMatrices + " unary matrices, " + dvparser.dvModel.wordVectors.Count + " word vectors"); } //dvparser.dvModel.printAllMatrices(); Treebank testTreebank = null; if (testTreebankPath != null) { log.Info("Reading in trees from " + testTreebankPath); if (testTreebankFilter != null) { log.Info("Filtering on " + testTreebankFilter); } testTreebank = dvparser.GetOp().tlpParams.MemoryTreebank(); testTreebank.LoadPath(testTreebankPath, testTreebankFilter); log.Info("Read in " + testTreebank.Count + " trees for testing"); } // runGradientCheck= true; if (runGradientCheck) { log.Info("Running gradient check on " + trainSentences.Count + " trees"); dvparser.RunGradientCheck(trainSentences, trainCompressedParses); } if (runTraining) { log.Info("Training the RNN parser"); log.Info("Current train options: " + dvparser.GetOp().trainOptions); dvparser.Train(trainSentences, trainCompressedParses, testTreebank, modelPath, resultsRecordPath); if (modelPath != null) { dvparser.SaveModel(modelPath); } } if (testTreebankPath != null) { EvaluateTreebank evaluator = new EvaluateTreebank(dvparser.AttachModelToLexicalizedParser()); evaluator.TestOnTreebank(testTreebank); } log.Info("Successfully ran DVParser"); }
/// <exception cref="System.IO.IOException"/> public virtual void Train(IList <Tree> sentences, IdentityHashMap <Tree, byte[]> compressedParses, Treebank testTreebank, string modelPath, string resultsRecordPath) { // process: // we come up with a cost and a derivative for the model // we always use the gold tree as the example to train towards // every time through, we will look at the top N trees from // the LexicalizedParser and pick the best one according to // our model (at the start, this is essentially random) // we use QN to minimize the cost function for the model // to do this minimization, we turn all of the matrices in the // DVModel into one big Theta, which is the set of variables to // be optimized by the QN. Timing timing = new Timing(); long maxTrainTimeMillis = op.trainOptions.maxTrainTimeSeconds * 1000; int batchCount = 0; int debugCycle = 0; double bestLabelF1 = 0.0; if (op.trainOptions.useContextWords) { foreach (Tree tree in sentences) { Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(tree); tree.SetSpans(); } } // for AdaGrad double[] sumGradSquare = new double[dvModel.TotalParamSize()]; Arrays.Fill(sumGradSquare, 1.0); int numBatches = sentences.Count / op.trainOptions.batchSize + 1; log.Info("Training on " + sentences.Count + " trees in " + numBatches + " batches"); log.Info("Times through each training batch: " + op.trainOptions.trainingIterations); log.Info("QN iterations per batch: " + op.trainOptions.qnIterationsPerBatch); for (int iter = 0; iter < op.trainOptions.trainingIterations; ++iter) { IList <Tree> shuffledSentences = new List <Tree>(sentences); Java.Util.Collections.Shuffle(shuffledSentences, dvModel.rand); for (int batch = 0; batch < numBatches; ++batch) { ++batchCount; // This did not help performance //log.info("Setting AdaGrad's sum of squares to 1..."); //Arrays.fill(sumGradSquare, 1.0); log.Info("======================================"); log.Info("Iteration " + iter + " batch " + batch); // Each batch will be of the specified batch size, except the // last batch will include any leftover trees at the end of // the list int startTree = batch * op.trainOptions.batchSize; int endTree = (batch + 1) * op.trainOptions.batchSize; if (endTree > shuffledSentences.Count) { endTree = shuffledSentences.Count; } ExecuteOneTrainingBatch(shuffledSentences.SubList(startTree, endTree), compressedParses, sumGradSquare); long totalElapsed = timing.Report(); log.Info("Finished iteration " + iter + " batch " + batch + "; total training time " + totalElapsed + " ms"); if (maxTrainTimeMillis > 0 && totalElapsed > maxTrainTimeMillis) { // no need to debug output, we're done now break; } if (op.trainOptions.debugOutputFrequency > 0 && batchCount % op.trainOptions.debugOutputFrequency == 0) { log.Info("Finished " + batchCount + " total batches, running evaluation cycle"); // Time for debugging output! double tagF1 = 0.0; double labelF1 = 0.0; if (testTreebank != null) { EvaluateTreebank evaluator = new EvaluateTreebank(AttachModelToLexicalizedParser()); evaluator.TestOnTreebank(testTreebank); labelF1 = evaluator.GetLBScore(); tagF1 = evaluator.GetTagScore(); if (labelF1 > bestLabelF1) { bestLabelF1 = labelF1; } log.Info("Best label f1 on dev set so far: " + Nf.Format(bestLabelF1)); } string tempName = null; if (modelPath != null) { tempName = modelPath; if (modelPath.EndsWith(".ser.gz")) { tempName = Sharpen.Runtime.Substring(modelPath, 0, modelPath.Length - 7) + "-" + Filename.Format(debugCycle) + "-" + Nf.Format(labelF1) + ".ser.gz"; } SaveModel(tempName); } string statusLine = ("CHECKPOINT:" + " iteration " + iter + " batch " + batch + " labelF1 " + Nf.Format(labelF1) + " tagF1 " + Nf.Format(tagF1) + " bestLabelF1 " + Nf.Format(bestLabelF1) + " model " + tempName + op.trainOptions + " word vectors: " + op.lexOptions.wordVectorFile + " numHid: " + op.lexOptions.numHid); log.Info(statusLine); if (resultsRecordPath != null) { FileWriter fout = new FileWriter(resultsRecordPath, true); // append fout.Write(statusLine); fout.Write("\n"); fout.Close(); } ++debugCycle; } } long totalElapsed_1 = timing.Report(); if (maxTrainTimeMillis > 0 && totalElapsed_1 > maxTrainTimeMillis) { // no need to debug output, we're done now log.Info("Max training time exceeded, exiting"); break; } } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 4) { System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName); System.Environment.Exit(-1); } // Command line options Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; Treebank trainTreebank = tlpp.DiskTreebank(); trainTreebank.LoadPath(args[2]); Treebank devTreebank = tlpp.DiskTreebank(); devTreebank.LoadPath(args[3]); MorphoFeatureSpecification morphoSpec; Options options = GetOptions(language); if (language.Equals(Language.Arabic)) { morphoSpec = new ArabicMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { if (language.Equals(Language.French)) { morphoSpec = new FrenchMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { throw new NotSupportedException(); } } string featureList = args[1]; string[] features = featureList.Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.WriteLine("Features: " + args[1]); // Create word and tag indices // Save trees in a collection since the interface requires that.... System.Console.Out.Write("Loading training trees..."); IList <Tree> trainTrees = new List <Tree>(19000); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); foreach (Tree tree in trainTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } trainTrees.Add(tree); } System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count); // Setup and train the lexicon. System.Console.Out.Write("Collecting sufficient statistics for lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex); lexicon.InitializeTraining(trainTrees.Count); lexicon.Train(trainTrees, null); lexicon.FinishTraining(); System.Console.Out.WriteLine("Done!"); trainTrees = null; // Load the tuning set System.Console.Out.Write("Loading tuning set..."); IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp); System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count); // Print the probabilities that we obtain // TODO(spenceg): Implement tagging accuracy with FactLex int nCorrect = 0; ICounter <string> errors = new ClassicCounter <string>(); foreach (FactoredLexiconEvent @event in tuningSet) { IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr()); ICounter <int> logScores = new ClassicCounter <int>(); bool noRules = true; int goldTagId = -1; while (itr.MoveNext()) { noRules = false; IntTaggedWord iTW = itr.Current; if (iTW.Tag() == @event.TagId()) { log.Info("GOLD-"); goldTagId = iTW.Tag(); } float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr()); logScores.IncrementCount(iTW.Tag(), tagScore); } if (noRules) { System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr()); } else { // Score the tagging int hypTagId = Counters.Argmax(logScores); if (hypTagId == goldTagId) { ++nCorrect; } else { string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId); errors.IncrementCount(goldTag); } } log.Info(); } // Output accuracy double acc = (double)nCorrect / (double)tuningSet.Count; System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0); log.Info("% of errors by type:"); IList <string> biggestKeys = new List <string>(errors.KeySet()); biggestKeys.Sort(Counters.ToComparator(errors, false, true)); Counters.Normalize(errors); foreach (string key in biggestKeys) { System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0); } }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { string dvmodelFile = null; string lexparserFile = null; string testTreebankPath = null; IFileFilter testTreebankFilter = null; IList <string> unusedArgs = new List <string>(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-lexparser")) { lexparserFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; testTreebankPath = treebankDescription.First(); testTreebankFilter = treebankDescription.Second(); } else { unusedArgs.Add(args[argIndex++]); } } } log.Info("Loading lexparser from: " + lexparserFile); string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]); LexicalizedParser lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(lexparserFile, newArgs)); log.Info("... done"); Treebank testTreebank = null; if (testTreebankPath != null) { log.Info("Reading in trees from " + testTreebankPath); if (testTreebankFilter != null) { log.Info("Filtering on " + testTreebankFilter); } testTreebank = lexparser.GetOp().tlpParams.MemoryTreebank(); testTreebank.LoadPath(testTreebankPath, testTreebankFilter); log.Info("Read in " + testTreebank.Count + " trees for testing"); } double[] labelResults = new double[weights.Length]; double[] tagResults = new double[weights.Length]; for (int i = 0; i < weights.Length; ++i) { lexparser.GetOp().baseParserWeight = weights[i]; EvaluateTreebank evaluator = new EvaluateTreebank(lexparser); evaluator.TestOnTreebank(testTreebank); labelResults[i] = evaluator.GetLBScore(); tagResults[i] = evaluator.GetTagScore(); } for (int i_1 = 0; i_1 < weights.Length; ++i_1) { log.Info("LexicalizedParser weight " + weights[i_1] + ": labeled " + labelResults[i_1] + " tag " + tagResults[i_1]); } }
public virtual IList <Tree> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank) { ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); if (Verbose) { log.Info("\n\n" + trainTreebank.TextualSummary(tlp)); } log.Info("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); Timing.Tick("done."); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); RemoveDeleteSplittersFromSplitters(tlp); if (op.testOptions.verbose) { IList <string> list = new List <string>(op.trainOptions.splitters); list.Sort(); log.Info("Parent split categories: " + list); } } // if (op.trainOptions.selectivePostSplit) { // // Do all the transformations once just to learn selective splits on annotated categories // TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams); // Treebank annotatedTB = trainTreebank.transform(myTransformer); // op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); // if (op.testOptions.verbose) { // log.info("Parent post annotation split categories: " + op.trainOptions.postSplitters); // } // } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in trainTreebank) { binarizer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } //Tree transformation // IList <Tree> binaryTrainTrees = new List <Tree>(); foreach (Tree tree_1 in trainTreebank) { tree_1 = binarizer.TransformTree(tree_1); if (tree_1.Yield().Count - 1 <= trainLengthLimit) { binaryTrainTrees.Add(tree_1); } } // WSGDEBUG: Lot's of stuff on the grammar // if(VERBOSE) { // binarizer.printStateCounts(); // binarizer.printRuleCounts(); // binarizer.dumpStats(); // } return(binaryTrainTrees); }
/// <summary>Run the scoring metric on guess/gold input.</summary> /// <remarks> /// Run the scoring metric on guess/gold input. This method performs "Collinization." /// The default language is English. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); int maxGoldYield = int.MaxValue; bool Verbose = false; string encoding = "UTF-8"; string guessFile = null; string goldFile = null; IDictionary <string, string[]> argsMap = StringUtils.ArgsToMap(args, optionArgDefs); foreach (KeyValuePair <string, string[]> opt in argsMap) { if (opt.Key == null) { continue; } if (opt.Key.Equals("-l")) { Language lang = Language.ValueOf(opt.Value[0].Trim()); tlpp = lang.@params; } else { if (opt.Key.Equals("-y")) { maxGoldYield = System.Convert.ToInt32(opt.Value[0].Trim()); } else { if (opt.Key.Equals("-v")) { Verbose = true; } else { if (opt.Key.Equals("-c")) { Edu.Stanford.Nlp.Parser.Metrics.TaggingEval.doCatLevelEval = true; } else { if (opt.Key.Equals("-e")) { encoding = opt.Value[0]; } else { log.Info(usage.ToString()); System.Environment.Exit(-1); } } } } } //Non-option arguments located at key null string[] rest = argsMap[null]; if (rest == null || rest.Length < minArgs) { log.Info(usage.ToString()); System.Environment.Exit(-1); } goldFile = rest[0]; guessFile = rest[1]; } tlpp.SetInputEncoding(encoding); PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Edu.Stanford.Nlp.Parser.Metrics.TaggingEval metric = new Edu.Stanford.Nlp.Parser.Metrics.TaggingEval("Tagging LP/LR"); ITreeTransformer tc = tlpp.Collinizer(); //The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.MoveNext() && goldItr.MoveNext()) { Tree guessTree = guessItr.Current; IList <ILabel> guessYield = guessTree.Yield(); guessLineId++; Tree goldTree = goldItr.Current; IList <ILabel> goldYield = goldTree.Yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.Count > maxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.Count != guessYield.Count) { pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId); skippedGuessTrees++; continue; } Tree evalGuess = tc.TransformTree(guessTree); Tree evalGold = tc.TransformTree(goldTree); metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); } if (guessItr.MoveNext() || goldItr.MoveNext()) { System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees); } metric.Display(true, pwOut); pwOut.Println(); pwOut.Close(); }
/// <summary>Lets you test out the TreeAnnotatorAndBinarizer on the command line.</summary> /// <param name="args"> /// Command line arguments: All flags accepted by FactoredParser.setOptionFlag /// and -train treebankPath [fileRanges] /// </param> public static void Main(string[] args) { Options op = new Options(); string treebankPath = null; IFileFilter trainFilter = null; int i = 0; while (i < args.Length && args[i].StartsWith("-")) { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-train")) { int numSubArgs = NumSubArgs(args, i); i++; if (numSubArgs >= 1) { treebankPath = args[i]; i++; } else { throw new Exception("Error: -train option must have treebankPath as first argument."); } if (numSubArgs == 2) { trainFilter = new NumberRangesFileFilter(args[i++], true); } else { if (numSubArgs >= 3) { int low = System.Convert.ToInt32(args[i]); int high = System.Convert.ToInt32(args[i + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); i += 2; } } } else { i = op.SetOption(args, i); } } if (i < args.Length) { log.Info("usage: java TreeAnnotatorAndBinarizer options*"); log.Info(" Options are like for lexicalized parser including -train treebankPath fileRange]"); return; } log.Info("Annotating from treebank dir: " + treebankPath); Treebank trainTreebank = op.tlpParams.DiskTreebank(); if (trainFilter == null) { trainTreebank.LoadPath(treebankPath); } else { trainTreebank.LoadPath(treebankPath, trainFilter); } Treebank binaryTrainTreebank = GetAnnotatedBinaryTreebankFromTreebank(trainTreebank, null, null, op).First(); IEnumerator <Tree> it = trainTreebank.GetEnumerator(); foreach (Tree t in binaryTrainTreebank) { System.Console.Out.WriteLine("Original tree:"); it.Current.PennPrint(); System.Console.Out.WriteLine("Binarized tree:"); t.PennPrint(); System.Console.Out.WriteLine(); } }
// private static String stripTag(String tag) { // if (tag.startsWith("DT")) { // String newTag = tag.substring(2, tag.length()); // return newTag.length() > 0 ? newTag : tag; // } // return tag; // } /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 3) { System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName); System.Environment.Exit(-1); } Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; if (language.Equals(Language.Arabic)) { string[] options = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(options, 0); } else { string[] options = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(options, 0); } Treebank tb = tlpp.DiskTreebank(); tb.LoadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); string[] features = args[2].Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } // Counters ICounter <string> wordTagCounter = new ClassicCounter <string>(30000); ICounter <string> morphTagCounter = new ClassicCounter <string>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); ICounter <string> morphCounter = new ClassicCounter <string>(500); ICounter <string> wordCounter = new ClassicCounter <string>(30000); ICounter <string> tagCounter = new ClassicCounter <string>(300); ICounter <string> lemmaCounter = new ClassicCounter <string>(25000); ICounter <string> lemmaTagCounter = new ClassicCounter <string>(25000); ICounter <string> richTagCounter = new ClassicCounter <string>(1000); ICounter <string> reducedTagCounter = new ClassicCounter <string>(500); ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500); IDictionary <string, ICollection <string> > wordLemmaMap = Generics.NewHashMap(); TwoDimensionalIntCounter <string, string> lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000); TwoDimensionalIntCounter <string, string> reducedTagTagCounter = new TwoDimensionalIntCounter <string, string>(500); TwoDimensionalIntCounter <string, string> tagReducedTagCounter = new TwoDimensionalIntCounter <string, string>(300); int numTrees = 0; foreach (Tree tree in tb) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } IList <ILabel> pretermList = tree.PreTerminalYield(); IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string tag = pretermList[i].Value(); string word = yield[i].Value(); string morph = ((CoreLabel)yield[i]).OriginalText(); // Note: if there is no lemma, then we use the surface form. Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph); string lemma = lemmaTag.First(); string richTag = lemmaTag.Second(); // WSGDEBUG if (tag.Contains("MW")) { lemma += "-MWE"; } lemmaCounter.IncrementCount(lemma); lemmaTagCounter.IncrementCount(lemma + tag); richTagCounter.IncrementCount(richTag); string reducedTag = morphoSpec.StrToFeatures(richTag).ToString(); reducedTagCounter.IncrementCount(reducedTag); reducedTagLemmaCounter.IncrementCount(reducedTag + lemma); wordTagCounter.IncrementCount(word + tag); morphTagCounter.IncrementCount(morph + tag); morphCounter.IncrementCount(morph); wordCounter.IncrementCount(word); tagCounter.IncrementCount(tag); reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag; if (wordLemmaMap.Contains(word)) { wordLemmaMap[word].Add(lemma); } else { ICollection <string> lemmas = Generics.NewHashSet(1); wordLemmaMap[word] = lemmas; } lemmaReducedTagCounter.IncrementCount(lemma, reducedTag); reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag); tagReducedTagCounter.IncrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.Printf("#trees:\t%d%n", numTrees); System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount()); System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count); System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count); System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count); System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count); System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count); System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count); System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count); System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count); // Extra System.Console.Out.WriteLine("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap) { string word = wordLemmas.Key; ICollection <string> lemmas = wordLemmas.Value; if (lemmas.Count == 0) { sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.Count > 1) { sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n"); continue; } string lemma = lemmas.GetEnumerator().Current; ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet(); if (reducedTags.Count > 1) { System.Console.Out.Printf("%s --> %s%n", word, lemma); foreach (string reducedTag in reducedTags) { int count = lemmaReducedTagCounter.GetCount(lemma, reducedTag); string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet()); System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.Console.Out.WriteLine(); } } System.Console.Out.WriteLine("=================="); System.Console.Out.WriteLine(sbNoLemma.ToString()); System.Console.Out.WriteLine(sbMultLemmas.ToString()); System.Console.Out.WriteLine("=================="); IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet()); tags.Sort(); foreach (string tag_1 in tags) { System.Console.Out.WriteLine(tag_1); ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet(); foreach (string reducedTag in reducedTags) { int count = tagReducedTagCounter.GetCount(tag_1, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count); } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine("=================="); }
/// <summary>Run the Evalb scoring metric on guess/gold input.</summary> /// <remarks>Run the Evalb scoring metric on guess/gold input. The default language is English.</remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs()); Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; int maxGoldYield = PropertiesUtils.GetInt(options, "y", int.MaxValue); bool Verbose = PropertiesUtils.GetBool(options, "v", false); bool sortByF1 = PropertiesUtils.HasProperty(options, "s"); int worstKTreesToEmit = PropertiesUtils.GetInt(options, "s", 0); PriorityQueue <Triple <double, Tree, Tree> > queue = sortByF1 ? new PriorityQueue <Triple <double, Tree, Tree> >(2000, new Evalb.F1Comparator()) : null; bool doCatLevel = PropertiesUtils.GetBool(options, "c", false); string labelRegex = options.GetProperty("f", null); string encoding = options.GetProperty("e", "UTF-8"); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != minArgs) { log.Info(Usage()); System.Environment.Exit(-1); } string goldFile = parsedArgs[0]; string guessFile = parsedArgs[1]; // Command-line has been parsed. Configure the metric for evaluation. tlpp.SetInputEncoding(encoding); PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Evalb metric = new Evalb("Evalb LP/LR", true); EvalbByCat evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null; ITreeTransformer tc = tlpp.Collinizer(); //The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.MoveNext() && goldItr.MoveNext()) { Tree guessTree = guessItr.Current; IList <ILabel> guessYield = guessTree.Yield(); guessLineId++; Tree goldTree = goldItr.Current; IList <ILabel> goldYield = goldTree.Yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.Count > maxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.Count != guessYield.Count) { pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId); skippedGuessTrees++; continue; } Tree evalGuess = tc.TransformTree(guessTree); Tree evalGold = tc.TransformTree(goldTree); metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); if (doCatLevel) { evalbCat.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); } if (sortByF1) { StoreTrees(queue, guessTree, goldTree, metric.GetLastF1()); } } if (guessItr.MoveNext() || goldItr.MoveNext()) { System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees); } metric.Display(true, pwOut); pwOut.Println(); if (doCatLevel) { evalbCat.Display(true, pwOut); pwOut.Println(); } if (sortByF1) { EmitSortedTrees(queue, worstKTreesToEmit, guessFile); } pwOut.Close(); }