public FilterConfusingRules(LexicalizedParser parser) { BinaryGrammar binaryGrammar = parser.bg; UnaryGrammar unaryGrammar = parser.ug; Options op = parser.GetOp(); IIndex <string> stateIndex = parser.stateIndex; foreach (UnaryRule unaryRule in unaryGrammar) { // only make one matrix for each parent state, and only use the // basic category for that string childState = stateIndex.Get(unaryRule.child); string childBasic = op.Langpack().BasicCategory(childState); unaryRules.Add(childBasic); } foreach (BinaryRule binaryRule in binaryGrammar) { // only make one matrix for each parent state, and only use the // basic category for that string leftState = stateIndex.Get(binaryRule.leftChild); string leftBasic = op.Langpack().BasicCategory(leftState); string rightState = stateIndex.Get(binaryRule.rightChild); string rightBasic = op.Langpack().BasicCategory(rightState); binaryRules.Add(leftBasic, rightBasic); } }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { string taggerFile = null; string inputFile = null; string outputFile = null; double weight = 1.0; for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tagger")) { taggerFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { inputFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-weight")) { weight = double.ValueOf(args[argIndex + 1]); argIndex += 2; } else { throw new ArgumentException("Unknown argument: " + args[argIndex]); } } } } } LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(inputFile)); MaxentTagger tagger = new MaxentTagger(taggerFile); parser.reranker = new TaggerReranker(tagger, parser.GetOp()); parser.SaveParserToSerialized(outputFile); }
public virtual void RunTest(string[] args) { // get a parser from file LexicalizedParser pd = ((LexicalizedParser)LexicalizedParser.LoadModel(args[0])); op = pd.GetOp(); // in case a serialized options was read in Treebank testTreebank = op.tlpParams.MemoryTreebank(); int testlow = System.Convert.ToInt32(args[2]); int testhigh = System.Convert.ToInt32(args[3]); testTreebank.LoadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true)); op.SetOptionsOrWarn(args, 4, args.Length); TestOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex); }
public DVParser(LexicalizedParser parser) { this.parser = parser; this.op = parser.GetOp(); if (op.trainOptions.randomSeed == 0) { op.trainOptions.randomSeed = Runtime.NanoTime(); log.Info("Random seed not set, using randomly chosen seed of " + op.trainOptions.randomSeed); } else { log.Info("Random seed set to " + op.trainOptions.randomSeed); } log.Info("Word vector file: " + op.lexOptions.wordVectorFile); log.Info("Size of word vectors: " + op.lexOptions.numHid); log.Info("Number of hypothesis trees to train against: " + op.trainOptions.dvKBest); log.Info("Number of trees in one batch: " + op.trainOptions.batchSize); log.Info("Number of iterations of trees: " + op.trainOptions.trainingIterations); log.Info("Number of qn iterations per batch: " + op.trainOptions.qnIterationsPerBatch); log.Info("Learning rate: " + op.trainOptions.learningRate); log.Info("Delta margin: " + op.trainOptions.deltaMargin); log.Info("regCost: " + op.trainOptions.regCost); log.Info("Using unknown word vector for numbers: " + op.trainOptions.unknownNumberVector); log.Info("Using unknown dashed word vector heuristics: " + op.trainOptions.unknownDashedWordVectors); log.Info("Using unknown word vector for capitalized words: " + op.trainOptions.unknownCapsVector); log.Info("Using unknown number vector for Chinese words: " + op.trainOptions.unknownChineseNumberVector); log.Info("Using unknown year vector for Chinese words: " + op.trainOptions.unknownChineseYearVector); log.Info("Using unknown percent vector for Chinese words: " + op.trainOptions.unknownChinesePercentVector); log.Info("Initial matrices scaled by: " + op.trainOptions.scalingForInit); log.Info("Training will use " + op.trainOptions.trainingThreads + " thread(s)"); log.Info("Context words are " + ((op.trainOptions.useContextWords) ? "on" : "off")); log.Info("Model will " + ((op.trainOptions.dvSimplifiedModel) ? string.Empty : "not ") + "be simplified"); this.dvModel = new DVModel(op, parser.stateIndex, parser.ug, parser.bg); if (dvModel.unaryTransform.Count != dvModel.unaryScore.Count) { throw new AssertionError("Unary transform and score size not the same"); } if (dvModel.binaryTransform.Size() != dvModel.binaryScore.Size()) { throw new AssertionError("Binary transform and score size not the same"); } }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap(); flagsToNumArgs["-parser"] = int.Parse(3); flagsToNumArgs["-lex"] = int.Parse(3); flagsToNumArgs["-test"] = int.Parse(2); flagsToNumArgs["-out"] = int.Parse(1); flagsToNumArgs["-lengthPenalty"] = int.Parse(1); flagsToNumArgs["-penaltyType"] = int.Parse(1); flagsToNumArgs["-maxLength"] = int.Parse(1); flagsToNumArgs["-stats"] = int.Parse(2); IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs); bool eval = argMap.Contains("-eval"); PrintWriter pw = null; if (argMap.Contains("-out")) { pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap["-out"])[0]), "GB18030"), true); } log.Info("ChineseCharacterBasedLexicon called with args:"); ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); for (int i = 0; i < args.Length; i++) { ctpp.SetOptionFlag(args, i); log.Info(" " + args[i]); } log.Info(); Options op = new Options(ctpp); if (argMap.Contains("-stats")) { string[] statArgs = (argMap["-stats"]); MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank(); IFileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false); rawTrainTreebank.LoadPath(new File(statArgs[0]), trainFilt); log.Info("Done reading trees."); MemoryTreebank trainTreebank; if (argMap.Contains("-annotate")) { trainTreebank = new MemoryTreebank(); TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op); foreach (Tree tree in rawTrainTreebank) { trainTreebank.Add(annotator.TransformTree(tree)); } log.Info("Done annotating trees."); } else { trainTreebank = rawTrainTreebank; } PrintStats(trainTreebank, pw); System.Environment.Exit(0); } int maxLength = 1000000; // Test.verbose = true; if (argMap.Contains("-norm")) { op.testOptions.lengthNormalization = true; } if (argMap.Contains("-maxLength")) { maxLength = System.Convert.ToInt32((argMap["-maxLength"])[0]); } op.testOptions.maxLength = 120; bool combo = argMap.Contains("-combo"); if (combo) { ctpp.useCharacterBasedLexicon = true; op.testOptions.maxSpanForTags = 10; op.doDep = false; op.dcTags = false; } LexicalizedParser lp = null; ILexicon lex = null; if (argMap.Contains("-parser")) { string[] parserArgs = (argMap["-parser"]); if (parserArgs.Length > 1) { IFileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false); lp = LexicalizedParser.TrainFromTreebank(parserArgs[0], trainFilt, op); if (parserArgs.Length == 3) { string filename = parserArgs[2]; log.Info("Writing parser in serialized format to file " + filename + " "); System.Console.Error.Flush(); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(lp); @out.Close(); log.Info("done."); } } else { string parserFile = parserArgs[0]; lp = LexicalizedParser.LoadModel(parserFile, op); } lex = lp.GetLexicon(); op = lp.GetOp(); ctpp = (ChineseTreebankParserParams)op.tlpParams; } if (argMap.Contains("-rad")) { ctpp.useUnknownCharacterModel = true; } if (argMap.Contains("-lengthPenalty")) { ctpp.lengthPenalty = double.Parse((argMap["-lengthPenalty"])[0]); } if (argMap.Contains("-penaltyType")) { ctpp.penaltyType = System.Convert.ToInt32((argMap["-penaltyType"])[0]); } if (argMap.Contains("-lex")) { string[] lexArgs = (argMap["-lex"]); if (lexArgs.Length > 1) { IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); lex = ctpp.Lex(op, wordIndex, tagIndex); MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank(); IFileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false); rawTrainTreebank.LoadPath(new File(lexArgs[0]), trainFilt); log.Info("Done reading trees."); MemoryTreebank trainTreebank; if (argMap.Contains("-annotate")) { trainTreebank = new MemoryTreebank(); TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op); foreach (Tree tree in rawTrainTreebank) { tree = annotator.TransformTree(tree); trainTreebank.Add(tree); } log.Info("Done annotating trees."); } else { trainTreebank = rawTrainTreebank; } lex.InitializeTraining(trainTreebank.Count); lex.Train(trainTreebank); lex.FinishTraining(); log.Info("Done training lexicon."); if (lexArgs.Length == 3) { string filename = lexArgs.Length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz"; log.Info("Writing lexicon in serialized format to file " + filename + " "); System.Console.Error.Flush(); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(lex); @out.Close(); log.Info("done."); } } else { string lexFile = lexArgs.Length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz"; log.Info("Reading Lexicon from file " + lexFile); ObjectInputStream @in = IOUtils.ReadStreamFromString(lexFile); try { lex = (ILexicon)@in.ReadObject(); } catch (TypeLoadException) { throw new Exception("Bad serialized file: " + lexFile); } @in.Close(); } } if (argMap.Contains("-test")) { bool segmentWords = ctpp.segment; bool parse = lp != null; System.Diagnostics.Debug.Assert((parse || segmentWords)); // WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords"); // WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags"); IWordSegmenter seg = null; if (segmentWords) { seg = (IWordSegmenter)lex; } string[] testArgs = (argMap["-test"]); MemoryTreebank testTreebank = op.tlpParams.MemoryTreebank(); IFileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false); testTreebank.LoadPath(new File(testArgs[0]), testFilt); ITreeTransformer subcategoryStripper = op.tlpParams.SubcategoryStripper(); ITreeTransformer collinizer = ctpp.Collinizer(); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval basicEval = new EquivalenceClassEval(eqclass, eqcheck, "basic"); EquivalenceClassEval collinsEval = new EquivalenceClassEval(eqclass, eqcheck, "collinized"); IList <string> evalTypes = new List <string>(3); bool goodPOS = false; if (segmentWords) { evalTypes.Add(WordCatConstituent.wordType); if (ctpp.segmentMarkov && !parse) { evalTypes.Add(WordCatConstituent.tagType); goodPOS = true; } } if (parse) { evalTypes.Add(WordCatConstituent.tagType); evalTypes.Add(WordCatConstituent.catType); if (combo) { evalTypes.Add(WordCatConstituent.wordType); goodPOS = true; } } TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes); log.Info("Testing..."); foreach (Tree goldTop in testTreebank) { Tree gold = goldTop.FirstChild(); IList <IHasWord> goldSentence = gold.YieldHasWord(); if (goldSentence.Count > maxLength) { log.Info("Skipping sentence; too long: " + goldSentence.Count); continue; } else { log.Info("Processing sentence; length: " + goldSentence.Count); } IList <IHasWord> s; if (segmentWords) { StringBuilder goldCharBuf = new StringBuilder(); foreach (IHasWord aGoldSentence in goldSentence) { StringLabel word = (StringLabel)aGoldSentence; goldCharBuf.Append(word.Value()); } string goldChars = goldCharBuf.ToString(); s = seg.Segment(goldChars); } else { s = goldSentence; } Tree tree; if (parse) { tree = lp.ParseTree(s); if (tree == null) { throw new Exception("PARSER RETURNED NULL!!!"); } } else { tree = Edu.Stanford.Nlp.Trees.Trees.ToFlatTree(s); tree = subcategoryStripper.TransformTree(tree); } if (pw != null) { if (parse) { tree.PennPrint(pw); } else { IEnumerator sentIter = s.GetEnumerator(); for (; ;) { Word word = (Word)sentIter.Current; pw.Print(word.Word()); if (sentIter.MoveNext()) { pw.Print(" "); } else { break; } } } pw.Println(); } if (eval) { ICollection ourBrackets; ICollection goldBrackets; ourBrackets = proc.AllBrackets(tree); goldBrackets = proc.AllBrackets(gold); if (goodPOS) { Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(tree, gold)); Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(gold, tree)); } basicEval.Eval(ourBrackets, goldBrackets); System.Console.Out.WriteLine("\nScores:"); basicEval.DisplayLast(); Tree collinsTree = collinizer.TransformTree(tree); Tree collinsGold = collinizer.TransformTree(gold); ourBrackets = proc.AllBrackets(collinsTree); goldBrackets = proc.AllBrackets(collinsGold); if (goodPOS) { Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsTree, collinsGold)); Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsGold, collinsTree)); } collinsEval.Eval(ourBrackets, goldBrackets); System.Console.Out.WriteLine("\nCollinized scores:"); collinsEval.DisplayLast(); System.Console.Out.WriteLine(); } } if (eval) { basicEval.Display(); System.Console.Out.WriteLine(); collinsEval.Display(); } } }
public DVParser(DVModel model, LexicalizedParser parser) { this.parser = parser; this.op = parser.GetOp(); this.dvModel = model; }
/// <summary>This example shows a few more ways of providing input to a parser.</summary> /// <remarks> /// This example shows a few more ways of providing input to a parser. /// Usage: ParserDemo2 [grammar [textFile]] /// </remarks> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string grammar = args.Length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; string[] options = new string[] { "-maxLength", "80", "-retainTmpSubcategories" }; LexicalizedParser lp = ((LexicalizedParser)LexicalizedParser.LoadModel(grammar, options)); ITreebankLanguagePack tlp = lp.GetOp().Langpack(); IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory(); IEnumerable <IList <IHasWord> > sentences; if (args.Length > 1) { DocumentPreprocessor dp = new DocumentPreprocessor(args[1]); IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >(); foreach (IList <IHasWord> sentence in dp) { tmp.Add(sentence); } sentences = tmp; } else { // Showing tokenization and parsing in code a couple of different ways. string[] sent = new string[] { "This", "is", "an", "easy", "sentence", "." }; IList <IHasWord> sentence = new List <IHasWord>(); foreach (string word in sent) { sentence.Add(new Word(word)); } string sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization."); // Use the default tokenizer for this TreebankLanguagePack ITokenizer <IHasWord> toke = tlp.GetTokenizerFactory().GetTokenizer(new StringReader(sent2)); IList <IHasWord> sentence2 = toke.Tokenize(); string[] sent3 = new string[] { "It", "can", "can", "it", "." }; string[] tag3 = new string[] { "PRP", "MD", "VB", "PRP", "." }; // Parser gets second "can" wrong without help IList <TaggedWord> sentence3 = new List <TaggedWord>(); for (int i = 0; i < sent3.Length; i++) { sentence3.Add(new TaggedWord(sent3[i], tag3[i])); } Tree parse = lp.Parse(sentence3); parse.PennPrint(); IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >(); tmp.Add(sentence); tmp.Add(sentence2); tmp.Add(sentence3); sentences = tmp; } foreach (IList <IHasWord> sentence_1 in sentences) { Tree parse = lp.Parse(sentence_1); parse.PennPrint(); System.Console.Out.WriteLine(); GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); System.Console.Out.WriteLine("The words of the sentence:"); foreach (ILabel lab in parse.Yield()) { if (lab is CoreLabel) { System.Console.Out.WriteLine(((CoreLabel)lab).ToString(CoreLabel.OutputFormat.ValueMap)); } else { System.Console.Out.WriteLine(lab); } } System.Console.Out.WriteLine(); System.Console.Out.WriteLine(parse.TaggedYield()); System.Console.Out.WriteLine(); } // This method turns the String into a single sentence using the // default tokenizer for the TreebankLanguagePack. string sent3_1 = "This is one last test!"; lp.Parse(sent3_1).PennPrint(); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { string dvmodelFile = null; string lexparserFile = null; string testTreebankPath = null; IFileFilter testTreebankFilter = null; IList <string> unusedArgs = new List <string>(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-lexparser")) { lexparserFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; testTreebankPath = treebankDescription.First(); testTreebankFilter = treebankDescription.Second(); } else { unusedArgs.Add(args[argIndex++]); } } } log.Info("Loading lexparser from: " + lexparserFile); string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]); LexicalizedParser lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(lexparserFile, newArgs)); log.Info("... done"); Treebank testTreebank = null; if (testTreebankPath != null) { log.Info("Reading in trees from " + testTreebankPath); if (testTreebankFilter != null) { log.Info("Filtering on " + testTreebankFilter); } testTreebank = lexparser.GetOp().tlpParams.MemoryTreebank(); testTreebank.LoadPath(testTreebankPath, testTreebankFilter); log.Info("Read in " + testTreebank.Count + " trees for testing"); } double[] labelResults = new double[weights.Length]; double[] tagResults = new double[weights.Length]; for (int i = 0; i < weights.Length; ++i) { lexparser.GetOp().baseParserWeight = weights[i]; EvaluateTreebank evaluator = new EvaluateTreebank(lexparser); evaluator.TestOnTreebank(testTreebank); labelResults[i] = evaluator.GetLBScore(); tagResults[i] = evaluator.GetTagScore(); } for (int i_1 = 0; i_1 < weights.Length; ++i_1) { log.Info("LexicalizedParser weight " + weights[i_1] + ": labeled " + labelResults[i_1] + " tag " + tagResults[i_1]); } }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { string modelPath = null; IList <string> baseModelPaths = null; string testTreebankPath = null; IFileFilter testTreebankFilter = null; IList <string> unusedArgs = new List <string>(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model")) { modelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; testTreebankPath = treebankDescription.First(); testTreebankFilter = treebankDescription.Second(); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-baseModels")) { argIndex++; baseModelPaths = new List <string>(); while (argIndex < args.Length && args[argIndex][0] != '-') { baseModelPaths.Add(args[argIndex++]); } if (baseModelPaths.Count == 0) { throw new ArgumentException("Found an argument -baseModels with no actual models named"); } } else { unusedArgs.Add(args[argIndex++]); } } } } string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]); LexicalizedParser underlyingParser = null; Options options = null; LexicalizedParser combinedParser = null; if (baseModelPaths != null) { IList <DVModel> dvparsers = new List <DVModel>(); foreach (string baseModelPath in baseModelPaths) { log.Info("Loading serialized DVParser from " + baseModelPath); LexicalizedParser dvparser = ((LexicalizedParser)LexicalizedParser.LoadModel(baseModelPath)); IReranker reranker = dvparser.reranker; if (!(reranker is DVModelReranker)) { throw new ArgumentException("Expected parsers with DVModel embedded"); } dvparsers.Add(((DVModelReranker)reranker).GetModel()); if (underlyingParser == null) { underlyingParser = dvparser; options = underlyingParser.GetOp(); // TODO: other parser's options? options.SetOptions(newArgs); } log.Info("... done"); } combinedParser = LexicalizedParser.CopyLexicalizedParser(underlyingParser); CombinedDVModelReranker reranker_1 = new CombinedDVModelReranker(options, dvparsers); combinedParser.reranker = reranker_1; combinedParser.SaveParserToSerialized(modelPath); } else { throw new ArgumentException("Need to specify -model to load an already prepared CombinedParser"); } Treebank testTreebank = null; if (testTreebankPath != null) { log.Info("Reading in trees from " + testTreebankPath); if (testTreebankFilter != null) { log.Info("Filtering on " + testTreebankFilter); } testTreebank = combinedParser.GetOp().tlpParams.MemoryTreebank(); testTreebank.LoadPath(testTreebankPath, testTreebankFilter); log.Info("Read in " + testTreebank.Count + " trees for testing"); EvaluateTreebank evaluator = new EvaluateTreebank(combinedParser.GetOp(), null, combinedParser); evaluator.TestOnTreebank(testTreebank); } }
/// <summary> /// An example of a command line is /// <br /> /// java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202 /// <br /> /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6 /// <br /> /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed 026-270,301-499,600-999 /// </summary> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string parserModel = null; string output = null; IList <Pair <string, IFileFilter> > treebanks = Generics.NewArrayList(); int dvKBest = 200; int numThreads = 1; for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-dvKBest")) { dvKBest = System.Convert.ToInt32(args[argIndex + 1]); argIndex += 2; continue; } if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser") || args[argIndex].Equals("-model")) { parserModel = args[argIndex + 1]; argIndex += 2; continue; } if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { output = args[argIndex + 1]; argIndex += 2; continue; } if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; treebanks.Add(treebankDescription); continue; } if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-numThreads")) { numThreads = System.Convert.ToInt32(args[argIndex + 1]); argIndex += 2; continue; } throw new ArgumentException("Unknown argument " + args[argIndex]); } if (parserModel == null) { throw new ArgumentException("Need to supply a parser model with -model"); } if (output == null) { throw new ArgumentException("Need to supply an output filename with -output"); } if (treebanks.IsEmpty()) { throw new ArgumentException("Need to supply a treebank with -treebank"); } log.Info("Writing output to " + output); log.Info("Loading parser model " + parserModel); log.Info("Writing " + dvKBest + " hypothesis trees for each tree"); LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel, "-dvKBest", int.ToString(dvKBest))); CacheParseHypotheses cacher = new CacheParseHypotheses(parser); ITreeTransformer transformer = DVParser.BuildTrainTransformer(parser.GetOp()); IList <Tree> sentences = new List <Tree>(); foreach (Pair <string, IFileFilter> description in treebanks) { log.Info("Reading trees from " + description.first); Treebank treebank = parser.GetOp().tlpParams.MemoryTreebank(); treebank.LoadPath(description.first, description.second); treebank = treebank.Transform(transformer); Sharpen.Collections.AddAll(sentences, treebank); } log.Info("Processing " + sentences.Count + " trees"); IList <Pair <Tree, byte[]> > cache = Generics.NewArrayList(); transformer = new SynchronizedTreeTransformer(transformer); MulticoreWrapper <Tree, Pair <Tree, byte[]> > wrapper = new MulticoreWrapper <Tree, Pair <Tree, byte[]> >(numThreads, new CacheParseHypotheses.CacheProcessor(cacher, parser, dvKBest, transformer)); foreach (Tree tree in sentences) { wrapper.Put(tree); while (wrapper.Peek()) { cache.Add(wrapper.Poll()); if (cache.Count % 10 == 0) { System.Console.Out.WriteLine("Processed " + cache.Count + " trees"); } } } wrapper.Join(); while (wrapper.Peek()) { cache.Add(wrapper.Poll()); if (cache.Count % 10 == 0) { System.Console.Out.WriteLine("Processed " + cache.Count + " trees"); } } System.Console.Out.WriteLine("Finished processing " + cache.Count + " trees"); IOUtils.WriteObjectToFile(cache, output); }
public EvaluateTreebank(LexicalizedParser parser) : this(parser.GetOp(), parser.lex, parser) { }
/// <summary> /// Command line arguments for this program: /// <br /> /// -output: the model file to output /// -input: a list of model files to input /// </summary> public static void Main(string[] args) { string outputModelFilename = null; IList <string> inputModelFilenames = Generics.NewArrayList(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputModelFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { for (++argIndex; argIndex < args.Length && !args[argIndex].StartsWith("-"); ++argIndex) { Sharpen.Collections.AddAll(inputModelFilenames, Arrays.AsList(args[argIndex].Split(","))); } } else { throw new Exception("Unknown argument " + args[argIndex]); } } } if (outputModelFilename == null) { log.Info("Need to specify output model name with -output"); System.Environment.Exit(2); } if (inputModelFilenames.Count == 0) { log.Info("Need to specify input model names with -input"); System.Environment.Exit(2); } log.Info("Averaging " + inputModelFilenames); log.Info("Outputting result to " + outputModelFilename); LexicalizedParser lexparser = null; IList <DVModel> models = Generics.NewArrayList(); foreach (string filename in inputModelFilenames) { LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(filename)); if (lexparser == null) { lexparser = parser; } models.Add(DVParser.GetModelFromLexicalizedParser(parser)); } IList <TwoDimensionalMap <string, string, SimpleMatrix> > binaryTransformMaps = CollectionUtils.TransformAsList(models, null); IList <TwoDimensionalMap <string, string, SimpleMatrix> > binaryScoreMaps = CollectionUtils.TransformAsList(models, null); IList <IDictionary <string, SimpleMatrix> > unaryTransformMaps = CollectionUtils.TransformAsList(models, null); IList <IDictionary <string, SimpleMatrix> > unaryScoreMaps = CollectionUtils.TransformAsList(models, null); IList <IDictionary <string, SimpleMatrix> > wordMaps = CollectionUtils.TransformAsList(models, null); TwoDimensionalMap <string, string, SimpleMatrix> binaryTransformAverages = AverageBinaryMatrices(binaryTransformMaps); TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreAverages = AverageBinaryMatrices(binaryScoreMaps); IDictionary <string, SimpleMatrix> unaryTransformAverages = AverageUnaryMatrices(unaryTransformMaps); IDictionary <string, SimpleMatrix> unaryScoreAverages = AverageUnaryMatrices(unaryScoreMaps); IDictionary <string, SimpleMatrix> wordAverages = AverageUnaryMatrices(wordMaps); DVModel newModel = new DVModel(binaryTransformAverages, unaryTransformAverages, binaryScoreAverages, unaryScoreAverages, wordAverages, lexparser.GetOp()); DVParser newParser = new DVParser(newModel, lexparser); newParser.SaveModel(outputModelFilename); }
internal LexicalizedParserQuery(LexicalizedParser parser) { this.op = parser.GetOp(); BinaryGrammar bg = parser.bg; UnaryGrammar ug = parser.ug; ILexicon lex = parser.lex; IDependencyGrammar dg = parser.dg; IIndex <string> stateIndex = parser.stateIndex; IIndex <string> wordIndex = new DeltaIndex <string>(parser.wordIndex); IIndex <string> tagIndex = parser.tagIndex; this.debinarizer = new Debinarizer(op.forceCNF); this.boundaryRemover = new BoundaryRemover(); if (op.doPCFG) { if (op.testOptions.iterativeCKY) { pparser = new IterativeCKYPCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex); } else { pparser = new ExhaustivePCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex); } } else { pparser = null; } if (op.doDep) { dg.SetLexicon(lex); if (!op.testOptions.useFastFactored) { dparser = new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex); } else { dparser = null; } } else { dparser = null; } if (op.doDep && op.doPCFG) { if (op.testOptions.useFastFactored) { MLEDependencyGrammar mledg = (MLEDependencyGrammar)dg; int numToFind = 1; if (op.testOptions.printFactoredKGood > 0) { numToFind = op.testOptions.printFactoredKGood; } bparser = new FastFactoredParser(pparser, mledg, op, numToFind, wordIndex, tagIndex); } else { IScorer scorer = new TwinScorer(pparser, dparser); //Scorer scorer = parser; if (op.testOptions.useN5) { bparser = new BiLexPCFGParser.N5BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex); } else { bparser = new BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex); } } } else { bparser = null; } subcategoryStripper = op.tlpParams.SubcategoryStripper(); }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { string modelPath = null; string outputPath = null; string testTreebankPath = null; IFileFilter testTreebankFilter = null; IList <string> unusedArgs = new List <string>(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model")) { modelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; testTreebankPath = treebankDescription.First(); testTreebankFilter = treebankDescription.Second(); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputPath = args[argIndex + 1]; argIndex += 2; } else { unusedArgs.Add(args[argIndex++]); } } } } if (modelPath == null) { throw new ArgumentException("Need to specify -model"); } if (testTreebankPath == null) { throw new ArgumentException("Need to specify -testTreebank"); } if (outputPath == null) { throw new ArgumentException("Need to specify -output"); } string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]); LexicalizedParser lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs)); Treebank testTreebank = null; if (testTreebankPath != null) { log.Info("Reading in trees from " + testTreebankPath); if (testTreebankFilter != null) { log.Info("Filtering on " + testTreebankFilter); } testTreebank = lexparser.GetOp().tlpParams.MemoryTreebank(); testTreebank.LoadPath(testTreebankPath, testTreebankFilter); log.Info("Read in " + testTreebank.Count + " trees for testing"); } FileWriter @out = new FileWriter(outputPath); BufferedWriter bout = new BufferedWriter(@out); log.Info("Parsing " + testTreebank.Count + " trees"); int count = 0; IList <FindNearestNeighbors.ParseRecord> records = Generics.NewArrayList(); foreach (Tree goldTree in testTreebank) { IList <Word> tokens = goldTree.YieldWords(); IParserQuery parserQuery = lexparser.ParserQuery(); if (!parserQuery.Parse(tokens)) { throw new AssertionError("Could not parse: " + tokens); } if (!(parserQuery is RerankingParserQuery)) { throw new ArgumentException("Expected a LexicalizedParser with a Reranker attached"); } RerankingParserQuery rpq = (RerankingParserQuery)parserQuery; if (!(rpq.RerankerQuery() is DVModelReranker.Query)) { throw new ArgumentException("Expected a LexicalizedParser with a DVModel attached"); } DeepTree tree = ((DVModelReranker.Query)rpq.RerankerQuery()).GetDeepTrees()[0]; SimpleMatrix rootVector = null; foreach (KeyValuePair <Tree, SimpleMatrix> entry in tree.GetVectors()) { if (entry.Key.Label().Value().Equals("ROOT")) { rootVector = entry.Value; break; } } if (rootVector == null) { throw new AssertionError("Could not find root nodevector"); } @out.Write(tokens + "\n"); @out.Write(tree.GetTree() + "\n"); for (int i = 0; i < rootVector.GetNumElements(); ++i) { @out.Write(" " + rootVector.Get(i)); } @out.Write("\n\n\n"); count++; if (count % 10 == 0) { log.Info(" " + count); } records.Add(new FindNearestNeighbors.ParseRecord(tokens, goldTree, tree.GetTree(), rootVector, tree.GetVectors())); } log.Info(" done parsing"); IList <Pair <Tree, SimpleMatrix> > subtrees = Generics.NewArrayList(); foreach (FindNearestNeighbors.ParseRecord record in records) { foreach (KeyValuePair <Tree, SimpleMatrix> entry in record.nodeVectors) { if (entry.Key.GetLeaves().Count <= maxLength) { subtrees.Add(Pair.MakePair(entry.Key, entry.Value)); } } } log.Info("There are " + subtrees.Count + " subtrees in the set of trees"); PriorityQueue <ScoredObject <Pair <Tree, Tree> > > bestmatches = new PriorityQueue <ScoredObject <Pair <Tree, Tree> > >(101, ScoredComparator.DescendingComparator); for (int i_1 = 0; i_1 < subtrees.Count; ++i_1) { log.Info(subtrees[i_1].First().YieldWords()); log.Info(subtrees[i_1].First()); for (int j = 0; j < subtrees.Count; ++j) { if (i_1 == j) { continue; } // TODO: look at basic category? double normF = subtrees[i_1].Second().Minus(subtrees[j].Second()).NormF(); bestmatches.Add(new ScoredObject <Pair <Tree, Tree> >(Pair.MakePair(subtrees[i_1].First(), subtrees[j].First()), normF)); if (bestmatches.Count > 100) { bestmatches.Poll(); } } IList <ScoredObject <Pair <Tree, Tree> > > ordered = Generics.NewArrayList(); while (bestmatches.Count > 0) { ordered.Add(bestmatches.Poll()); } Java.Util.Collections.Reverse(ordered); foreach (ScoredObject <Pair <Tree, Tree> > pair in ordered) { log.Info(" MATCHED " + pair.Object().second.YieldWords() + " ... " + pair.Object().Second() + " with a score of " + pair.Score()); } log.Info(); log.Info(); bestmatches.Clear(); } /* * for (int i = 0; i < records.size(); ++i) { * if (i % 10 == 0) { * log.info(" " + i); * } * List<ScoredObject<ParseRecord>> scored = Generics.newArrayList(); * for (int j = 0; j < records.size(); ++j) { * if (i == j) continue; * * double score = 0.0; * int matches = 0; * for (Map.Entry<Tree, SimpleMatrix> first : records.get(i).nodeVectors.entrySet()) { * for (Map.Entry<Tree, SimpleMatrix> second : records.get(j).nodeVectors.entrySet()) { * String firstBasic = dvparser.dvModel.basicCategory(first.getKey().label().value()); * String secondBasic = dvparser.dvModel.basicCategory(second.getKey().label().value()); * if (firstBasic.equals(secondBasic)) { ++matches; * double normF = first.getValue().minus(second.getValue()).normF(); * score += normF * normF; * } * } * } * if (matches == 0) { * score = Double.POSITIVE_INFINITY; * } else { * score = score / matches; * } * //double score = records.get(i).vector.minus(records.get(j).vector).normF(); * scored.add(new ScoredObject<ParseRecord>(records.get(j), score)); * } * Collections.sort(scored, ScoredComparator.ASCENDING_COMPARATOR); * * out.write(records.get(i).sentence.toString() + "\n"); * for (int j = 0; j < numNeighbors; ++j) { * out.write(" " + scored.get(j).score() + ": " + scored.get(j).object().sentence + "\n"); * } * out.write("\n\n"); * } * log.info(); */ bout.Flush(); @out.Flush(); @out.Close(); }