public virtual Pair <Tree, byte[]> Process(Tree tree) { IList <Tree> topParses = DVParser.GetTopParsesForOneTree(parser, dvKBest, tree, transformer); // this block is a test to make sure the conversion code is working... IList <Tree> converted = CacheParseHypotheses.ConvertToTrees(cacher.ConvertToBytes(topParses)); IList <Tree> simplified = CollectionUtils.TransformAsList(topParses, cacher.treeBasicCategories); simplified = CollectionUtils.FilterAsList(simplified, cacher.treeFilter); if (simplified.Count != topParses.Count) { log.Info("Filtered " + (topParses.Count - simplified.Count) + " trees"); if (simplified.Count == 0) { log.Info(" WARNING: filtered all trees for " + tree); } } if (!simplified.Equals(converted)) { if (converted.Count != simplified.Count) { throw new AssertionError("horrible error: tree sizes not equal, " + converted.Count + " vs " + simplified.Count); } for (int i = 0; i < converted.Count; ++i) { if (!simplified[i].Equals(converted[i])) { System.Console.Out.WriteLine("============================="); System.Console.Out.WriteLine(simplified[i]); System.Console.Out.WriteLine("============================="); System.Console.Out.WriteLine(converted[i]); System.Console.Out.WriteLine("============================="); throw new AssertionError("horrible error: tree " + i + " not equal for base tree " + tree); } } } return(Pair.MakePair(tree, cacher.ConvertToBytes(topParses))); }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string modelPath = null; string outputPath = null; string inputPath = null; string testTreebankPath = null; IFileFilter testTreebankFilter = null; IList <string> unusedArgs = Generics.NewArrayList(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model")) { modelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; testTreebankPath = treebankDescription.First(); testTreebankFilter = treebankDescription.Second(); } else { unusedArgs.Add(args[argIndex++]); } } } } } string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]); LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs)); DVModel model = DVParser.GetModelFromLexicalizedParser(parser); File outputFile = new File(outputPath); FileSystem.CheckNotExistsOrFail(outputFile); FileSystem.MkdirOrFail(outputFile); int count = 0; if (inputPath != null) { Reader input = new BufferedReader(new FileReader(inputPath)); DocumentPreprocessor processor = new DocumentPreprocessor(input); foreach (IList <IHasWord> sentence in processor) { count++; // index from 1 IParserQuery pq = parser.ParserQuery(); if (!(pq is RerankingParserQuery)) { throw new ArgumentException("Expected a RerankingParserQuery"); } RerankingParserQuery rpq = (RerankingParserQuery)pq; if (!rpq.Parse(sentence)) { throw new Exception("Unparsable sentence: " + sentence); } IRerankerQuery reranker = rpq.RerankerQuery(); if (!(reranker is DVModelReranker.Query)) { throw new ArgumentException("Expected a DVModelReranker"); } DeepTree deepTree = ((DVModelReranker.Query)reranker).GetDeepTrees()[0]; IdentityHashMap <Tree, SimpleMatrix> vectors = deepTree.GetVectors(); foreach (KeyValuePair <Tree, SimpleMatrix> entry in vectors) { log.Info(entry.Key + " " + entry.Value); } FileWriter fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt"); BufferedWriter bout = new BufferedWriter(fout); bout.Write(SentenceUtils.ListToString(sentence)); bout.NewLine(); bout.Write(deepTree.GetTree().ToString()); bout.NewLine(); foreach (IHasWord word in sentence) { OutputMatrix(bout, model.GetWordVector(word.Word())); } Tree rootTree = FindRootTree(vectors); OutputTreeMatrices(bout, rootTree, vectors); bout.Flush(); fout.Close(); } } }
/// <summary> /// An example of a command line is /// <br /> /// java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202 /// <br /> /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6 /// <br /> /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed 026-270,301-499,600-999 /// </summary> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string parserModel = null; string output = null; IList <Pair <string, IFileFilter> > treebanks = Generics.NewArrayList(); int dvKBest = 200; int numThreads = 1; for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-dvKBest")) { dvKBest = System.Convert.ToInt32(args[argIndex + 1]); argIndex += 2; continue; } if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser") || args[argIndex].Equals("-model")) { parserModel = args[argIndex + 1]; argIndex += 2; continue; } if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { output = args[argIndex + 1]; argIndex += 2; continue; } if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; treebanks.Add(treebankDescription); continue; } if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-numThreads")) { numThreads = System.Convert.ToInt32(args[argIndex + 1]); argIndex += 2; continue; } throw new ArgumentException("Unknown argument " + args[argIndex]); } if (parserModel == null) { throw new ArgumentException("Need to supply a parser model with -model"); } if (output == null) { throw new ArgumentException("Need to supply an output filename with -output"); } if (treebanks.IsEmpty()) { throw new ArgumentException("Need to supply a treebank with -treebank"); } log.Info("Writing output to " + output); log.Info("Loading parser model " + parserModel); log.Info("Writing " + dvKBest + " hypothesis trees for each tree"); LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel, "-dvKBest", int.ToString(dvKBest))); CacheParseHypotheses cacher = new CacheParseHypotheses(parser); ITreeTransformer transformer = DVParser.BuildTrainTransformer(parser.GetOp()); IList <Tree> sentences = new List <Tree>(); foreach (Pair <string, IFileFilter> description in treebanks) { log.Info("Reading trees from " + description.first); Treebank treebank = parser.GetOp().tlpParams.MemoryTreebank(); treebank.LoadPath(description.first, description.second); treebank = treebank.Transform(transformer); Sharpen.Collections.AddAll(sentences, treebank); } log.Info("Processing " + sentences.Count + " trees"); IList <Pair <Tree, byte[]> > cache = Generics.NewArrayList(); transformer = new SynchronizedTreeTransformer(transformer); MulticoreWrapper <Tree, Pair <Tree, byte[]> > wrapper = new MulticoreWrapper <Tree, Pair <Tree, byte[]> >(numThreads, new CacheParseHypotheses.CacheProcessor(cacher, parser, dvKBest, transformer)); foreach (Tree tree in sentences) { wrapper.Put(tree); while (wrapper.Peek()) { cache.Add(wrapper.Poll()); if (cache.Count % 10 == 0) { System.Console.Out.WriteLine("Processed " + cache.Count + " trees"); } } } wrapper.Join(); while (wrapper.Peek()) { cache.Add(wrapper.Poll()); if (cache.Count % 10 == 0) { System.Console.Out.WriteLine("Processed " + cache.Count + " trees"); } } System.Console.Out.WriteLine("Finished processing " + cache.Count + " trees"); IOUtils.WriteObjectToFile(cache, output); }
/// <summary> /// Command line arguments for this program: /// <br /> /// -output: the model file to output /// -input: a list of model files to input /// </summary> public static void Main(string[] args) { string outputModelFilename = null; IList <string> inputModelFilenames = Generics.NewArrayList(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputModelFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { for (++argIndex; argIndex < args.Length && !args[argIndex].StartsWith("-"); ++argIndex) { Sharpen.Collections.AddAll(inputModelFilenames, Arrays.AsList(args[argIndex].Split(","))); } } else { throw new Exception("Unknown argument " + args[argIndex]); } } } if (outputModelFilename == null) { log.Info("Need to specify output model name with -output"); System.Environment.Exit(2); } if (inputModelFilenames.Count == 0) { log.Info("Need to specify input model names with -input"); System.Environment.Exit(2); } log.Info("Averaging " + inputModelFilenames); log.Info("Outputting result to " + outputModelFilename); LexicalizedParser lexparser = null; IList <DVModel> models = Generics.NewArrayList(); foreach (string filename in inputModelFilenames) { LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(filename)); if (lexparser == null) { lexparser = parser; } models.Add(DVParser.GetModelFromLexicalizedParser(parser)); } IList <TwoDimensionalMap <string, string, SimpleMatrix> > binaryTransformMaps = CollectionUtils.TransformAsList(models, null); IList <TwoDimensionalMap <string, string, SimpleMatrix> > binaryScoreMaps = CollectionUtils.TransformAsList(models, null); IList <IDictionary <string, SimpleMatrix> > unaryTransformMaps = CollectionUtils.TransformAsList(models, null); IList <IDictionary <string, SimpleMatrix> > unaryScoreMaps = CollectionUtils.TransformAsList(models, null); IList <IDictionary <string, SimpleMatrix> > wordMaps = CollectionUtils.TransformAsList(models, null); TwoDimensionalMap <string, string, SimpleMatrix> binaryTransformAverages = AverageBinaryMatrices(binaryTransformMaps); TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreAverages = AverageBinaryMatrices(binaryScoreMaps); IDictionary <string, SimpleMatrix> unaryTransformAverages = AverageUnaryMatrices(unaryTransformMaps); IDictionary <string, SimpleMatrix> unaryScoreAverages = AverageUnaryMatrices(unaryScoreMaps); IDictionary <string, SimpleMatrix> wordAverages = AverageUnaryMatrices(wordMaps); DVModel newModel = new DVModel(binaryTransformAverages, unaryTransformAverages, binaryScoreAverages, unaryScoreAverages, wordAverages, lexparser.GetOp()); DVParser newParser = new DVParser(newModel, lexparser); newParser.SaveModel(outputModelFilename); }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string modelPath = null; string outputDir = null; for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model")) { modelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputDir = args[argIndex + 1]; argIndex += 2; } else { log.Info("Unknown argument " + args[argIndex]); Help(); } } } if (outputDir == null || modelPath == null) { Help(); } File outputFile = new File(outputDir); FileSystem.CheckNotExistsOrFail(outputFile); FileSystem.MkdirOrFail(outputFile); LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath)); DVModel model = DVParser.GetModelFromLexicalizedParser(parser); string binaryWDir = outputDir + File.separator + "binaryW"; FileSystem.MkdirOrFail(binaryWDir); foreach (TwoDimensionalMap.Entry <string, string, SimpleMatrix> entry in model.binaryTransform) { string filename = binaryWDir + File.separator + entry.GetFirstKey() + "_" + entry.GetSecondKey() + ".txt"; DumpMatrix(filename, entry.GetValue()); } string binaryScoreDir = outputDir + File.separator + "binaryScore"; FileSystem.MkdirOrFail(binaryScoreDir); foreach (TwoDimensionalMap.Entry <string, string, SimpleMatrix> entry_1 in model.binaryScore) { string filename = binaryScoreDir + File.separator + entry_1.GetFirstKey() + "_" + entry_1.GetSecondKey() + ".txt"; DumpMatrix(filename, entry_1.GetValue()); } string unaryWDir = outputDir + File.separator + "unaryW"; FileSystem.MkdirOrFail(unaryWDir); foreach (KeyValuePair <string, SimpleMatrix> entry_2 in model.unaryTransform) { string filename = unaryWDir + File.separator + entry_2.Key + ".txt"; DumpMatrix(filename, entry_2.Value); } string unaryScoreDir = outputDir + File.separator + "unaryScore"; FileSystem.MkdirOrFail(unaryScoreDir); foreach (KeyValuePair <string, SimpleMatrix> entry_3 in model.unaryScore) { string filename = unaryScoreDir + File.separator + entry_3.Key + ".txt"; DumpMatrix(filename, entry_3.Value); } string embeddingFile = outputDir + File.separator + "embeddings.txt"; FileWriter fout = new FileWriter(embeddingFile); BufferedWriter bout = new BufferedWriter(fout); foreach (KeyValuePair <string, SimpleMatrix> entry_4 in model.wordVectors) { bout.Write(entry_4.Key); SimpleMatrix vector = entry_4.Value; for (int i = 0; i < vector.NumRows(); ++i) { bout.Write(" " + vector.Get(i, 0)); } bout.Write("\n"); } bout.Close(); fout.Close(); }