Ejemplo n.º 1
0
            public virtual Pair <Tree, byte[]> Process(Tree tree)
            {
                IList <Tree> topParses = DVParser.GetTopParsesForOneTree(parser, dvKBest, tree, transformer);
                // this block is a test to make sure the conversion code is working...
                IList <Tree> converted  = CacheParseHypotheses.ConvertToTrees(cacher.ConvertToBytes(topParses));
                IList <Tree> simplified = CollectionUtils.TransformAsList(topParses, cacher.treeBasicCategories);

                simplified = CollectionUtils.FilterAsList(simplified, cacher.treeFilter);
                if (simplified.Count != topParses.Count)
                {
                    log.Info("Filtered " + (topParses.Count - simplified.Count) + " trees");
                    if (simplified.Count == 0)
                    {
                        log.Info(" WARNING: filtered all trees for " + tree);
                    }
                }
                if (!simplified.Equals(converted))
                {
                    if (converted.Count != simplified.Count)
                    {
                        throw new AssertionError("horrible error: tree sizes not equal, " + converted.Count + " vs " + simplified.Count);
                    }
                    for (int i = 0; i < converted.Count; ++i)
                    {
                        if (!simplified[i].Equals(converted[i]))
                        {
                            System.Console.Out.WriteLine("=============================");
                            System.Console.Out.WriteLine(simplified[i]);
                            System.Console.Out.WriteLine("=============================");
                            System.Console.Out.WriteLine(converted[i]);
                            System.Console.Out.WriteLine("=============================");
                            throw new AssertionError("horrible error: tree " + i + " not equal for base tree " + tree);
                        }
                    }
                }
                return(Pair.MakePair(tree, cacher.ConvertToBytes(topParses)));
            }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string         modelPath          = null;
            string         outputPath         = null;
            string         inputPath          = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = Generics.NewArrayList();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                    {
                        outputPath = args[argIndex + 1];
                        argIndex  += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                        {
                            inputPath = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                            {
                                Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                                argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                                testTreebankPath   = treebankDescription.First();
                                testTreebankFilter = treebankDescription.Second();
                            }
                            else
                            {
                                unusedArgs.Add(args[argIndex++]);
                            }
                        }
                    }
                }
            }
            string[]          newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser parser  = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs));
            DVModel           model   = DVParser.GetModelFromLexicalizedParser(parser);
            File outputFile           = new File(outputPath);

            FileSystem.CheckNotExistsOrFail(outputFile);
            FileSystem.MkdirOrFail(outputFile);
            int count = 0;

            if (inputPath != null)
            {
                Reader input = new BufferedReader(new FileReader(inputPath));
                DocumentPreprocessor processor = new DocumentPreprocessor(input);
                foreach (IList <IHasWord> sentence in processor)
                {
                    count++;
                    // index from 1
                    IParserQuery pq = parser.ParserQuery();
                    if (!(pq is RerankingParserQuery))
                    {
                        throw new ArgumentException("Expected a RerankingParserQuery");
                    }
                    RerankingParserQuery rpq = (RerankingParserQuery)pq;
                    if (!rpq.Parse(sentence))
                    {
                        throw new Exception("Unparsable sentence: " + sentence);
                    }
                    IRerankerQuery reranker = rpq.RerankerQuery();
                    if (!(reranker is DVModelReranker.Query))
                    {
                        throw new ArgumentException("Expected a DVModelReranker");
                    }
                    DeepTree deepTree = ((DVModelReranker.Query)reranker).GetDeepTrees()[0];
                    IdentityHashMap <Tree, SimpleMatrix> vectors = deepTree.GetVectors();
                    foreach (KeyValuePair <Tree, SimpleMatrix> entry in vectors)
                    {
                        log.Info(entry.Key + "   " + entry.Value);
                    }
                    FileWriter     fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt");
                    BufferedWriter bout = new BufferedWriter(fout);
                    bout.Write(SentenceUtils.ListToString(sentence));
                    bout.NewLine();
                    bout.Write(deepTree.GetTree().ToString());
                    bout.NewLine();
                    foreach (IHasWord word in sentence)
                    {
                        OutputMatrix(bout, model.GetWordVector(word.Word()));
                    }
                    Tree rootTree = FindRootTree(vectors);
                    OutputTreeMatrices(bout, rootTree, vectors);
                    bout.Flush();
                    fout.Close();
                }
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// An example of a command line is
        /// <br />
        /// java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz  -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed  026-270,301-499,600-999
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string parserModel = null;
            string output      = null;
            IList <Pair <string, IFileFilter> > treebanks = Generics.NewArrayList();
            int dvKBest    = 200;
            int numThreads = 1;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-dvKBest"))
                {
                    dvKBest   = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser") || args[argIndex].Equals("-model"))
                {
                    parserModel = args[argIndex + 1];
                    argIndex   += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                {
                    output    = args[argIndex + 1];
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                {
                    Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank");
                    argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                    treebanks.Add(treebankDescription);
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-numThreads"))
                {
                    numThreads = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex  += 2;
                    continue;
                }
                throw new ArgumentException("Unknown argument " + args[argIndex]);
            }
            if (parserModel == null)
            {
                throw new ArgumentException("Need to supply a parser model with -model");
            }
            if (output == null)
            {
                throw new ArgumentException("Need to supply an output filename with -output");
            }
            if (treebanks.IsEmpty())
            {
                throw new ArgumentException("Need to supply a treebank with -treebank");
            }
            log.Info("Writing output to " + output);
            log.Info("Loading parser model " + parserModel);
            log.Info("Writing " + dvKBest + " hypothesis trees for each tree");
            LexicalizedParser    parser      = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel, "-dvKBest", int.ToString(dvKBest)));
            CacheParseHypotheses cacher      = new CacheParseHypotheses(parser);
            ITreeTransformer     transformer = DVParser.BuildTrainTransformer(parser.GetOp());
            IList <Tree>         sentences   = new List <Tree>();

            foreach (Pair <string, IFileFilter> description in treebanks)
            {
                log.Info("Reading trees from " + description.first);
                Treebank treebank = parser.GetOp().tlpParams.MemoryTreebank();
                treebank.LoadPath(description.first, description.second);
                treebank = treebank.Transform(transformer);
                Sharpen.Collections.AddAll(sentences, treebank);
            }
            log.Info("Processing " + sentences.Count + " trees");
            IList <Pair <Tree, byte[]> > cache = Generics.NewArrayList();

            transformer = new SynchronizedTreeTransformer(transformer);
            MulticoreWrapper <Tree, Pair <Tree, byte[]> > wrapper = new MulticoreWrapper <Tree, Pair <Tree, byte[]> >(numThreads, new CacheParseHypotheses.CacheProcessor(cacher, parser, dvKBest, transformer));

            foreach (Tree tree in sentences)
            {
                wrapper.Put(tree);
                while (wrapper.Peek())
                {
                    cache.Add(wrapper.Poll());
                    if (cache.Count % 10 == 0)
                    {
                        System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                    }
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                cache.Add(wrapper.Poll());
                if (cache.Count % 10 == 0)
                {
                    System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                }
            }
            System.Console.Out.WriteLine("Finished processing " + cache.Count + " trees");
            IOUtils.WriteObjectToFile(cache, output);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Command line arguments for this program:
        /// <br />
        /// -output: the model file to output
        /// -input: a list of model files to input
        /// </summary>
        public static void Main(string[] args)
        {
            string         outputModelFilename = null;
            IList <string> inputModelFilenames = Generics.NewArrayList();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                {
                    outputModelFilename = args[argIndex + 1];
                    argIndex           += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                    {
                        for (++argIndex; argIndex < args.Length && !args[argIndex].StartsWith("-"); ++argIndex)
                        {
                            Sharpen.Collections.AddAll(inputModelFilenames, Arrays.AsList(args[argIndex].Split(",")));
                        }
                    }
                    else
                    {
                        throw new Exception("Unknown argument " + args[argIndex]);
                    }
                }
            }
            if (outputModelFilename == null)
            {
                log.Info("Need to specify output model name with -output");
                System.Environment.Exit(2);
            }
            if (inputModelFilenames.Count == 0)
            {
                log.Info("Need to specify input model names with -input");
                System.Environment.Exit(2);
            }
            log.Info("Averaging " + inputModelFilenames);
            log.Info("Outputting result to " + outputModelFilename);
            LexicalizedParser lexparser = null;
            IList <DVModel>   models    = Generics.NewArrayList();

            foreach (string filename in inputModelFilenames)
            {
                LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(filename));
                if (lexparser == null)
                {
                    lexparser = parser;
                }
                models.Add(DVParser.GetModelFromLexicalizedParser(parser));
            }
            IList <TwoDimensionalMap <string, string, SimpleMatrix> > binaryTransformMaps = CollectionUtils.TransformAsList(models, null);
            IList <TwoDimensionalMap <string, string, SimpleMatrix> > binaryScoreMaps     = CollectionUtils.TransformAsList(models, null);
            IList <IDictionary <string, SimpleMatrix> >      unaryTransformMaps           = CollectionUtils.TransformAsList(models, null);
            IList <IDictionary <string, SimpleMatrix> >      unaryScoreMaps          = CollectionUtils.TransformAsList(models, null);
            IList <IDictionary <string, SimpleMatrix> >      wordMaps                = CollectionUtils.TransformAsList(models, null);
            TwoDimensionalMap <string, string, SimpleMatrix> binaryTransformAverages = AverageBinaryMatrices(binaryTransformMaps);
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreAverages     = AverageBinaryMatrices(binaryScoreMaps);
            IDictionary <string, SimpleMatrix> unaryTransformAverages                = AverageUnaryMatrices(unaryTransformMaps);
            IDictionary <string, SimpleMatrix> unaryScoreAverages = AverageUnaryMatrices(unaryScoreMaps);
            IDictionary <string, SimpleMatrix> wordAverages       = AverageUnaryMatrices(wordMaps);
            DVModel  newModel  = new DVModel(binaryTransformAverages, unaryTransformAverages, binaryScoreAverages, unaryScoreAverages, wordAverages, lexparser.GetOp());
            DVParser newParser = new DVParser(newModel, lexparser);

            newParser.SaveModel(outputModelFilename);
        }
Ejemplo n.º 5
0
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string modelPath = null;
            string outputDir = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                    {
                        outputDir = args[argIndex + 1];
                        argIndex += 2;
                    }
                    else
                    {
                        log.Info("Unknown argument " + args[argIndex]);
                        Help();
                    }
                }
            }
            if (outputDir == null || modelPath == null)
            {
                Help();
            }
            File outputFile = new File(outputDir);

            FileSystem.CheckNotExistsOrFail(outputFile);
            FileSystem.MkdirOrFail(outputFile);
            LexicalizedParser parser     = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath));
            DVModel           model      = DVParser.GetModelFromLexicalizedParser(parser);
            string            binaryWDir = outputDir + File.separator + "binaryW";

            FileSystem.MkdirOrFail(binaryWDir);
            foreach (TwoDimensionalMap.Entry <string, string, SimpleMatrix> entry in model.binaryTransform)
            {
                string filename = binaryWDir + File.separator + entry.GetFirstKey() + "_" + entry.GetSecondKey() + ".txt";
                DumpMatrix(filename, entry.GetValue());
            }
            string binaryScoreDir = outputDir + File.separator + "binaryScore";

            FileSystem.MkdirOrFail(binaryScoreDir);
            foreach (TwoDimensionalMap.Entry <string, string, SimpleMatrix> entry_1 in model.binaryScore)
            {
                string filename = binaryScoreDir + File.separator + entry_1.GetFirstKey() + "_" + entry_1.GetSecondKey() + ".txt";
                DumpMatrix(filename, entry_1.GetValue());
            }
            string unaryWDir = outputDir + File.separator + "unaryW";

            FileSystem.MkdirOrFail(unaryWDir);
            foreach (KeyValuePair <string, SimpleMatrix> entry_2 in model.unaryTransform)
            {
                string filename = unaryWDir + File.separator + entry_2.Key + ".txt";
                DumpMatrix(filename, entry_2.Value);
            }
            string unaryScoreDir = outputDir + File.separator + "unaryScore";

            FileSystem.MkdirOrFail(unaryScoreDir);
            foreach (KeyValuePair <string, SimpleMatrix> entry_3 in model.unaryScore)
            {
                string filename = unaryScoreDir + File.separator + entry_3.Key + ".txt";
                DumpMatrix(filename, entry_3.Value);
            }
            string         embeddingFile = outputDir + File.separator + "embeddings.txt";
            FileWriter     fout          = new FileWriter(embeddingFile);
            BufferedWriter bout          = new BufferedWriter(fout);

            foreach (KeyValuePair <string, SimpleMatrix> entry_4 in model.wordVectors)
            {
                bout.Write(entry_4.Key);
                SimpleMatrix vector = entry_4.Value;
                for (int i = 0; i < vector.NumRows(); ++i)
                {
                    bout.Write("  " + vector.Get(i, 0));
                }
                bout.Write("\n");
            }
            bout.Close();
            fout.Close();
        }