Beispiel #1
0
        static void EvaluateParser()
        {
            string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar";
                                @"D:\user\nyang\data\treebank\English\pcfg\ptb.s1.smoothed.grammar";

            string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt";

            string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat";
                            @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat";

            string trainfile = @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.02-21.flat";

            Vocabulary vocab;
            TagSet tagSet;
            LAPCFGrammar grammar;

            var traintrees = new List<PhrasalTree>();

            LoadTrees(traintrees, trainfile);

            var rwHanlder = new RareWordHandler(traintrees, 10);

            using (var s = new TextModelReader(modelfile))
            {
                vocab = Vocabulary.LoadFromStream(s);
                tagSet = TagSet.LoadFromStream(s);
                grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet);
            }

            rwHanlder.Build(tagSet, 0.001);

            //grammar.Smoothing(0.1f);

            grammar.MakeCompaction();

            grammar.MakeSubruleCompaction();

            var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1];

            var traces = new int[grammars.Length][][];

            grammars [grammars.Length - 1] = grammar;

            for (int i = grammars.Length - 1; i >= 1; --i)
            {
                traces [i] = grammar.subtagTraces [i - 1];
                grammars [i - 1] = grammars [i].ProjectGrammar(traces [i]);
                grammars [i - 1].MakeCompaction();
                grammars [i - 1].MakeSubruleCompaction();
            }

            string[][] tagTiers;

            using (StreamReader sr = new StreamReader(tagmapfile))
            {
                var tt = new List<string[]>();
                while (!sr.EndOfStream)
                {
                    string line = sr.ReadLine();

                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries));
                }

                tagTiers = new string[tt [0].Length][];//tt.ToArray();

                for (int i = 0; i < tagTiers.Length; ++i)
                {
                    tagTiers [i] = new string[tt.Count];
                    for (int j = 0; j < tt.Count; ++j)
                    {
                        tagTiers [i] [j] = tt [j] [i];
                    }
                }
            }

            var cbs = new CodeBook32[tagTiers.Length];

            for (int i = 0; i < cbs.Length; ++i)
            {
                cbs [i] = new CodeBook32();

                foreach (var t in tagTiers[i])
                {
                    cbs [i].Add(t);
                }
            }

            int pgcount = cbs.Length - 1;

            int[][] tagMaps = new int[pgcount][];

            for (int i = 0; i < tagMaps.Length; ++i)
            {
                tagMaps [i] = new int[grammars [0].PTCount + 1 + cbs [i + 1].Count];

                for (int j = 0; j < grammars[0].PTCount + 1; ++j)
                {
                    tagMaps [i] [j] = j;
                }
            }

            var lastMap = tagMaps [tagMaps.Length - 1];

            for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j)
            {
                string tstr = tagSet.GetTagString(j);
                int id = cbs [cbs.Length - 1] [tstr];
                int pid = cbs [cbs.Length - 2] [tagTiers [tagTiers.Length - 2] [id]];

                lastMap [j] = pid + grammars [0].PTCount + 1;
            }

            for (int i = 0; i < tagMaps.Length - 1; ++i)
            {
                for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j)
                {
                    string tstr = cbs [i + 1] [j - grammars [0].PTCount - 1];

                    int xid = Array.IndexOf(tagTiers [i + 1], tstr);

                    string pstr = tagTiers [i] [xid];

                    int pid = cbs [i] [pstr];

                    tagMaps [i] [j] = pid;
                }
            }

            var cgrammars = new LAPCFGrammar[tagMaps.Length];

            cgrammars [cgrammars.Length - 1] = grammars [0].CollapseNonTerminals(tagMaps [cgrammars.Length - 1], 1 + cbs [cgrammars.Length - 1].Count);

            for (int i = cgrammars.Length - 1; i >= 1; --i)
            {
                cgrammars [i - 1] = cgrammars [i].CollapseNonTerminals(tagMaps [i - 1], 1 + cbs [i - 1].Count);
            }

            for (int i = 0; i < cgrammars.Length; ++i)
            {
                cgrammars [i].MakeCompaction();
                cgrammars [i].MakeSubruleCompaction();
            }

            var treebank = new List<PhrasalTree>();

            LoadTrees(treebank, testfile);

            foreach (var tree in treebank)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }

                //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root);

                tree.ComputeStartEnd();
            }

            double ccount = 0;
            double pcount = 0;
            double gcount = 0;
            int failed = 0;
            int sentcount = 0;
            HyperEdgePool epool = new HyperEdgePool(1024 * 1024);

            HyperVertexPool vpool = new HyperVertexPool(grammars [grammars.Length - 1].subTagCounts.Max());

            //EMorph.EnglishMorph.WarmUp();

            Console.Error.WriteLine("Start to parse...");
            ConsoleTimer tm = new ConsoleTimer(1);

            Stopwatch g0bwatch = new Stopwatch();
            Stopwatch g0watch = new Stopwatch();
            Stopwatch bwatch = new Stopwatch();

            Stopwatch[] gwatch = new Stopwatch[grammars.Length];

            for (int i = 0; i < gwatch.Length; ++i)
            {
                gwatch [i] = new Stopwatch();
            }

            Stopwatch vwatch = new Stopwatch();

            foreach (var tree in treebank)
            {
                var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                if (words.Length > 20)
                {
                    continue;
                }

                sentcount += 1;

                int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray();

                wids [0] = vocab.GetId(SimpleTokenizor.ETokenize(words [0]), true);

                string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                double[][] tprobs = new double[wids.Length][];

                //for (int i = 0; i < wids.Length; ++i)
                //{
                //    tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i]));
                //}

                bool[][] allowedTags = null;
                    //AssignTagConstraints(vocab, tagSet, words, wids);

                try
                {
                    //var parser = new ChartParser(wids);
                    var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs);
                    g0bwatch.Start();
                    parser.BuildHyperGraph(cgrammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts);
                    g0bwatch.Stop();
                    g0watch.Start();
                    parser.SumForward();
                    parser.SumBackward(false);
                    parser.Prune(-15.0);
                    parser.Purge();
                    for (int i = 1; i < cgrammars.Length; ++i)
                    {
                        parser.ExpandHyperGraph(cgrammars [i], tagMaps [i - 1], epool, vpool,
                                                 grammars [grammars.Length - 1].subTagCounts);
                        parser.SumForward();
                        parser.SumBackward(false);
                        parser.Prune(-15.0);
                        parser.Purge();
                    }
                    g0watch.Stop();
            //
                    bwatch.Start();
                    parser.ExpandHyperGraph(grammars [0], tagMaps [2], epool, vpool, grammars [grammars.Length - 1].subTagCounts);
            //					parser.BuildHyperGraph (grammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts);
                    bwatch.Stop();

                    for (int i = 0; i < grammars.Length - 1; ++i)
                    {
                        gwatch [i].Start();
                        parser.SumForward();
                        parser.SumBackward(false);

                        parser.Prune(-10.0);

                        parser.Purge();

                        parser.ProjectGrammar(traces [i + 1], grammars [i + 1]);
                        gwatch [i].Stop();
                    }

                    gwatch [grammars.Length - 1].Start();
                    parser.SumForward();

                    parser.SumBackward(true);
                    gwatch [grammars.Length - 1].Stop();

                    vwatch.Start();
                    parser.PosteriorViterbi();

                    var ptree = parser.ExtractPosteriorViterbi(words, tagSet);

                    vwatch.Stop();

                    //PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root);

                    ptree.ComputeStartEnd();

                    var pbrackets = ptree.GetBracketsIgnorePunc();
                    var gbrackets = tree.GetBracketsIgnorePunc();

                    gcount += gbrackets.Count;
                    pcount += pbrackets.Count;

                    foreach (var b in pbrackets)
                    {
                        if (gbrackets.Contains(b))
                        {
                            ccount += 1;
                        }
                    }

                    if (pbrackets.Count == 0
                        || (pbrackets.Count < gbrackets.Count / 2))
                    {
                        Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count,
                            gbrackets.Count);
                    }

                    //Console.Error.WriteLine(tree.TextTree);
                } catch
                {
                    g0bwatch.Stop();
                    g0watch.Stop();
                    bwatch.Stop();
                    foreach (var w in gwatch)
                    {
                        w.Stop();
                    }
                    vwatch.Stop();
                    failed += 1;
                    Console.Error.WriteLine("\nFailure!");
                }

                tm.Up();
            }

            tm.Finish();

            Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed);

            double prec = ccount / pcount;
            double recall = ccount / gcount;

            double f1 = 2.0 * prec * recall / (prec + recall);

            Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1);

            Console.Error.WriteLine("G-1 Build:\t{0:F6} s", g0bwatch.Elapsed.TotalSeconds);

            Console.Error.WriteLine("G-1 Pass:\t{0:F6} s", g0watch.Elapsed.TotalSeconds);

            Console.Error.WriteLine("G0 Build:\t{0:F6} s", bwatch.Elapsed.TotalSeconds);

            for (int i = 0; i < gwatch.Length; ++i)
            {
                Console.Error.WriteLine("G{0} Pass:\t{1:F6} s", i, gwatch [i].Elapsed.TotalSeconds);
            }

            Console.Error.WriteLine("Viterbi:\t{0:F6} s", vwatch.Elapsed.TotalSeconds);
        }
Beispiel #2
0
        static double EvaluateRawParser()
        {
            string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar";
                                @"D:\user\nyang\data\treebank\English\pcfg\ptb.s2.smoothed.grammar";

            string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat";
                            @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat";

            string outputfile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.out";

            string reffile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.ref";

            string bklex = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.lexicon";
            string bkgrammar = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.grammar";

            int nthread = 16;

            Vocabulary vocab;
            TagSet tagSet;
            LAPCFGrammar grammar;

            using (var s = new TextModelReader(modelfile))
            {
                vocab = Vocabulary.LoadFromStream(s);
                tagSet = TagSet.LoadFromStream(s);
                grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet);
                //grammar = new LAPCFGrammar(tagSet, vocab, bklex, bkgrammar);
            }

            //grammar.Smoothing(0.01, 0.1);

            //grammar.Normalize();

            //grammar.PropMaxUnaryPath();

            grammar.MakeCompaction();

            grammar.MakeSubruleCompaction();

            var treebank = new List<PhrasalTree>();

            LoadTrees(treebank, testfile);

            foreach (var tree in treebank)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }

                //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root);

                tree.ComputeStartEnd();
            }

            treebank = treebank.Where(x => x.Root.End <= 20).ToList();

            double ccount = 0;
            double pcount = 0;
            double gcount = 0;
            int failed = 0;
            int sentcount = treebank.Count;

            Console.Error.WriteLine("Start to parse...");
            ConsoleTimer tm = new ConsoleTimer(1);

            PhrasalTree[] ptrees = new PhrasalTree[treebank.Count];

            Parallel.For(0, nthread, thrID =>
            {
                HyperEdgePool epool = new HyperEdgePool();

                HyperVertexPool vpool = new HyperVertexPool(grammar.subTagCounts.Max());
                for (int treeId = thrID; treeId < treebank.Count; treeId += nthread)
                {
                    var tree = treebank[treeId];
                    var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                    int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray();

                    wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true);

                    string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                    double[][] tprobs = new double[wids.Length][];

                    //for (int i = 0; i < wids.Length; ++i)
                    //{
                    //    tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i]));
                    //}

                    bool[][] allowedTags = null;
                    //AssignTagConstraints(vocab, tagSet, words, wids);

                    try
                    {
                        //var parser = new ChartParser(wids);
                        var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs);
                        parser.BuildHyperGraph(grammar, epool, vpool, grammar.subTagCounts);
                        parser.SumForward();
                        parser.SumBackward(true);
                        parser.PosteriorViterbi();

                        var ptree = parser.ExtractPosteriorViterbi(words, tagSet);

                        //parser.MaxForward();

                        //var ptree = parser.ExtractViterbi(words, tagSet);

                        ptree.ComputeStartEnd();

                        ptrees[treeId] = ptree;

                    }
                    catch
                    {
                    }
                }
            });

            using (StreamWriter sw = new StreamWriter(outputfile))
            {
                using (StreamWriter swref = new StreamWriter(reffile))
                {
                    for (int treeid = 0; treeid < treebank.Count; ++treeid)
                    {
                        var tree = treebank[treeid];
                        var ptree = ptrees[treeid];

                        swref.WriteLine(tree.GetParseLine());

                        if (ptree == null)
                        {
                            failed += 1;
                            sw.WriteLine("()");
                            continue;
                        }

                        var pbrackets = ptree.GetBracketsIgnorePunc();
                        var gbrackets = tree.GetBracketsIgnorePunc();

                        gcount += gbrackets.Count;
                        pcount += pbrackets.Count;

                        double xxc = 0;

                        foreach (var b in pbrackets)
                        {
                            if (gbrackets.Contains(b))
                            {
                                ccount += 1;
                                xxc += 1;
                            }
                        }

                        if (pbrackets.Count == 0
                            || (pbrackets.Count < gbrackets.Count / 2))
                        {
                            Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count,
                                gbrackets.Count);
                        }

                        string parseline = ptree.GetParseLine();

                        double snt_p = xxc / pbrackets.Count;
                        double snt_r = xxc / gbrackets.Count;

                        double snt_f1 = 2.0 * snt_p * snt_r / (snt_p + snt_r);

                        sw.WriteLine(parseline);

                        //sw.WriteLine(" [Current]\tP: {0:F2} R: {1:F2} F1: {2:F3}", snt_p * 100.0, snt_r * 100.0, snt_f1 * 100.0);

                    }
                }
            }

            tm.Finish();

            Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed);

            double prec = ccount / pcount;
            double recall = ccount / gcount;

            double f1 = 2.0 * prec * recall / (prec + recall);

            Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1);

            return f1;
        }
Beispiel #3
0
        public static void RunBclRewriter(string[] args)
        {
            #region Parse the command-line arguments.
            if (!Parser.ParseArgumentsWithUsage(args, typeof(Program)))
            {
                throw new UsageException();
            }
            #endregion

            #region Figure out paths
            s_assemblyName = Path.GetFullPath(s_assemblyName); // this has to be specified

            string outputBaseName = null;
            if (!String.IsNullOrEmpty(s_output))
            {
                s_output       = Path.GetFullPath(s_output);
                outputBaseName = Path.GetFileNameWithoutExtension(s_output);
            }
            else
            {
                s_output       = Path.Combine(Directory.GetCurrentDirectory(), Path.GetFileNameWithoutExtension(s_assemblyName) + ".small" + Path.GetExtension(s_assemblyName));
                outputBaseName = s_assemblyName;
            }

            string pdbSourceFile = Path.ChangeExtension(s_assemblyName, "pdb");
            string outputPdb     = Path.ChangeExtension(s_output, "pdb");
            string outputFolder  = Path.GetDirectoryName(s_output);

            // if the user wants to do an in-place rewrite, we copy the file to a temp file
            if (s_output == s_assemblyName)
            {
                String tempPath    = s_assemblyName + TempExtension;
                String tempPdbPath = pdbSourceFile + TempExtension;

                File.Copy(s_assemblyName, tempPath, true);
                s_assemblyName = tempPath;

                if (File.Exists(pdbSourceFile))
                {
                    File.Copy(pdbSourceFile, tempPdbPath, true);
                    pdbSourceFile = tempPdbPath;
                }
            }

            if (!Directory.Exists(outputFolder))
            {
                Directory.CreateDirectory(outputFolder);
            }

            #endregion

            #region Load input files
            HostEnvironment host = new HostEnvironment(new NameTable(), s_assemblyDependencyPaths, s_referencedAssemblies);

            IAssembly /*?*/ assembly = host.LoadUnitFrom(s_assemblyName) as IAssembly;
            // TODO: Handle multimodule assemblies
            if (assembly == null || assembly == Dummy.Assembly)
            {
                throw new UsageException(args[0] + " is not a PE file containing a CLR assembly, or an error occurred when loading it.");
            }

            if (!File.Exists(s_includeListFile))
            {
                throw new UsageException(String.Format("ERROR: Can't find code model file '{0}'", s_includeListFile));
            }

            ThinModel model = new ThinModel(new ThinnerOptions(host, new AssemblyIdentity[] { assembly.AssemblyIdentity }));
            model.LoadModel(s_includeListFile, new ModelReaderOptions(s_platform, s_architecture, s_flavor, s_treatFxInternalAsPublic, s_defines));
            #endregion

            #region Calculate api closure.
            ConsoleTimer.StartTimer("Calculating api closure");
            model.LoadMetadataFrom(assembly);

            ThinModel apiClosure = model.CalculateApiClosure();
            if (s_keepTempFiles)
            {
                apiClosure.SaveModel(Path.ChangeExtension(s_output, ".apiClosure.xml"));
            }
            ConsoleTimer.EndTimer("Calculating api closure");
            #endregion

            #region Calculate impl closure.
            ConsoleTimer.StartTimer("Calculating implementation closure");
            apiClosure.LoadMetadataFrom(assembly);

            ThinModel implClosure = apiClosure.CalculateImplementationClosure(true, FieldOptions.KeepAll);

            if (s_keepTempFiles)
            {
                implClosure.SaveModel(Path.ChangeExtension(s_output, ".implClosure.xml"));
            }
            ConsoleTimer.EndTimer("Calculating implementation closure");
            #endregion

            #region Trim.
            ConsoleTimer.StartTimer("Trimming assembly");
            IncludeSet includeSet = new IncludeSet();
            includeSet.LoadFrom(implClosure);

            var      copier         = new MetadataDeepCopier(host);
            Assembly copiedAssembly = copier.Copy(assembly);

            Trimmer trimmer = new Trimmer(includeSet, true, false, true, host, s_removeSerializable);
            trimmer.RewriteChildren(copiedAssembly);
            Assembly mutableAssembly = copiedAssembly;
            assembly = mutableAssembly;

            ConsoleTimer.EndTimer("Trimming assembly");
            #endregion

            #region Update assembly name.
            ConsoleTimer.StartTimer("Updating assembly name");

            // If the output assembly name is different, update the internal assembly name to match.
            AssemblyIdentity originalAssemblyIdentity = mutableAssembly.AssemblyIdentity;
            if (!outputBaseName.Equals(originalAssemblyIdentity.Name.ToString(), StringComparison.OrdinalIgnoreCase))
            {
                mutableAssembly.Name       = host.NameTable.GetNameFor(outputBaseName);
                mutableAssembly.ModuleName = mutableAssembly.Name;
            }

            // If we changed the assembly identity, update references to it.
            if (!mutableAssembly.AssemblyIdentity.Equals(originalAssemblyIdentity))
            {
                trimmer.UpdateAssemblyReferences(originalAssemblyIdentity, mutableAssembly.AssemblyIdentity);
            }

            ConsoleTimer.EndTimer("Updating assembly name");
            #endregion

            #region Write out the assembly
            ConsoleTimer.StartTimer("Writing assembly");
            PdbReader pdbReader = null;
            PdbWriter pdbWriter = null;
            if (File.Exists(pdbSourceFile))
            {
                Stream pdbStream = File.OpenRead(pdbSourceFile);
                pdbReader = new PdbReader(pdbStream, host);
                pdbWriter = new PdbWriter(outputPdb, pdbReader);
                Console.WriteLine("Writing pdb: {0}", outputPdb);
            }

            Console.WriteLine("Writing assembly: {0}", s_output);
            FileStream file = File.Create(s_output);

            try
            {
                PeWriter.WritePeToStream(assembly, host, file, pdbReader, pdbReader, pdbWriter);
            }
            finally
            {
                if (file != null)
                {
                    file.Dispose();
                }

                if (pdbWriter != null)
                {
                    pdbWriter.Dispose();
                }
            }

            ConsoleTimer.EndTimer("Writing assembly");
            #endregion
        }