コード例 #1
0
ファイル: PhraseBox.cs プロジェクト: nyanyanya/la-pcfg
        public static List<PhraseBox> GetPhraseBoxes(PhrasalTree tree)
        {
            List<PhraseBox> blist = new List<PhraseBox>();
            int hLvl = 0;
            int vLvl = 0;

            GetPhraseBoxes(blist, null, tree.Root, ref hLvl, vLvl);

            blist.Sort((a, b) => { return a.horizontalLvl.CompareTo(b.horizontalLvl); });

            return blist;
        }
コード例 #2
0
ファイル: Program.cs プロジェクト: nyanyanya/la-pcfg
        static double EvaluateRawParser()
        {
            string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar";
                                @"D:\user\nyang\data\treebank\English\pcfg\ptb.s2.smoothed.grammar";

            string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat";
                            @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat";

            string outputfile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.out";

            string reffile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.ref";

            string bklex = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.lexicon";
            string bkgrammar = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.grammar";

            int nthread = 16;

            Vocabulary vocab;
            TagSet tagSet;
            LAPCFGrammar grammar;

            using (var s = new TextModelReader(modelfile))
            {
                vocab = Vocabulary.LoadFromStream(s);
                tagSet = TagSet.LoadFromStream(s);
                grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet);
                //grammar = new LAPCFGrammar(tagSet, vocab, bklex, bkgrammar);
            }

            //grammar.Smoothing(0.01, 0.1);

            //grammar.Normalize();

            //grammar.PropMaxUnaryPath();

            grammar.MakeCompaction();

            grammar.MakeSubruleCompaction();

            var treebank = new List<PhrasalTree>();

            LoadTrees(treebank, testfile);

            foreach (var tree in treebank)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }

                //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root);

                tree.ComputeStartEnd();
            }

            treebank = treebank.Where(x => x.Root.End <= 20).ToList();

            double ccount = 0;
            double pcount = 0;
            double gcount = 0;
            int failed = 0;
            int sentcount = treebank.Count;

            Console.Error.WriteLine("Start to parse...");
            ConsoleTimer tm = new ConsoleTimer(1);

            PhrasalTree[] ptrees = new PhrasalTree[treebank.Count];

            Parallel.For(0, nthread, thrID =>
            {
                HyperEdgePool epool = new HyperEdgePool();

                HyperVertexPool vpool = new HyperVertexPool(grammar.subTagCounts.Max());
                for (int treeId = thrID; treeId < treebank.Count; treeId += nthread)
                {
                    var tree = treebank[treeId];
                    var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                    int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray();

                    wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true);

                    string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                    double[][] tprobs = new double[wids.Length][];

                    //for (int i = 0; i < wids.Length; ++i)
                    //{
                    //    tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i]));
                    //}

                    bool[][] allowedTags = null;
                    //AssignTagConstraints(vocab, tagSet, words, wids);

                    try
                    {
                        //var parser = new ChartParser(wids);
                        var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs);
                        parser.BuildHyperGraph(grammar, epool, vpool, grammar.subTagCounts);
                        parser.SumForward();
                        parser.SumBackward(true);
                        parser.PosteriorViterbi();

                        var ptree = parser.ExtractPosteriorViterbi(words, tagSet);

                        //parser.MaxForward();

                        //var ptree = parser.ExtractViterbi(words, tagSet);

                        ptree.ComputeStartEnd();

                        ptrees[treeId] = ptree;

                    }
                    catch
                    {
                    }
                }
            });

            using (StreamWriter sw = new StreamWriter(outputfile))
            {
                using (StreamWriter swref = new StreamWriter(reffile))
                {
                    for (int treeid = 0; treeid < treebank.Count; ++treeid)
                    {
                        var tree = treebank[treeid];
                        var ptree = ptrees[treeid];

                        swref.WriteLine(tree.GetParseLine());

                        if (ptree == null)
                        {
                            failed += 1;
                            sw.WriteLine("()");
                            continue;
                        }

                        var pbrackets = ptree.GetBracketsIgnorePunc();
                        var gbrackets = tree.GetBracketsIgnorePunc();

                        gcount += gbrackets.Count;
                        pcount += pbrackets.Count;

                        double xxc = 0;

                        foreach (var b in pbrackets)
                        {
                            if (gbrackets.Contains(b))
                            {
                                ccount += 1;
                                xxc += 1;
                            }
                        }

                        if (pbrackets.Count == 0
                            || (pbrackets.Count < gbrackets.Count / 2))
                        {
                            Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count,
                                gbrackets.Count);
                        }

                        string parseline = ptree.GetParseLine();

                        double snt_p = xxc / pbrackets.Count;
                        double snt_r = xxc / gbrackets.Count;

                        double snt_f1 = 2.0 * snt_p * snt_r / (snt_p + snt_r);

                        sw.WriteLine(parseline);

                        //sw.WriteLine(" [Current]\tP: {0:F2} R: {1:F2} F1: {2:F3}", snt_p * 100.0, snt_r * 100.0, snt_f1 * 100.0);

                    }
                }
            }

            tm.Finish();

            Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed);

            double prec = ccount / pcount;
            double recall = ccount / gcount;

            double f1 = 2.0 * prec * recall / (prec + recall);

            Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1);

            return f1;
        }
コード例 #3
0
ファイル: Program.cs プロジェクト: nyanyanya/la-pcfg
        static void CombineLabel()
        {
            string parsefilename = @"D:\user\nyang\tmp\train.pcfg.clean.txt";
            string safilename = @"D:\user\nyang\tmp\train.c.txt";
            string combinefilename = @"D:\user\nyang\tmp\train.cb.txt";

            using (StreamReader srp = new StreamReader(parsefilename))
            {
                using (StreamReader sra = new StreamReader(safilename))
                {
                    using (StreamWriter sw = new StreamWriter(combinefilename))
                    {
                        while (!srp.EndOfStream && !sra.EndOfStream)
                        {
                            string parseline = srp.ReadLine();
                            string saline = sra.ReadLine();

                            PhrasalTree ptree = new PhrasalTree(parseline);
                            ptree.RemoveUnaryRule();
                            PhrasalTree stree = new PhrasalTree(saline);

                            int length = ptree.Root.End;

                            var dict = new Dictionary<int, PhrasalNode>();

                            foreach (var node in ptree.TreeNodes)
                            {
                                int sig = node.Start * (length + 1) + node.End;
                                dict [sig] = node;
                            }

                            foreach (var node in stree.TreeNodes)
                            {
                                int sig = node.Start * (length + 1) + node.End;
                                var pnode = dict [sig];

                                var ptag = pnode.Tag;

                                bool xbar = ptag [0] == '@';

                                if (xbar)
                                {
                                    ptag = ptag.Substring(1);
                                }

                                //ptag = ptag.Substring(0, 1);

                                if (xbar)
                                {
                                    ptag += "x";
                                }

                                node.Tag = node.Tag + ptag;
                            }

                            stree.Root.Tag = "S";

                            var xline = stree.GetParseLine();

                            sw.WriteLine(xline);
                        }
                    }
                }
            }
        }
コード例 #4
0
ファイル: Program.cs プロジェクト: nyanyanya/la-pcfg
        static void ProcessBKOutput()
        {
            string inputfn = @"D:\user\nyang\tools\bkparser\wsj.23.s1.splitting.viterbi.out";
            string outputfn = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.s1.bk.out";

            using (StreamReader sr = new StreamReader(inputfn))
            {
                using (StreamWriter sw = new StreamWriter(outputfn))
                {
                    while (!sr.EndOfStream)
                    {
                        string line = sr.ReadLine().Trim();

                        if (string.IsNullOrWhiteSpace(line))
                        {
                            continue;
                        }

                        if (!line.StartsWith("(") || line.StartsWith("()"))
                        {
                            continue;
                        }

                        string xline = "(TOP " + line + ")";

                        var tree = new PhrasalTree(xline);

                        foreach (var node in tree.TreeNodes)
                        {
                            if (node.Children.Count == 0)
                            {
                                node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                            }
                        }

                        sw.WriteLine(tree.GetParseLine());
                    }
                }
            }
        }
コード例 #5
0
ファイル: Program.cs プロジェクト: nyanyanya/la-pcfg
        static void LoadTrees(List<PhrasalTree> trees, string file)
        {
            using (StreamReader sr = new StreamReader(file))
            {

                while (!sr.EndOfStream)
                {
                    string line = sr.ReadLine();
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    PhrasalTree pt = new PhrasalTree(line);
                    trees.Add(pt);
                }
            }
        }
コード例 #6
0
        public PhrasalTree ExtractViterbi(string[] words, TagSet tagSet)
        {
            var rootCell = chart[0, wids.Length - 1];

            if (rootCell.l2v == null)
            {
                return null;
            }

            var rootid = tagSet.ROOTID;

            HyperVertex v = null;//rootCell.l2v [0];

            for (int i = 0; i < rootCell.l2v.Length; ++i)
            {
                if (rootCell.l2v[i].tag == rootid)
                {
                    v = rootCell.l2v[i];
                    break;
                }
            }

            if (v == null || v.TYPE == VTYPE.DEAD)
            {
                throw new Exception("node is pruned!");
            }

            PhrasalNode rootNode = ExtractViterbiParse(v, 0, tagSet);

            PhrasalTree tree = new PhrasalTree(rootNode);

            tree.ComputeStartEnd();

            AnnotateLex(words, tree.Root);

            return tree;
        }
コード例 #7
0
ファイル: HyperGraphParser.cs プロジェクト: nyanyanya/la-pcfg
 public HyperGraph BuildHyperGraph(PhrasalTree tree)
 {
     HyperGraph g = new HyperGraph();
     int maxTag = rules.subTagCounts.Max();
     g.lbuf = new double[maxTag + 1];
     HyperVertex root;
     BuildHyperGraph(tree.Root, g, out root);
     g.Root = root;
     return g;
 }