Exemplo n.º 1
0
        public void BuildHyperGraph(LAPCFGrammar grammar, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity = null)
        {
            epool.Recycle ();
            vpool.Recycle ();
            this.ROOTID = grammar.ROOTID;

            var maxSubTag = grammar.subTagCounts.Max ();

            lbuf = new double[maxSubTag + 1];

            // CYK
            for (int i = 0; i < wids.Length; ++i) {
                bool isRoot = i == 0 && i == wids.Length - 1;
                chart [i, i] = new HyperCell (i, i + 1, grammar.TotalTagCount);
                MatchLexicon (grammar, chart [i, i], wids [i], epool, vpool, tagCapacity,
                    allowedPoSTags == null ? null : allowedPoSTags[i],
                    rawTagProbs == null ? null : rawTagProbs[i], isRoot);

                MatchUnaryRules (grammar, chart [i, i], epool, vpool, tagCapacity, isRoot);
                chart [i, i].Finish ();
            }

            for (int spanL = 2; spanL <= wids.Length; ++spanL) {
                for (int beg = 0; beg + spanL <= wids.Length; ++beg) {
                    int end = beg + spanL;
                    int l = beg;
                    int r = end - 1;
                    bool isRoot = l == 0 && r == wids.Length - 1;
                    chart [l, r] = new HyperCell (beg, end, grammar.TotalTagCount);
                    for (int mid = l; mid < r; ++mid) {
                        MatchBinaryRules (grammar, chart [l, r], chart [l, mid], chart [mid + 1, r], epool, vpool, tagCapacity, isRoot);
                    }

                    for (int i = 0; i < chart[l, r].l1v.Length; ++i) {
                        var c = chart [l, r].l1v [i];

                        if (c != null) {
                            if (isRoot && c.tag != ROOTID) {
                                continue;
                            }
                            chart [l, r].l2v [i] = vpool.Allocate (false, c.tag, c.beta.Length, c.beta.v.Length);
                            epool.Allocate (chart [l, r].l2v [i], c);
                        }

                    }
                    MatchUnaryRules (grammar, chart [l, r], epool, vpool, tagCapacity, isRoot);
                    chart [l, r].Finish ();
                }
            }
        }
Exemplo n.º 2
0
        private static void CreateMergeMapping(LAPCFGrammar rules, List<MergeHelper> mergeCands, out int[][] subtagMap, out bool[][] isMerged, out int[] newSubTagCounts)
        {
            subtagMap = new int[rules.TotalTagCount][];

            isMerged = new bool[rules.TotalTagCount][];

            for (int i = 0; i < subtagMap.Length; ++i) {
                subtagMap [i] = new int[rules.GetSubTagCount (i)];

                isMerged [i] = new bool[rules.GetSubTagCount (i)];

                for (int j = 0; j < subtagMap[i].Length; ++j) {
                    subtagMap [i] [j] = j;
                }
            }

            newSubTagCounts = new int[rules.TotalTagCount];

            for (int i = 0; i < newSubTagCounts.Length; ++i) {
                newSubTagCounts [i] = rules.GetSubTagCount (i);
            }

            for (int i = 0; i < mergeCands.Count / 2; ++i) {
                var cand = mergeCands [i];

                var t = cand.tag;

                var xt = cand.subtag;

                var lt = xt * 2;

                isMerged [t] [lt] = true;
                isMerged [t] [lt + 1] = true;
                newSubTagCounts [t] -= 1;

                for (int subt = lt + 1; subt < subtagMap[t].Length; ++subt) {
                    subtagMap [t] [subt] -= 1;
                }
            }
        }
Exemplo n.º 3
0
        public LAPCFGrammar SplitSymbols(Random RNG, double randomness)
        {
            int[] newSubTagCounts = new int[subTagCounts.Length];

            for (int tid = 0; tid < newSubTagCounts.Length; ++tid)
            {
                if (tid == ROOTID)
                {
                    newSubTagCounts [tid] = subTagCounts [tid];
                } else
                {
                    newSubTagCounts [tid] = subTagCounts [tid] * 2;
                }
            }

            var newbRules = brules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.Select(
                        z => z == null ? null : z.SplitSymbols(subTagCounts, newSubTagCounts, RNG, randomness)
            ).ToArray()
            ).ToArray()
            ).ToArray();

            var newuRules = urules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.SplitSymbols(subTagCounts, newSubTagCounts, RNG, randomness)
            ).ToArray()
            ).ToArray();

            var newtRules = trules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.SplitSymbols(subTagCounts, newSubTagCounts, RNG, randomness)
            ).ToArray()
            ).ToArray();

            var newTable = new LAPCFGrammar();
            newTable.NTCount = NTCount;
            newTable.PTCount = PTCount;
            newTable.ROOTID = ROOTID;
            newTable.brules = newbRules;
            newTable.urules = newuRules;
            newTable.trules = newtRules;
            newTable.subTagCounts = newSubTagCounts;

            newTable.InitializeExpectedCounts();

            foreach (var trace in subtagTraces)
            {
                newTable.subtagTraces.Add(trace);
            }

            int[][] newTrace = new int[TotalTagCount][];

            for (int i = 0; i < newTrace.Length; ++i)
            {
                newTrace [i] = new int[newSubTagCounts [i]];

                int splitFactor = newSubTagCounts [i] == subTagCounts [i] ? 1 : 2;
                for (int j = 0; j < newTrace[i].Length; ++j)
                {
                    newTrace [i] [j] = j / splitFactor;
                }
            }

            newTable.subtagTraces.Add(newTrace);

            return newTable;
        }
Exemplo n.º 4
0
        static void TestParse()
        {
            string modelfile = //@"/home/nan/Data/PTB/ptb.s2.smoothed.grammar";
            @"D:\user\nyang\data\treebank\English\pcfg\ptb.s6.smoothed.grammar";
            string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt";

            Vocabulary vocab;
            TagSet tagSet;
            LAPCFGrammar grammar;

            using (var s = new TextModelReader(modelfile))
            {
                vocab = Vocabulary.LoadFromStream(s);
                tagSet = TagSet.LoadFromStream(s);
                grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet);
            }

            grammar.Smoothing(0.1f);

            grammar.MakeCompaction();

            grammar.MakeSubruleCompaction();

            var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1];

            var traces = new int[grammars.Length][][];

            grammars[grammars.Length - 1] = grammar;

            for (int i = grammars.Length - 1; i >= 1; --i)
            {
                traces[i] = grammar.subtagTraces[i - 1];
                grammars[i - 1] = grammars[i].ProjectGrammar(traces[i]);
                grammars[i - 1].MakeCompaction();
                grammars[i - 1].MakeSubruleCompaction();
            }

            string[][] tagTiers;

            using (StreamReader sr = new StreamReader(tagmapfile))
            {
                var tt = new List<string[]>();
                while (!sr.EndOfStream)
                {
                    string line = sr.ReadLine();

                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries));
                }

                tagTiers = new string[tt[0].Length][];//tt.ToArray();

                for (int i = 0; i < tagTiers.Length; ++i)
                {
                    tagTiers[i] = new string[tt.Count];
                    for (int j = 0; j < tt.Count; ++j)
                    {
                        tagTiers[i][j] = tt[j][i];
                    }
                }
            }

            var cbs = new CodeBook32[tagTiers.Length];

            for (int i = 0; i < cbs.Length; ++i)
            {
                cbs[i] = new CodeBook32();

                foreach (var t in tagTiers[i])
                {
                    cbs[i].Add(t);
                }
            }

            int pgcount = cbs.Length - 1;

            int[][] tagMaps = new int[pgcount][];

            for (int i = 0; i < tagMaps.Length; ++i)
            {
                tagMaps[i] = new int[grammars[0].PTCount + 1 + cbs[i + 1].Count];

                for (int j = 0; j < grammars[0].PTCount + 1; ++j)
                {
                    tagMaps[i][j] = j;
                }
            }

            var lastMap = tagMaps[tagMaps.Length - 1];

            for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j)
            {
                string tstr = tagSet.GetTagString(j);
                int id = cbs[cbs.Length - 1][tstr];
                int pid = cbs[cbs.Length - 2][tagTiers[tagTiers.Length - 2][id]];

                lastMap[j] = pid + grammars[0].PTCount + 1;
            }

            for (int i = 0; i < tagMaps.Length - 1; ++i)
            {
                for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j)
                {
                    string tstr = cbs[i + 1][j - grammars[0].PTCount - 1];

                    int xid = Array.IndexOf(tagTiers[i + 1], tstr);

                    string pstr = tagTiers[i][xid];

                    int pid = cbs[i][pstr];

                    tagMaps[i][j] = pid;
                }
            }

            var cgrammars = new LAPCFGrammar[tagMaps.Length];

            cgrammars[cgrammars.Length - 1] = grammars[0].CollapseNonTerminals(tagMaps[cgrammars.Length - 1], 1 + cbs[cgrammars.Length - 1].Count);

            for (int i = cgrammars.Length - 1; i >= 1; --i)
            {
                cgrammars[i - 1] = cgrammars[i].CollapseNonTerminals(tagMaps[i - 1], 1 + cbs[i - 1].Count);
            }

            for (int i = 0; i < cgrammars.Length; ++i)
            {
                cgrammars[i].MakeCompaction();
                cgrammars[i].MakeSubruleCompaction();
            }

            HyperEdgePool epool = new HyperEdgePool(1024 * 1024);
            HyperVertexPool vpool = new HyperVertexPool(grammars[grammars.Length - 1].subTagCounts.Max());
            EMorph.EnglishMorph.WarmUp();
            Console.Error.WriteLine("READY");

            while(true)
            {
                string line = Console.ReadLine();

                if (string.IsNullOrWhiteSpace(line))
                {
                    continue;
                }

                var words = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray();

                wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true);
                bool[][] allowedTags = new bool[wids.Length][];

                for (int i = 0; i < wids.Length; ++i)
                {
                    if (vocab.IsRareOrUNK(wids[i]))
                    {
                        var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]);

                        if (lemmas == null || lemmas.Count == 0)
                        {
                            continue;
                        }

                        allowedTags[i] = new bool[tagSet.PTCount];

                        if (char.IsUpper(words[i][0]))
                        {
                            allowedTags[i][tagSet.GetID("NNP")] = true;
                            allowedTags[i][tagSet.GetID("NNPS")] = true;
                        }

                        foreach (var lemma in lemmas)
                        {
                            switch (lemma.PoS)
                            {
                                case EMorph.MorphPoS.NN:
                                    allowedTags[i][tagSet.GetID("NN")] = true;
                                    allowedTags[i][tagSet.GetID("NNS")] = true;
                                    break;
                                case EMorph.MorphPoS.NNS:
                                    allowedTags[i][tagSet.GetID("NNS")] = true;
                                    allowedTags[i][tagSet.GetID("NN")] = true;
                                    break;
                                case EMorph.MorphPoS.JJ:
                                    allowedTags[i][tagSet.GetID("JJ")] = true;
                                    break;
                                case EMorph.MorphPoS.JJR:
                                    allowedTags[i][tagSet.GetID("JJR")] = true;
                                    break;
                                case EMorph.MorphPoS.JJS:
                                    allowedTags[i][tagSet.GetID("JJS")] = true;
                                    break;
                                case EMorph.MorphPoS.RB:
                                    allowedTags[i][tagSet.GetID("RB")] = true;
                                    break;
                                case EMorph.MorphPoS.RBR:
                                    allowedTags[i][tagSet.GetID("RBR")] = true;
                                    break;
                                case EMorph.MorphPoS.RBS:
                                    allowedTags[i][tagSet.GetID("RBS")] = true;
                                    break;
                                case EMorph.MorphPoS.VB:
                                    allowedTags[i][tagSet.GetID("VB")] = true;
                                    allowedTags[i][tagSet.GetID("VBP")] = true;
                                    break;
                                case EMorph.MorphPoS.VBD:
                                    allowedTags[i][tagSet.GetID("VBD")] = true;
                                    allowedTags[i][tagSet.GetID("VBN")] = true;
                                    break;
                                case EMorph.MorphPoS.VBG:
                                    allowedTags[i][tagSet.GetID("VBG")] = true;
                                    break;
                                case EMorph.MorphPoS.VBZ:
                                    allowedTags[i][tagSet.GetID("VBZ")] = true;
                                    break;
                                default:
                                    throw new Exception("not recognized morph lemma!");
                            }
                        }
                    }
                }

                try
                {
                    var parser = new ChartHyperGraphParser(wids, allowedTags);
                    parser.BuildHyperGraph(cgrammars[0], epool, vpool, grammars[grammars.Length - 1].subTagCounts);
                    parser.SumForward();
                    parser.SumBackward(false);
                    parser.Prune(-10.0);
                    parser.Purge();
                    for (int i = 1; i < cgrammars.Length; ++i)
                    {
                        parser.ExpandHyperGraph(cgrammars[i], tagMaps[i - 1], epool, vpool,
                                                 grammars[grammars.Length - 1].subTagCounts);
                        parser.SumForward();
                        parser.SumBackward(false);
                        parser.Prune(-10.0);
                        parser.Purge();
                    }
                    parser.ExpandHyperGraph(grammars[0], tagMaps[2], epool, vpool, grammars[grammars.Length - 1].subTagCounts);

                    for (int i = 0; i < grammars.Length - 1; ++i)
                    {
                        parser.SumForward();
                        parser.SumBackward(false);
                        parser.Prune(-8.0);
                        parser.Purge();
                        parser.ProjectGrammar(traces[i + 1], grammars[i + 1]);
                    }

                    parser.SumForward();
                    parser.SumBackward(true);
                    parser.PosteriorViterbi();

                    var ptree = parser.ExtractPosteriorViterbi(words, tagSet);

                    PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root);

                    ptree.ComputeStartEnd();

                    string treeline = ptree.TextTree;

                    string[] xlines = treeline.Split(new string[] { "\n", "\r", "\r\n" }, StringSplitOptions.RemoveEmptyEntries);

                    foreach (var xline in xlines)
                    {
                        Console.Error.WriteLine(xline);
                    }
                }
                catch
                {
                    Console.Error.WriteLine("Failure to parse!");
                }
            }
        }
Exemplo n.º 5
0
        private static double ParseGraphAndCollect(int nthread,
            List<PhrasalTree> treebank,
            LAPCFGrammar rules,
            Vocabulary vocab,
            TagSet tagSet,
            out int failed)
        {
            double llhd = 0;
            failed = 0;

            int xfail = 0;
            var handle = new object();
            var rulelist = new List<LAPCFGrammar>();
            rulelist.Add(rules);
            while (rulelist.Count < nthread)
            {
                rulelist.Add(rules.CloneWithSharedParameters());
            }
            Parallel.For(0, nthread, threadid =>
            {
                int fail = 0;
                double xllhd = 0;
                var parser = new HyperGraphParser(vocab, tagSet, rulelist [threadid]);
                for (int i = threadid; i < treebank.Count; i += nthread)
                {
                    try
                    {
                        var graph = parser.BuildHyperGraph(treebank [i]);

                        graph.SumForward();
                        graph.SumBackward();

                        if (double.IsInfinity(graph.RootScore) || double.IsNaN(graph.RootScore))
                        {
                            fail += 1;
                            continue;
                        }

                        graph.CollectExpectedCount();
                        xllhd += graph.RootScore;
                    } catch
                    {
                        fail += 1;
                    }

                }

                lock (handle)
                {
                    xfail += fail;
                    llhd += xllhd;
                }
            }
            );

            for (int i = 1; i < rulelist.Count; ++i)
            {
                LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.tposteriorCounts, rulelist [i].tposteriorCounts);
                LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.uposteriorCounts, rulelist [i].uposteriorCounts);
                LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.bposteriorCounts, rulelist [i].bposteriorCounts);
            }
            failed = xfail;
            //Console.Error.WriteLine("fail: {0}\tllhd: {1}", failed, llhd);
            return llhd;
        }
Exemplo n.º 6
0
        private static void MatchUnaryRules(
            LAPCFGrammar grammar,
            HyperCell cell,
            HyperEdgePool epool,
            HyperVertexPool vpool,
            int[] tagCapacity,
            bool isRoot)
        {
            foreach (var cv in cell.l1v) {
                if (cv == null) {
                    continue;
                }
                var rules = grammar.urules [cv.tag];

                if (rules != null) {
                    foreach (var rule in rules) {
                        if (rule == null) {
                            break;
                        }

                        if (rule.ptag == grammar.ROOTID && !isRoot) {
                            continue;
                        }

                        if (isRoot && rule.ptag != grammar.ROOTID) {
                            continue;
                        }

                        if (cell.l2v [rule.ptag] == null) {
                            var cap = tagCapacity == null ? -1 : tagCapacity [rule.ptag];
                            cell.l2v [rule.ptag] = vpool.Allocate (false, rule.ptag, grammar.GetSubTagCount (rule.ptag), cap);
                        }

                        epool.Allocate (cell.l2v [rule.ptag], cv, rule.scores, null);
                    }
                }
            }
        }
Exemplo n.º 7
0
        private static void MatchBinaryRules(
            LAPCFGrammar grammar,
            HyperCell pcell,
            HyperCell lcell,
            HyperCell rcell,
            HyperEdgePool epool,
            HyperVertexPool vpool,
            int[] tagCapacity,
            bool isRoot)
        {
            foreach (var lv in lcell.l2v) {
                var rprules = grammar.brules [lv.tag];
                if (rprules == null) {
                    continue;
                }
                foreach (var rv in rcell.l2v) {
                    var prules = rprules [rv.tag];

                    if (prules == null) {
                        continue;
                    }

                    for (int p = 0; p < prules.Length; ++p) {
                        var rule = prules [p];

                        if (rule == null) {
                            break;
                        }

                        if (rule.ptag == grammar.ROOTID && !isRoot) {
                            continue;
                        }

                        if (pcell.l1v [rule.ptag] == null) {
                            var cap = tagCapacity == null ? -1 : tagCapacity [rule.ptag];
                            pcell.l1v [rule.ptag] = vpool.Allocate (false, rule.ptag, grammar.GetSubTagCount (rule.ptag), cap);
                        }

                        epool.Allocate (pcell.l1v [rule.ptag], lv, rv, rule.scores, null);
                    }
                }
            }
        }
Exemplo n.º 8
0
        public void ProjectGrammar(int[][] trace, LAPCFGrammar grammar)
        {
            int maxSubTag = grammar.subTagCounts.Max ();
            lbuf = new double[maxSubTag + 1];

            for (int spanL = 1; spanL <= wids.Length; ++spanL) {
                for (int beg = 0; beg + spanL <= wids.Length; ++beg) {
                    int end = beg + spanL;
                    int l = beg;
                    int r = end - 1;

                    foreach (var v in chart[l, r].l1v) {
                        if (v != null) {
                            v.ProjectGrammar (trace, grammar);
                        }
                    }

                    foreach (var v in chart[l, r].l2v) {
                        if (v != null) {
                            v.ProjectGrammar (trace, grammar);
                        }
                    }
                }
            }
        }
Exemplo n.º 9
0
 public HyperGraphParser(
     Vocabulary vocab,
     TagSet tagset,
     LAPCFGrammar rules)
 {
     this.vocab = vocab;
     this.tagset = tagset;
     this.rules = rules;
 }
Exemplo n.º 10
0
        public LAPCFGrammar CreateRuleTable(int[] newSubTagCounts)
        {
            var table = new LAPCFGrammar();

            table.NTCount = NTCount;
            table.PTCount = PTCount;
            table.ROOTID = ROOTID;
            table.subTagCounts = newSubTagCounts;

            table.brules = brules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.Select(
                        z => z == null ? null : z.CreateRule(newSubTagCounts)
            ).ToArray()
            ).ToArray()
            ).ToArray();
            //ArrayHelper.Clone<BinaryRule>(rules);

            table.urules = urules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.CreateRule(newSubTagCounts)
            ).ToArray()
            ).ToArray();

            table.trules = trules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.CreateRule(newSubTagCounts)
            ).ToArray()
            ).ToArray();

            return table;
        }
Exemplo n.º 11
0
        public LAPCFGrammar CollapseNonTerminals(int[] tagMap, int newNTCount)
        {
            var pg = new LAPCFGrammar();

            pg.NTCount = newNTCount;
            pg.PTCount = PTCount;
            pg.ROOTID = ROOTID;
            pg.subTagCounts = new int[PTCount + newNTCount];
            ArrayHelper.Fill(pg.subTagCounts, 1);

            var sprobs = ComputeSymbolProb();

            var psprobs = pg.subTagCounts.Select(x => new double[x]).ToArray();
            ArrayHelper.Fill(psprobs, double.NegativeInfinity);

            for (int i = 0; i < PTCount; ++i)
            {
                for (int j = 0; j < subTagCounts[i]; ++j)
                {
                    psprobs [i] [0] = MathHelper.LogAdd(psprobs [i] [0], sprobs [i] [j]);
                }
            }

            for (int i = PTCount; i < TotalTagCount; ++i)
            {
                for (int j = 0; j < subTagCounts[i]; ++j)
                {
                    psprobs [tagMap [i]] [0] = MathHelper.LogAdd(psprobs [tagMap [i]] [0], sprobs [i] [j]);
                }
            }

            pg.brules = ArrayHelper.AllocateArray<BinaryRule>(PTCount + newNTCount,
                                                               PTCount + newNTCount,
                                                               PTCount + newNTCount);

            foreach (var x in brules.Where(x => x != null))
            {
                foreach (var y in x.Where(y => y != null))
                {
                    foreach (var z in y.Where(z => z != null))
                    {
                        int ltag = z.ltag >= PTCount ? tagMap [z.ltag] : z.ltag;
                        int rtag = z.rtag >= PTCount ? tagMap [z.rtag] : z.rtag;
                        int ptag = z.ptag >= PTCount ? tagMap [z.ptag] : z.ptag;
                        double s = double.NegativeInfinity;

                        for (int l = 0; l < z.scores.Length; ++l)
                        {
                            if (z.scores [l] == null)
                            {
                                continue;
                            }

                            for (int r = 0; r < z.scores[l].Length; ++r)
                            {
                                if (z.scores [l] [r] == null)
                                {
                                    continue;
                                }

                                for (int p = 0; p < z.scores[l][r].Length; ++p)
                                {
                                    double xs = z.scores [l] [r] [p];

                                    if (double.IsNegativeInfinity(xs))
                                    {
                                        continue;
                                    }

                                    xs += sprobs [z.ptag] [p];

                                    s = MathHelper.LogAdd(xs, s);
                                }
                            }
                        }

                        if (double.IsNegativeInfinity(s))
                        {
                            continue;
                        }

                        if (pg.brules [ltag] [rtag] [ptag] == null)
                        {
                            pg.brules [ltag] [rtag] [ptag] = new BinaryRule(ArrayHelper.AllocateArray<double>(1, 1, 1), ptag, ltag, rtag);
                            pg.brules [ltag] [rtag] [ptag].scores [0] [0] [0] = s;
                        } else
                        {
                            pg.brules [ltag] [rtag] [ptag].scores [0] [0] [0]
                                = MathHelper.LogAdd(s, pg.brules [ltag] [rtag] [ptag].scores [0] [0] [0]);
                        }
                    }
                }
            }

            foreach (var x in pg.brules.Where(x => x != null))
            {
                foreach (var y in x.Where(y => y != null))
                {
                    foreach (var z in y.Where(z => z != null))
                    {
                        z.scores [0] [0] [0] = z.scores [0] [0] [0] - psprobs [z.ptag] [0];
                    }
                }
            }

            pg.urules = ArrayHelper.AllocateArray<UnaryRule>(PTCount + newNTCount,
                                                              PTCount + newNTCount);

            foreach (var x in urules.Where(x => x != null))
            {
                foreach (var y in x.Where(y => y != null))
                {
                    int ctag = y.ctag >= PTCount ? tagMap [y.ctag] : y.ctag;
                    int ptag = y.ptag >= PTCount ? tagMap [y.ptag] : y.ptag;
                    double s = double.NegativeInfinity;

                    for (int c = 0; c < y.scores.Length; ++c)
                    {
                        if (y.scores [c] == null)
                        {
                            continue;
                        }

                        for (int p = 0; p < y.scores[c].Length; ++p)
                        {
                            double xs = y.scores [c] [p];

                            if (double.IsNegativeInfinity(xs))
                            {
                                continue;
                            }

                            xs += sprobs [y.ptag] [p];

                            s = MathHelper.LogAdd(xs, s);
                        }
                    }

                    if (double.IsNegativeInfinity(s))
                    {
                        continue;
                    }

                    if (pg.urules [ctag] [ptag] == null)
                    {
                        pg.urules [ctag] [ptag] = new UnaryRule(ArrayHelper.AllocateArray<double>(1, 1), ptag, ctag);
                        pg.urules [ctag] [ptag].scores [0] [0] = s;
                    } else
                    {
                        pg.urules [ctag] [ptag].scores [0] [0] =
                            MathHelper.LogAdd(s, pg.urules [ctag] [ptag].scores [0] [0]);
                    }
                }
            }

            foreach (var x in pg.urules.Where(x => x != null))
            {
                foreach (var y in x.Where(y => y != null))
                {
                    y.scores [0] [0] = y.scores [0] [0] - psprobs [y.ptag] [0];
                }
            }

            var trace = subTagCounts.Select(x => new int[x]).ToArray();

            pg.trules = trules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.MergeSymbols(trace, sprobs, psprobs)
            ).ToArray()
            ).ToArray();

            pg.Normalize();
            return pg;
        }
Exemplo n.º 12
0
 public LAPCFGrammar CloneWithSharedParameters()
 {
     var clone = new LAPCFGrammar();
     clone.brules = brules;
     clone.urules = urules;
     clone.trules = trules;
     clone.NTCount = NTCount;
     clone.PTCount = PTCount;
     clone.ROOTID = ROOTID;
     clone.subTagCounts = subTagCounts;
     clone.subtagTraces = subtagTraces;
     clone.InitializeExpectedCounts();
     return clone;
 }
Exemplo n.º 13
0
 public LAPCFGrammar Clone()
 {
     var clone = new LAPCFGrammar();
     clone.brules = LAPCFGrammar.CloneRules(brules);
     clone.urules = LAPCFGrammar.CloneRules(urules);
     clone.trules = LAPCFGrammar.CloneRules(trules);
     clone.NTCount = NTCount;
     clone.PTCount = PTCount;
     clone.ROOTID = ROOTID;
     clone.subTagCounts = (int[])subTagCounts.Clone();
     clone.subtagTraces = new List<int[][]>();
     foreach (var trace in subtagTraces)
     {
         clone.subtagTraces.Add(ArrayHelper.Clone(trace));
     }
     clone.InitializeExpectedCounts();
     return clone;
 }
Exemplo n.º 14
0
        public static LAPCFGrammar LoadFromStream(TextModelReader sr, Vocabulary vocab, TagSet tagSet)
        {
            var grammar = new LAPCFGrammar();
            var name = typeof(LAPCFGrammar).FullName;

            sr.Require(name);
            sr.Require("VER", VER);

            grammar.NTCount = sr.ReadOptionInt("NTCount");
            grammar.PTCount = sr.ReadOptionInt("PTCount");
            grammar.ROOTID = sr.ReadOptionInt("ROOTID");

            sr.Require("TerminalRule");

            int lvl = sr.NestLevel;
            var truleStrings = new HashSet<string>();
            var uruleStrings = new HashSet<string>();
            var bruleStrings = new HashSet<string>();

            string line = sr.Read();
            while (sr.NestLevel > lvl)
            {
                truleStrings.Add(line);
                line = sr.Read();
            }

            if (line != "UnaryRule")
            {
                throw new Exception("wrong model!");
            }
            line = sr.Read();
            while (sr.NestLevel > lvl)
            {
                uruleStrings.Add(line);
                line = sr.Read();
            }

            if (line != "BinaryRule")
            {
                throw new Exception("wrong model!");
            }
            line = sr.Read();
            while (sr.NestLevel > lvl)
            {
                bruleStrings.Add(line);
                line = sr.Read();
            }

            string[] parts = line.Split('\t');

            if (parts [0] != "TraceCount")
            {
                throw new Exception("error in model");
            }

            int subtraceCount = int.Parse(parts [1]);

            grammar.subtagTraces = new List<int[][]>();

            for (int i = 0; i < subtraceCount; ++i)
            {
                int tlen = sr.ReadOptionInt("TRACE");
                int[][] trace = new int[tlen][];

                for (int j = 0; j < tlen; ++j)
                {
                    trace [j] = sr.ReadIntArray();
                }

                grammar.subtagTraces.Add(trace);
            }

            if (grammar.subtagTraces.Count == 0)
            {
                grammar.subTagCounts = new int[grammar.TotalTagCount];
                ArrayHelper.Fill(grammar.subTagCounts, 1);
            } else
            {
                var trace = grammar.subtagTraces [grammar.subtagTraces.Count - 1];
                grammar.subTagCounts = trace.Select(x => x.Length).ToArray();
            }

            sr.Require(name);

            foreach (var str in uruleStrings)
            {
                grammar.BuildUnaryRule(str, tagSet);
            }

            foreach (var str in truleStrings)
            {
                grammar.BuildTerminalRule(str, vocab, tagSet);
            }

            foreach (var str in bruleStrings)
            {
                grammar.BuildBinaryRule(str, tagSet);
            }

            return grammar;
        }
Exemplo n.º 15
0
        private static LAPCFGrammar MergeRuleTable(LAPCFGrammar rules, double[][] tagProb, int[][] subtagMap, bool[][] isMerged, int[] newSubTagCounts)
        {
            var newRules = rules.CreateRuleTable (newSubTagCounts);

            foreach (var x in rules.brules) {
                if (x == null) {
                    continue;
                }

                foreach (var y in x) {
                    if (y == null) {
                        continue;
                    }

                    foreach (var rule in y) {
                        if (rule == null) {
                            continue;
                        }
                        int l = rule.ltag;
                        int r = rule.rtag;
                        int p = rule.ptag;

                        for (int sl = 0; sl < rule.scores.Length; ++sl) {
                            for (int sr = 0; sr < rule.scores[sl].Length; ++sr) {
                                for (int sp = 0; sp < rule.scores[sl][sr].Length; ++sp) {
                                    double s = rule.scores [sl] [sr] [sp];

                                    int nsl = subtagMap [l] [sl];
                                    int nsr = subtagMap [r] [sr];
                                    int nsp = subtagMap [p] [sp];

                                    if (isMerged [p] [sp]) {
                                        s += tagProb [p] [sp];
                                    }

                                    var xs = newRules.brules [l] [r] [p].scores [nsl] [nsr] [nsp];

                                    newRules.brules [l] [r] [p].scores [nsl] [nsr] [nsp] = MathHelper.LogAdd (xs, s);
                                }
                            }
                        }
                    }
                }
            }

            foreach (var x in rules.urules) {
                if (x == null) {
                    continue;
                }

                foreach (var rule in x) {
                    if (rule == null) {
                        continue;
                    }

                    int c = rule.ctag;
                    int p = rule.ptag;

                    for (int sc = 0; sc < rule.scores.Length; ++sc) {
                        for (int sp = 0; sp < rule.scores[sc].Length; ++sp) {
                            double s = rule.scores [sc] [sp];

                            int nsc = subtagMap [c] [sc];
                            int nsp = subtagMap [p] [sp];

                            if (isMerged [p] [sp]) {
                                s += tagProb [p] [sp];
                            }

                            var xs = newRules.urules [c] [p].scores [nsc] [nsp];

                            newRules.urules [c] [p].scores [nsc] [nsp] = MathHelper.LogAdd (xs, s);
                        }
                    }
                }
            }

            foreach (var x in rules.trules) {
                if (x == null) {
                    continue;
                }

                foreach (var rule in x) {
                    if (rule == null) {
                        continue;
                    }

                    int w = rule.word;
                    int t = rule.tag;

                    for (int st = 0; st < rule.scores.Length; ++st) {
                        double s = rule.scores [st];

                        int nsp = subtagMap [t] [st];

                        if (isMerged [t] [st]) {
                            s += tagProb [t] [st];
                        }

                        var xs = newRules.trules [w] [t].scores [nsp];

                        newRules.trules [w] [t].scores [nsp] = MathHelper.LogAdd (xs, s);
                    }
                }
            }

            double[][] expects = newRules.subTagCounts.Select (x => new double[x]).ToArray ();

            ArrayHelper.Fill (expects, double.NegativeInfinity);

            LAPCFGrammar.CollectTagMass (expects, newRules.trules);
            LAPCFGrammar.CollectTagMass (expects, newRules.urules);
            LAPCFGrammar.CollectTagMass (expects, newRules.brules);

            LAPCFGrammar.Normalize (expects, newRules.trules);
            LAPCFGrammar.Normalize (expects, newRules.urules);
            LAPCFGrammar.Normalize (expects, newRules.brules);

            foreach (var trace in rules.subtagTraces) {
                newRules.subtagTraces.Add (trace);
            }

            int[][] oldTrace = newRules.subtagTraces [newRules.subtagTraces.Count - 1];

            int[][] newTrace = new int[oldTrace.Length][];

            for (int i = 0; i < newTrace.Length; ++i) {
                newTrace [i] = new int[newRules.subTagCounts [i]];

                for (int j = 0; j < oldTrace[i].Length; ++j) {
                    newTrace [i] [subtagMap [i] [j]] = oldTrace [i] [j];
                }
            }

            newRules.subtagTraces [newRules.subtagTraces.Count - 1] = newTrace;

            return newRules;
        }
Exemplo n.º 16
0
        public void ProjectGrammar(int[][] traces, LAPCFGrammar grammar)
        {
            int[] trace = traces [tag];

            for (int i = trace.Length - 1; i >= 0; --i) {
                pruned [i] = pruned [trace [i]];
            }

            subtagCount = grammar.subTagCounts [tag];
            alpha.Length = subtagCount;
            beta.Length = subtagCount;

            if (TYPE != VTYPE.TERMINAL) {
                for (int i = 0; i < subtagCount; ++i) {
                    _alpha [i] = double.NegativeInfinity;
                }
            } else {
                for (int i = 0; i < subtagCount; ++i) {
                    _alpha [i] = 0;
                }
            }

            for (int i = 0; i < beta.Length; ++i) {
                _beta [i] = double.NegativeInfinity;
            }

            foreach (var e in incomings) {
                switch (e.TYPE) {
                case ETYPE.BINARY:
                    e.binaryScores = grammar.GetRuleScores (e.to.tag, e.from0.tag, e.from1.tag, true);
                    break;
                case ETYPE.UNARY:
                    e.unaryScores = grammar.GetRuleScores (e.to.tag, e.from0.tag, true);
                    break;
                case ETYPE.TERMINAL:
                    e.terminalScores = grammar.GetTerminalRuleScores (e.to.tag, e.from0.tag, true);
                    break;
                case ETYPE.DUMMY:
                    break;
                default:
                    throw new Exception ("unrecognized edge type!");
                }
            }

            posteriorScore = double.NegativeInfinity;
        }
Exemplo n.º 17
0
        public static void Build(
            Vocabulary vocab,
            TagSet tagset,
            List<PhrasalTree> treebank,
            out LAPCFGrammar rules,
            Random RNG = null)
        {
            int tagCount = tagset.NTCount + tagset.PTCount;

            double[] pmass = new double[tagset.NTCount + tagset.PTCount];

            double[][] unaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount);

            double[][][] binaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount, tagCount);

            foreach (var tree in treebank) {
                foreach (var node in tree.TreeNodes) {
                    if (node.Children.Count == 0) {
                        // terminals
                        continue;
                    } else if (node.Children.Count == 1) {
                        int pt = tagset.GetID (node.Tag);
                        int ct = tagset.GetID (node.Children [0].Tag);

                        pmass [pt] += 1.0f;
                        unaries [ct] [pt] += 1.0f;
                    } else if (node.Children.Count == 2) {
                        int pt = tagset.GetID (node.Tag);
                        int lt = tagset.GetID (node.Children [0].Tag);
                        int rt = tagset.GetID (node.Children [1].Tag);

                        pmass [pt] += 1.0f;
                        binaries [lt] [rt] [pt] += 1.0f;
                    } else {
                        throw new Exception ("tree node with more than 2 children!");
                    }
                }
            }

            for (int c = 0; c < unaries.Length; ++c)
            {
                bool csurvive = false;
                for (int p = 0; p < unaries[c].Length; ++p)
                {
                    if (unaries[c][p] > 0)
                    {
                        csurvive = true;
                        break;
                    }
                }

                if (!csurvive)
                {
                    unaries[c] = null;
                }
            }

            for (int l = 0; l < binaries.Length; ++l)
            {
                bool lsurvive = false;
                for (int r = 0; r < binaries[l].Length; ++r)
                {
                    bool rsurvive = false;

                    for (int p = 0; p < binaries[l][r].Length; ++p)
                    {
                        if (binaries[l][r][p] > 0)
                        {
                            rsurvive = true;
                            break;
                        }
                    }

                    if (rsurvive)
                    {
                        lsurvive = true;
                    }
                    else
                    {
                        binaries[l][r] = null;
                    }
                }

                if (!lsurvive)
                {
                    binaries[l] = null;
                }
            }

            foreach (var x in unaries.Where(x => x != null))
            {
                for (int p = 0; p < x.Length; ++p)
                {
                    double noise = RNG.NextDouble();
                    x[p] += noise;
                    pmass[p] += noise;
                }
            }

            foreach (var x in binaries.Where(x => x != null))
            {
                foreach (var y in x.Where(y => y != null))
                {
                    for (int p = 0; p < y.Length; ++p)
                    {
                        double noise = RNG.NextDouble();
                        y[p] += noise;
                        pmass[p] += noise;
                    }
                }
            }

            for (int c = 0; c < tagCount; ++c) {
                for (int p = 0; p < tagCount; ++p) {
                    if (pmass [p] == 0) {
                        continue;
                    }
                    if (unaries[c] == null)
                    {
                        continue;
                    }
                    unaries [c] [p] /= pmass [p];
                }
            }

            for (int c = 0; c < tagCount; ++c) {
                if (unaries [c] == null) {
                    continue;
                }
                for (int p = 0; p < tagCount; ++p) {
                    if (unaries [c] [p] <= 0) {
                        unaries [c] [p] = double.NegativeInfinity;
                    } else {
                        unaries [c] [p] = (double)Math.Log (unaries [c] [p]);
                    }
                }
            }

            for (int l = 0; l < tagCount; ++l) {
                if (binaries[l] == null)
                {
                    continue;
                }
                for (int r = 0; r < tagCount; ++r) {
                    for (int p = 0; p < tagCount; ++p) {
                        if (pmass [p] == 0) {
                            continue;
                        }

                        if (binaries[l][r] == null)
                        {
                            continue;
                        }

                        binaries [l] [r] [p] /= pmass [p];
                    }
                }
            }

            for (int l = 0; l < tagCount; ++l) {
                if (binaries [l] == null) {
                    continue;
                }
                for (int r = 0; r < tagCount; ++r) {
                    if (binaries [l] [r] == null) {
                        continue;
                    }

                    for (int p = 0; p < tagCount; ++p) {

                        if (binaries [l] [r] [p] <= 0) {
                            binaries [l] [r] [p] = double.NegativeInfinity;
                        } else {
                            binaries [l] [r] [p] = (double)Math.Log (binaries [l] [r] [p]);
                        }
                    }

                }
            }

            var terminals = BuildLexSimple (treebank, tagset, vocab, RNG);

            rules = new LAPCFGrammar (tagset, binaries, unaries, terminals);
        }
Exemplo n.º 18
0
        public static void CalculateNewScores(
            //LALexiconBuilder lexicon,
            LAPCFGrammar rules,
            bool lexiconOnly = false)
        {
            //lexicon.CalculateNewScores();
            //lexicon.ClearExpectedCounts();

            double[][] expects = new double[rules.TotalTagCount][];

            for (int i = 0; i < expects.Length; ++i) {
                expects [i] = new double[rules.GetSubTagCount (i)];
            }

            ArrayHelper.Fill (expects, double.NegativeInfinity);

            if (lexiconOnly)
            {
                LAPCFGrammar.CollectTagMass(expects, rules.tposteriorCounts);
                LAPCFGrammar.CopyRules(rules.tposteriorCounts, rules.trules);
                LAPCFGrammar.Normalize(expects, rules.trules);
                LAPCFGrammar.ClearRules(rules.bposteriorCounts);
                LAPCFGrammar.ClearRules(rules.uposteriorCounts);
                LAPCFGrammar.ClearRules(rules.tposteriorCounts);
            }
            else
            {
                LAPCFGrammar.CollectTagMass(expects, rules.tposteriorCounts);
                LAPCFGrammar.CollectTagMass(expects, rules.uposteriorCounts);
                LAPCFGrammar.CollectTagMass(expects, rules.bposteriorCounts);

                LAPCFGrammar.CopyRules(rules.bposteriorCounts, rules.brules);
                LAPCFGrammar.Normalize(expects, rules.brules);
                LAPCFGrammar.CopyRules(rules.uposteriorCounts, rules.urules);
                LAPCFGrammar.Normalize(expects, rules.urules);
                LAPCFGrammar.CopyRules(rules.tposteriorCounts, rules.trules);
                LAPCFGrammar.Normalize(expects, rules.trules);
                LAPCFGrammar.ClearRules(rules.bposteriorCounts);
                LAPCFGrammar.ClearRules(rules.uposteriorCounts);
                LAPCFGrammar.ClearRules(rules.tposteriorCounts);
            }

            //rules.PropMaxUnaryPath();
        }
Exemplo n.º 19
0
        public static void CheckProbs(
            //LALexiconBuilder lexicon,
            LAPCFGrammar rules)
        {
            double[][] expects = new double[rules.TotalTagCount][];

            for (int i = 0; i < expects.Length; ++i) {
                expects [i] = new double[rules.GetSubTagCount (i)];
            }

            ArrayHelper.Fill (expects, double.NegativeInfinity);

            LAPCFGrammar.CollectTagMass (expects, rules.brules);
            LAPCFGrammar.CollectTagMass (expects, rules.urules);

            for (int p = 0; p < expects.Length; ++p) {
                for (int sp = 0; sp < expects[p].Length; ++sp) {
                    double s = expects [p] [sp];

                    if (double.IsNaN (s) || double.IsInfinity (s)) {
                        continue;
                        //throw new Exception("some rule in table has no mass!");
                    }

                    if (Math.Abs (s) > 0.01) {
                        throw new Exception ("table is not normalized!");
                    }
                }
            }
        }
Exemplo n.º 20
0
        private static void MatchLexicon(
            LAPCFGrammar table,
            HyperCell cell,
            int wid,
            HyperEdgePool epool,
            HyperVertexPool vpool,
            int[] tagCapacity,
            bool[] allowedTags,
            double[] tagProbs,
            bool isRoot)
        {
            var tv = new HyperVertex (true, wid, 1);

            var trules = table.trules [wid];

            foreach (var rule in trules) {
                if (rule == null) {
                    break;
                }
                if (rule.tag == table.ROOTID && !isRoot) {
                    continue;
                }

                if (allowedTags != null && !allowedTags[rule.tag])
                {
                    continue;
                }

                var xrule = rule;

                if (tagProbs != null)
                {
                    var xprob = tagProbs[rule.tag];
                    if (double.IsNegativeInfinity(xprob))
                    {
                        continue;
                    }
                    xrule = rule.Clone();

                    for (int i = 0; i < xrule.scores.Length; ++i)
                    {
                        if (!double.IsNegativeInfinity(xrule.scores[i]))
                        {
                            xrule.scores[i] += xprob;
                        }
                    }
                }
                var cap = tagCapacity == null ? -1 : tagCapacity [rule.tag];
                cell.l1v [rule.tag] = vpool.Allocate (false, rule.tag, table.GetSubTagCount (rule.tag), cap);
                epool.Allocate (cell.l1v [rule.tag], tv, xrule.scores, null);
                if (isRoot && rule.tag != table.ROOTID)
                {
                    continue;
                }
                cell.l2v [rule.tag] = vpool.Allocate (false, rule.tag, table.GetSubTagCount (rule.tag), cap);
                epool.Allocate (cell.l2v [rule.tag], cell.l1v [rule.tag]);
            }
        }
Exemplo n.º 21
0
        public static LAPCFGrammar MergeSymbols(double percentage,
            Vocabulary vocab,
            TagSet tagset,
            LAPCFGrammar rules,
            List<PhrasalTree> treebank,
            int nthread)
        {
            rules.InitializeExpectedCounts ();
            double[][] tagProb = SubtagExpectedCounts (nthread, vocab, tagset, rules, treebank);

            bool[] isSplit = new bool[tagProb.Length];

            for (int i = 0; i < tagProb.Length; ++i) {
                if (tagProb [i].Length == 1) {
                    tagProb [i] [0] = 0;
                    isSplit [i] = false;
                } else {
                    isSplit [i] = true;
                    for (int j = 0; j < tagProb[i].Length / 2; ++j) {
                        double z = MathHelper.LogAdd (tagProb [i] [2 * j], tagProb [i] [2 * j + 1]);
                        tagProb [i] [2 * j] -= z;
                        tagProb [i] [2 * j + 1] -= z;
                    }
                }
            }

            double[][] mergeLoss = CollectMergeLoss (nthread, vocab, tagset, rules, treebank, tagProb);

            var mergeCands = new List<MergeHelper> ();
            for (int t = 0; t < mergeLoss.Length; ++t) {
                if (mergeLoss [t] == null) {
                    continue;
                }

                for (int st = 0; st < mergeLoss[t].Length; ++st) {
                    mergeCands.Add (new MergeHelper (t, st, mergeLoss [t] [st]));
                }
            }

            mergeCands.Sort ((a, b) => {
                return a.loss.CompareTo (b.loss); }
            );

            //mergeCands.Reverse();

            int[][] subtagMap;
            bool[][] isMerged;
            int[] newSubTagCounts;

            CreateMergeMapping (rules, mergeCands, out subtagMap, out isMerged, out newSubTagCounts);

            var newRules = MergeRuleTable (rules, tagProb, subtagMap, isMerged, newSubTagCounts);

            newRules.InitializeExpectedCounts ();

            return newRules;
        }
Exemplo n.º 22
0
        public int ExpandHyperGraph(LAPCFGrammar grammar, int[] tagMap, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity = null)
        {
            this.ROOTID = grammar.ROOTID;

            var maxSubTag = grammar.subTagCounts.Max ();

            lbuf = new double[maxSubTag + 1];

            int prunedCell = 0;

            var lv1flags = new TimedArray<bool> (grammar.TotalTagCount);
            var lv2flags = new TimedArray<bool> (grammar.TotalTagCount);

            // CYK
            for (int i = 0; i < wids.Length; ++i) {
                var oldcell = chart [i, i];
                lv1flags.Clear ();
                foreach (var v in oldcell.l1v) {
                    if (v != null) {
                        lv1flags [v.tag] = true;
                    }
                }
                lv2flags.Clear ();
                foreach (var v in oldcell.l2v) {
                    if (v != null) {
                        lv2flags [v.tag] = true;
                    }
                }
                bool isRoot = i == 0 && i == wids.Length - 1;
                chart [i, i] = new HyperCell (i, i + 1, grammar.TotalTagCount);
                ExpandLexicon (
                    grammar, chart [i, i], lv1flags, lv2flags, tagMap,
                    wids [i], epool, vpool, tagCapacity,
                    allowedPoSTags == null ? null: allowedPoSTags[i],
                    rawTagProbs == null ? null : rawTagProbs[i], isRoot);
                ExpandUnaryRules (grammar, chart [i, i], lv1flags, lv2flags, tagMap, epool, vpool, tagCapacity, isRoot);
                chart [i, i].Finish ();
            }

            for (int spanL = 2; spanL <= wids.Length; ++spanL) {
                for (int beg = 0; beg + spanL <= wids.Length; ++beg) {
                    int end = beg + spanL;
                    int l = beg;
                    int r = end - 1;

                    var oldcell = chart [l, r];
                    lv1flags.Clear ();
                    foreach (var v in oldcell.l1v) {
                        if (v != null) {
                            lv1flags [v.tag] = true;
                        }
                    }
                    lv2flags.Clear ();
                    foreach (var v in oldcell.l2v) {
                        if (v != null) {
                            lv2flags [v.tag] = true;
                        }
                    }
                    bool isRoot = l == 0 && r == wids.Length - 1;
                    chart [l, r] = new HyperCell (beg, end, grammar.TotalTagCount);
                    if (!oldcell.IsEmptyCell ()) {
                        for (int mid = l; mid < r; ++mid) {
                            ExpandBinaryRules (grammar, chart [l, r], chart [l, mid], chart [mid + 1, r],
                                               lv1flags, lv2flags, tagMap,
                                               epool, vpool, tagCapacity, isRoot);
                        }

                        for (int i = 0; i < chart[l, r].l1v.Length; ++i) {
                            var c = chart [l, r].l1v [i];

                            if (c != null) {
                                if (isRoot && c.tag != ROOTID) {
                                    continue;
                                }
                                chart [l, r].l2v [i] = vpool.Allocate (false, c.tag, c.beta.Length, c.beta.v.Length);
                                epool.Allocate (chart [l, r].l2v [i], c);
                            }

                        }
                        ExpandUnaryRules (grammar, chart [l, r],
                                          lv1flags, lv2flags, tagMap,
                                          epool, vpool, tagCapacity, isRoot);
                    } else {
                        prunedCell += 1;
                    }
                    chart [l, r].Finish ();
                }
            }
            return prunedCell;
        }
Exemplo n.º 23
0
        public static double[][] SubtagExpectedCounts(
            int nthread,
            Vocabulary vocab,
            TagSet tagset,
            //LALexiconBuilder lexicon,
            LAPCFGrammar rules,
            List<PhrasalTree> treebank)
        {
            var parser = new HyperGraphParser (vocab, tagset, rules);

            double[][][] tagExpectsArray = new double[nthread][][];

            for (int tid = 0; tid < nthread; ++tid) {
                tagExpectsArray [tid] = new double[rules.TotalTagCount][];
                var tagExpects = tagExpectsArray [tid];
                for (int i = 0; i < tagExpects.Length; ++i) {
                    tagExpects [i] = new double[rules.GetSubTagCount (i)];
                }
                ArrayHelper.Fill (tagExpects, double.NegativeInfinity);
            }

            Parallel.For (0, nthread, threadid =>
            {
                var tagExpects = tagExpectsArray [threadid];

                for (int treeId = threadid; treeId < treebank.Count; treeId += nthread) {
                    var tree = treebank [treeId];
                    var g = parser.BuildHyperGraph (tree);

                    g.SumForward ();
                    g.SumBackward ();

                    double sentS = g.RootScore;

                    if (double.IsNaN (sentS) || double.IsInfinity (sentS)) {
                        continue;
                    }
                    foreach (var v in g.Vs) {
                        if (v.TYPE == VTYPE.TERMINAL) {
                            continue;
                        }

                        int t = v.tag;

                        for (int st = 0; st < v.subtagCount; ++st) {
                            if (double.IsNaN (v.alpha.v [st]) || double.IsInfinity (v.alpha.v [st])
                                || double.IsNaN (v.beta.v [st]) || double.IsInfinity (v.beta.v [st])
                                || v.alpha.pruned [st] || v.beta.pruned [st]) {
                                continue;
                            }

                            tagExpects [t] [st] = MathHelper.LogAdd (v.alpha.v [st] + v.beta.v [st] - sentS, tagExpects [t] [st]);
                        }
                    }
                }
            }
            );

            var te = tagExpectsArray [0];
            for (int i = 1; i < nthread; ++i) {
                for (int j = 0; j < te.Length; ++j) {
                    for (int k = 0; k < te[j].Length; ++k) {
                        te [j] [k] = MathHelper.LogAdd (te [j] [k], tagExpectsArray [i] [j] [k]);
                    }
                }
            }

            return te;
        }
Exemplo n.º 24
0
        private static double ParseGraphs(int nthread,
            List<PhrasalTree> treebank,
            LAPCFGrammar rules,
            Vocabulary vocab,
            TagSet tagSet,
            out int failed)
        {
            double llhd = 0;
            failed = 0;

            int xfail = 0;
            var handle = new object();
            Parallel.For(0, nthread, threadid =>
            {
                int fail = 0;
                double xllhd = 0;
                var parser = new HyperGraphParser(vocab, tagSet, rules);
                for (int i = threadid; i < treebank.Count; i += nthread)
                {
                    try
                    {
                        var graph = parser.BuildHyperGraph(treebank [i]);

                        graph.SumForward();
                        graph.SumBackward();

                        if (double.IsInfinity(graph.RootScore) || double.IsNaN(graph.RootScore))
                        {
                            fail += 1;
                            continue;
                        }
                        xllhd += graph.RootScore;
                    } catch
                    {
                        fail += 1;
                    }

                }

                lock (handle)
                {
                    xfail += fail;
                    llhd += xllhd;
                }
            }
            );
            failed = xfail;
            return llhd;
        }
Exemplo n.º 25
0
        private static double[][] CollectMergeLoss(int nthread, Vocabulary vocab, TagSet tagset, LAPCFGrammar rules, List<PhrasalTree> treebank, double[][] tagProb)
        {
            double[][][] mlossList = new double[nthread][][];
            for (int tid = 0; tid < nthread; ++tid) {
                double[][] mergeLoss = new double[rules.TotalTagCount][];

                for (int i = 0; i < mergeLoss.Length; ++i) {
                    if (tagProb [i].Length == 1) {
                        continue;
                    }

                    mergeLoss [i] = new double[tagProb [i].Length / 2];
                }

                ArrayHelper.Fill (mergeLoss, 0);

                mlossList [tid] = mergeLoss;
            }

            var parser = new HyperGraphParser (vocab, tagset, rules);

            Parallel.For (0, nthread, threadid =>
            {
                var mergeLoss = mlossList [threadid];
                for (int treeId = threadid; treeId < treebank.Count; treeId += nthread) {
                    var tree = treebank [treeId];

                    var g = parser.BuildHyperGraph (tree);

                    g.SumForward ();
                    g.SumBackward ();

                    double sentS = g.RootScore;

                    if (double.IsNaN (sentS) || double.IsInfinity (sentS)) {
                        continue;
                    }
                    foreach (var v in g.Vs) {
                        if (v.TYPE == VTYPE.TERMINAL) {
                            continue;
                        }

                        int t = v.tag;

                        if (v.subtagCount == 1) {
                            continue;
                        }

                        double[] marginals = new double[v.subtagCount];

                        for (int st = 0; st < v.subtagCount; ++st) {
                            if (!v.alpha.pruned [st]) {
                                marginals [st] = v.alpha.v [st] + v.beta.v [st];
                            }
                        }

                        for (int st = 0; st < v.subtagCount / 2; ++st) {
                            int l = st * 2;
                            int r = st * 2 + 1;
                            if (double.IsNaN (v.alpha.v [l]) || double.IsInfinity (v.alpha.v [l])
                                || double.IsNaN (v.beta.v [l]) || double.IsInfinity (v.beta.v [l])
                                || double.IsNaN (v.alpha.v [r]) || double.IsInfinity (v.alpha.v [r])
                                || double.IsNaN (v.beta.v [r]) || double.IsInfinity (v.beta.v [r])
                                || v.alpha.pruned [l] || v.alpha.pruned [r]) {
                                continue;
                            }

                            double lllhd = marginals [l];
                            double rllhd = marginals [r];

                            double mllhd = MathHelper.LogAdd (tagProb [t] [l] + v.alpha.v [l], tagProb [t] [r] + v.alpha.v [r])
                                + MathHelper.LogAdd (v.beta.v [l], v.beta.v [r]);

                            marginals [l] = mllhd;
                            marginals [r] = double.NegativeInfinity;

                            double xSentScore = MathHelper.LogAdd (marginals);

                            double sentScore = g.RootScore;

                            mergeLoss [t] [st] += sentScore - xSentScore;
                            //MathHelper.LogAdd(xSentScore - sentScore, mergeLoss[t][st]);

                            marginals [l] = lllhd;
                            marginals [r] = rllhd;
                        }
                    }
                }
            }
            );

            var ml = mlossList [0];

            for (int threadid = 1; threadid < mlossList.Length; ++threadid) {
                var xl = mlossList [threadid];
                for (int i = 0; i < ml.Length; ++i) {
                    if (ml [i] == null) {
                        continue;
                    }

                    for (int j = 0; j < ml[i].Length; ++j) {
                        ml [i] [j] += xl [i] [j];
                    }

                }
            }

            return ml;
        }
Exemplo n.º 26
0
        static void EvaluateParser()
        {
            string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar";
                                @"D:\user\nyang\data\treebank\English\pcfg\ptb.s1.smoothed.grammar";

            string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt";

            string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat";
                            @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat";

            string trainfile = @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.02-21.flat";

            Vocabulary vocab;
            TagSet tagSet;
            LAPCFGrammar grammar;

            var traintrees = new List<PhrasalTree>();

            LoadTrees(traintrees, trainfile);

            var rwHanlder = new RareWordHandler(traintrees, 10);

            using (var s = new TextModelReader(modelfile))
            {
                vocab = Vocabulary.LoadFromStream(s);
                tagSet = TagSet.LoadFromStream(s);
                grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet);
            }

            rwHanlder.Build(tagSet, 0.001);

            //grammar.Smoothing(0.1f);

            grammar.MakeCompaction();

            grammar.MakeSubruleCompaction();

            var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1];

            var traces = new int[grammars.Length][][];

            grammars [grammars.Length - 1] = grammar;

            for (int i = grammars.Length - 1; i >= 1; --i)
            {
                traces [i] = grammar.subtagTraces [i - 1];
                grammars [i - 1] = grammars [i].ProjectGrammar(traces [i]);
                grammars [i - 1].MakeCompaction();
                grammars [i - 1].MakeSubruleCompaction();
            }

            string[][] tagTiers;

            using (StreamReader sr = new StreamReader(tagmapfile))
            {
                var tt = new List<string[]>();
                while (!sr.EndOfStream)
                {
                    string line = sr.ReadLine();

                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries));
                }

                tagTiers = new string[tt [0].Length][];//tt.ToArray();

                for (int i = 0; i < tagTiers.Length; ++i)
                {
                    tagTiers [i] = new string[tt.Count];
                    for (int j = 0; j < tt.Count; ++j)
                    {
                        tagTiers [i] [j] = tt [j] [i];
                    }
                }
            }

            var cbs = new CodeBook32[tagTiers.Length];

            for (int i = 0; i < cbs.Length; ++i)
            {
                cbs [i] = new CodeBook32();

                foreach (var t in tagTiers[i])
                {
                    cbs [i].Add(t);
                }
            }

            int pgcount = cbs.Length - 1;

            int[][] tagMaps = new int[pgcount][];

            for (int i = 0; i < tagMaps.Length; ++i)
            {
                tagMaps [i] = new int[grammars [0].PTCount + 1 + cbs [i + 1].Count];

                for (int j = 0; j < grammars[0].PTCount + 1; ++j)
                {
                    tagMaps [i] [j] = j;
                }
            }

            var lastMap = tagMaps [tagMaps.Length - 1];

            for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j)
            {
                string tstr = tagSet.GetTagString(j);
                int id = cbs [cbs.Length - 1] [tstr];
                int pid = cbs [cbs.Length - 2] [tagTiers [tagTiers.Length - 2] [id]];

                lastMap [j] = pid + grammars [0].PTCount + 1;
            }

            for (int i = 0; i < tagMaps.Length - 1; ++i)
            {
                for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j)
                {
                    string tstr = cbs [i + 1] [j - grammars [0].PTCount - 1];

                    int xid = Array.IndexOf(tagTiers [i + 1], tstr);

                    string pstr = tagTiers [i] [xid];

                    int pid = cbs [i] [pstr];

                    tagMaps [i] [j] = pid;
                }
            }

            var cgrammars = new LAPCFGrammar[tagMaps.Length];

            cgrammars [cgrammars.Length - 1] = grammars [0].CollapseNonTerminals(tagMaps [cgrammars.Length - 1], 1 + cbs [cgrammars.Length - 1].Count);

            for (int i = cgrammars.Length - 1; i >= 1; --i)
            {
                cgrammars [i - 1] = cgrammars [i].CollapseNonTerminals(tagMaps [i - 1], 1 + cbs [i - 1].Count);
            }

            for (int i = 0; i < cgrammars.Length; ++i)
            {
                cgrammars [i].MakeCompaction();
                cgrammars [i].MakeSubruleCompaction();
            }

            var treebank = new List<PhrasalTree>();

            LoadTrees(treebank, testfile);

            foreach (var tree in treebank)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }

                //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root);

                tree.ComputeStartEnd();
            }

            double ccount = 0;
            double pcount = 0;
            double gcount = 0;
            int failed = 0;
            int sentcount = 0;
            HyperEdgePool epool = new HyperEdgePool(1024 * 1024);

            HyperVertexPool vpool = new HyperVertexPool(grammars [grammars.Length - 1].subTagCounts.Max());

            //EMorph.EnglishMorph.WarmUp();

            Console.Error.WriteLine("Start to parse...");
            ConsoleTimer tm = new ConsoleTimer(1);

            Stopwatch g0bwatch = new Stopwatch();
            Stopwatch g0watch = new Stopwatch();
            Stopwatch bwatch = new Stopwatch();

            Stopwatch[] gwatch = new Stopwatch[grammars.Length];

            for (int i = 0; i < gwatch.Length; ++i)
            {
                gwatch [i] = new Stopwatch();
            }

            Stopwatch vwatch = new Stopwatch();

            foreach (var tree in treebank)
            {
                var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                if (words.Length > 20)
                {
                    continue;
                }

                sentcount += 1;

                int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray();

                wids [0] = vocab.GetId(SimpleTokenizor.ETokenize(words [0]), true);

                string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                double[][] tprobs = new double[wids.Length][];

                //for (int i = 0; i < wids.Length; ++i)
                //{
                //    tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i]));
                //}

                bool[][] allowedTags = null;
                    //AssignTagConstraints(vocab, tagSet, words, wids);

                try
                {
                    //var parser = new ChartParser(wids);
                    var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs);
                    g0bwatch.Start();
                    parser.BuildHyperGraph(cgrammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts);
                    g0bwatch.Stop();
                    g0watch.Start();
                    parser.SumForward();
                    parser.SumBackward(false);
                    parser.Prune(-15.0);
                    parser.Purge();
                    for (int i = 1; i < cgrammars.Length; ++i)
                    {
                        parser.ExpandHyperGraph(cgrammars [i], tagMaps [i - 1], epool, vpool,
                                                 grammars [grammars.Length - 1].subTagCounts);
                        parser.SumForward();
                        parser.SumBackward(false);
                        parser.Prune(-15.0);
                        parser.Purge();
                    }
                    g0watch.Stop();
            //
                    bwatch.Start();
                    parser.ExpandHyperGraph(grammars [0], tagMaps [2], epool, vpool, grammars [grammars.Length - 1].subTagCounts);
            //					parser.BuildHyperGraph (grammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts);
                    bwatch.Stop();

                    for (int i = 0; i < grammars.Length - 1; ++i)
                    {
                        gwatch [i].Start();
                        parser.SumForward();
                        parser.SumBackward(false);

                        parser.Prune(-10.0);

                        parser.Purge();

                        parser.ProjectGrammar(traces [i + 1], grammars [i + 1]);
                        gwatch [i].Stop();
                    }

                    gwatch [grammars.Length - 1].Start();
                    parser.SumForward();

                    parser.SumBackward(true);
                    gwatch [grammars.Length - 1].Stop();

                    vwatch.Start();
                    parser.PosteriorViterbi();

                    var ptree = parser.ExtractPosteriorViterbi(words, tagSet);

                    vwatch.Stop();

                    //PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root);

                    ptree.ComputeStartEnd();

                    var pbrackets = ptree.GetBracketsIgnorePunc();
                    var gbrackets = tree.GetBracketsIgnorePunc();

                    gcount += gbrackets.Count;
                    pcount += pbrackets.Count;

                    foreach (var b in pbrackets)
                    {
                        if (gbrackets.Contains(b))
                        {
                            ccount += 1;
                        }
                    }

                    if (pbrackets.Count == 0
                        || (pbrackets.Count < gbrackets.Count / 2))
                    {
                        Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count,
                            gbrackets.Count);
                    }

                    //Console.Error.WriteLine(tree.TextTree);
                } catch
                {
                    g0bwatch.Stop();
                    g0watch.Stop();
                    bwatch.Stop();
                    foreach (var w in gwatch)
                    {
                        w.Stop();
                    }
                    vwatch.Stop();
                    failed += 1;
                    Console.Error.WriteLine("\nFailure!");
                }

                tm.Up();
            }

            tm.Finish();

            Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed);

            double prec = ccount / pcount;
            double recall = ccount / gcount;

            double f1 = 2.0 * prec * recall / (prec + recall);

            Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1);

            Console.Error.WriteLine("G-1 Build:\t{0:F6} s", g0bwatch.Elapsed.TotalSeconds);

            Console.Error.WriteLine("G-1 Pass:\t{0:F6} s", g0watch.Elapsed.TotalSeconds);

            Console.Error.WriteLine("G0 Build:\t{0:F6} s", bwatch.Elapsed.TotalSeconds);

            for (int i = 0; i < gwatch.Length; ++i)
            {
                Console.Error.WriteLine("G{0} Pass:\t{1:F6} s", i, gwatch [i].Elapsed.TotalSeconds);
            }

            Console.Error.WriteLine("Viterbi:\t{0:F6} s", vwatch.Elapsed.TotalSeconds);
        }
Exemplo n.º 27
0
        public LAPCFGrammar ProjectGrammar(int[][] trace)
        {
            var pg = new LAPCFGrammar();

            pg.NTCount = NTCount;
            pg.PTCount = PTCount;
            pg.ROOTID = ROOTID;
            pg.subTagCounts = trace.Select(x => x [x.Length - 1] + 1).ToArray();

            var sprobs = ComputeSymbolProb();

            var psprobs = pg.subTagCounts.Select(x => new double[x]).ToArray();
            ArrayHelper.Fill(psprobs, double.NegativeInfinity);

            for (int i = 0; i < trace.Length; ++i)
            {
                for (int j = 0; j < trace[i].Length; ++j)
                {
                    psprobs [i] [trace [i] [j]] = MathHelper.LogAdd(psprobs [i] [trace [i] [j]], sprobs [i] [j]);
                }
            }

            pg.brules = brules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.Select(
                        z => z == null ? null : z.MergeSymbols(trace, sprobs, psprobs)
            ).ToArray()
            ).ToArray()
            ).ToArray();

            pg.urules = urules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.MergeSymbols(trace, sprobs, psprobs)
            ).ToArray()
            ).ToArray();

            pg.trules = trules.Select(
                x => x == null ? null : x.Select(
                    y => y == null ? null : y.MergeSymbols(trace, sprobs, psprobs)
            ).ToArray()
            ).ToArray();

            pg.Normalize();
            return pg;
        }