Пример #1
0
        public static TagSet LoadFromStream(TextModelReader sr)
        {
            string xname = sr.Read();

            if (xname != typeof(TagSet).FullName)
            {
                throw new Exception("model name does not match");
            }

            int startlvl = sr.NestLevel;

            var xver = sr.ReadOptionUInt64("VER");

            if (xver != VER)
            {
                throw new Exception("version number does not match");
            }

            var ts = new TagSet();

            ts.ROOT = sr.ReadOptionString("ROOT");

            ts.PTs = CodeBook32.LoadFromStream(sr);
            ts.NTs = CodeBook32.LoadFromStream(sr);

            xname = sr.Read();

            if(xname != typeof(TagSet).FullName || sr.NestLevel != startlvl)
            {
                throw new Exception("model name does not match");
            }

            return ts;
        }
Пример #2
0
        public PhrasalTree ExtractViterbi(string[] words, TagSet tagSet)
        {
            var rootCell = chart[0, wids.Length - 1];

            if (rootCell.l2v == null)
            {
                return null;
            }

            var rootid = tagSet.ROOTID;

            HyperVertex v = null;//rootCell.l2v [0];

            for (int i = 0; i < rootCell.l2v.Length; ++i)
            {
                if (rootCell.l2v[i].tag == rootid)
                {
                    v = rootCell.l2v[i];
                    break;
                }
            }

            if (v == null || v.TYPE == VTYPE.DEAD)
            {
                throw new Exception("node is pruned!");
            }

            PhrasalNode rootNode = ExtractViterbiParse(v, 0, tagSet);

            PhrasalTree tree = new PhrasalTree(rootNode);

            tree.ComputeStartEnd();

            AnnotateLex(words, tree.Root);

            return tree;
        }
Пример #3
0
        private static double[][] CollectMergeLoss(int nthread, Vocabulary vocab, TagSet tagset, LAPCFGrammar rules, List<PhrasalTree> treebank, double[][] tagProb)
        {
            double[][][] mlossList = new double[nthread][][];
            for (int tid = 0; tid < nthread; ++tid) {
                double[][] mergeLoss = new double[rules.TotalTagCount][];

                for (int i = 0; i < mergeLoss.Length; ++i) {
                    if (tagProb [i].Length == 1) {
                        continue;
                    }

                    mergeLoss [i] = new double[tagProb [i].Length / 2];
                }

                ArrayHelper.Fill (mergeLoss, 0);

                mlossList [tid] = mergeLoss;
            }

            var parser = new HyperGraphParser (vocab, tagset, rules);

            Parallel.For (0, nthread, threadid =>
            {
                var mergeLoss = mlossList [threadid];
                for (int treeId = threadid; treeId < treebank.Count; treeId += nthread) {
                    var tree = treebank [treeId];

                    var g = parser.BuildHyperGraph (tree);

                    g.SumForward ();
                    g.SumBackward ();

                    double sentS = g.RootScore;

                    if (double.IsNaN (sentS) || double.IsInfinity (sentS)) {
                        continue;
                    }
                    foreach (var v in g.Vs) {
                        if (v.TYPE == VTYPE.TERMINAL) {
                            continue;
                        }

                        int t = v.tag;

                        if (v.subtagCount == 1) {
                            continue;
                        }

                        double[] marginals = new double[v.subtagCount];

                        for (int st = 0; st < v.subtagCount; ++st) {
                            if (!v.alpha.pruned [st]) {
                                marginals [st] = v.alpha.v [st] + v.beta.v [st];
                            }
                        }

                        for (int st = 0; st < v.subtagCount / 2; ++st) {
                            int l = st * 2;
                            int r = st * 2 + 1;
                            if (double.IsNaN (v.alpha.v [l]) || double.IsInfinity (v.alpha.v [l])
                                || double.IsNaN (v.beta.v [l]) || double.IsInfinity (v.beta.v [l])
                                || double.IsNaN (v.alpha.v [r]) || double.IsInfinity (v.alpha.v [r])
                                || double.IsNaN (v.beta.v [r]) || double.IsInfinity (v.beta.v [r])
                                || v.alpha.pruned [l] || v.alpha.pruned [r]) {
                                continue;
                            }

                            double lllhd = marginals [l];
                            double rllhd = marginals [r];

                            double mllhd = MathHelper.LogAdd (tagProb [t] [l] + v.alpha.v [l], tagProb [t] [r] + v.alpha.v [r])
                                + MathHelper.LogAdd (v.beta.v [l], v.beta.v [r]);

                            marginals [l] = mllhd;
                            marginals [r] = double.NegativeInfinity;

                            double xSentScore = MathHelper.LogAdd (marginals);

                            double sentScore = g.RootScore;

                            mergeLoss [t] [st] += sentScore - xSentScore;
                            //MathHelper.LogAdd(xSentScore - sentScore, mergeLoss[t][st]);

                            marginals [l] = lllhd;
                            marginals [r] = rllhd;
                        }
                    }
                }
            }
            );

            var ml = mlossList [0];

            for (int threadid = 1; threadid < mlossList.Length; ++threadid) {
                var xl = mlossList [threadid];
                for (int i = 0; i < ml.Length; ++i) {
                    if (ml [i] == null) {
                        continue;
                    }

                    for (int j = 0; j < ml[i].Length; ++j) {
                        ml [i] [j] += xl [i] [j];
                    }

                }
            }

            return ml;
        }
Пример #4
0
        public static LAPCFGrammar MergeSymbols(double percentage,
            Vocabulary vocab,
            TagSet tagset,
            LAPCFGrammar rules,
            List<PhrasalTree> treebank,
            int nthread)
        {
            rules.InitializeExpectedCounts ();
            double[][] tagProb = SubtagExpectedCounts (nthread, vocab, tagset, rules, treebank);

            bool[] isSplit = new bool[tagProb.Length];

            for (int i = 0; i < tagProb.Length; ++i) {
                if (tagProb [i].Length == 1) {
                    tagProb [i] [0] = 0;
                    isSplit [i] = false;
                } else {
                    isSplit [i] = true;
                    for (int j = 0; j < tagProb[i].Length / 2; ++j) {
                        double z = MathHelper.LogAdd (tagProb [i] [2 * j], tagProb [i] [2 * j + 1]);
                        tagProb [i] [2 * j] -= z;
                        tagProb [i] [2 * j + 1] -= z;
                    }
                }
            }

            double[][] mergeLoss = CollectMergeLoss (nthread, vocab, tagset, rules, treebank, tagProb);

            var mergeCands = new List<MergeHelper> ();
            for (int t = 0; t < mergeLoss.Length; ++t) {
                if (mergeLoss [t] == null) {
                    continue;
                }

                for (int st = 0; st < mergeLoss[t].Length; ++st) {
                    mergeCands.Add (new MergeHelper (t, st, mergeLoss [t] [st]));
                }
            }

            mergeCands.Sort ((a, b) => {
                return a.loss.CompareTo (b.loss); }
            );

            //mergeCands.Reverse();

            int[][] subtagMap;
            bool[][] isMerged;
            int[] newSubTagCounts;

            CreateMergeMapping (rules, mergeCands, out subtagMap, out isMerged, out newSubTagCounts);

            var newRules = MergeRuleTable (rules, tagProb, subtagMap, isMerged, newSubTagCounts);

            newRules.InitializeExpectedCounts ();

            return newRules;
        }
Пример #5
0
        public static double[][] SubtagExpectedCounts(
            int nthread,
            Vocabulary vocab,
            TagSet tagset,
            //LALexiconBuilder lexicon,
            LAPCFGrammar rules,
            List<PhrasalTree> treebank)
        {
            var parser = new HyperGraphParser (vocab, tagset, rules);

            double[][][] tagExpectsArray = new double[nthread][][];

            for (int tid = 0; tid < nthread; ++tid) {
                tagExpectsArray [tid] = new double[rules.TotalTagCount][];
                var tagExpects = tagExpectsArray [tid];
                for (int i = 0; i < tagExpects.Length; ++i) {
                    tagExpects [i] = new double[rules.GetSubTagCount (i)];
                }
                ArrayHelper.Fill (tagExpects, double.NegativeInfinity);
            }

            Parallel.For (0, nthread, threadid =>
            {
                var tagExpects = tagExpectsArray [threadid];

                for (int treeId = threadid; treeId < treebank.Count; treeId += nthread) {
                    var tree = treebank [treeId];
                    var g = parser.BuildHyperGraph (tree);

                    g.SumForward ();
                    g.SumBackward ();

                    double sentS = g.RootScore;

                    if (double.IsNaN (sentS) || double.IsInfinity (sentS)) {
                        continue;
                    }
                    foreach (var v in g.Vs) {
                        if (v.TYPE == VTYPE.TERMINAL) {
                            continue;
                        }

                        int t = v.tag;

                        for (int st = 0; st < v.subtagCount; ++st) {
                            if (double.IsNaN (v.alpha.v [st]) || double.IsInfinity (v.alpha.v [st])
                                || double.IsNaN (v.beta.v [st]) || double.IsInfinity (v.beta.v [st])
                                || v.alpha.pruned [st] || v.beta.pruned [st]) {
                                continue;
                            }

                            tagExpects [t] [st] = MathHelper.LogAdd (v.alpha.v [st] + v.beta.v [st] - sentS, tagExpects [t] [st]);
                        }
                    }
                }
            }
            );

            var te = tagExpectsArray [0];
            for (int i = 1; i < nthread; ++i) {
                for (int j = 0; j < te.Length; ++j) {
                    for (int k = 0; k < te[j].Length; ++k) {
                        te [j] [k] = MathHelper.LogAdd (te [j] [k], tagExpectsArray [i] [j] [k]);
                    }
                }
            }

            return te;
        }
Пример #6
0
        private static double ParseGraphs(int nthread,
            List<PhrasalTree> treebank,
            LAPCFGrammar rules,
            Vocabulary vocab,
            TagSet tagSet,
            out int failed)
        {
            double llhd = 0;
            failed = 0;

            int xfail = 0;
            var handle = new object();
            Parallel.For(0, nthread, threadid =>
            {
                int fail = 0;
                double xllhd = 0;
                var parser = new HyperGraphParser(vocab, tagSet, rules);
                for (int i = threadid; i < treebank.Count; i += nthread)
                {
                    try
                    {
                        var graph = parser.BuildHyperGraph(treebank [i]);

                        graph.SumForward();
                        graph.SumBackward();

                        if (double.IsInfinity(graph.RootScore) || double.IsNaN(graph.RootScore))
                        {
                            fail += 1;
                            continue;
                        }
                        xllhd += graph.RootScore;
                    } catch
                    {
                        fail += 1;
                    }

                }

                lock (handle)
                {
                    xfail += fail;
                    llhd += xllhd;
                }
            }
            );
            failed = xfail;
            return llhd;
        }
Пример #7
0
        private void BuildTerminalRule(string ruleString, Vocabulary vocab, TagSet tagSet)
        {
            string[] parts = ruleString.Split(new string[] { "\t", "_" }, StringSplitOptions.RemoveEmptyEntries);

            int tag = tagSet.GetID(parts [0]);
            int subtag = int.Parse(parts [1]);
            int word = vocab.GetId(parts [2]);
            double s = double.Parse(parts [3]);

            if (trules == null)
            {
                trules = new TerminalRule[vocab.VocabSize][];
            }

            if (trules [word] == null)
            {
                trules [word] = new TerminalRule[tagSet.PTCount];
            }

            if (trules [word] [tag] == null)
            {
                trules [word] [tag] = new TerminalRule(new double[subTagCounts [tag]], tag, word);
                trules [word] [tag].ClearScore();
            }

            trules [word] [tag].scores [subtag] = s;
        }
Пример #8
0
        public void Build(TagSet tags, double smoothCount)
        {
            foreach (var kv in wordTagCount)
            {
                var word = kv.Key;

                double[] tp = new double[tags.PTCount];

                for (int tid = 0; tid < tp.Length; ++tid)
                {
                    var tstr = tags.GetTagString(tid);

                    int wtc = kv.Value[tstr];

                    int tc = tagCount[tstr];

                    int ttc = tagTypeCount[tstr];

                    if (tc <= 0)
                    {
                        tp[tid] = double.NegativeInfinity;
                    }
                    else
                    {
                        tp[tid] = Math.Log((wtc + smoothCount) / (tc + (ttc + 1.0) * smoothCount));
                    }
                }

                probs.Add(word, tp);
            }
        }
Пример #9
0
        public static double[][] BuildLex(List<PhrasalTree> treebank, TagSet tagSet, Vocabulary vocab)
        {
            int PTCount = tagSet.PTCount;
            int vocabCount = vocab.VocabSize;
            double[][] tagWordCounts = ArrayHelper.AllocateArray<double> (PTCount, vocabCount);
            double[][] wordTagCounts = ArrayHelper.AllocateArray<double> (vocabCount, PTCount);
            double[] tagCounts = new double[PTCount];
            double[] wordCounts = new double[vocabCount];

            HashSet<string>[] tagTypeSets = new HashSet<string>[PTCount];

            for (int i = 0; i < tagTypeSets.Length; ++i) {
                tagTypeSets [i] = new HashSet<string> ();
            }

            foreach (var tree in treebank) {
                tree.ComputeStartEnd ();
                foreach (var node in tree.TreeNodes) {
                    if (node.Children.Count == 0) {
                        string word = SimpleTokenizor.ETokenize (node.Lex);
                        string tag = node.Tag;

                        int tagId = tagSet.GetPTID (tag);
                        tagTypeSets [tagId].Add (word);

                        int wordId = vocab.GetId (word, node.Start == 0);

                        tagWordCounts [tagId] [wordId] += 1.0f;
                        wordTagCounts [wordId] [tagId] += 1.0f;

                        tagCounts [tagId] += 1.0f;
                        wordCounts [wordId] += 1.0f;
                    }
                }
            }

            double[] typeTagCount = new double[PTCount];

            for (int i = 0; i < typeTagCount.Length; ++i) {
                typeTagCount [i] = tagTypeSets [i].Count;
            }

            // for smoothing
            for (int wordId = 0; wordId < wordTagCounts.Length; ++wordId) {
                var wt = wordTagCounts [wordId];
                double wc = wordCounts [wordId];

                //bool isRare = vocab.IsRareOrUNK (wordId);

                //if (isRare) {
                //    for (int tid = 0; tid < wt.Length; ++tid) {
                //        if (wt [tid] > 0 || typeTagCount [tid] >= openTagClassThr) {
                //            wt [tid] += addXSmoothing;
                //            wc += addXSmoothing;
                //        }
                //    }
                //}

                for (int i = 0; i < wt.Length; ++i) {
                    wt [i] /= wc;
                }
            }

            double totalwc = MathHelper.Sum (wordCounts);

            for (int i = 0; i < wordCounts.Length; ++i) {
                wordCounts [i] /= totalwc;
            }

            double totaltc = MathHelper.Sum (tagCounts);

            for (int i = 0; i < tagCounts.Length; ++i) {
                tagCounts [i] /= totaltc;
            }

            for (int tagId = 0; tagId < tagCounts.Length; ++tagId) {
                for (int wordId = 0; wordId < wordCounts.Length; ++wordId) {
                    tagWordCounts [tagId] [wordId] = wordTagCounts [wordId] [tagId] * wordCounts [wordId] / tagCounts [tagId];
                }
            }

            double[][] scores = ArrayHelper.AllocateArray<double> (vocabCount, PTCount);

            ArrayHelper.Fill (scores, double.NegativeInfinity);

            for (int word = 0; word < scores.Length; ++word) {
                for (int tag = 0; tag < scores[word].Length; ++tag) {
                    if (tagWordCounts [tag] [word] > 0) {
                        //scores[i][j] = new double[1];
                        //expectedCounts[i][j] = new double[1];
                        scores [word] [tag] = (double)Math.Log (tagWordCounts [tag] [word]);
                        //expectedCounts[i][j][0] = double.NegativeInfinity;
                    }
                }
            }

            return scores;
        }
Пример #10
0
        public void DumpToStream(TextModelWriter sw, TagSet tagSet, Vocabulary vocab)
        {
            var name = typeof(LAPCFGrammar).FullName;

            sw.Write(name);
            sw.WriteOption("VER", VER);
            sw.WriteOption("NTCount", NTCount);
            sw.WriteOption("PTCount", PTCount);
            sw.WriteOption("ROOTID", ROOTID);
            sw.Write("TerminalRule");
            sw.NestLevel += 1;
            foreach (var x in trules)
            {
                if (x != null)
                {
                    foreach (var y in x)
                    {
                        if (y != null)
                        {
                            var word = vocab.GetWordString(y.word);
                            var tag = tagSet.GetTagString(y.tag);
                            for (int p = 0; p < y.scores.Length; ++p)
                            {
                                if (!double.IsInfinity(y.scores [p]) && !double.IsNaN(y.scores [p]))
                                {
                                    sw.Write(string.Format("{0}_{1}\t{2}\t{3}", tag, p, word, y.scores [p]));
                                }
                            }
                        }
                    }
                }
            }
            sw.NestLevel -= 1;
            sw.Write("UnaryRule");
            sw.NestLevel += 1;
            foreach (var x in urules)
            {
                if (x != null)
                {
                    foreach (var y in x)
                    {
                        if (y != null)
                        {
                            var ptag = tagSet.GetTagString(y.ptag);
                            var ctag = tagSet.GetTagString(y.ctag);
                            for (int c = 0; c < y.scores.Length; ++c)
                            {
                                for (int p = 0; p < y.scores[c].Length; ++p)
                                {
                                    if (!double.IsInfinity(y.scores [c] [p]) && !double.IsNaN(y.scores [c] [p]))
                                    {
                                        sw.Write(string.Format("{0}_{1}\t{2}_{3}\t{4}", ptag, p, ctag, c, y.scores [c] [p]));
                                    }
                                }
                            }
                        }
                    }
                }
            }
            sw.NestLevel -= 1;
            sw.Write("BinaryRule");
            sw.NestLevel += 1;
            foreach (var x in brules)
            {
                if (x != null)
                {
                    foreach (var y in x)
                    {
                        if (y != null)
                        {
                            foreach (var z in y)
                            {
                                if (z != null)
                                {
                                    var ptag = tagSet.GetTagString(z.ptag);
                                    var ltag = tagSet.GetTagString(z.ltag);
                                    var rtag = tagSet.GetTagString(z.rtag);
                                    for (int l = 0; l < z.scores.Length; ++l)
                                    {
                                        for (int r = 0; r < z.scores[l].Length; ++r)
                                        {
                                            for (int p = 0; p < z.scores[l][r].Length; ++p)
                                            {
                                                if (!double.IsInfinity(z.scores [l] [r] [p]) && !double.IsNaN(z.scores [l] [r] [p]))
                                                {
                                                    sw.Write(
                                                        string.Format("{0}_{1}\t{2}_{3}\t{4}_{5}\t{6}",
                                                        ptag, p, ltag, l, rtag, r, z.scores [l] [r] [p])
                                                    );
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            sw.NestLevel -= 1;

            sw.WriteOption("TraceCount", subtagTraces.Count);
            foreach (var trace in subtagTraces)
            {
                sw.WriteOption("TRACE", trace.Length);
                sw.NestLevel += 1;
                foreach (var t in trace)
                {
                    sw.Write(string.Join(" ", t));
                }
                sw.NestLevel -= 1;
            }

            sw.Write(name);
        }
Пример #11
0
 public HyperGraphParser(
     Vocabulary vocab,
     TagSet tagset,
     LAPCFGrammar rules)
 {
     this.vocab = vocab;
     this.tagset = tagset;
     this.rules = rules;
 }
Пример #12
0
        public static LAPCFGrammar LoadFromStream(TextModelReader sr, Vocabulary vocab, TagSet tagSet)
        {
            var grammar = new LAPCFGrammar();
            var name = typeof(LAPCFGrammar).FullName;

            sr.Require(name);
            sr.Require("VER", VER);

            grammar.NTCount = sr.ReadOptionInt("NTCount");
            grammar.PTCount = sr.ReadOptionInt("PTCount");
            grammar.ROOTID = sr.ReadOptionInt("ROOTID");

            sr.Require("TerminalRule");

            int lvl = sr.NestLevel;
            var truleStrings = new HashSet<string>();
            var uruleStrings = new HashSet<string>();
            var bruleStrings = new HashSet<string>();

            string line = sr.Read();
            while (sr.NestLevel > lvl)
            {
                truleStrings.Add(line);
                line = sr.Read();
            }

            if (line != "UnaryRule")
            {
                throw new Exception("wrong model!");
            }
            line = sr.Read();
            while (sr.NestLevel > lvl)
            {
                uruleStrings.Add(line);
                line = sr.Read();
            }

            if (line != "BinaryRule")
            {
                throw new Exception("wrong model!");
            }
            line = sr.Read();
            while (sr.NestLevel > lvl)
            {
                bruleStrings.Add(line);
                line = sr.Read();
            }

            string[] parts = line.Split('\t');

            if (parts [0] != "TraceCount")
            {
                throw new Exception("error in model");
            }

            int subtraceCount = int.Parse(parts [1]);

            grammar.subtagTraces = new List<int[][]>();

            for (int i = 0; i < subtraceCount; ++i)
            {
                int tlen = sr.ReadOptionInt("TRACE");
                int[][] trace = new int[tlen][];

                for (int j = 0; j < tlen; ++j)
                {
                    trace [j] = sr.ReadIntArray();
                }

                grammar.subtagTraces.Add(trace);
            }

            if (grammar.subtagTraces.Count == 0)
            {
                grammar.subTagCounts = new int[grammar.TotalTagCount];
                ArrayHelper.Fill(grammar.subTagCounts, 1);
            } else
            {
                var trace = grammar.subtagTraces [grammar.subtagTraces.Count - 1];
                grammar.subTagCounts = trace.Select(x => x.Length).ToArray();
            }

            sr.Require(name);

            foreach (var str in uruleStrings)
            {
                grammar.BuildUnaryRule(str, tagSet);
            }

            foreach (var str in truleStrings)
            {
                grammar.BuildTerminalRule(str, vocab, tagSet);
            }

            foreach (var str in bruleStrings)
            {
                grammar.BuildBinaryRule(str, tagSet);
            }

            return grammar;
        }
Пример #13
0
        public LAPCFGrammar(TagSet set, double[][][] brawScores, double[][] urawScores, double[][] trawScores)
        {
            NTCount = set.NTCount;
            PTCount = set.PTCount;
            ROOTID = set.ROOTID;

            var tagCount = TotalTagCount;

            brules = new BinaryRule[tagCount][][];

            for (int l = 0; l < tagCount; ++l)
            {
                if (brawScores [l] == null)
                {
                    continue;
                }

                brules [l] = new BinaryRule[tagCount][];

                for (int r = 0; r < tagCount; ++r)
                {
                    if (brawScores [l] [r] == null)
                    {
                        continue;
                    }

                    brules [l] [r] = new BinaryRule[tagCount];
                    for (int p = 0; p < tagCount; ++p)
                    {
                        if (!double.IsInfinity(brawScores [l] [r] [p]) && !double.IsNaN(brawScores [l] [r] [p]))
                        {
                            double[][][] s = new double[1][][];
                            s [0] = new double[1][];
                            s [0] [0] = new double[1];
                            s [0] [0] [0] = brawScores [l] [r] [p];
                            brules [l] [r] [p] = new BinaryRule(s, p, l, r);
                        }
                    }
                }
            }

            urules = new UnaryRule[tagCount][];

            for (int c = 0; c < tagCount; ++c)
            {
                if (urawScores [c] == null)
                {
                    continue;
                }

                urules [c] = new UnaryRule[tagCount];

                for (int p = 0; p < tagCount; ++p)
                {
                    if (!double.IsNaN(urawScores [c] [p]) && !double.IsInfinity(urawScores [c] [p]))
                    {
                        double[][] s = new double[1][];
                        s [0] = new double[1];
                        s [0] [0] = urawScores [c] [p];
                        urules [c] [p] = new UnaryRule(s, p, c);
                    }
                }
            }

            trules = new TerminalRule[trawScores.Length][];

            for (int w = 0; w < trawScores.Length; ++w)
            {
                if (trawScores [w] == null)
                {
                    continue;
                }

                trules [w] = new TerminalRule[trawScores [w].Length];

                for (int t = 0; t < trules[w].Length; ++t)
                {
                    if (!double.IsNaN(trawScores [w] [t]) && !double.IsInfinity(trawScores [w] [t]))
                    {
                        double[] s = new double[1];
                        s [0] = trawScores [w] [t];
                        trules [w] [t] = new TerminalRule(s, t, w);
                    }
                }
            }

            subTagCounts = new int[tagCount];
            for (int i = 0; i < subTagCounts.Length; ++i)
            {
                subTagCounts [i] = 1;
            }
        }
Пример #14
0
        private void BuildUnaryRule(string ruleString, TagSet tagSet)
        {
            string[] parts = ruleString.Split(new string[] { "\t", "_" }, StringSplitOptions.RemoveEmptyEntries);

            int ptag = tagSet.GetID(parts [0]);
            int subptag = int.Parse(parts [1]);
            int ctag = tagSet.GetID(parts [2]);
            int subctag = int.Parse(parts [3]);
            double s = double.Parse(parts [4]);

            if (urules == null)
            {
                urules = new UnaryRule[TotalTagCount][];
            }

            if (urules [ctag] == null)
            {
                urules [ctag] = new UnaryRule[TotalTagCount];
            }

            if (urules [ctag] [ptag] == null)
            {
                urules [ctag] [ptag] = new UnaryRule(ArrayHelper.AllocateArray<double>(subTagCounts [ctag], subTagCounts [ptag]), ptag, ctag);
                urules [ctag] [ptag].ClearScore();
            }

            urules [ctag] [ptag].scores [subctag] [subptag] = s;
        }
Пример #15
0
        private PhrasalNode ExtractViterbiParse(HyperVertex v, int subtag, TagSet tagSet)
        {
            if (v == null || v.TYPE == VTYPE.TERMINAL)
            {
                return null;
            }

            PhrasalNode node = new PhrasalNode();
            node.Tag = tagSet.GetTagString(v.tag);

            var bestEdge = v.traces[subtag].edge;

            if (bestEdge == null)
            {
                return node;
            }

            switch (bestEdge.TYPE)
            {
                case ETYPE.BINARY:
                    var l = ExtractViterbiParse(bestEdge.from0, v.traces[subtag].subtag0, tagSet);
                    var r = ExtractViterbiParse(bestEdge.from1, v.traces[subtag].subtag1, tagSet);
                    node.Children.Add(l);
                    node.Children.Add(r);
                    l.Parent = node;
                    r.Parent = node;
                    break;
                case ETYPE.UNARY:
                    var c = ExtractViterbiParse(bestEdge.from0, v.traces[subtag].subtag0, tagSet);
                    node.Children.Add(c);
                    c.Parent = node;
                    break;
                case ETYPE.TERMINAL:
                    break;
                case ETYPE.DUMMY:
                    node = ExtractViterbiParse(bestEdge.from0, v.traces[subtag].subtag0, tagSet);
                    break;
                default:
                    throw new Exception("unknown edge type!");
            }
            return node;
        }
Пример #16
0
        public static void Build(
            Vocabulary vocab,
            TagSet tagset,
            List<PhrasalTree> treebank,
            out LAPCFGrammar rules,
            Random RNG = null)
        {
            int tagCount = tagset.NTCount + tagset.PTCount;

            double[] pmass = new double[tagset.NTCount + tagset.PTCount];

            double[][] unaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount);

            double[][][] binaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount, tagCount);

            foreach (var tree in treebank) {
                foreach (var node in tree.TreeNodes) {
                    if (node.Children.Count == 0) {
                        // terminals
                        continue;
                    } else if (node.Children.Count == 1) {
                        int pt = tagset.GetID (node.Tag);
                        int ct = tagset.GetID (node.Children [0].Tag);

                        pmass [pt] += 1.0f;
                        unaries [ct] [pt] += 1.0f;
                    } else if (node.Children.Count == 2) {
                        int pt = tagset.GetID (node.Tag);
                        int lt = tagset.GetID (node.Children [0].Tag);
                        int rt = tagset.GetID (node.Children [1].Tag);

                        pmass [pt] += 1.0f;
                        binaries [lt] [rt] [pt] += 1.0f;
                    } else {
                        throw new Exception ("tree node with more than 2 children!");
                    }
                }
            }

            for (int c = 0; c < unaries.Length; ++c)
            {
                bool csurvive = false;
                for (int p = 0; p < unaries[c].Length; ++p)
                {
                    if (unaries[c][p] > 0)
                    {
                        csurvive = true;
                        break;
                    }
                }

                if (!csurvive)
                {
                    unaries[c] = null;
                }
            }

            for (int l = 0; l < binaries.Length; ++l)
            {
                bool lsurvive = false;
                for (int r = 0; r < binaries[l].Length; ++r)
                {
                    bool rsurvive = false;

                    for (int p = 0; p < binaries[l][r].Length; ++p)
                    {
                        if (binaries[l][r][p] > 0)
                        {
                            rsurvive = true;
                            break;
                        }
                    }

                    if (rsurvive)
                    {
                        lsurvive = true;
                    }
                    else
                    {
                        binaries[l][r] = null;
                    }
                }

                if (!lsurvive)
                {
                    binaries[l] = null;
                }
            }

            foreach (var x in unaries.Where(x => x != null))
            {
                for (int p = 0; p < x.Length; ++p)
                {
                    double noise = RNG.NextDouble();
                    x[p] += noise;
                    pmass[p] += noise;
                }
            }

            foreach (var x in binaries.Where(x => x != null))
            {
                foreach (var y in x.Where(y => y != null))
                {
                    for (int p = 0; p < y.Length; ++p)
                    {
                        double noise = RNG.NextDouble();
                        y[p] += noise;
                        pmass[p] += noise;
                    }
                }
            }

            for (int c = 0; c < tagCount; ++c) {
                for (int p = 0; p < tagCount; ++p) {
                    if (pmass [p] == 0) {
                        continue;
                    }
                    if (unaries[c] == null)
                    {
                        continue;
                    }
                    unaries [c] [p] /= pmass [p];
                }
            }

            for (int c = 0; c < tagCount; ++c) {
                if (unaries [c] == null) {
                    continue;
                }
                for (int p = 0; p < tagCount; ++p) {
                    if (unaries [c] [p] <= 0) {
                        unaries [c] [p] = double.NegativeInfinity;
                    } else {
                        unaries [c] [p] = (double)Math.Log (unaries [c] [p]);
                    }
                }
            }

            for (int l = 0; l < tagCount; ++l) {
                if (binaries[l] == null)
                {
                    continue;
                }
                for (int r = 0; r < tagCount; ++r) {
                    for (int p = 0; p < tagCount; ++p) {
                        if (pmass [p] == 0) {
                            continue;
                        }

                        if (binaries[l][r] == null)
                        {
                            continue;
                        }

                        binaries [l] [r] [p] /= pmass [p];
                    }
                }
            }

            for (int l = 0; l < tagCount; ++l) {
                if (binaries [l] == null) {
                    continue;
                }
                for (int r = 0; r < tagCount; ++r) {
                    if (binaries [l] [r] == null) {
                        continue;
                    }

                    for (int p = 0; p < tagCount; ++p) {

                        if (binaries [l] [r] [p] <= 0) {
                            binaries [l] [r] [p] = double.NegativeInfinity;
                        } else {
                            binaries [l] [r] [p] = (double)Math.Log (binaries [l] [r] [p]);
                        }
                    }

                }
            }

            var terminals = BuildLexSimple (treebank, tagset, vocab, RNG);

            rules = new LAPCFGrammar (tagset, binaries, unaries, terminals);
        }
Пример #17
0
        private static double ParseGraphAndCollect(int nthread,
            List<PhrasalTree> treebank,
            LAPCFGrammar rules,
            Vocabulary vocab,
            TagSet tagSet,
            out int failed)
        {
            double llhd = 0;
            failed = 0;

            int xfail = 0;
            var handle = new object();
            var rulelist = new List<LAPCFGrammar>();
            rulelist.Add(rules);
            while (rulelist.Count < nthread)
            {
                rulelist.Add(rules.CloneWithSharedParameters());
            }
            Parallel.For(0, nthread, threadid =>
            {
                int fail = 0;
                double xllhd = 0;
                var parser = new HyperGraphParser(vocab, tagSet, rulelist [threadid]);
                for (int i = threadid; i < treebank.Count; i += nthread)
                {
                    try
                    {
                        var graph = parser.BuildHyperGraph(treebank [i]);

                        graph.SumForward();
                        graph.SumBackward();

                        if (double.IsInfinity(graph.RootScore) || double.IsNaN(graph.RootScore))
                        {
                            fail += 1;
                            continue;
                        }

                        graph.CollectExpectedCount();
                        xllhd += graph.RootScore;
                    } catch
                    {
                        fail += 1;
                    }

                }

                lock (handle)
                {
                    xfail += fail;
                    llhd += xllhd;
                }
            }
            );

            for (int i = 1; i < rulelist.Count; ++i)
            {
                LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.tposteriorCounts, rulelist [i].tposteriorCounts);
                LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.uposteriorCounts, rulelist [i].uposteriorCounts);
                LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.bposteriorCounts, rulelist [i].bposteriorCounts);
            }
            failed = xfail;
            //Console.Error.WriteLine("fail: {0}\tllhd: {1}", failed, llhd);
            return llhd;
        }
Пример #18
0
        public static double[][] BuildLexSimple(List<PhrasalTree> treebank, TagSet tagSet, Vocabulary vocab, Random RNG)
        {
            int PTCount = tagSet.PTCount;
            int vocabCount = vocab.VocabSize;
            double[][] tagWordCounts = ArrayHelper.AllocateArray<double>(PTCount, vocabCount);
            double[] tagCounts = new double[PTCount];

            HashSet<string>[] tagTypeSets = new HashSet<string>[PTCount];

            for (int i = 0; i < tagTypeSets.Length; ++i)
            {
                tagTypeSets[i] = new HashSet<string>();
            }

            foreach (var tree in treebank)
            {
                tree.ComputeStartEnd();
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        string word = SimpleTokenizor.ETokenize(node.Lex);
                        string tag = node.Tag;

                        int tagId = tagSet.GetPTID(tag);
                        tagTypeSets[tagId].Add(word);

                        int wordId = vocab.GetId(word, node.Start == 0);

                        double weight = RNG == null ? 1.0 : 1.0 + (RNG.NextDouble() - 0.5) / 100;

                        tagWordCounts[tagId][wordId] += weight;
                        tagCounts[tagId] += weight;
                    }
                }
            }

            double[][] scores = ArrayHelper.AllocateArray<double>(vocabCount, PTCount);

            ArrayHelper.Fill(scores, double.NegativeInfinity);

            for (int word = 0; word < scores.Length; ++word)
            {
                for (int tag = 0; tag < scores[word].Length; ++tag)
                {
                    if (tagWordCounts[tag][word] > 0)
                    {
                        //scores[i][j] = new double[1];
                        //expectedCounts[i][j] = new double[1];
                        scores[word][tag] = (double)Math.Log(tagWordCounts[tag][word] / tagCounts[tag]);
                        //expectedCounts[i][j][0] = double.NegativeInfinity;
                    }
                }
            }

            return scores;
        }
Пример #19
0
        private static bool[][] AssignTagConstraints(Vocabulary vocab, TagSet tagSet, string[] words, int[] wids)
        {
            bool[][] allowedTags = new bool[wids.Length][];

            for (int i = 0; i < wids.Length; ++i)
            {
                //allowedTags[i] = new bool[tagSet.PTCount];

                //allowedTags[i][tagSet.GetID(tags[i])] = true;

                //continue;
                if (vocab.IsRareOrUNK(wids[i]))
                {
                    var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]);

                    if (lemmas == null || lemmas.Count == 0)
                    {
                        continue;
                    }

                    allowedTags[i] = new bool[tagSet.PTCount];

                    if (char.IsUpper(words[i][0]))
                    {
                        allowedTags[i][tagSet.GetID("NNP")] = true;
                        allowedTags[i][tagSet.GetID("NNPS")] = true;
                    }

                    foreach (var lemma in lemmas)
                    {
                        switch (lemma.PoS)
                        {
                            case EMorph.MorphPoS.NN:
                                allowedTags[i][tagSet.GetID("NN")] = true;
                                var w = words[i].ToLower();
                                if (EMorph.EnglishMorph.IsNoChangeNoun(w)
                                    || w.EndsWith("ese") || w.EndsWith("ise"))
                                {
                                    allowedTags[i][tagSet.GetID("NNS")] = true;
                                }
                                break;
                            case EMorph.MorphPoS.NNS:
                                allowedTags[i][tagSet.GetID("NNS")] = true;
                                //allowedTags[i][tagSet.GetID("NN")] = true;
                                break;
                            case EMorph.MorphPoS.JJ:
                                allowedTags[i][tagSet.GetID("JJ")] = true;
                                break;
                            case EMorph.MorphPoS.JJR:
                                allowedTags[i][tagSet.GetID("JJR")] = true;
                                break;
                            case EMorph.MorphPoS.JJS:
                                allowedTags[i][tagSet.GetID("JJS")] = true;
                                break;
                            case EMorph.MorphPoS.RB:
                                allowedTags[i][tagSet.GetID("RB")] = true;
                                break;
                            case EMorph.MorphPoS.RBR:
                                allowedTags[i][tagSet.GetID("RBR")] = true;
                                break;
                            case EMorph.MorphPoS.RBS:
                                allowedTags[i][tagSet.GetID("RBS")] = true;
                                break;
                            case EMorph.MorphPoS.VB:
                                allowedTags[i][tagSet.GetID("VB")] = true;
                                allowedTags[i][tagSet.GetID("VBP")] = true;
                                break;
                            case EMorph.MorphPoS.VBD:
                                allowedTags[i][tagSet.GetID("VBD")] = true;
                                allowedTags[i][tagSet.GetID("VBN")] = true;
                                //allowedTags[i][tagSet.GetID("JJ")] = true;
                                break;
                            case EMorph.MorphPoS.VBG:
                                allowedTags[i][tagSet.GetID("VBG")] = true;
                                //allowedTags[i][tagSet.GetID("JJ")] = true;
                                break;
                            case EMorph.MorphPoS.VBZ:
                                allowedTags[i][tagSet.GetID("VBZ")] = true;
                                break;
                            default:
                                throw new Exception("not recognized morph lemma!");
                        }
                    }

                    //if(!allowedTags[i][tagSet.GetID(tags[i])])
                    //{
                    //    Console.Error.WriteLine("!");
                    //}
                }
            }
            return allowedTags;
        }
Пример #20
0
        public LAPCFGrammar(TagSet tagset, Vocabulary vocab, string lexfile, string rulefile)
        {
            NTCount = tagset.NTCount;
            PTCount = tagset.PTCount;
            ROOTID = tagset.ROOTID;

            subTagCounts = new int[TotalTagCount];
            ArrayHelper.Fill(subTagCounts, 2);

            subTagCounts[ROOTID] = 1;

            var trace = subTagCounts.Select(x => new int[x]).ToArray();

            subtagTraces.Add(trace);

            using (var sr = new System.IO.StreamReader(lexfile))
            {
                while (!sr.EndOfStream)
                {
                    string line = sr.ReadLine();
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }
                    string[] parts = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                    string tag = parts[0];
                    string word = parts[1];

                    for (int i = 0; i < parts.Length - 2; ++i)
                    {
                        double p = double.Parse(parts[i + 2]);
                        if (p <= 0)
                        {
                            continue;
                        }

                        string pline = string.Format("{0}_{1}\t{2}\t{3}", tag, i, word, Math.Log(p));

                        BuildTerminalRule(pline, vocab, tagset);
                    }
                }
            }

            using (var sr = new System.IO.StreamReader(rulefile))
            {
                while (!sr.EndOfStream)
                {
                    string line = sr.ReadLine();

                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    string[] parts = line.Split(new string[] { " ", "\t", "->" }, StringSplitOptions.RemoveEmptyEntries);

                    int xlen = parts.Length - 1;

                    parts[xlen] = Math.Log(double.Parse(parts[xlen])).ToString();

                    string pline = string.Join("\t", parts);

                    if (parts.Length == 3)
                    {
                        BuildUnaryRule(pline, tagset);
                    }
                    else
                    {
                        BuildBinaryRule(pline, tagset);
                    }

                }
            }
        }