public void BuildHyperGraph(LAPCFGrammar grammar, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity = null) { epool.Recycle (); vpool.Recycle (); this.ROOTID = grammar.ROOTID; var maxSubTag = grammar.subTagCounts.Max (); lbuf = new double[maxSubTag + 1]; // CYK for (int i = 0; i < wids.Length; ++i) { bool isRoot = i == 0 && i == wids.Length - 1; chart [i, i] = new HyperCell (i, i + 1, grammar.TotalTagCount); MatchLexicon (grammar, chart [i, i], wids [i], epool, vpool, tagCapacity, allowedPoSTags == null ? null : allowedPoSTags[i], rawTagProbs == null ? null : rawTagProbs[i], isRoot); MatchUnaryRules (grammar, chart [i, i], epool, vpool, tagCapacity, isRoot); chart [i, i].Finish (); } for (int spanL = 2; spanL <= wids.Length; ++spanL) { for (int beg = 0; beg + spanL <= wids.Length; ++beg) { int end = beg + spanL; int l = beg; int r = end - 1; bool isRoot = l == 0 && r == wids.Length - 1; chart [l, r] = new HyperCell (beg, end, grammar.TotalTagCount); for (int mid = l; mid < r; ++mid) { MatchBinaryRules (grammar, chart [l, r], chart [l, mid], chart [mid + 1, r], epool, vpool, tagCapacity, isRoot); } for (int i = 0; i < chart[l, r].l1v.Length; ++i) { var c = chart [l, r].l1v [i]; if (c != null) { if (isRoot && c.tag != ROOTID) { continue; } chart [l, r].l2v [i] = vpool.Allocate (false, c.tag, c.beta.Length, c.beta.v.Length); epool.Allocate (chart [l, r].l2v [i], c); } } MatchUnaryRules (grammar, chart [l, r], epool, vpool, tagCapacity, isRoot); chart [l, r].Finish (); } } }
private static void CreateMergeMapping(LAPCFGrammar rules, List<MergeHelper> mergeCands, out int[][] subtagMap, out bool[][] isMerged, out int[] newSubTagCounts) { subtagMap = new int[rules.TotalTagCount][]; isMerged = new bool[rules.TotalTagCount][]; for (int i = 0; i < subtagMap.Length; ++i) { subtagMap [i] = new int[rules.GetSubTagCount (i)]; isMerged [i] = new bool[rules.GetSubTagCount (i)]; for (int j = 0; j < subtagMap[i].Length; ++j) { subtagMap [i] [j] = j; } } newSubTagCounts = new int[rules.TotalTagCount]; for (int i = 0; i < newSubTagCounts.Length; ++i) { newSubTagCounts [i] = rules.GetSubTagCount (i); } for (int i = 0; i < mergeCands.Count / 2; ++i) { var cand = mergeCands [i]; var t = cand.tag; var xt = cand.subtag; var lt = xt * 2; isMerged [t] [lt] = true; isMerged [t] [lt + 1] = true; newSubTagCounts [t] -= 1; for (int subt = lt + 1; subt < subtagMap[t].Length; ++subt) { subtagMap [t] [subt] -= 1; } } }
public LAPCFGrammar SplitSymbols(Random RNG, double randomness) { int[] newSubTagCounts = new int[subTagCounts.Length]; for (int tid = 0; tid < newSubTagCounts.Length; ++tid) { if (tid == ROOTID) { newSubTagCounts [tid] = subTagCounts [tid]; } else { newSubTagCounts [tid] = subTagCounts [tid] * 2; } } var newbRules = brules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.Select( z => z == null ? null : z.SplitSymbols(subTagCounts, newSubTagCounts, RNG, randomness) ).ToArray() ).ToArray() ).ToArray(); var newuRules = urules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.SplitSymbols(subTagCounts, newSubTagCounts, RNG, randomness) ).ToArray() ).ToArray(); var newtRules = trules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.SplitSymbols(subTagCounts, newSubTagCounts, RNG, randomness) ).ToArray() ).ToArray(); var newTable = new LAPCFGrammar(); newTable.NTCount = NTCount; newTable.PTCount = PTCount; newTable.ROOTID = ROOTID; newTable.brules = newbRules; newTable.urules = newuRules; newTable.trules = newtRules; newTable.subTagCounts = newSubTagCounts; newTable.InitializeExpectedCounts(); foreach (var trace in subtagTraces) { newTable.subtagTraces.Add(trace); } int[][] newTrace = new int[TotalTagCount][]; for (int i = 0; i < newTrace.Length; ++i) { newTrace [i] = new int[newSubTagCounts [i]]; int splitFactor = newSubTagCounts [i] == subTagCounts [i] ? 1 : 2; for (int j = 0; j < newTrace[i].Length; ++j) { newTrace [i] [j] = j / splitFactor; } } newTable.subtagTraces.Add(newTrace); return newTable; }
static void TestParse() { string modelfile = //@"/home/nan/Data/PTB/ptb.s2.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s6.smoothed.grammar"; string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt"; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); } grammar.Smoothing(0.1f); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1]; var traces = new int[grammars.Length][][]; grammars[grammars.Length - 1] = grammar; for (int i = grammars.Length - 1; i >= 1; --i) { traces[i] = grammar.subtagTraces[i - 1]; grammars[i - 1] = grammars[i].ProjectGrammar(traces[i]); grammars[i - 1].MakeCompaction(); grammars[i - 1].MakeSubruleCompaction(); } string[][] tagTiers; using (StreamReader sr = new StreamReader(tagmapfile)) { var tt = new List<string[]>(); while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries)); } tagTiers = new string[tt[0].Length][];//tt.ToArray(); for (int i = 0; i < tagTiers.Length; ++i) { tagTiers[i] = new string[tt.Count]; for (int j = 0; j < tt.Count; ++j) { tagTiers[i][j] = tt[j][i]; } } } var cbs = new CodeBook32[tagTiers.Length]; for (int i = 0; i < cbs.Length; ++i) { cbs[i] = new CodeBook32(); foreach (var t in tagTiers[i]) { cbs[i].Add(t); } } int pgcount = cbs.Length - 1; int[][] tagMaps = new int[pgcount][]; for (int i = 0; i < tagMaps.Length; ++i) { tagMaps[i] = new int[grammars[0].PTCount + 1 + cbs[i + 1].Count]; for (int j = 0; j < grammars[0].PTCount + 1; ++j) { tagMaps[i][j] = j; } } var lastMap = tagMaps[tagMaps.Length - 1]; for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j) { string tstr = tagSet.GetTagString(j); int id = cbs[cbs.Length - 1][tstr]; int pid = cbs[cbs.Length - 2][tagTiers[tagTiers.Length - 2][id]]; lastMap[j] = pid + grammars[0].PTCount + 1; } for (int i = 0; i < tagMaps.Length - 1; ++i) { for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j) { string tstr = cbs[i + 1][j - grammars[0].PTCount - 1]; int xid = Array.IndexOf(tagTiers[i + 1], tstr); string pstr = tagTiers[i][xid]; int pid = cbs[i][pstr]; tagMaps[i][j] = pid; } } var cgrammars = new LAPCFGrammar[tagMaps.Length]; cgrammars[cgrammars.Length - 1] = grammars[0].CollapseNonTerminals(tagMaps[cgrammars.Length - 1], 1 + cbs[cgrammars.Length - 1].Count); for (int i = cgrammars.Length - 1; i >= 1; --i) { cgrammars[i - 1] = cgrammars[i].CollapseNonTerminals(tagMaps[i - 1], 1 + cbs[i - 1].Count); } for (int i = 0; i < cgrammars.Length; ++i) { cgrammars[i].MakeCompaction(); cgrammars[i].MakeSubruleCompaction(); } HyperEdgePool epool = new HyperEdgePool(1024 * 1024); HyperVertexPool vpool = new HyperVertexPool(grammars[grammars.Length - 1].subTagCounts.Max()); EMorph.EnglishMorph.WarmUp(); Console.Error.WriteLine("READY"); while(true) { string line = Console.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } var words = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true); bool[][] allowedTags = new bool[wids.Length][]; for (int i = 0; i < wids.Length; ++i) { if (vocab.IsRareOrUNK(wids[i])) { var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]); if (lemmas == null || lemmas.Count == 0) { continue; } allowedTags[i] = new bool[tagSet.PTCount]; if (char.IsUpper(words[i][0])) { allowedTags[i][tagSet.GetID("NNP")] = true; allowedTags[i][tagSet.GetID("NNPS")] = true; } foreach (var lemma in lemmas) { switch (lemma.PoS) { case EMorph.MorphPoS.NN: allowedTags[i][tagSet.GetID("NN")] = true; allowedTags[i][tagSet.GetID("NNS")] = true; break; case EMorph.MorphPoS.NNS: allowedTags[i][tagSet.GetID("NNS")] = true; allowedTags[i][tagSet.GetID("NN")] = true; break; case EMorph.MorphPoS.JJ: allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.JJR: allowedTags[i][tagSet.GetID("JJR")] = true; break; case EMorph.MorphPoS.JJS: allowedTags[i][tagSet.GetID("JJS")] = true; break; case EMorph.MorphPoS.RB: allowedTags[i][tagSet.GetID("RB")] = true; break; case EMorph.MorphPoS.RBR: allowedTags[i][tagSet.GetID("RBR")] = true; break; case EMorph.MorphPoS.RBS: allowedTags[i][tagSet.GetID("RBS")] = true; break; case EMorph.MorphPoS.VB: allowedTags[i][tagSet.GetID("VB")] = true; allowedTags[i][tagSet.GetID("VBP")] = true; break; case EMorph.MorphPoS.VBD: allowedTags[i][tagSet.GetID("VBD")] = true; allowedTags[i][tagSet.GetID("VBN")] = true; break; case EMorph.MorphPoS.VBG: allowedTags[i][tagSet.GetID("VBG")] = true; break; case EMorph.MorphPoS.VBZ: allowedTags[i][tagSet.GetID("VBZ")] = true; break; default: throw new Exception("not recognized morph lemma!"); } } } } try { var parser = new ChartHyperGraphParser(wids, allowedTags); parser.BuildHyperGraph(cgrammars[0], epool, vpool, grammars[grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); for (int i = 1; i < cgrammars.Length; ++i) { parser.ExpandHyperGraph(cgrammars[i], tagMaps[i - 1], epool, vpool, grammars[grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); } parser.ExpandHyperGraph(grammars[0], tagMaps[2], epool, vpool, grammars[grammars.Length - 1].subTagCounts); for (int i = 0; i < grammars.Length - 1; ++i) { parser.SumForward(); parser.SumBackward(false); parser.Prune(-8.0); parser.Purge(); parser.ProjectGrammar(traces[i + 1], grammars[i + 1]); } parser.SumForward(); parser.SumBackward(true); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root); ptree.ComputeStartEnd(); string treeline = ptree.TextTree; string[] xlines = treeline.Split(new string[] { "\n", "\r", "\r\n" }, StringSplitOptions.RemoveEmptyEntries); foreach (var xline in xlines) { Console.Error.WriteLine(xline); } } catch { Console.Error.WriteLine("Failure to parse!"); } } }
private static double ParseGraphAndCollect(int nthread, List<PhrasalTree> treebank, LAPCFGrammar rules, Vocabulary vocab, TagSet tagSet, out int failed) { double llhd = 0; failed = 0; int xfail = 0; var handle = new object(); var rulelist = new List<LAPCFGrammar>(); rulelist.Add(rules); while (rulelist.Count < nthread) { rulelist.Add(rules.CloneWithSharedParameters()); } Parallel.For(0, nthread, threadid => { int fail = 0; double xllhd = 0; var parser = new HyperGraphParser(vocab, tagSet, rulelist [threadid]); for (int i = threadid; i < treebank.Count; i += nthread) { try { var graph = parser.BuildHyperGraph(treebank [i]); graph.SumForward(); graph.SumBackward(); if (double.IsInfinity(graph.RootScore) || double.IsNaN(graph.RootScore)) { fail += 1; continue; } graph.CollectExpectedCount(); xllhd += graph.RootScore; } catch { fail += 1; } } lock (handle) { xfail += fail; llhd += xllhd; } } ); for (int i = 1; i < rulelist.Count; ++i) { LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.tposteriorCounts, rulelist [i].tposteriorCounts); LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.uposteriorCounts, rulelist [i].uposteriorCounts); LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.bposteriorCounts, rulelist [i].bposteriorCounts); } failed = xfail; //Console.Error.WriteLine("fail: {0}\tllhd: {1}", failed, llhd); return llhd; }
private static void MatchUnaryRules( LAPCFGrammar grammar, HyperCell cell, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity, bool isRoot) { foreach (var cv in cell.l1v) { if (cv == null) { continue; } var rules = grammar.urules [cv.tag]; if (rules != null) { foreach (var rule in rules) { if (rule == null) { break; } if (rule.ptag == grammar.ROOTID && !isRoot) { continue; } if (isRoot && rule.ptag != grammar.ROOTID) { continue; } if (cell.l2v [rule.ptag] == null) { var cap = tagCapacity == null ? -1 : tagCapacity [rule.ptag]; cell.l2v [rule.ptag] = vpool.Allocate (false, rule.ptag, grammar.GetSubTagCount (rule.ptag), cap); } epool.Allocate (cell.l2v [rule.ptag], cv, rule.scores, null); } } } }
private static void MatchBinaryRules( LAPCFGrammar grammar, HyperCell pcell, HyperCell lcell, HyperCell rcell, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity, bool isRoot) { foreach (var lv in lcell.l2v) { var rprules = grammar.brules [lv.tag]; if (rprules == null) { continue; } foreach (var rv in rcell.l2v) { var prules = rprules [rv.tag]; if (prules == null) { continue; } for (int p = 0; p < prules.Length; ++p) { var rule = prules [p]; if (rule == null) { break; } if (rule.ptag == grammar.ROOTID && !isRoot) { continue; } if (pcell.l1v [rule.ptag] == null) { var cap = tagCapacity == null ? -1 : tagCapacity [rule.ptag]; pcell.l1v [rule.ptag] = vpool.Allocate (false, rule.ptag, grammar.GetSubTagCount (rule.ptag), cap); } epool.Allocate (pcell.l1v [rule.ptag], lv, rv, rule.scores, null); } } } }
public void ProjectGrammar(int[][] trace, LAPCFGrammar grammar) { int maxSubTag = grammar.subTagCounts.Max (); lbuf = new double[maxSubTag + 1]; for (int spanL = 1; spanL <= wids.Length; ++spanL) { for (int beg = 0; beg + spanL <= wids.Length; ++beg) { int end = beg + spanL; int l = beg; int r = end - 1; foreach (var v in chart[l, r].l1v) { if (v != null) { v.ProjectGrammar (trace, grammar); } } foreach (var v in chart[l, r].l2v) { if (v != null) { v.ProjectGrammar (trace, grammar); } } } } }
public HyperGraphParser( Vocabulary vocab, TagSet tagset, LAPCFGrammar rules) { this.vocab = vocab; this.tagset = tagset; this.rules = rules; }
public LAPCFGrammar CreateRuleTable(int[] newSubTagCounts) { var table = new LAPCFGrammar(); table.NTCount = NTCount; table.PTCount = PTCount; table.ROOTID = ROOTID; table.subTagCounts = newSubTagCounts; table.brules = brules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.Select( z => z == null ? null : z.CreateRule(newSubTagCounts) ).ToArray() ).ToArray() ).ToArray(); //ArrayHelper.Clone<BinaryRule>(rules); table.urules = urules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.CreateRule(newSubTagCounts) ).ToArray() ).ToArray(); table.trules = trules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.CreateRule(newSubTagCounts) ).ToArray() ).ToArray(); return table; }
public LAPCFGrammar CollapseNonTerminals(int[] tagMap, int newNTCount) { var pg = new LAPCFGrammar(); pg.NTCount = newNTCount; pg.PTCount = PTCount; pg.ROOTID = ROOTID; pg.subTagCounts = new int[PTCount + newNTCount]; ArrayHelper.Fill(pg.subTagCounts, 1); var sprobs = ComputeSymbolProb(); var psprobs = pg.subTagCounts.Select(x => new double[x]).ToArray(); ArrayHelper.Fill(psprobs, double.NegativeInfinity); for (int i = 0; i < PTCount; ++i) { for (int j = 0; j < subTagCounts[i]; ++j) { psprobs [i] [0] = MathHelper.LogAdd(psprobs [i] [0], sprobs [i] [j]); } } for (int i = PTCount; i < TotalTagCount; ++i) { for (int j = 0; j < subTagCounts[i]; ++j) { psprobs [tagMap [i]] [0] = MathHelper.LogAdd(psprobs [tagMap [i]] [0], sprobs [i] [j]); } } pg.brules = ArrayHelper.AllocateArray<BinaryRule>(PTCount + newNTCount, PTCount + newNTCount, PTCount + newNTCount); foreach (var x in brules.Where(x => x != null)) { foreach (var y in x.Where(y => y != null)) { foreach (var z in y.Where(z => z != null)) { int ltag = z.ltag >= PTCount ? tagMap [z.ltag] : z.ltag; int rtag = z.rtag >= PTCount ? tagMap [z.rtag] : z.rtag; int ptag = z.ptag >= PTCount ? tagMap [z.ptag] : z.ptag; double s = double.NegativeInfinity; for (int l = 0; l < z.scores.Length; ++l) { if (z.scores [l] == null) { continue; } for (int r = 0; r < z.scores[l].Length; ++r) { if (z.scores [l] [r] == null) { continue; } for (int p = 0; p < z.scores[l][r].Length; ++p) { double xs = z.scores [l] [r] [p]; if (double.IsNegativeInfinity(xs)) { continue; } xs += sprobs [z.ptag] [p]; s = MathHelper.LogAdd(xs, s); } } } if (double.IsNegativeInfinity(s)) { continue; } if (pg.brules [ltag] [rtag] [ptag] == null) { pg.brules [ltag] [rtag] [ptag] = new BinaryRule(ArrayHelper.AllocateArray<double>(1, 1, 1), ptag, ltag, rtag); pg.brules [ltag] [rtag] [ptag].scores [0] [0] [0] = s; } else { pg.brules [ltag] [rtag] [ptag].scores [0] [0] [0] = MathHelper.LogAdd(s, pg.brules [ltag] [rtag] [ptag].scores [0] [0] [0]); } } } } foreach (var x in pg.brules.Where(x => x != null)) { foreach (var y in x.Where(y => y != null)) { foreach (var z in y.Where(z => z != null)) { z.scores [0] [0] [0] = z.scores [0] [0] [0] - psprobs [z.ptag] [0]; } } } pg.urules = ArrayHelper.AllocateArray<UnaryRule>(PTCount + newNTCount, PTCount + newNTCount); foreach (var x in urules.Where(x => x != null)) { foreach (var y in x.Where(y => y != null)) { int ctag = y.ctag >= PTCount ? tagMap [y.ctag] : y.ctag; int ptag = y.ptag >= PTCount ? tagMap [y.ptag] : y.ptag; double s = double.NegativeInfinity; for (int c = 0; c < y.scores.Length; ++c) { if (y.scores [c] == null) { continue; } for (int p = 0; p < y.scores[c].Length; ++p) { double xs = y.scores [c] [p]; if (double.IsNegativeInfinity(xs)) { continue; } xs += sprobs [y.ptag] [p]; s = MathHelper.LogAdd(xs, s); } } if (double.IsNegativeInfinity(s)) { continue; } if (pg.urules [ctag] [ptag] == null) { pg.urules [ctag] [ptag] = new UnaryRule(ArrayHelper.AllocateArray<double>(1, 1), ptag, ctag); pg.urules [ctag] [ptag].scores [0] [0] = s; } else { pg.urules [ctag] [ptag].scores [0] [0] = MathHelper.LogAdd(s, pg.urules [ctag] [ptag].scores [0] [0]); } } } foreach (var x in pg.urules.Where(x => x != null)) { foreach (var y in x.Where(y => y != null)) { y.scores [0] [0] = y.scores [0] [0] - psprobs [y.ptag] [0]; } } var trace = subTagCounts.Select(x => new int[x]).ToArray(); pg.trules = trules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.MergeSymbols(trace, sprobs, psprobs) ).ToArray() ).ToArray(); pg.Normalize(); return pg; }
public LAPCFGrammar CloneWithSharedParameters() { var clone = new LAPCFGrammar(); clone.brules = brules; clone.urules = urules; clone.trules = trules; clone.NTCount = NTCount; clone.PTCount = PTCount; clone.ROOTID = ROOTID; clone.subTagCounts = subTagCounts; clone.subtagTraces = subtagTraces; clone.InitializeExpectedCounts(); return clone; }
public LAPCFGrammar Clone() { var clone = new LAPCFGrammar(); clone.brules = LAPCFGrammar.CloneRules(brules); clone.urules = LAPCFGrammar.CloneRules(urules); clone.trules = LAPCFGrammar.CloneRules(trules); clone.NTCount = NTCount; clone.PTCount = PTCount; clone.ROOTID = ROOTID; clone.subTagCounts = (int[])subTagCounts.Clone(); clone.subtagTraces = new List<int[][]>(); foreach (var trace in subtagTraces) { clone.subtagTraces.Add(ArrayHelper.Clone(trace)); } clone.InitializeExpectedCounts(); return clone; }
public static LAPCFGrammar LoadFromStream(TextModelReader sr, Vocabulary vocab, TagSet tagSet) { var grammar = new LAPCFGrammar(); var name = typeof(LAPCFGrammar).FullName; sr.Require(name); sr.Require("VER", VER); grammar.NTCount = sr.ReadOptionInt("NTCount"); grammar.PTCount = sr.ReadOptionInt("PTCount"); grammar.ROOTID = sr.ReadOptionInt("ROOTID"); sr.Require("TerminalRule"); int lvl = sr.NestLevel; var truleStrings = new HashSet<string>(); var uruleStrings = new HashSet<string>(); var bruleStrings = new HashSet<string>(); string line = sr.Read(); while (sr.NestLevel > lvl) { truleStrings.Add(line); line = sr.Read(); } if (line != "UnaryRule") { throw new Exception("wrong model!"); } line = sr.Read(); while (sr.NestLevel > lvl) { uruleStrings.Add(line); line = sr.Read(); } if (line != "BinaryRule") { throw new Exception("wrong model!"); } line = sr.Read(); while (sr.NestLevel > lvl) { bruleStrings.Add(line); line = sr.Read(); } string[] parts = line.Split('\t'); if (parts [0] != "TraceCount") { throw new Exception("error in model"); } int subtraceCount = int.Parse(parts [1]); grammar.subtagTraces = new List<int[][]>(); for (int i = 0; i < subtraceCount; ++i) { int tlen = sr.ReadOptionInt("TRACE"); int[][] trace = new int[tlen][]; for (int j = 0; j < tlen; ++j) { trace [j] = sr.ReadIntArray(); } grammar.subtagTraces.Add(trace); } if (grammar.subtagTraces.Count == 0) { grammar.subTagCounts = new int[grammar.TotalTagCount]; ArrayHelper.Fill(grammar.subTagCounts, 1); } else { var trace = grammar.subtagTraces [grammar.subtagTraces.Count - 1]; grammar.subTagCounts = trace.Select(x => x.Length).ToArray(); } sr.Require(name); foreach (var str in uruleStrings) { grammar.BuildUnaryRule(str, tagSet); } foreach (var str in truleStrings) { grammar.BuildTerminalRule(str, vocab, tagSet); } foreach (var str in bruleStrings) { grammar.BuildBinaryRule(str, tagSet); } return grammar; }
private static LAPCFGrammar MergeRuleTable(LAPCFGrammar rules, double[][] tagProb, int[][] subtagMap, bool[][] isMerged, int[] newSubTagCounts) { var newRules = rules.CreateRuleTable (newSubTagCounts); foreach (var x in rules.brules) { if (x == null) { continue; } foreach (var y in x) { if (y == null) { continue; } foreach (var rule in y) { if (rule == null) { continue; } int l = rule.ltag; int r = rule.rtag; int p = rule.ptag; for (int sl = 0; sl < rule.scores.Length; ++sl) { for (int sr = 0; sr < rule.scores[sl].Length; ++sr) { for (int sp = 0; sp < rule.scores[sl][sr].Length; ++sp) { double s = rule.scores [sl] [sr] [sp]; int nsl = subtagMap [l] [sl]; int nsr = subtagMap [r] [sr]; int nsp = subtagMap [p] [sp]; if (isMerged [p] [sp]) { s += tagProb [p] [sp]; } var xs = newRules.brules [l] [r] [p].scores [nsl] [nsr] [nsp]; newRules.brules [l] [r] [p].scores [nsl] [nsr] [nsp] = MathHelper.LogAdd (xs, s); } } } } } } foreach (var x in rules.urules) { if (x == null) { continue; } foreach (var rule in x) { if (rule == null) { continue; } int c = rule.ctag; int p = rule.ptag; for (int sc = 0; sc < rule.scores.Length; ++sc) { for (int sp = 0; sp < rule.scores[sc].Length; ++sp) { double s = rule.scores [sc] [sp]; int nsc = subtagMap [c] [sc]; int nsp = subtagMap [p] [sp]; if (isMerged [p] [sp]) { s += tagProb [p] [sp]; } var xs = newRules.urules [c] [p].scores [nsc] [nsp]; newRules.urules [c] [p].scores [nsc] [nsp] = MathHelper.LogAdd (xs, s); } } } } foreach (var x in rules.trules) { if (x == null) { continue; } foreach (var rule in x) { if (rule == null) { continue; } int w = rule.word; int t = rule.tag; for (int st = 0; st < rule.scores.Length; ++st) { double s = rule.scores [st]; int nsp = subtagMap [t] [st]; if (isMerged [t] [st]) { s += tagProb [t] [st]; } var xs = newRules.trules [w] [t].scores [nsp]; newRules.trules [w] [t].scores [nsp] = MathHelper.LogAdd (xs, s); } } } double[][] expects = newRules.subTagCounts.Select (x => new double[x]).ToArray (); ArrayHelper.Fill (expects, double.NegativeInfinity); LAPCFGrammar.CollectTagMass (expects, newRules.trules); LAPCFGrammar.CollectTagMass (expects, newRules.urules); LAPCFGrammar.CollectTagMass (expects, newRules.brules); LAPCFGrammar.Normalize (expects, newRules.trules); LAPCFGrammar.Normalize (expects, newRules.urules); LAPCFGrammar.Normalize (expects, newRules.brules); foreach (var trace in rules.subtagTraces) { newRules.subtagTraces.Add (trace); } int[][] oldTrace = newRules.subtagTraces [newRules.subtagTraces.Count - 1]; int[][] newTrace = new int[oldTrace.Length][]; for (int i = 0; i < newTrace.Length; ++i) { newTrace [i] = new int[newRules.subTagCounts [i]]; for (int j = 0; j < oldTrace[i].Length; ++j) { newTrace [i] [subtagMap [i] [j]] = oldTrace [i] [j]; } } newRules.subtagTraces [newRules.subtagTraces.Count - 1] = newTrace; return newRules; }
public void ProjectGrammar(int[][] traces, LAPCFGrammar grammar) { int[] trace = traces [tag]; for (int i = trace.Length - 1; i >= 0; --i) { pruned [i] = pruned [trace [i]]; } subtagCount = grammar.subTagCounts [tag]; alpha.Length = subtagCount; beta.Length = subtagCount; if (TYPE != VTYPE.TERMINAL) { for (int i = 0; i < subtagCount; ++i) { _alpha [i] = double.NegativeInfinity; } } else { for (int i = 0; i < subtagCount; ++i) { _alpha [i] = 0; } } for (int i = 0; i < beta.Length; ++i) { _beta [i] = double.NegativeInfinity; } foreach (var e in incomings) { switch (e.TYPE) { case ETYPE.BINARY: e.binaryScores = grammar.GetRuleScores (e.to.tag, e.from0.tag, e.from1.tag, true); break; case ETYPE.UNARY: e.unaryScores = grammar.GetRuleScores (e.to.tag, e.from0.tag, true); break; case ETYPE.TERMINAL: e.terminalScores = grammar.GetTerminalRuleScores (e.to.tag, e.from0.tag, true); break; case ETYPE.DUMMY: break; default: throw new Exception ("unrecognized edge type!"); } } posteriorScore = double.NegativeInfinity; }
public static void Build( Vocabulary vocab, TagSet tagset, List<PhrasalTree> treebank, out LAPCFGrammar rules, Random RNG = null) { int tagCount = tagset.NTCount + tagset.PTCount; double[] pmass = new double[tagset.NTCount + tagset.PTCount]; double[][] unaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount); double[][][] binaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount, tagCount); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { // terminals continue; } else if (node.Children.Count == 1) { int pt = tagset.GetID (node.Tag); int ct = tagset.GetID (node.Children [0].Tag); pmass [pt] += 1.0f; unaries [ct] [pt] += 1.0f; } else if (node.Children.Count == 2) { int pt = tagset.GetID (node.Tag); int lt = tagset.GetID (node.Children [0].Tag); int rt = tagset.GetID (node.Children [1].Tag); pmass [pt] += 1.0f; binaries [lt] [rt] [pt] += 1.0f; } else { throw new Exception ("tree node with more than 2 children!"); } } } for (int c = 0; c < unaries.Length; ++c) { bool csurvive = false; for (int p = 0; p < unaries[c].Length; ++p) { if (unaries[c][p] > 0) { csurvive = true; break; } } if (!csurvive) { unaries[c] = null; } } for (int l = 0; l < binaries.Length; ++l) { bool lsurvive = false; for (int r = 0; r < binaries[l].Length; ++r) { bool rsurvive = false; for (int p = 0; p < binaries[l][r].Length; ++p) { if (binaries[l][r][p] > 0) { rsurvive = true; break; } } if (rsurvive) { lsurvive = true; } else { binaries[l][r] = null; } } if (!lsurvive) { binaries[l] = null; } } foreach (var x in unaries.Where(x => x != null)) { for (int p = 0; p < x.Length; ++p) { double noise = RNG.NextDouble(); x[p] += noise; pmass[p] += noise; } } foreach (var x in binaries.Where(x => x != null)) { foreach (var y in x.Where(y => y != null)) { for (int p = 0; p < y.Length; ++p) { double noise = RNG.NextDouble(); y[p] += noise; pmass[p] += noise; } } } for (int c = 0; c < tagCount; ++c) { for (int p = 0; p < tagCount; ++p) { if (pmass [p] == 0) { continue; } if (unaries[c] == null) { continue; } unaries [c] [p] /= pmass [p]; } } for (int c = 0; c < tagCount; ++c) { if (unaries [c] == null) { continue; } for (int p = 0; p < tagCount; ++p) { if (unaries [c] [p] <= 0) { unaries [c] [p] = double.NegativeInfinity; } else { unaries [c] [p] = (double)Math.Log (unaries [c] [p]); } } } for (int l = 0; l < tagCount; ++l) { if (binaries[l] == null) { continue; } for (int r = 0; r < tagCount; ++r) { for (int p = 0; p < tagCount; ++p) { if (pmass [p] == 0) { continue; } if (binaries[l][r] == null) { continue; } binaries [l] [r] [p] /= pmass [p]; } } } for (int l = 0; l < tagCount; ++l) { if (binaries [l] == null) { continue; } for (int r = 0; r < tagCount; ++r) { if (binaries [l] [r] == null) { continue; } for (int p = 0; p < tagCount; ++p) { if (binaries [l] [r] [p] <= 0) { binaries [l] [r] [p] = double.NegativeInfinity; } else { binaries [l] [r] [p] = (double)Math.Log (binaries [l] [r] [p]); } } } } var terminals = BuildLexSimple (treebank, tagset, vocab, RNG); rules = new LAPCFGrammar (tagset, binaries, unaries, terminals); }
public static void CalculateNewScores( //LALexiconBuilder lexicon, LAPCFGrammar rules, bool lexiconOnly = false) { //lexicon.CalculateNewScores(); //lexicon.ClearExpectedCounts(); double[][] expects = new double[rules.TotalTagCount][]; for (int i = 0; i < expects.Length; ++i) { expects [i] = new double[rules.GetSubTagCount (i)]; } ArrayHelper.Fill (expects, double.NegativeInfinity); if (lexiconOnly) { LAPCFGrammar.CollectTagMass(expects, rules.tposteriorCounts); LAPCFGrammar.CopyRules(rules.tposteriorCounts, rules.trules); LAPCFGrammar.Normalize(expects, rules.trules); LAPCFGrammar.ClearRules(rules.bposteriorCounts); LAPCFGrammar.ClearRules(rules.uposteriorCounts); LAPCFGrammar.ClearRules(rules.tposteriorCounts); } else { LAPCFGrammar.CollectTagMass(expects, rules.tposteriorCounts); LAPCFGrammar.CollectTagMass(expects, rules.uposteriorCounts); LAPCFGrammar.CollectTagMass(expects, rules.bposteriorCounts); LAPCFGrammar.CopyRules(rules.bposteriorCounts, rules.brules); LAPCFGrammar.Normalize(expects, rules.brules); LAPCFGrammar.CopyRules(rules.uposteriorCounts, rules.urules); LAPCFGrammar.Normalize(expects, rules.urules); LAPCFGrammar.CopyRules(rules.tposteriorCounts, rules.trules); LAPCFGrammar.Normalize(expects, rules.trules); LAPCFGrammar.ClearRules(rules.bposteriorCounts); LAPCFGrammar.ClearRules(rules.uposteriorCounts); LAPCFGrammar.ClearRules(rules.tposteriorCounts); } //rules.PropMaxUnaryPath(); }
public static void CheckProbs( //LALexiconBuilder lexicon, LAPCFGrammar rules) { double[][] expects = new double[rules.TotalTagCount][]; for (int i = 0; i < expects.Length; ++i) { expects [i] = new double[rules.GetSubTagCount (i)]; } ArrayHelper.Fill (expects, double.NegativeInfinity); LAPCFGrammar.CollectTagMass (expects, rules.brules); LAPCFGrammar.CollectTagMass (expects, rules.urules); for (int p = 0; p < expects.Length; ++p) { for (int sp = 0; sp < expects[p].Length; ++sp) { double s = expects [p] [sp]; if (double.IsNaN (s) || double.IsInfinity (s)) { continue; //throw new Exception("some rule in table has no mass!"); } if (Math.Abs (s) > 0.01) { throw new Exception ("table is not normalized!"); } } } }
private static void MatchLexicon( LAPCFGrammar table, HyperCell cell, int wid, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity, bool[] allowedTags, double[] tagProbs, bool isRoot) { var tv = new HyperVertex (true, wid, 1); var trules = table.trules [wid]; foreach (var rule in trules) { if (rule == null) { break; } if (rule.tag == table.ROOTID && !isRoot) { continue; } if (allowedTags != null && !allowedTags[rule.tag]) { continue; } var xrule = rule; if (tagProbs != null) { var xprob = tagProbs[rule.tag]; if (double.IsNegativeInfinity(xprob)) { continue; } xrule = rule.Clone(); for (int i = 0; i < xrule.scores.Length; ++i) { if (!double.IsNegativeInfinity(xrule.scores[i])) { xrule.scores[i] += xprob; } } } var cap = tagCapacity == null ? -1 : tagCapacity [rule.tag]; cell.l1v [rule.tag] = vpool.Allocate (false, rule.tag, table.GetSubTagCount (rule.tag), cap); epool.Allocate (cell.l1v [rule.tag], tv, xrule.scores, null); if (isRoot && rule.tag != table.ROOTID) { continue; } cell.l2v [rule.tag] = vpool.Allocate (false, rule.tag, table.GetSubTagCount (rule.tag), cap); epool.Allocate (cell.l2v [rule.tag], cell.l1v [rule.tag]); } }
public static LAPCFGrammar MergeSymbols(double percentage, Vocabulary vocab, TagSet tagset, LAPCFGrammar rules, List<PhrasalTree> treebank, int nthread) { rules.InitializeExpectedCounts (); double[][] tagProb = SubtagExpectedCounts (nthread, vocab, tagset, rules, treebank); bool[] isSplit = new bool[tagProb.Length]; for (int i = 0; i < tagProb.Length; ++i) { if (tagProb [i].Length == 1) { tagProb [i] [0] = 0; isSplit [i] = false; } else { isSplit [i] = true; for (int j = 0; j < tagProb[i].Length / 2; ++j) { double z = MathHelper.LogAdd (tagProb [i] [2 * j], tagProb [i] [2 * j + 1]); tagProb [i] [2 * j] -= z; tagProb [i] [2 * j + 1] -= z; } } } double[][] mergeLoss = CollectMergeLoss (nthread, vocab, tagset, rules, treebank, tagProb); var mergeCands = new List<MergeHelper> (); for (int t = 0; t < mergeLoss.Length; ++t) { if (mergeLoss [t] == null) { continue; } for (int st = 0; st < mergeLoss[t].Length; ++st) { mergeCands.Add (new MergeHelper (t, st, mergeLoss [t] [st])); } } mergeCands.Sort ((a, b) => { return a.loss.CompareTo (b.loss); } ); //mergeCands.Reverse(); int[][] subtagMap; bool[][] isMerged; int[] newSubTagCounts; CreateMergeMapping (rules, mergeCands, out subtagMap, out isMerged, out newSubTagCounts); var newRules = MergeRuleTable (rules, tagProb, subtagMap, isMerged, newSubTagCounts); newRules.InitializeExpectedCounts (); return newRules; }
public int ExpandHyperGraph(LAPCFGrammar grammar, int[] tagMap, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity = null) { this.ROOTID = grammar.ROOTID; var maxSubTag = grammar.subTagCounts.Max (); lbuf = new double[maxSubTag + 1]; int prunedCell = 0; var lv1flags = new TimedArray<bool> (grammar.TotalTagCount); var lv2flags = new TimedArray<bool> (grammar.TotalTagCount); // CYK for (int i = 0; i < wids.Length; ++i) { var oldcell = chart [i, i]; lv1flags.Clear (); foreach (var v in oldcell.l1v) { if (v != null) { lv1flags [v.tag] = true; } } lv2flags.Clear (); foreach (var v in oldcell.l2v) { if (v != null) { lv2flags [v.tag] = true; } } bool isRoot = i == 0 && i == wids.Length - 1; chart [i, i] = new HyperCell (i, i + 1, grammar.TotalTagCount); ExpandLexicon ( grammar, chart [i, i], lv1flags, lv2flags, tagMap, wids [i], epool, vpool, tagCapacity, allowedPoSTags == null ? null: allowedPoSTags[i], rawTagProbs == null ? null : rawTagProbs[i], isRoot); ExpandUnaryRules (grammar, chart [i, i], lv1flags, lv2flags, tagMap, epool, vpool, tagCapacity, isRoot); chart [i, i].Finish (); } for (int spanL = 2; spanL <= wids.Length; ++spanL) { for (int beg = 0; beg + spanL <= wids.Length; ++beg) { int end = beg + spanL; int l = beg; int r = end - 1; var oldcell = chart [l, r]; lv1flags.Clear (); foreach (var v in oldcell.l1v) { if (v != null) { lv1flags [v.tag] = true; } } lv2flags.Clear (); foreach (var v in oldcell.l2v) { if (v != null) { lv2flags [v.tag] = true; } } bool isRoot = l == 0 && r == wids.Length - 1; chart [l, r] = new HyperCell (beg, end, grammar.TotalTagCount); if (!oldcell.IsEmptyCell ()) { for (int mid = l; mid < r; ++mid) { ExpandBinaryRules (grammar, chart [l, r], chart [l, mid], chart [mid + 1, r], lv1flags, lv2flags, tagMap, epool, vpool, tagCapacity, isRoot); } for (int i = 0; i < chart[l, r].l1v.Length; ++i) { var c = chart [l, r].l1v [i]; if (c != null) { if (isRoot && c.tag != ROOTID) { continue; } chart [l, r].l2v [i] = vpool.Allocate (false, c.tag, c.beta.Length, c.beta.v.Length); epool.Allocate (chart [l, r].l2v [i], c); } } ExpandUnaryRules (grammar, chart [l, r], lv1flags, lv2flags, tagMap, epool, vpool, tagCapacity, isRoot); } else { prunedCell += 1; } chart [l, r].Finish (); } } return prunedCell; }
public static double[][] SubtagExpectedCounts( int nthread, Vocabulary vocab, TagSet tagset, //LALexiconBuilder lexicon, LAPCFGrammar rules, List<PhrasalTree> treebank) { var parser = new HyperGraphParser (vocab, tagset, rules); double[][][] tagExpectsArray = new double[nthread][][]; for (int tid = 0; tid < nthread; ++tid) { tagExpectsArray [tid] = new double[rules.TotalTagCount][]; var tagExpects = tagExpectsArray [tid]; for (int i = 0; i < tagExpects.Length; ++i) { tagExpects [i] = new double[rules.GetSubTagCount (i)]; } ArrayHelper.Fill (tagExpects, double.NegativeInfinity); } Parallel.For (0, nthread, threadid => { var tagExpects = tagExpectsArray [threadid]; for (int treeId = threadid; treeId < treebank.Count; treeId += nthread) { var tree = treebank [treeId]; var g = parser.BuildHyperGraph (tree); g.SumForward (); g.SumBackward (); double sentS = g.RootScore; if (double.IsNaN (sentS) || double.IsInfinity (sentS)) { continue; } foreach (var v in g.Vs) { if (v.TYPE == VTYPE.TERMINAL) { continue; } int t = v.tag; for (int st = 0; st < v.subtagCount; ++st) { if (double.IsNaN (v.alpha.v [st]) || double.IsInfinity (v.alpha.v [st]) || double.IsNaN (v.beta.v [st]) || double.IsInfinity (v.beta.v [st]) || v.alpha.pruned [st] || v.beta.pruned [st]) { continue; } tagExpects [t] [st] = MathHelper.LogAdd (v.alpha.v [st] + v.beta.v [st] - sentS, tagExpects [t] [st]); } } } } ); var te = tagExpectsArray [0]; for (int i = 1; i < nthread; ++i) { for (int j = 0; j < te.Length; ++j) { for (int k = 0; k < te[j].Length; ++k) { te [j] [k] = MathHelper.LogAdd (te [j] [k], tagExpectsArray [i] [j] [k]); } } } return te; }
private static double ParseGraphs(int nthread, List<PhrasalTree> treebank, LAPCFGrammar rules, Vocabulary vocab, TagSet tagSet, out int failed) { double llhd = 0; failed = 0; int xfail = 0; var handle = new object(); Parallel.For(0, nthread, threadid => { int fail = 0; double xllhd = 0; var parser = new HyperGraphParser(vocab, tagSet, rules); for (int i = threadid; i < treebank.Count; i += nthread) { try { var graph = parser.BuildHyperGraph(treebank [i]); graph.SumForward(); graph.SumBackward(); if (double.IsInfinity(graph.RootScore) || double.IsNaN(graph.RootScore)) { fail += 1; continue; } xllhd += graph.RootScore; } catch { fail += 1; } } lock (handle) { xfail += fail; llhd += xllhd; } } ); failed = xfail; return llhd; }
private static double[][] CollectMergeLoss(int nthread, Vocabulary vocab, TagSet tagset, LAPCFGrammar rules, List<PhrasalTree> treebank, double[][] tagProb) { double[][][] mlossList = new double[nthread][][]; for (int tid = 0; tid < nthread; ++tid) { double[][] mergeLoss = new double[rules.TotalTagCount][]; for (int i = 0; i < mergeLoss.Length; ++i) { if (tagProb [i].Length == 1) { continue; } mergeLoss [i] = new double[tagProb [i].Length / 2]; } ArrayHelper.Fill (mergeLoss, 0); mlossList [tid] = mergeLoss; } var parser = new HyperGraphParser (vocab, tagset, rules); Parallel.For (0, nthread, threadid => { var mergeLoss = mlossList [threadid]; for (int treeId = threadid; treeId < treebank.Count; treeId += nthread) { var tree = treebank [treeId]; var g = parser.BuildHyperGraph (tree); g.SumForward (); g.SumBackward (); double sentS = g.RootScore; if (double.IsNaN (sentS) || double.IsInfinity (sentS)) { continue; } foreach (var v in g.Vs) { if (v.TYPE == VTYPE.TERMINAL) { continue; } int t = v.tag; if (v.subtagCount == 1) { continue; } double[] marginals = new double[v.subtagCount]; for (int st = 0; st < v.subtagCount; ++st) { if (!v.alpha.pruned [st]) { marginals [st] = v.alpha.v [st] + v.beta.v [st]; } } for (int st = 0; st < v.subtagCount / 2; ++st) { int l = st * 2; int r = st * 2 + 1; if (double.IsNaN (v.alpha.v [l]) || double.IsInfinity (v.alpha.v [l]) || double.IsNaN (v.beta.v [l]) || double.IsInfinity (v.beta.v [l]) || double.IsNaN (v.alpha.v [r]) || double.IsInfinity (v.alpha.v [r]) || double.IsNaN (v.beta.v [r]) || double.IsInfinity (v.beta.v [r]) || v.alpha.pruned [l] || v.alpha.pruned [r]) { continue; } double lllhd = marginals [l]; double rllhd = marginals [r]; double mllhd = MathHelper.LogAdd (tagProb [t] [l] + v.alpha.v [l], tagProb [t] [r] + v.alpha.v [r]) + MathHelper.LogAdd (v.beta.v [l], v.beta.v [r]); marginals [l] = mllhd; marginals [r] = double.NegativeInfinity; double xSentScore = MathHelper.LogAdd (marginals); double sentScore = g.RootScore; mergeLoss [t] [st] += sentScore - xSentScore; //MathHelper.LogAdd(xSentScore - sentScore, mergeLoss[t][st]); marginals [l] = lllhd; marginals [r] = rllhd; } } } } ); var ml = mlossList [0]; for (int threadid = 1; threadid < mlossList.Length; ++threadid) { var xl = mlossList [threadid]; for (int i = 0; i < ml.Length; ++i) { if (ml [i] == null) { continue; } for (int j = 0; j < ml[i].Length; ++j) { ml [i] [j] += xl [i] [j]; } } } return ml; }
static void EvaluateParser() { string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s1.smoothed.grammar"; string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt"; string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat"; @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat"; string trainfile = @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.02-21.flat"; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; var traintrees = new List<PhrasalTree>(); LoadTrees(traintrees, trainfile); var rwHanlder = new RareWordHandler(traintrees, 10); using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); } rwHanlder.Build(tagSet, 0.001); //grammar.Smoothing(0.1f); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1]; var traces = new int[grammars.Length][][]; grammars [grammars.Length - 1] = grammar; for (int i = grammars.Length - 1; i >= 1; --i) { traces [i] = grammar.subtagTraces [i - 1]; grammars [i - 1] = grammars [i].ProjectGrammar(traces [i]); grammars [i - 1].MakeCompaction(); grammars [i - 1].MakeSubruleCompaction(); } string[][] tagTiers; using (StreamReader sr = new StreamReader(tagmapfile)) { var tt = new List<string[]>(); while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries)); } tagTiers = new string[tt [0].Length][];//tt.ToArray(); for (int i = 0; i < tagTiers.Length; ++i) { tagTiers [i] = new string[tt.Count]; for (int j = 0; j < tt.Count; ++j) { tagTiers [i] [j] = tt [j] [i]; } } } var cbs = new CodeBook32[tagTiers.Length]; for (int i = 0; i < cbs.Length; ++i) { cbs [i] = new CodeBook32(); foreach (var t in tagTiers[i]) { cbs [i].Add(t); } } int pgcount = cbs.Length - 1; int[][] tagMaps = new int[pgcount][]; for (int i = 0; i < tagMaps.Length; ++i) { tagMaps [i] = new int[grammars [0].PTCount + 1 + cbs [i + 1].Count]; for (int j = 0; j < grammars[0].PTCount + 1; ++j) { tagMaps [i] [j] = j; } } var lastMap = tagMaps [tagMaps.Length - 1]; for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j) { string tstr = tagSet.GetTagString(j); int id = cbs [cbs.Length - 1] [tstr]; int pid = cbs [cbs.Length - 2] [tagTiers [tagTiers.Length - 2] [id]]; lastMap [j] = pid + grammars [0].PTCount + 1; } for (int i = 0; i < tagMaps.Length - 1; ++i) { for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j) { string tstr = cbs [i + 1] [j - grammars [0].PTCount - 1]; int xid = Array.IndexOf(tagTiers [i + 1], tstr); string pstr = tagTiers [i] [xid]; int pid = cbs [i] [pstr]; tagMaps [i] [j] = pid; } } var cgrammars = new LAPCFGrammar[tagMaps.Length]; cgrammars [cgrammars.Length - 1] = grammars [0].CollapseNonTerminals(tagMaps [cgrammars.Length - 1], 1 + cbs [cgrammars.Length - 1].Count); for (int i = cgrammars.Length - 1; i >= 1; --i) { cgrammars [i - 1] = cgrammars [i].CollapseNonTerminals(tagMaps [i - 1], 1 + cbs [i - 1].Count); } for (int i = 0; i < cgrammars.Length; ++i) { cgrammars [i].MakeCompaction(); cgrammars [i].MakeSubruleCompaction(); } var treebank = new List<PhrasalTree>(); LoadTrees(treebank, testfile); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root); tree.ComputeStartEnd(); } double ccount = 0; double pcount = 0; double gcount = 0; int failed = 0; int sentcount = 0; HyperEdgePool epool = new HyperEdgePool(1024 * 1024); HyperVertexPool vpool = new HyperVertexPool(grammars [grammars.Length - 1].subTagCounts.Max()); //EMorph.EnglishMorph.WarmUp(); Console.Error.WriteLine("Start to parse..."); ConsoleTimer tm = new ConsoleTimer(1); Stopwatch g0bwatch = new Stopwatch(); Stopwatch g0watch = new Stopwatch(); Stopwatch bwatch = new Stopwatch(); Stopwatch[] gwatch = new Stopwatch[grammars.Length]; for (int i = 0; i < gwatch.Length; ++i) { gwatch [i] = new Stopwatch(); } Stopwatch vwatch = new Stopwatch(); foreach (var tree in treebank) { var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); if (words.Length > 20) { continue; } sentcount += 1; int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids [0] = vocab.GetId(SimpleTokenizor.ETokenize(words [0]), true); string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); double[][] tprobs = new double[wids.Length][]; //for (int i = 0; i < wids.Length; ++i) //{ // tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i])); //} bool[][] allowedTags = null; //AssignTagConstraints(vocab, tagSet, words, wids); try { //var parser = new ChartParser(wids); var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs); g0bwatch.Start(); parser.BuildHyperGraph(cgrammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts); g0bwatch.Stop(); g0watch.Start(); parser.SumForward(); parser.SumBackward(false); parser.Prune(-15.0); parser.Purge(); for (int i = 1; i < cgrammars.Length; ++i) { parser.ExpandHyperGraph(cgrammars [i], tagMaps [i - 1], epool, vpool, grammars [grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-15.0); parser.Purge(); } g0watch.Stop(); // bwatch.Start(); parser.ExpandHyperGraph(grammars [0], tagMaps [2], epool, vpool, grammars [grammars.Length - 1].subTagCounts); // parser.BuildHyperGraph (grammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts); bwatch.Stop(); for (int i = 0; i < grammars.Length - 1; ++i) { gwatch [i].Start(); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); parser.ProjectGrammar(traces [i + 1], grammars [i + 1]); gwatch [i].Stop(); } gwatch [grammars.Length - 1].Start(); parser.SumForward(); parser.SumBackward(true); gwatch [grammars.Length - 1].Stop(); vwatch.Start(); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); vwatch.Stop(); //PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root); ptree.ComputeStartEnd(); var pbrackets = ptree.GetBracketsIgnorePunc(); var gbrackets = tree.GetBracketsIgnorePunc(); gcount += gbrackets.Count; pcount += pbrackets.Count; foreach (var b in pbrackets) { if (gbrackets.Contains(b)) { ccount += 1; } } if (pbrackets.Count == 0 || (pbrackets.Count < gbrackets.Count / 2)) { Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count, gbrackets.Count); } //Console.Error.WriteLine(tree.TextTree); } catch { g0bwatch.Stop(); g0watch.Stop(); bwatch.Stop(); foreach (var w in gwatch) { w.Stop(); } vwatch.Stop(); failed += 1; Console.Error.WriteLine("\nFailure!"); } tm.Up(); } tm.Finish(); Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed); double prec = ccount / pcount; double recall = ccount / gcount; double f1 = 2.0 * prec * recall / (prec + recall); Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1); Console.Error.WriteLine("G-1 Build:\t{0:F6} s", g0bwatch.Elapsed.TotalSeconds); Console.Error.WriteLine("G-1 Pass:\t{0:F6} s", g0watch.Elapsed.TotalSeconds); Console.Error.WriteLine("G0 Build:\t{0:F6} s", bwatch.Elapsed.TotalSeconds); for (int i = 0; i < gwatch.Length; ++i) { Console.Error.WriteLine("G{0} Pass:\t{1:F6} s", i, gwatch [i].Elapsed.TotalSeconds); } Console.Error.WriteLine("Viterbi:\t{0:F6} s", vwatch.Elapsed.TotalSeconds); }
public LAPCFGrammar ProjectGrammar(int[][] trace) { var pg = new LAPCFGrammar(); pg.NTCount = NTCount; pg.PTCount = PTCount; pg.ROOTID = ROOTID; pg.subTagCounts = trace.Select(x => x [x.Length - 1] + 1).ToArray(); var sprobs = ComputeSymbolProb(); var psprobs = pg.subTagCounts.Select(x => new double[x]).ToArray(); ArrayHelper.Fill(psprobs, double.NegativeInfinity); for (int i = 0; i < trace.Length; ++i) { for (int j = 0; j < trace[i].Length; ++j) { psprobs [i] [trace [i] [j]] = MathHelper.LogAdd(psprobs [i] [trace [i] [j]], sprobs [i] [j]); } } pg.brules = brules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.Select( z => z == null ? null : z.MergeSymbols(trace, sprobs, psprobs) ).ToArray() ).ToArray() ).ToArray(); pg.urules = urules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.MergeSymbols(trace, sprobs, psprobs) ).ToArray() ).ToArray(); pg.trules = trules.Select( x => x == null ? null : x.Select( y => y == null ? null : y.MergeSymbols(trace, sprobs, psprobs) ).ToArray() ).ToArray(); pg.Normalize(); return pg; }