public void BuildHyperGraph(LAPCFGrammar grammar, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity = null) { epool.Recycle (); vpool.Recycle (); this.ROOTID = grammar.ROOTID; var maxSubTag = grammar.subTagCounts.Max (); lbuf = new double[maxSubTag + 1]; // CYK for (int i = 0; i < wids.Length; ++i) { bool isRoot = i == 0 && i == wids.Length - 1; chart [i, i] = new HyperCell (i, i + 1, grammar.TotalTagCount); MatchLexicon (grammar, chart [i, i], wids [i], epool, vpool, tagCapacity, allowedPoSTags == null ? null : allowedPoSTags[i], rawTagProbs == null ? null : rawTagProbs[i], isRoot); MatchUnaryRules (grammar, chart [i, i], epool, vpool, tagCapacity, isRoot); chart [i, i].Finish (); } for (int spanL = 2; spanL <= wids.Length; ++spanL) { for (int beg = 0; beg + spanL <= wids.Length; ++beg) { int end = beg + spanL; int l = beg; int r = end - 1; bool isRoot = l == 0 && r == wids.Length - 1; chart [l, r] = new HyperCell (beg, end, grammar.TotalTagCount); for (int mid = l; mid < r; ++mid) { MatchBinaryRules (grammar, chart [l, r], chart [l, mid], chart [mid + 1, r], epool, vpool, tagCapacity, isRoot); } for (int i = 0; i < chart[l, r].l1v.Length; ++i) { var c = chart [l, r].l1v [i]; if (c != null) { if (isRoot && c.tag != ROOTID) { continue; } chart [l, r].l2v [i] = vpool.Allocate (false, c.tag, c.beta.Length, c.beta.v.Length); epool.Allocate (chart [l, r].l2v [i], c); } } MatchUnaryRules (grammar, chart [l, r], epool, vpool, tagCapacity, isRoot); chart [l, r].Finish (); } } }
static double EvaluateRawParser() { string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s2.smoothed.grammar"; string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat"; @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat"; string outputfile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.out"; string reffile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.ref"; string bklex = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.lexicon"; string bkgrammar = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.grammar"; int nthread = 16; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); //grammar = new LAPCFGrammar(tagSet, vocab, bklex, bkgrammar); } //grammar.Smoothing(0.01, 0.1); //grammar.Normalize(); //grammar.PropMaxUnaryPath(); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var treebank = new List<PhrasalTree>(); LoadTrees(treebank, testfile); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root); tree.ComputeStartEnd(); } treebank = treebank.Where(x => x.Root.End <= 20).ToList(); double ccount = 0; double pcount = 0; double gcount = 0; int failed = 0; int sentcount = treebank.Count; Console.Error.WriteLine("Start to parse..."); ConsoleTimer tm = new ConsoleTimer(1); PhrasalTree[] ptrees = new PhrasalTree[treebank.Count]; Parallel.For(0, nthread, thrID => { HyperEdgePool epool = new HyperEdgePool(); HyperVertexPool vpool = new HyperVertexPool(grammar.subTagCounts.Max()); for (int treeId = thrID; treeId < treebank.Count; treeId += nthread) { var tree = treebank[treeId]; var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true); string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); double[][] tprobs = new double[wids.Length][]; //for (int i = 0; i < wids.Length; ++i) //{ // tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i])); //} bool[][] allowedTags = null; //AssignTagConstraints(vocab, tagSet, words, wids); try { //var parser = new ChartParser(wids); var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs); parser.BuildHyperGraph(grammar, epool, vpool, grammar.subTagCounts); parser.SumForward(); parser.SumBackward(true); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); //parser.MaxForward(); //var ptree = parser.ExtractViterbi(words, tagSet); ptree.ComputeStartEnd(); ptrees[treeId] = ptree; } catch { } } }); using (StreamWriter sw = new StreamWriter(outputfile)) { using (StreamWriter swref = new StreamWriter(reffile)) { for (int treeid = 0; treeid < treebank.Count; ++treeid) { var tree = treebank[treeid]; var ptree = ptrees[treeid]; swref.WriteLine(tree.GetParseLine()); if (ptree == null) { failed += 1; sw.WriteLine("()"); continue; } var pbrackets = ptree.GetBracketsIgnorePunc(); var gbrackets = tree.GetBracketsIgnorePunc(); gcount += gbrackets.Count; pcount += pbrackets.Count; double xxc = 0; foreach (var b in pbrackets) { if (gbrackets.Contains(b)) { ccount += 1; xxc += 1; } } if (pbrackets.Count == 0 || (pbrackets.Count < gbrackets.Count / 2)) { Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count, gbrackets.Count); } string parseline = ptree.GetParseLine(); double snt_p = xxc / pbrackets.Count; double snt_r = xxc / gbrackets.Count; double snt_f1 = 2.0 * snt_p * snt_r / (snt_p + snt_r); sw.WriteLine(parseline); //sw.WriteLine(" [Current]\tP: {0:F2} R: {1:F2} F1: {2:F3}", snt_p * 100.0, snt_r * 100.0, snt_f1 * 100.0); } } } tm.Finish(); Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed); double prec = ccount / pcount; double recall = ccount / gcount; double f1 = 2.0 * prec * recall / (prec + recall); Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1); return f1; }
static void EvaluateParser() { string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s1.smoothed.grammar"; string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt"; string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat"; @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat"; string trainfile = @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.02-21.flat"; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; var traintrees = new List<PhrasalTree>(); LoadTrees(traintrees, trainfile); var rwHanlder = new RareWordHandler(traintrees, 10); using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); } rwHanlder.Build(tagSet, 0.001); //grammar.Smoothing(0.1f); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1]; var traces = new int[grammars.Length][][]; grammars [grammars.Length - 1] = grammar; for (int i = grammars.Length - 1; i >= 1; --i) { traces [i] = grammar.subtagTraces [i - 1]; grammars [i - 1] = grammars [i].ProjectGrammar(traces [i]); grammars [i - 1].MakeCompaction(); grammars [i - 1].MakeSubruleCompaction(); } string[][] tagTiers; using (StreamReader sr = new StreamReader(tagmapfile)) { var tt = new List<string[]>(); while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries)); } tagTiers = new string[tt [0].Length][];//tt.ToArray(); for (int i = 0; i < tagTiers.Length; ++i) { tagTiers [i] = new string[tt.Count]; for (int j = 0; j < tt.Count; ++j) { tagTiers [i] [j] = tt [j] [i]; } } } var cbs = new CodeBook32[tagTiers.Length]; for (int i = 0; i < cbs.Length; ++i) { cbs [i] = new CodeBook32(); foreach (var t in tagTiers[i]) { cbs [i].Add(t); } } int pgcount = cbs.Length - 1; int[][] tagMaps = new int[pgcount][]; for (int i = 0; i < tagMaps.Length; ++i) { tagMaps [i] = new int[grammars [0].PTCount + 1 + cbs [i + 1].Count]; for (int j = 0; j < grammars[0].PTCount + 1; ++j) { tagMaps [i] [j] = j; } } var lastMap = tagMaps [tagMaps.Length - 1]; for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j) { string tstr = tagSet.GetTagString(j); int id = cbs [cbs.Length - 1] [tstr]; int pid = cbs [cbs.Length - 2] [tagTiers [tagTiers.Length - 2] [id]]; lastMap [j] = pid + grammars [0].PTCount + 1; } for (int i = 0; i < tagMaps.Length - 1; ++i) { for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j) { string tstr = cbs [i + 1] [j - grammars [0].PTCount - 1]; int xid = Array.IndexOf(tagTiers [i + 1], tstr); string pstr = tagTiers [i] [xid]; int pid = cbs [i] [pstr]; tagMaps [i] [j] = pid; } } var cgrammars = new LAPCFGrammar[tagMaps.Length]; cgrammars [cgrammars.Length - 1] = grammars [0].CollapseNonTerminals(tagMaps [cgrammars.Length - 1], 1 + cbs [cgrammars.Length - 1].Count); for (int i = cgrammars.Length - 1; i >= 1; --i) { cgrammars [i - 1] = cgrammars [i].CollapseNonTerminals(tagMaps [i - 1], 1 + cbs [i - 1].Count); } for (int i = 0; i < cgrammars.Length; ++i) { cgrammars [i].MakeCompaction(); cgrammars [i].MakeSubruleCompaction(); } var treebank = new List<PhrasalTree>(); LoadTrees(treebank, testfile); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root); tree.ComputeStartEnd(); } double ccount = 0; double pcount = 0; double gcount = 0; int failed = 0; int sentcount = 0; HyperEdgePool epool = new HyperEdgePool(1024 * 1024); HyperVertexPool vpool = new HyperVertexPool(grammars [grammars.Length - 1].subTagCounts.Max()); //EMorph.EnglishMorph.WarmUp(); Console.Error.WriteLine("Start to parse..."); ConsoleTimer tm = new ConsoleTimer(1); Stopwatch g0bwatch = new Stopwatch(); Stopwatch g0watch = new Stopwatch(); Stopwatch bwatch = new Stopwatch(); Stopwatch[] gwatch = new Stopwatch[grammars.Length]; for (int i = 0; i < gwatch.Length; ++i) { gwatch [i] = new Stopwatch(); } Stopwatch vwatch = new Stopwatch(); foreach (var tree in treebank) { var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); if (words.Length > 20) { continue; } sentcount += 1; int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids [0] = vocab.GetId(SimpleTokenizor.ETokenize(words [0]), true); string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); double[][] tprobs = new double[wids.Length][]; //for (int i = 0; i < wids.Length; ++i) //{ // tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i])); //} bool[][] allowedTags = null; //AssignTagConstraints(vocab, tagSet, words, wids); try { //var parser = new ChartParser(wids); var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs); g0bwatch.Start(); parser.BuildHyperGraph(cgrammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts); g0bwatch.Stop(); g0watch.Start(); parser.SumForward(); parser.SumBackward(false); parser.Prune(-15.0); parser.Purge(); for (int i = 1; i < cgrammars.Length; ++i) { parser.ExpandHyperGraph(cgrammars [i], tagMaps [i - 1], epool, vpool, grammars [grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-15.0); parser.Purge(); } g0watch.Stop(); // bwatch.Start(); parser.ExpandHyperGraph(grammars [0], tagMaps [2], epool, vpool, grammars [grammars.Length - 1].subTagCounts); // parser.BuildHyperGraph (grammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts); bwatch.Stop(); for (int i = 0; i < grammars.Length - 1; ++i) { gwatch [i].Start(); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); parser.ProjectGrammar(traces [i + 1], grammars [i + 1]); gwatch [i].Stop(); } gwatch [grammars.Length - 1].Start(); parser.SumForward(); parser.SumBackward(true); gwatch [grammars.Length - 1].Stop(); vwatch.Start(); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); vwatch.Stop(); //PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root); ptree.ComputeStartEnd(); var pbrackets = ptree.GetBracketsIgnorePunc(); var gbrackets = tree.GetBracketsIgnorePunc(); gcount += gbrackets.Count; pcount += pbrackets.Count; foreach (var b in pbrackets) { if (gbrackets.Contains(b)) { ccount += 1; } } if (pbrackets.Count == 0 || (pbrackets.Count < gbrackets.Count / 2)) { Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count, gbrackets.Count); } //Console.Error.WriteLine(tree.TextTree); } catch { g0bwatch.Stop(); g0watch.Stop(); bwatch.Stop(); foreach (var w in gwatch) { w.Stop(); } vwatch.Stop(); failed += 1; Console.Error.WriteLine("\nFailure!"); } tm.Up(); } tm.Finish(); Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed); double prec = ccount / pcount; double recall = ccount / gcount; double f1 = 2.0 * prec * recall / (prec + recall); Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1); Console.Error.WriteLine("G-1 Build:\t{0:F6} s", g0bwatch.Elapsed.TotalSeconds); Console.Error.WriteLine("G-1 Pass:\t{0:F6} s", g0watch.Elapsed.TotalSeconds); Console.Error.WriteLine("G0 Build:\t{0:F6} s", bwatch.Elapsed.TotalSeconds); for (int i = 0; i < gwatch.Length; ++i) { Console.Error.WriteLine("G{0} Pass:\t{1:F6} s", i, gwatch [i].Elapsed.TotalSeconds); } Console.Error.WriteLine("Viterbi:\t{0:F6} s", vwatch.Elapsed.TotalSeconds); }
static void TestParse() { string modelfile = //@"/home/nan/Data/PTB/ptb.s2.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s6.smoothed.grammar"; string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt"; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); } grammar.Smoothing(0.1f); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1]; var traces = new int[grammars.Length][][]; grammars[grammars.Length - 1] = grammar; for (int i = grammars.Length - 1; i >= 1; --i) { traces[i] = grammar.subtagTraces[i - 1]; grammars[i - 1] = grammars[i].ProjectGrammar(traces[i]); grammars[i - 1].MakeCompaction(); grammars[i - 1].MakeSubruleCompaction(); } string[][] tagTiers; using (StreamReader sr = new StreamReader(tagmapfile)) { var tt = new List<string[]>(); while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries)); } tagTiers = new string[tt[0].Length][];//tt.ToArray(); for (int i = 0; i < tagTiers.Length; ++i) { tagTiers[i] = new string[tt.Count]; for (int j = 0; j < tt.Count; ++j) { tagTiers[i][j] = tt[j][i]; } } } var cbs = new CodeBook32[tagTiers.Length]; for (int i = 0; i < cbs.Length; ++i) { cbs[i] = new CodeBook32(); foreach (var t in tagTiers[i]) { cbs[i].Add(t); } } int pgcount = cbs.Length - 1; int[][] tagMaps = new int[pgcount][]; for (int i = 0; i < tagMaps.Length; ++i) { tagMaps[i] = new int[grammars[0].PTCount + 1 + cbs[i + 1].Count]; for (int j = 0; j < grammars[0].PTCount + 1; ++j) { tagMaps[i][j] = j; } } var lastMap = tagMaps[tagMaps.Length - 1]; for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j) { string tstr = tagSet.GetTagString(j); int id = cbs[cbs.Length - 1][tstr]; int pid = cbs[cbs.Length - 2][tagTiers[tagTiers.Length - 2][id]]; lastMap[j] = pid + grammars[0].PTCount + 1; } for (int i = 0; i < tagMaps.Length - 1; ++i) { for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j) { string tstr = cbs[i + 1][j - grammars[0].PTCount - 1]; int xid = Array.IndexOf(tagTiers[i + 1], tstr); string pstr = tagTiers[i][xid]; int pid = cbs[i][pstr]; tagMaps[i][j] = pid; } } var cgrammars = new LAPCFGrammar[tagMaps.Length]; cgrammars[cgrammars.Length - 1] = grammars[0].CollapseNonTerminals(tagMaps[cgrammars.Length - 1], 1 + cbs[cgrammars.Length - 1].Count); for (int i = cgrammars.Length - 1; i >= 1; --i) { cgrammars[i - 1] = cgrammars[i].CollapseNonTerminals(tagMaps[i - 1], 1 + cbs[i - 1].Count); } for (int i = 0; i < cgrammars.Length; ++i) { cgrammars[i].MakeCompaction(); cgrammars[i].MakeSubruleCompaction(); } HyperEdgePool epool = new HyperEdgePool(1024 * 1024); HyperVertexPool vpool = new HyperVertexPool(grammars[grammars.Length - 1].subTagCounts.Max()); EMorph.EnglishMorph.WarmUp(); Console.Error.WriteLine("READY"); while(true) { string line = Console.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } var words = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true); bool[][] allowedTags = new bool[wids.Length][]; for (int i = 0; i < wids.Length; ++i) { if (vocab.IsRareOrUNK(wids[i])) { var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]); if (lemmas == null || lemmas.Count == 0) { continue; } allowedTags[i] = new bool[tagSet.PTCount]; if (char.IsUpper(words[i][0])) { allowedTags[i][tagSet.GetID("NNP")] = true; allowedTags[i][tagSet.GetID("NNPS")] = true; } foreach (var lemma in lemmas) { switch (lemma.PoS) { case EMorph.MorphPoS.NN: allowedTags[i][tagSet.GetID("NN")] = true; allowedTags[i][tagSet.GetID("NNS")] = true; break; case EMorph.MorphPoS.NNS: allowedTags[i][tagSet.GetID("NNS")] = true; allowedTags[i][tagSet.GetID("NN")] = true; break; case EMorph.MorphPoS.JJ: allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.JJR: allowedTags[i][tagSet.GetID("JJR")] = true; break; case EMorph.MorphPoS.JJS: allowedTags[i][tagSet.GetID("JJS")] = true; break; case EMorph.MorphPoS.RB: allowedTags[i][tagSet.GetID("RB")] = true; break; case EMorph.MorphPoS.RBR: allowedTags[i][tagSet.GetID("RBR")] = true; break; case EMorph.MorphPoS.RBS: allowedTags[i][tagSet.GetID("RBS")] = true; break; case EMorph.MorphPoS.VB: allowedTags[i][tagSet.GetID("VB")] = true; allowedTags[i][tagSet.GetID("VBP")] = true; break; case EMorph.MorphPoS.VBD: allowedTags[i][tagSet.GetID("VBD")] = true; allowedTags[i][tagSet.GetID("VBN")] = true; break; case EMorph.MorphPoS.VBG: allowedTags[i][tagSet.GetID("VBG")] = true; break; case EMorph.MorphPoS.VBZ: allowedTags[i][tagSet.GetID("VBZ")] = true; break; default: throw new Exception("not recognized morph lemma!"); } } } } try { var parser = new ChartHyperGraphParser(wids, allowedTags); parser.BuildHyperGraph(cgrammars[0], epool, vpool, grammars[grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); for (int i = 1; i < cgrammars.Length; ++i) { parser.ExpandHyperGraph(cgrammars[i], tagMaps[i - 1], epool, vpool, grammars[grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); } parser.ExpandHyperGraph(grammars[0], tagMaps[2], epool, vpool, grammars[grammars.Length - 1].subTagCounts); for (int i = 0; i < grammars.Length - 1; ++i) { parser.SumForward(); parser.SumBackward(false); parser.Prune(-8.0); parser.Purge(); parser.ProjectGrammar(traces[i + 1], grammars[i + 1]); } parser.SumForward(); parser.SumBackward(true); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root); ptree.ComputeStartEnd(); string treeline = ptree.TextTree; string[] xlines = treeline.Split(new string[] { "\n", "\r", "\r\n" }, StringSplitOptions.RemoveEmptyEntries); foreach (var xline in xlines) { Console.Error.WriteLine(xline); } } catch { Console.Error.WriteLine("Failure to parse!"); } } }
public int ExpandHyperGraph(LAPCFGrammar grammar, int[] tagMap, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity = null) { this.ROOTID = grammar.ROOTID; var maxSubTag = grammar.subTagCounts.Max (); lbuf = new double[maxSubTag + 1]; int prunedCell = 0; var lv1flags = new TimedArray<bool> (grammar.TotalTagCount); var lv2flags = new TimedArray<bool> (grammar.TotalTagCount); // CYK for (int i = 0; i < wids.Length; ++i) { var oldcell = chart [i, i]; lv1flags.Clear (); foreach (var v in oldcell.l1v) { if (v != null) { lv1flags [v.tag] = true; } } lv2flags.Clear (); foreach (var v in oldcell.l2v) { if (v != null) { lv2flags [v.tag] = true; } } bool isRoot = i == 0 && i == wids.Length - 1; chart [i, i] = new HyperCell (i, i + 1, grammar.TotalTagCount); ExpandLexicon ( grammar, chart [i, i], lv1flags, lv2flags, tagMap, wids [i], epool, vpool, tagCapacity, allowedPoSTags == null ? null: allowedPoSTags[i], rawTagProbs == null ? null : rawTagProbs[i], isRoot); ExpandUnaryRules (grammar, chart [i, i], lv1flags, lv2flags, tagMap, epool, vpool, tagCapacity, isRoot); chart [i, i].Finish (); } for (int spanL = 2; spanL <= wids.Length; ++spanL) { for (int beg = 0; beg + spanL <= wids.Length; ++beg) { int end = beg + spanL; int l = beg; int r = end - 1; var oldcell = chart [l, r]; lv1flags.Clear (); foreach (var v in oldcell.l1v) { if (v != null) { lv1flags [v.tag] = true; } } lv2flags.Clear (); foreach (var v in oldcell.l2v) { if (v != null) { lv2flags [v.tag] = true; } } bool isRoot = l == 0 && r == wids.Length - 1; chart [l, r] = new HyperCell (beg, end, grammar.TotalTagCount); if (!oldcell.IsEmptyCell ()) { for (int mid = l; mid < r; ++mid) { ExpandBinaryRules (grammar, chart [l, r], chart [l, mid], chart [mid + 1, r], lv1flags, lv2flags, tagMap, epool, vpool, tagCapacity, isRoot); } for (int i = 0; i < chart[l, r].l1v.Length; ++i) { var c = chart [l, r].l1v [i]; if (c != null) { if (isRoot && c.tag != ROOTID) { continue; } chart [l, r].l2v [i] = vpool.Allocate (false, c.tag, c.beta.Length, c.beta.v.Length); epool.Allocate (chart [l, r].l2v [i], c); } } ExpandUnaryRules (grammar, chart [l, r], lv1flags, lv2flags, tagMap, epool, vpool, tagCapacity, isRoot); } else { prunedCell += 1; } chart [l, r].Finish (); } } return prunedCell; }
private static void MatchUnaryRules( LAPCFGrammar grammar, HyperCell cell, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity, bool isRoot) { foreach (var cv in cell.l1v) { if (cv == null) { continue; } var rules = grammar.urules [cv.tag]; if (rules != null) { foreach (var rule in rules) { if (rule == null) { break; } if (rule.ptag == grammar.ROOTID && !isRoot) { continue; } if (isRoot && rule.ptag != grammar.ROOTID) { continue; } if (cell.l2v [rule.ptag] == null) { var cap = tagCapacity == null ? -1 : tagCapacity [rule.ptag]; cell.l2v [rule.ptag] = vpool.Allocate (false, rule.ptag, grammar.GetSubTagCount (rule.ptag), cap); } epool.Allocate (cell.l2v [rule.ptag], cv, rule.scores, null); } } } }
private static void MatchLexicon( LAPCFGrammar table, HyperCell cell, int wid, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity, bool[] allowedTags, double[] tagProbs, bool isRoot) { var tv = new HyperVertex (true, wid, 1); var trules = table.trules [wid]; foreach (var rule in trules) { if (rule == null) { break; } if (rule.tag == table.ROOTID && !isRoot) { continue; } if (allowedTags != null && !allowedTags[rule.tag]) { continue; } var xrule = rule; if (tagProbs != null) { var xprob = tagProbs[rule.tag]; if (double.IsNegativeInfinity(xprob)) { continue; } xrule = rule.Clone(); for (int i = 0; i < xrule.scores.Length; ++i) { if (!double.IsNegativeInfinity(xrule.scores[i])) { xrule.scores[i] += xprob; } } } var cap = tagCapacity == null ? -1 : tagCapacity [rule.tag]; cell.l1v [rule.tag] = vpool.Allocate (false, rule.tag, table.GetSubTagCount (rule.tag), cap); epool.Allocate (cell.l1v [rule.tag], tv, xrule.scores, null); if (isRoot && rule.tag != table.ROOTID) { continue; } cell.l2v [rule.tag] = vpool.Allocate (false, rule.tag, table.GetSubTagCount (rule.tag), cap); epool.Allocate (cell.l2v [rule.tag], cell.l1v [rule.tag]); } }
private static void MatchBinaryRules( LAPCFGrammar grammar, HyperCell pcell, HyperCell lcell, HyperCell rcell, HyperEdgePool epool, HyperVertexPool vpool, int[] tagCapacity, bool isRoot) { foreach (var lv in lcell.l2v) { var rprules = grammar.brules [lv.tag]; if (rprules == null) { continue; } foreach (var rv in rcell.l2v) { var prules = rprules [rv.tag]; if (prules == null) { continue; } for (int p = 0; p < prules.Length; ++p) { var rule = prules [p]; if (rule == null) { break; } if (rule.ptag == grammar.ROOTID && !isRoot) { continue; } if (pcell.l1v [rule.ptag] == null) { var cap = tagCapacity == null ? -1 : tagCapacity [rule.ptag]; pcell.l1v [rule.ptag] = vpool.Allocate (false, rule.ptag, grammar.GetSubTagCount (rule.ptag), cap); } epool.Allocate (pcell.l1v [rule.ptag], lv, rv, rule.scores, null); } } } }