public void Build(int cutOff) { var knownWords = new HashSet<string> (); var rareWords = new HashSet<string> (); var signitures = new HashSet<string> (); foreach (var pair in wordCounts) { string word = pair.Key; int count = pair.Value; if (count > cutOff) { knownWords.Add (word); } else { rareWords.Add (word); } } foreach (var word in rareWords) { string lowered = word.ToLower (); bool isKnownLC = knownWords.Contains (lowered); if (initialSet.Contains (word)) { signitures.Add (GetSigniture (word, true, isKnownLC)); } if (nonInitialSet.Contains (word)) { signitures.Add (GetSigniture (word, false, isKnownLC)); } } vocab = new CodeBook32 (); signitureVocab = new CodeBook32 (knownWords.Count); foreach (var w in knownWords) { vocab.Add (w); } foreach (var w in signitures) { signitureVocab.Add (w); } }
static void EvaluateParser() { string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s1.smoothed.grammar"; string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt"; string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat"; @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat"; string trainfile = @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.02-21.flat"; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; var traintrees = new List<PhrasalTree>(); LoadTrees(traintrees, trainfile); var rwHanlder = new RareWordHandler(traintrees, 10); using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); } rwHanlder.Build(tagSet, 0.001); //grammar.Smoothing(0.1f); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1]; var traces = new int[grammars.Length][][]; grammars [grammars.Length - 1] = grammar; for (int i = grammars.Length - 1; i >= 1; --i) { traces [i] = grammar.subtagTraces [i - 1]; grammars [i - 1] = grammars [i].ProjectGrammar(traces [i]); grammars [i - 1].MakeCompaction(); grammars [i - 1].MakeSubruleCompaction(); } string[][] tagTiers; using (StreamReader sr = new StreamReader(tagmapfile)) { var tt = new List<string[]>(); while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries)); } tagTiers = new string[tt [0].Length][];//tt.ToArray(); for (int i = 0; i < tagTiers.Length; ++i) { tagTiers [i] = new string[tt.Count]; for (int j = 0; j < tt.Count; ++j) { tagTiers [i] [j] = tt [j] [i]; } } } var cbs = new CodeBook32[tagTiers.Length]; for (int i = 0; i < cbs.Length; ++i) { cbs [i] = new CodeBook32(); foreach (var t in tagTiers[i]) { cbs [i].Add(t); } } int pgcount = cbs.Length - 1; int[][] tagMaps = new int[pgcount][]; for (int i = 0; i < tagMaps.Length; ++i) { tagMaps [i] = new int[grammars [0].PTCount + 1 + cbs [i + 1].Count]; for (int j = 0; j < grammars[0].PTCount + 1; ++j) { tagMaps [i] [j] = j; } } var lastMap = tagMaps [tagMaps.Length - 1]; for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j) { string tstr = tagSet.GetTagString(j); int id = cbs [cbs.Length - 1] [tstr]; int pid = cbs [cbs.Length - 2] [tagTiers [tagTiers.Length - 2] [id]]; lastMap [j] = pid + grammars [0].PTCount + 1; } for (int i = 0; i < tagMaps.Length - 1; ++i) { for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j) { string tstr = cbs [i + 1] [j - grammars [0].PTCount - 1]; int xid = Array.IndexOf(tagTiers [i + 1], tstr); string pstr = tagTiers [i] [xid]; int pid = cbs [i] [pstr]; tagMaps [i] [j] = pid; } } var cgrammars = new LAPCFGrammar[tagMaps.Length]; cgrammars [cgrammars.Length - 1] = grammars [0].CollapseNonTerminals(tagMaps [cgrammars.Length - 1], 1 + cbs [cgrammars.Length - 1].Count); for (int i = cgrammars.Length - 1; i >= 1; --i) { cgrammars [i - 1] = cgrammars [i].CollapseNonTerminals(tagMaps [i - 1], 1 + cbs [i - 1].Count); } for (int i = 0; i < cgrammars.Length; ++i) { cgrammars [i].MakeCompaction(); cgrammars [i].MakeSubruleCompaction(); } var treebank = new List<PhrasalTree>(); LoadTrees(treebank, testfile); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root); tree.ComputeStartEnd(); } double ccount = 0; double pcount = 0; double gcount = 0; int failed = 0; int sentcount = 0; HyperEdgePool epool = new HyperEdgePool(1024 * 1024); HyperVertexPool vpool = new HyperVertexPool(grammars [grammars.Length - 1].subTagCounts.Max()); //EMorph.EnglishMorph.WarmUp(); Console.Error.WriteLine("Start to parse..."); ConsoleTimer tm = new ConsoleTimer(1); Stopwatch g0bwatch = new Stopwatch(); Stopwatch g0watch = new Stopwatch(); Stopwatch bwatch = new Stopwatch(); Stopwatch[] gwatch = new Stopwatch[grammars.Length]; for (int i = 0; i < gwatch.Length; ++i) { gwatch [i] = new Stopwatch(); } Stopwatch vwatch = new Stopwatch(); foreach (var tree in treebank) { var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); if (words.Length > 20) { continue; } sentcount += 1; int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids [0] = vocab.GetId(SimpleTokenizor.ETokenize(words [0]), true); string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); double[][] tprobs = new double[wids.Length][]; //for (int i = 0; i < wids.Length; ++i) //{ // tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i])); //} bool[][] allowedTags = null; //AssignTagConstraints(vocab, tagSet, words, wids); try { //var parser = new ChartParser(wids); var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs); g0bwatch.Start(); parser.BuildHyperGraph(cgrammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts); g0bwatch.Stop(); g0watch.Start(); parser.SumForward(); parser.SumBackward(false); parser.Prune(-15.0); parser.Purge(); for (int i = 1; i < cgrammars.Length; ++i) { parser.ExpandHyperGraph(cgrammars [i], tagMaps [i - 1], epool, vpool, grammars [grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-15.0); parser.Purge(); } g0watch.Stop(); // bwatch.Start(); parser.ExpandHyperGraph(grammars [0], tagMaps [2], epool, vpool, grammars [grammars.Length - 1].subTagCounts); // parser.BuildHyperGraph (grammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts); bwatch.Stop(); for (int i = 0; i < grammars.Length - 1; ++i) { gwatch [i].Start(); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); parser.ProjectGrammar(traces [i + 1], grammars [i + 1]); gwatch [i].Stop(); } gwatch [grammars.Length - 1].Start(); parser.SumForward(); parser.SumBackward(true); gwatch [grammars.Length - 1].Stop(); vwatch.Start(); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); vwatch.Stop(); //PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root); ptree.ComputeStartEnd(); var pbrackets = ptree.GetBracketsIgnorePunc(); var gbrackets = tree.GetBracketsIgnorePunc(); gcount += gbrackets.Count; pcount += pbrackets.Count; foreach (var b in pbrackets) { if (gbrackets.Contains(b)) { ccount += 1; } } if (pbrackets.Count == 0 || (pbrackets.Count < gbrackets.Count / 2)) { Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count, gbrackets.Count); } //Console.Error.WriteLine(tree.TextTree); } catch { g0bwatch.Stop(); g0watch.Stop(); bwatch.Stop(); foreach (var w in gwatch) { w.Stop(); } vwatch.Stop(); failed += 1; Console.Error.WriteLine("\nFailure!"); } tm.Up(); } tm.Finish(); Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed); double prec = ccount / pcount; double recall = ccount / gcount; double f1 = 2.0 * prec * recall / (prec + recall); Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1); Console.Error.WriteLine("G-1 Build:\t{0:F6} s", g0bwatch.Elapsed.TotalSeconds); Console.Error.WriteLine("G-1 Pass:\t{0:F6} s", g0watch.Elapsed.TotalSeconds); Console.Error.WriteLine("G0 Build:\t{0:F6} s", bwatch.Elapsed.TotalSeconds); for (int i = 0; i < gwatch.Length; ++i) { Console.Error.WriteLine("G{0} Pass:\t{1:F6} s", i, gwatch [i].Elapsed.TotalSeconds); } Console.Error.WriteLine("Viterbi:\t{0:F6} s", vwatch.Elapsed.TotalSeconds); }
static void TestParse() { string modelfile = //@"/home/nan/Data/PTB/ptb.s2.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s6.smoothed.grammar"; string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt"; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); } grammar.Smoothing(0.1f); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1]; var traces = new int[grammars.Length][][]; grammars[grammars.Length - 1] = grammar; for (int i = grammars.Length - 1; i >= 1; --i) { traces[i] = grammar.subtagTraces[i - 1]; grammars[i - 1] = grammars[i].ProjectGrammar(traces[i]); grammars[i - 1].MakeCompaction(); grammars[i - 1].MakeSubruleCompaction(); } string[][] tagTiers; using (StreamReader sr = new StreamReader(tagmapfile)) { var tt = new List<string[]>(); while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries)); } tagTiers = new string[tt[0].Length][];//tt.ToArray(); for (int i = 0; i < tagTiers.Length; ++i) { tagTiers[i] = new string[tt.Count]; for (int j = 0; j < tt.Count; ++j) { tagTiers[i][j] = tt[j][i]; } } } var cbs = new CodeBook32[tagTiers.Length]; for (int i = 0; i < cbs.Length; ++i) { cbs[i] = new CodeBook32(); foreach (var t in tagTiers[i]) { cbs[i].Add(t); } } int pgcount = cbs.Length - 1; int[][] tagMaps = new int[pgcount][]; for (int i = 0; i < tagMaps.Length; ++i) { tagMaps[i] = new int[grammars[0].PTCount + 1 + cbs[i + 1].Count]; for (int j = 0; j < grammars[0].PTCount + 1; ++j) { tagMaps[i][j] = j; } } var lastMap = tagMaps[tagMaps.Length - 1]; for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j) { string tstr = tagSet.GetTagString(j); int id = cbs[cbs.Length - 1][tstr]; int pid = cbs[cbs.Length - 2][tagTiers[tagTiers.Length - 2][id]]; lastMap[j] = pid + grammars[0].PTCount + 1; } for (int i = 0; i < tagMaps.Length - 1; ++i) { for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j) { string tstr = cbs[i + 1][j - grammars[0].PTCount - 1]; int xid = Array.IndexOf(tagTiers[i + 1], tstr); string pstr = tagTiers[i][xid]; int pid = cbs[i][pstr]; tagMaps[i][j] = pid; } } var cgrammars = new LAPCFGrammar[tagMaps.Length]; cgrammars[cgrammars.Length - 1] = grammars[0].CollapseNonTerminals(tagMaps[cgrammars.Length - 1], 1 + cbs[cgrammars.Length - 1].Count); for (int i = cgrammars.Length - 1; i >= 1; --i) { cgrammars[i - 1] = cgrammars[i].CollapseNonTerminals(tagMaps[i - 1], 1 + cbs[i - 1].Count); } for (int i = 0; i < cgrammars.Length; ++i) { cgrammars[i].MakeCompaction(); cgrammars[i].MakeSubruleCompaction(); } HyperEdgePool epool = new HyperEdgePool(1024 * 1024); HyperVertexPool vpool = new HyperVertexPool(grammars[grammars.Length - 1].subTagCounts.Max()); EMorph.EnglishMorph.WarmUp(); Console.Error.WriteLine("READY"); while(true) { string line = Console.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } var words = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true); bool[][] allowedTags = new bool[wids.Length][]; for (int i = 0; i < wids.Length; ++i) { if (vocab.IsRareOrUNK(wids[i])) { var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]); if (lemmas == null || lemmas.Count == 0) { continue; } allowedTags[i] = new bool[tagSet.PTCount]; if (char.IsUpper(words[i][0])) { allowedTags[i][tagSet.GetID("NNP")] = true; allowedTags[i][tagSet.GetID("NNPS")] = true; } foreach (var lemma in lemmas) { switch (lemma.PoS) { case EMorph.MorphPoS.NN: allowedTags[i][tagSet.GetID("NN")] = true; allowedTags[i][tagSet.GetID("NNS")] = true; break; case EMorph.MorphPoS.NNS: allowedTags[i][tagSet.GetID("NNS")] = true; allowedTags[i][tagSet.GetID("NN")] = true; break; case EMorph.MorphPoS.JJ: allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.JJR: allowedTags[i][tagSet.GetID("JJR")] = true; break; case EMorph.MorphPoS.JJS: allowedTags[i][tagSet.GetID("JJS")] = true; break; case EMorph.MorphPoS.RB: allowedTags[i][tagSet.GetID("RB")] = true; break; case EMorph.MorphPoS.RBR: allowedTags[i][tagSet.GetID("RBR")] = true; break; case EMorph.MorphPoS.RBS: allowedTags[i][tagSet.GetID("RBS")] = true; break; case EMorph.MorphPoS.VB: allowedTags[i][tagSet.GetID("VB")] = true; allowedTags[i][tagSet.GetID("VBP")] = true; break; case EMorph.MorphPoS.VBD: allowedTags[i][tagSet.GetID("VBD")] = true; allowedTags[i][tagSet.GetID("VBN")] = true; break; case EMorph.MorphPoS.VBG: allowedTags[i][tagSet.GetID("VBG")] = true; break; case EMorph.MorphPoS.VBZ: allowedTags[i][tagSet.GetID("VBZ")] = true; break; default: throw new Exception("not recognized morph lemma!"); } } } } try { var parser = new ChartHyperGraphParser(wids, allowedTags); parser.BuildHyperGraph(cgrammars[0], epool, vpool, grammars[grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); for (int i = 1; i < cgrammars.Length; ++i) { parser.ExpandHyperGraph(cgrammars[i], tagMaps[i - 1], epool, vpool, grammars[grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); } parser.ExpandHyperGraph(grammars[0], tagMaps[2], epool, vpool, grammars[grammars.Length - 1].subTagCounts); for (int i = 0; i < grammars.Length - 1; ++i) { parser.SumForward(); parser.SumBackward(false); parser.Prune(-8.0); parser.Purge(); parser.ProjectGrammar(traces[i + 1], grammars[i + 1]); } parser.SumForward(); parser.SumBackward(true); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root); ptree.ComputeStartEnd(); string treeline = ptree.TextTree; string[] xlines = treeline.Split(new string[] { "\n", "\r", "\r\n" }, StringSplitOptions.RemoveEmptyEntries); foreach (var xline in xlines) { Console.Error.WriteLine(xline); } } catch { Console.Error.WriteLine("Failure to parse!"); } } }
public void Build(string ROOT) { base.ROOT = ROOT; PTs = new CodeBook32(); if (ptset.Contains(ROOT)) { throw new Exception("ROOT symbols found in preterminal set!"); } foreach (var pt in ptset) { PTs.Add(pt); } ntset.Remove(ROOT); NTs = new CodeBook32(PTs.Count); NTs.Add(ROOT); foreach (var nt in ntset) { NTs.Add(nt); } }