public static List<PhraseBox> GetPhraseBoxes(PhrasalTree tree) { List<PhraseBox> blist = new List<PhraseBox>(); int hLvl = 0; int vLvl = 0; GetPhraseBoxes(blist, null, tree.Root, ref hLvl, vLvl); blist.Sort((a, b) => { return a.horizontalLvl.CompareTo(b.horizontalLvl); }); return blist; }
static double EvaluateRawParser() { string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s2.smoothed.grammar"; string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat"; @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat"; string outputfile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.out"; string reffile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.ref"; string bklex = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.lexicon"; string bkgrammar = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.grammar"; int nthread = 16; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); //grammar = new LAPCFGrammar(tagSet, vocab, bklex, bkgrammar); } //grammar.Smoothing(0.01, 0.1); //grammar.Normalize(); //grammar.PropMaxUnaryPath(); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var treebank = new List<PhrasalTree>(); LoadTrees(treebank, testfile); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root); tree.ComputeStartEnd(); } treebank = treebank.Where(x => x.Root.End <= 20).ToList(); double ccount = 0; double pcount = 0; double gcount = 0; int failed = 0; int sentcount = treebank.Count; Console.Error.WriteLine("Start to parse..."); ConsoleTimer tm = new ConsoleTimer(1); PhrasalTree[] ptrees = new PhrasalTree[treebank.Count]; Parallel.For(0, nthread, thrID => { HyperEdgePool epool = new HyperEdgePool(); HyperVertexPool vpool = new HyperVertexPool(grammar.subTagCounts.Max()); for (int treeId = thrID; treeId < treebank.Count; treeId += nthread) { var tree = treebank[treeId]; var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true); string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); double[][] tprobs = new double[wids.Length][]; //for (int i = 0; i < wids.Length; ++i) //{ // tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i])); //} bool[][] allowedTags = null; //AssignTagConstraints(vocab, tagSet, words, wids); try { //var parser = new ChartParser(wids); var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs); parser.BuildHyperGraph(grammar, epool, vpool, grammar.subTagCounts); parser.SumForward(); parser.SumBackward(true); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); //parser.MaxForward(); //var ptree = parser.ExtractViterbi(words, tagSet); ptree.ComputeStartEnd(); ptrees[treeId] = ptree; } catch { } } }); using (StreamWriter sw = new StreamWriter(outputfile)) { using (StreamWriter swref = new StreamWriter(reffile)) { for (int treeid = 0; treeid < treebank.Count; ++treeid) { var tree = treebank[treeid]; var ptree = ptrees[treeid]; swref.WriteLine(tree.GetParseLine()); if (ptree == null) { failed += 1; sw.WriteLine("()"); continue; } var pbrackets = ptree.GetBracketsIgnorePunc(); var gbrackets = tree.GetBracketsIgnorePunc(); gcount += gbrackets.Count; pcount += pbrackets.Count; double xxc = 0; foreach (var b in pbrackets) { if (gbrackets.Contains(b)) { ccount += 1; xxc += 1; } } if (pbrackets.Count == 0 || (pbrackets.Count < gbrackets.Count / 2)) { Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count, gbrackets.Count); } string parseline = ptree.GetParseLine(); double snt_p = xxc / pbrackets.Count; double snt_r = xxc / gbrackets.Count; double snt_f1 = 2.0 * snt_p * snt_r / (snt_p + snt_r); sw.WriteLine(parseline); //sw.WriteLine(" [Current]\tP: {0:F2} R: {1:F2} F1: {2:F3}", snt_p * 100.0, snt_r * 100.0, snt_f1 * 100.0); } } } tm.Finish(); Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed); double prec = ccount / pcount; double recall = ccount / gcount; double f1 = 2.0 * prec * recall / (prec + recall); Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1); return f1; }
static void CombineLabel() { string parsefilename = @"D:\user\nyang\tmp\train.pcfg.clean.txt"; string safilename = @"D:\user\nyang\tmp\train.c.txt"; string combinefilename = @"D:\user\nyang\tmp\train.cb.txt"; using (StreamReader srp = new StreamReader(parsefilename)) { using (StreamReader sra = new StreamReader(safilename)) { using (StreamWriter sw = new StreamWriter(combinefilename)) { while (!srp.EndOfStream && !sra.EndOfStream) { string parseline = srp.ReadLine(); string saline = sra.ReadLine(); PhrasalTree ptree = new PhrasalTree(parseline); ptree.RemoveUnaryRule(); PhrasalTree stree = new PhrasalTree(saline); int length = ptree.Root.End; var dict = new Dictionary<int, PhrasalNode>(); foreach (var node in ptree.TreeNodes) { int sig = node.Start * (length + 1) + node.End; dict [sig] = node; } foreach (var node in stree.TreeNodes) { int sig = node.Start * (length + 1) + node.End; var pnode = dict [sig]; var ptag = pnode.Tag; bool xbar = ptag [0] == '@'; if (xbar) { ptag = ptag.Substring(1); } //ptag = ptag.Substring(0, 1); if (xbar) { ptag += "x"; } node.Tag = node.Tag + ptag; } stree.Root.Tag = "S"; var xline = stree.GetParseLine(); sw.WriteLine(xline); } } } } }
static void ProcessBKOutput() { string inputfn = @"D:\user\nyang\tools\bkparser\wsj.23.s1.splitting.viterbi.out"; string outputfn = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.s1.bk.out"; using (StreamReader sr = new StreamReader(inputfn)) { using (StreamWriter sw = new StreamWriter(outputfn)) { while (!sr.EndOfStream) { string line = sr.ReadLine().Trim(); if (string.IsNullOrWhiteSpace(line)) { continue; } if (!line.StartsWith("(") || line.StartsWith("()")) { continue; } string xline = "(TOP " + line + ")"; var tree = new PhrasalTree(xline); foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } sw.WriteLine(tree.GetParseLine()); } } } }
static void LoadTrees(List<PhrasalTree> trees, string file) { using (StreamReader sr = new StreamReader(file)) { while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } PhrasalTree pt = new PhrasalTree(line); trees.Add(pt); } } }
public PhrasalTree ExtractViterbi(string[] words, TagSet tagSet) { var rootCell = chart[0, wids.Length - 1]; if (rootCell.l2v == null) { return null; } var rootid = tagSet.ROOTID; HyperVertex v = null;//rootCell.l2v [0]; for (int i = 0; i < rootCell.l2v.Length; ++i) { if (rootCell.l2v[i].tag == rootid) { v = rootCell.l2v[i]; break; } } if (v == null || v.TYPE == VTYPE.DEAD) { throw new Exception("node is pruned!"); } PhrasalNode rootNode = ExtractViterbiParse(v, 0, tagSet); PhrasalTree tree = new PhrasalTree(rootNode); tree.ComputeStartEnd(); AnnotateLex(words, tree.Root); return tree; }
public HyperGraph BuildHyperGraph(PhrasalTree tree) { HyperGraph g = new HyperGraph(); int maxTag = rules.subTagCounts.Max(); g.lbuf = new double[maxTag + 1]; HyperVertex root; BuildHyperGraph(tree.Root, g, out root); g.Root = root; return g; }