public static TagSet LoadFromStream(TextModelReader sr) { string xname = sr.Read(); if (xname != typeof(TagSet).FullName) { throw new Exception("model name does not match"); } int startlvl = sr.NestLevel; var xver = sr.ReadOptionUInt64("VER"); if (xver != VER) { throw new Exception("version number does not match"); } var ts = new TagSet(); ts.ROOT = sr.ReadOptionString("ROOT"); ts.PTs = CodeBook32.LoadFromStream(sr); ts.NTs = CodeBook32.LoadFromStream(sr); xname = sr.Read(); if(xname != typeof(TagSet).FullName || sr.NestLevel != startlvl) { throw new Exception("model name does not match"); } return ts; }
public void TestTextReset() { using (var fr = File.OpenRead("model.txt")) { var reader = new TextModelReader(fr); var wv1 = reader.ReadVector(); reader.Reset(); var wv2 = reader.ReadVector(); Assert.AreEqual(4501, reader.Words); Assert.AreEqual(100, reader.Size); Assert.AreEqual(wv1.Word, wv2.Word); CollectionAssert.AreEqual(wv1.Vector, wv2.Vector); } }
public void TestReLoadingText() { var model = Model.Load("model.txt"); Model m2; using (var s = new MemoryStream()) { using (var writer = new TextModelWriter(s, true)) { writer.Write(model); } s.Seek(0, SeekOrigin.Begin); using (var tmr = new TextModelReader(s)) { m2 = Model.Load(tmr); } } Assert.AreEqual(model.Words, m2.Words); Assert.AreEqual(model.Size, m2.Size); }
public void TestReLoadingText() { var model = Model.Load("model.txt"); Model m2; using (var s = new MemoryStream()) { using (var writer = new TextModelWriter(s, true)) { writer.Write(model); } s.Seek(0, SeekOrigin.Begin); var tmr = new TextModelReader(s); { m2 = Model.Load(tmr); } } Assert.AreEqual(model.Words, m2.Words); Assert.AreEqual(model.Size, m2.Size); }
public static Vocabulary LoadFromStream(TextModelReader sr) { var v = new Vocabulary(); int knownWordCount = 0; int sigCount = 0; string name = typeof(Vocabulary).FullName; int startLvl = 0; string line = sr.Read(); startLvl = sr.NestLevel; if (line != name) { throw new Exception("error in model file!"); } var xsig = sr.ReadOptionUInt64("SIG"); var xver = sr.ReadOptionUInt64("VER"); if (xsig != SIG || xver != VER) { throw new Exception("Signiture or version does not match!"); } knownWordCount = sr.ReadOptionInt("knownWordCount"); sigCount = sr.ReadOptionInt("sigCount"); v.vocab = CodeBook32.LoadFromStream(sr); v.signitureVocab = CodeBook32.LoadFromStream(sr); if (v.vocab.Count != knownWordCount || v.signitureVocab.Count != sigCount) { throw new Exception("vocab size does not match"); } string closeline = sr.Read(); if (sr.NestLevel != startLvl || closeline != name) { throw new Exception("model is not closed!"); } return v; }
static double EvaluateRawParser() { string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s2.smoothed.grammar"; string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat"; @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat"; string outputfile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.out"; string reffile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.ref"; string bklex = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.lexicon"; string bkgrammar = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.grammar"; int nthread = 16; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); //grammar = new LAPCFGrammar(tagSet, vocab, bklex, bkgrammar); } //grammar.Smoothing(0.01, 0.1); //grammar.Normalize(); //grammar.PropMaxUnaryPath(); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var treebank = new List<PhrasalTree>(); LoadTrees(treebank, testfile); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root); tree.ComputeStartEnd(); } treebank = treebank.Where(x => x.Root.End <= 20).ToList(); double ccount = 0; double pcount = 0; double gcount = 0; int failed = 0; int sentcount = treebank.Count; Console.Error.WriteLine("Start to parse..."); ConsoleTimer tm = new ConsoleTimer(1); PhrasalTree[] ptrees = new PhrasalTree[treebank.Count]; Parallel.For(0, nthread, thrID => { HyperEdgePool epool = new HyperEdgePool(); HyperVertexPool vpool = new HyperVertexPool(grammar.subTagCounts.Max()); for (int treeId = thrID; treeId < treebank.Count; treeId += nthread) { var tree = treebank[treeId]; var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true); string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); double[][] tprobs = new double[wids.Length][]; //for (int i = 0; i < wids.Length; ++i) //{ // tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i])); //} bool[][] allowedTags = null; //AssignTagConstraints(vocab, tagSet, words, wids); try { //var parser = new ChartParser(wids); var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs); parser.BuildHyperGraph(grammar, epool, vpool, grammar.subTagCounts); parser.SumForward(); parser.SumBackward(true); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); //parser.MaxForward(); //var ptree = parser.ExtractViterbi(words, tagSet); ptree.ComputeStartEnd(); ptrees[treeId] = ptree; } catch { } } }); using (StreamWriter sw = new StreamWriter(outputfile)) { using (StreamWriter swref = new StreamWriter(reffile)) { for (int treeid = 0; treeid < treebank.Count; ++treeid) { var tree = treebank[treeid]; var ptree = ptrees[treeid]; swref.WriteLine(tree.GetParseLine()); if (ptree == null) { failed += 1; sw.WriteLine("()"); continue; } var pbrackets = ptree.GetBracketsIgnorePunc(); var gbrackets = tree.GetBracketsIgnorePunc(); gcount += gbrackets.Count; pcount += pbrackets.Count; double xxc = 0; foreach (var b in pbrackets) { if (gbrackets.Contains(b)) { ccount += 1; xxc += 1; } } if (pbrackets.Count == 0 || (pbrackets.Count < gbrackets.Count / 2)) { Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count, gbrackets.Count); } string parseline = ptree.GetParseLine(); double snt_p = xxc / pbrackets.Count; double snt_r = xxc / gbrackets.Count; double snt_f1 = 2.0 * snt_p * snt_r / (snt_p + snt_r); sw.WriteLine(parseline); //sw.WriteLine(" [Current]\tP: {0:F2} R: {1:F2} F1: {2:F3}", snt_p * 100.0, snt_r * 100.0, snt_f1 * 100.0); } } } tm.Finish(); Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed); double prec = ccount / pcount; double recall = ccount / gcount; double f1 = 2.0 * prec * recall / (prec + recall); Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1); return f1; }
static void EvaluateParser() { string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s1.smoothed.grammar"; string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt"; string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat"; @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat"; string trainfile = @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.02-21.flat"; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; var traintrees = new List<PhrasalTree>(); LoadTrees(traintrees, trainfile); var rwHanlder = new RareWordHandler(traintrees, 10); using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); } rwHanlder.Build(tagSet, 0.001); //grammar.Smoothing(0.1f); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1]; var traces = new int[grammars.Length][][]; grammars [grammars.Length - 1] = grammar; for (int i = grammars.Length - 1; i >= 1; --i) { traces [i] = grammar.subtagTraces [i - 1]; grammars [i - 1] = grammars [i].ProjectGrammar(traces [i]); grammars [i - 1].MakeCompaction(); grammars [i - 1].MakeSubruleCompaction(); } string[][] tagTiers; using (StreamReader sr = new StreamReader(tagmapfile)) { var tt = new List<string[]>(); while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries)); } tagTiers = new string[tt [0].Length][];//tt.ToArray(); for (int i = 0; i < tagTiers.Length; ++i) { tagTiers [i] = new string[tt.Count]; for (int j = 0; j < tt.Count; ++j) { tagTiers [i] [j] = tt [j] [i]; } } } var cbs = new CodeBook32[tagTiers.Length]; for (int i = 0; i < cbs.Length; ++i) { cbs [i] = new CodeBook32(); foreach (var t in tagTiers[i]) { cbs [i].Add(t); } } int pgcount = cbs.Length - 1; int[][] tagMaps = new int[pgcount][]; for (int i = 0; i < tagMaps.Length; ++i) { tagMaps [i] = new int[grammars [0].PTCount + 1 + cbs [i + 1].Count]; for (int j = 0; j < grammars[0].PTCount + 1; ++j) { tagMaps [i] [j] = j; } } var lastMap = tagMaps [tagMaps.Length - 1]; for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j) { string tstr = tagSet.GetTagString(j); int id = cbs [cbs.Length - 1] [tstr]; int pid = cbs [cbs.Length - 2] [tagTiers [tagTiers.Length - 2] [id]]; lastMap [j] = pid + grammars [0].PTCount + 1; } for (int i = 0; i < tagMaps.Length - 1; ++i) { for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j) { string tstr = cbs [i + 1] [j - grammars [0].PTCount - 1]; int xid = Array.IndexOf(tagTiers [i + 1], tstr); string pstr = tagTiers [i] [xid]; int pid = cbs [i] [pstr]; tagMaps [i] [j] = pid; } } var cgrammars = new LAPCFGrammar[tagMaps.Length]; cgrammars [cgrammars.Length - 1] = grammars [0].CollapseNonTerminals(tagMaps [cgrammars.Length - 1], 1 + cbs [cgrammars.Length - 1].Count); for (int i = cgrammars.Length - 1; i >= 1; --i) { cgrammars [i - 1] = cgrammars [i].CollapseNonTerminals(tagMaps [i - 1], 1 + cbs [i - 1].Count); } for (int i = 0; i < cgrammars.Length; ++i) { cgrammars [i].MakeCompaction(); cgrammars [i].MakeSubruleCompaction(); } var treebank = new List<PhrasalTree>(); LoadTrees(treebank, testfile); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root); tree.ComputeStartEnd(); } double ccount = 0; double pcount = 0; double gcount = 0; int failed = 0; int sentcount = 0; HyperEdgePool epool = new HyperEdgePool(1024 * 1024); HyperVertexPool vpool = new HyperVertexPool(grammars [grammars.Length - 1].subTagCounts.Max()); //EMorph.EnglishMorph.WarmUp(); Console.Error.WriteLine("Start to parse..."); ConsoleTimer tm = new ConsoleTimer(1); Stopwatch g0bwatch = new Stopwatch(); Stopwatch g0watch = new Stopwatch(); Stopwatch bwatch = new Stopwatch(); Stopwatch[] gwatch = new Stopwatch[grammars.Length]; for (int i = 0; i < gwatch.Length; ++i) { gwatch [i] = new Stopwatch(); } Stopwatch vwatch = new Stopwatch(); foreach (var tree in treebank) { var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); if (words.Length > 20) { continue; } sentcount += 1; int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids [0] = vocab.GetId(SimpleTokenizor.ETokenize(words [0]), true); string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); double[][] tprobs = new double[wids.Length][]; //for (int i = 0; i < wids.Length; ++i) //{ // tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i])); //} bool[][] allowedTags = null; //AssignTagConstraints(vocab, tagSet, words, wids); try { //var parser = new ChartParser(wids); var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs); g0bwatch.Start(); parser.BuildHyperGraph(cgrammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts); g0bwatch.Stop(); g0watch.Start(); parser.SumForward(); parser.SumBackward(false); parser.Prune(-15.0); parser.Purge(); for (int i = 1; i < cgrammars.Length; ++i) { parser.ExpandHyperGraph(cgrammars [i], tagMaps [i - 1], epool, vpool, grammars [grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-15.0); parser.Purge(); } g0watch.Stop(); // bwatch.Start(); parser.ExpandHyperGraph(grammars [0], tagMaps [2], epool, vpool, grammars [grammars.Length - 1].subTagCounts); // parser.BuildHyperGraph (grammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts); bwatch.Stop(); for (int i = 0; i < grammars.Length - 1; ++i) { gwatch [i].Start(); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); parser.ProjectGrammar(traces [i + 1], grammars [i + 1]); gwatch [i].Stop(); } gwatch [grammars.Length - 1].Start(); parser.SumForward(); parser.SumBackward(true); gwatch [grammars.Length - 1].Stop(); vwatch.Start(); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); vwatch.Stop(); //PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root); ptree.ComputeStartEnd(); var pbrackets = ptree.GetBracketsIgnorePunc(); var gbrackets = tree.GetBracketsIgnorePunc(); gcount += gbrackets.Count; pcount += pbrackets.Count; foreach (var b in pbrackets) { if (gbrackets.Contains(b)) { ccount += 1; } } if (pbrackets.Count == 0 || (pbrackets.Count < gbrackets.Count / 2)) { Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count, gbrackets.Count); } //Console.Error.WriteLine(tree.TextTree); } catch { g0bwatch.Stop(); g0watch.Stop(); bwatch.Stop(); foreach (var w in gwatch) { w.Stop(); } vwatch.Stop(); failed += 1; Console.Error.WriteLine("\nFailure!"); } tm.Up(); } tm.Finish(); Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed); double prec = ccount / pcount; double recall = ccount / gcount; double f1 = 2.0 * prec * recall / (prec + recall); Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1); Console.Error.WriteLine("G-1 Build:\t{0:F6} s", g0bwatch.Elapsed.TotalSeconds); Console.Error.WriteLine("G-1 Pass:\t{0:F6} s", g0watch.Elapsed.TotalSeconds); Console.Error.WriteLine("G0 Build:\t{0:F6} s", bwatch.Elapsed.TotalSeconds); for (int i = 0; i < gwatch.Length; ++i) { Console.Error.WriteLine("G{0} Pass:\t{1:F6} s", i, gwatch [i].Elapsed.TotalSeconds); } Console.Error.WriteLine("Viterbi:\t{0:F6} s", vwatch.Elapsed.TotalSeconds); }
static void TestParse() { string modelfile = //@"/home/nan/Data/PTB/ptb.s2.smoothed.grammar"; @"D:\user\nyang\data\treebank\English\pcfg\ptb.s6.smoothed.grammar"; string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt"; Vocabulary vocab; TagSet tagSet; LAPCFGrammar grammar; using (var s = new TextModelReader(modelfile)) { vocab = Vocabulary.LoadFromStream(s); tagSet = TagSet.LoadFromStream(s); grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet); } grammar.Smoothing(0.1f); grammar.MakeCompaction(); grammar.MakeSubruleCompaction(); var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1]; var traces = new int[grammars.Length][][]; grammars[grammars.Length - 1] = grammar; for (int i = grammars.Length - 1; i >= 1; --i) { traces[i] = grammar.subtagTraces[i - 1]; grammars[i - 1] = grammars[i].ProjectGrammar(traces[i]); grammars[i - 1].MakeCompaction(); grammars[i - 1].MakeSubruleCompaction(); } string[][] tagTiers; using (StreamReader sr = new StreamReader(tagmapfile)) { var tt = new List<string[]>(); while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries)); } tagTiers = new string[tt[0].Length][];//tt.ToArray(); for (int i = 0; i < tagTiers.Length; ++i) { tagTiers[i] = new string[tt.Count]; for (int j = 0; j < tt.Count; ++j) { tagTiers[i][j] = tt[j][i]; } } } var cbs = new CodeBook32[tagTiers.Length]; for (int i = 0; i < cbs.Length; ++i) { cbs[i] = new CodeBook32(); foreach (var t in tagTiers[i]) { cbs[i].Add(t); } } int pgcount = cbs.Length - 1; int[][] tagMaps = new int[pgcount][]; for (int i = 0; i < tagMaps.Length; ++i) { tagMaps[i] = new int[grammars[0].PTCount + 1 + cbs[i + 1].Count]; for (int j = 0; j < grammars[0].PTCount + 1; ++j) { tagMaps[i][j] = j; } } var lastMap = tagMaps[tagMaps.Length - 1]; for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j) { string tstr = tagSet.GetTagString(j); int id = cbs[cbs.Length - 1][tstr]; int pid = cbs[cbs.Length - 2][tagTiers[tagTiers.Length - 2][id]]; lastMap[j] = pid + grammars[0].PTCount + 1; } for (int i = 0; i < tagMaps.Length - 1; ++i) { for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j) { string tstr = cbs[i + 1][j - grammars[0].PTCount - 1]; int xid = Array.IndexOf(tagTiers[i + 1], tstr); string pstr = tagTiers[i][xid]; int pid = cbs[i][pstr]; tagMaps[i][j] = pid; } } var cgrammars = new LAPCFGrammar[tagMaps.Length]; cgrammars[cgrammars.Length - 1] = grammars[0].CollapseNonTerminals(tagMaps[cgrammars.Length - 1], 1 + cbs[cgrammars.Length - 1].Count); for (int i = cgrammars.Length - 1; i >= 1; --i) { cgrammars[i - 1] = cgrammars[i].CollapseNonTerminals(tagMaps[i - 1], 1 + cbs[i - 1].Count); } for (int i = 0; i < cgrammars.Length; ++i) { cgrammars[i].MakeCompaction(); cgrammars[i].MakeSubruleCompaction(); } HyperEdgePool epool = new HyperEdgePool(1024 * 1024); HyperVertexPool vpool = new HyperVertexPool(grammars[grammars.Length - 1].subTagCounts.Max()); EMorph.EnglishMorph.WarmUp(); Console.Error.WriteLine("READY"); while(true) { string line = Console.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } var words = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray(); wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true); bool[][] allowedTags = new bool[wids.Length][]; for (int i = 0; i < wids.Length; ++i) { if (vocab.IsRareOrUNK(wids[i])) { var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]); if (lemmas == null || lemmas.Count == 0) { continue; } allowedTags[i] = new bool[tagSet.PTCount]; if (char.IsUpper(words[i][0])) { allowedTags[i][tagSet.GetID("NNP")] = true; allowedTags[i][tagSet.GetID("NNPS")] = true; } foreach (var lemma in lemmas) { switch (lemma.PoS) { case EMorph.MorphPoS.NN: allowedTags[i][tagSet.GetID("NN")] = true; allowedTags[i][tagSet.GetID("NNS")] = true; break; case EMorph.MorphPoS.NNS: allowedTags[i][tagSet.GetID("NNS")] = true; allowedTags[i][tagSet.GetID("NN")] = true; break; case EMorph.MorphPoS.JJ: allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.JJR: allowedTags[i][tagSet.GetID("JJR")] = true; break; case EMorph.MorphPoS.JJS: allowedTags[i][tagSet.GetID("JJS")] = true; break; case EMorph.MorphPoS.RB: allowedTags[i][tagSet.GetID("RB")] = true; break; case EMorph.MorphPoS.RBR: allowedTags[i][tagSet.GetID("RBR")] = true; break; case EMorph.MorphPoS.RBS: allowedTags[i][tagSet.GetID("RBS")] = true; break; case EMorph.MorphPoS.VB: allowedTags[i][tagSet.GetID("VB")] = true; allowedTags[i][tagSet.GetID("VBP")] = true; break; case EMorph.MorphPoS.VBD: allowedTags[i][tagSet.GetID("VBD")] = true; allowedTags[i][tagSet.GetID("VBN")] = true; break; case EMorph.MorphPoS.VBG: allowedTags[i][tagSet.GetID("VBG")] = true; break; case EMorph.MorphPoS.VBZ: allowedTags[i][tagSet.GetID("VBZ")] = true; break; default: throw new Exception("not recognized morph lemma!"); } } } } try { var parser = new ChartHyperGraphParser(wids, allowedTags); parser.BuildHyperGraph(cgrammars[0], epool, vpool, grammars[grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); for (int i = 1; i < cgrammars.Length; ++i) { parser.ExpandHyperGraph(cgrammars[i], tagMaps[i - 1], epool, vpool, grammars[grammars.Length - 1].subTagCounts); parser.SumForward(); parser.SumBackward(false); parser.Prune(-10.0); parser.Purge(); } parser.ExpandHyperGraph(grammars[0], tagMaps[2], epool, vpool, grammars[grammars.Length - 1].subTagCounts); for (int i = 0; i < grammars.Length - 1; ++i) { parser.SumForward(); parser.SumBackward(false); parser.Prune(-8.0); parser.Purge(); parser.ProjectGrammar(traces[i + 1], grammars[i + 1]); } parser.SumForward(); parser.SumBackward(true); parser.PosteriorViterbi(); var ptree = parser.ExtractPosteriorViterbi(words, tagSet); PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root); ptree.ComputeStartEnd(); string treeline = ptree.TextTree; string[] xlines = treeline.Split(new string[] { "\n", "\r", "\r\n" }, StringSplitOptions.RemoveEmptyEntries); foreach (var xline in xlines) { Console.Error.WriteLine(xline); } } catch { Console.Error.WriteLine("Failure to parse!"); } } }
static double RunTrainTest(int randSeed) { string rawdir = //@"/home/nan/Data/PTB/flat/"; @"D:\user\nyang\data\treebank\English\pcfg\flat"; string inputdir = //@"/home/nan/Data/PTB/xbar/"; @"D:\user\nyang\data\treebank\English\pcfg\xbar"; string trainfile = "wsj.02-21.flat"; string devfile = "wsj.22.flat"; string testfile = "wsj.23.flat"; string vocabfn = //@"/home/nan/Data/PTB/nw.wsj.vocab"; @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.vocab"; string tagfn = //@"/home/nan/Data/PTB/nw.wsj.tagset"; @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.tagset"; string modeldir = //@"/home/nan/Data/PTB/"; @"D:\user\nyang\data\treebank\English\pcfg\"; string urulefile = "wsj.urule.count"; double pruningThr = -30.0 / Math.Log10(Math.E); double smoothingAlpha = 0.01f; int afterSplitRound = 50; int afterMergeRound = 20; int afterSmoothRound = 10; int nthread = 16; Vocabulary vocab; TagSet tagSet; using (var s = new TextModelReader(vocabfn)) { vocab = Vocabulary.LoadFromStream(s); } using (var s = new TextModelReader(tagfn)) { tagSet = TagSet.LoadFromStream(s); } var treebank = new List<PhrasalTree>(); LoadTrees(treebank, Path.Combine(inputdir, trainfile)); var devtrees = new List<PhrasalTree>(); LoadTrees(devtrees, Path.Combine(inputdir, devfile)); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } } foreach (var tree in devtrees) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } } Random RNG = new Random(randSeed); LAPCFGrammar rules; GrammarBuilder.Build(vocab, tagSet, treebank, out rules, RNG); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s0.grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } //EvaluateRawParser(); //return; rules = rules.SplitSymbols(RNG, 1.0f); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); int failed = 0; var smoothRules = rules; double llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); double devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); double devbest = 0; int bestround = 0; var bestRules = rules; for (int round = 0; round < afterSplitRound; ++round) { GrammarBuilder.CalculateNewScores(rules); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); //Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); Console.Error.Write("."); if (round == 0 || devllhd > devbest) { devbest = devllhd; bestround = round; bestRules = rules.Clone(); } else { if (round - bestround > 3 && round >= 50) { break; } } } Console.Error.WriteLine(); rules = bestRules.Clone(); //rules.Normalize(); //rules.Prune(pruningThr); //rules.Normalize(); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + "1" + ".grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } return EvaluateRawParser(); }
static void Run() { string rawdir = //@"/home/nan/Data/PTB/flat/"; @"D:\user\nyang\data\treebank\English\pcfg\flat"; string inputdir = //@"/home/nan/Data/PTB/xbar/"; @"D:\user\nyang\data\treebank\English\pcfg\xbar"; string bklex = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.lexicon"; string bkgrammar = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.grammar"; string trainfile = "wsj.02-21.flat"; string devfile = "wsj.22.flat"; string testfile = "wsj.23.flat"; string vocabfn = //@"/home/nan/Data/PTB/nw.wsj.vocab"; @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.vocab"; string tagfn = //@"/home/nan/Data/PTB/nw.wsj.tagset"; @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.tagset"; string modeldir = //@"/home/nan/Data/PTB/"; @"D:\user\nyang\data\treebank\English\pcfg\"; string urulefile = "wsj.urule.count"; double pruningThr = -30.0 / Math.Log10(Math.E); double smoothingAlpha = 0.01f; int afterSplitRound = 50; int afterMergeRound = 20; int afterSmoothRound = 20; int nthread = 16; //Preprocess(Path.Combine(rawdir, trainfile), Path.Combine(inputdir, trainfile), Path.Combine(modeldir, urulefile)); //Preprocess(Path.Combine(rawdir, devfile), Path.Combine(inputdir, devfile), null); //Preprocess(Path.Combine(rawdir, testfile), Path.Combine(inputdir, testfile), null); //BuildVocab(Path.Combine(inputdir, trainfile), vocabfn, 20); //BuildTagset(Path.Combine(inputdir, trainfile), tagfn); //return; Vocabulary vocab; TagSet tagSet; using (var s = new TextModelReader(vocabfn)) { vocab = Vocabulary.LoadFromStream(s); } using (var s = new TextModelReader(tagfn)) { tagSet = TagSet.LoadFromStream(s); } var treebank = new List<PhrasalTree>(); LoadTrees(treebank, Path.Combine(inputdir, trainfile)); var devtrees = new List<PhrasalTree>(); LoadTrees(devtrees, Path.Combine(inputdir, devfile)); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } } foreach (var tree in devtrees) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } } Random RNG = new Random(0); LAPCFGrammar rules; GrammarBuilder.Build(vocab, tagSet, treebank, out rules, RNG); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); GC.Collect(); GC.WaitForPendingFinalizers(); Console.Error.WriteLine("save model"); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s0.grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } //EvaluateRawParser(); //return; for (int smCycle = 1; smCycle <= 6; ++smCycle) { Console.Error.WriteLine("start cycle {0}", smCycle); Console.Error.WriteLine("split grammar..."); //rules = new LAPCFGrammar(tagSet, vocab, bklex, bkgrammar); //rules.InitializeExpectedCounts(); rules = rules.SplitSymbols(RNG, 1.0f); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); GC.Collect(); GC.WaitForPendingFinalizers(); Console.Error.WriteLine("done"); GC.Collect(); GC.WaitForPendingFinalizers(); int failed = 0; var smoothRules = rules; double llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); double devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); double devbest = 0; int bestround = 0; var bestRules = rules; for (int round = 0; round < afterSplitRound; ++round) { GrammarBuilder.CalculateNewScores(rules); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); GC.Collect(); GC.WaitForPendingFinalizers(); //rules.Normalize(); //rules.Prune(pruningThr); //rules.Normalize(); //GrammarBuilder.CheckProbs(rules); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); if (round == 0 || devllhd > devbest) { devbest = devllhd; bestround = round; bestRules = rules.Clone(); } else { if (round - bestround > 3 && round >= 50) { break; } } } rules = bestRules.Clone(); //rules.Normalize(); rules.Prune(pruningThr); //rules.Normalize(); Console.Error.WriteLine("save model"); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + smCycle.ToString() + ".grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } //EvaluateRawParser(); //return; rules = bestRules; Console.Error.WriteLine("merging symbols..."); rules = GrammarBuilder.MergeSymbols(0.5f, vocab, tagSet, rules, treebank, nthread); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); GrammarBuilder.CalculateNewScores(rules, true); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); GC.Collect(); GC.WaitForPendingFinalizers(); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); for (int round = 0; round < afterMergeRound; ++round) { GrammarBuilder.CalculateNewScores(rules, false); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); if (round == 0 || devllhd > devbest) { devbest = devllhd; bestround = round; bestRules = rules.Clone(); } else { if (round - bestround > 3 && round >= 10) { break; } } } rules = bestRules.Clone(); //rules.Normalize(); rules.Prune(pruningThr); //rules.Normalize(); Console.Error.WriteLine("save model"); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + smCycle.ToString() + ".merged.grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } Console.Error.WriteLine("smoothing model..."); rules = bestRules.Clone(); rules.InitializeExpectedCounts(); rules.Smoothing(smoothingAlpha); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); for (int round = 0; round < afterSmoothRound; ++round) { GrammarBuilder.CalculateNewScores(rules, false); rules.Normalize(); rules.Smoothing(0.01, 0.1); rules.Prune(pruningThr); //rules.Normalize(); rules.Trim(); rules.InitializeExpectedCounts(); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); if (round == 0 || devllhd > devbest) { devbest = devllhd; bestround = round; bestRules = rules.Clone(); } else { if (round - bestround > 3 && round >= 10) { break; } } Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); } rules = bestRules.Clone(); rules.Prune(pruningThr); Console.Error.WriteLine("save model"); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + smCycle.ToString() + ".smoothed.grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } rules = bestRules; } //EvaluateRawParser(); return; }
static void Main(string[] args) { try { IText text = new Text(); IConcordance concordance = new Concordance(); var concordanceParser = new ConcordanceParser(); var configuration = new GlobalConfiguration(); if (args.Length != 0) { configuration.FileNames = args; } else { using var reader = new StreamReader("../../../config.json"); var json = reader.ReadToEnd(); configuration = JsonConvert.DeserializeObject <GlobalConfiguration>(json); } foreach (var fileName in configuration.FileNames) { using var stream = new StreamReader(new FileStream(fileName, FileMode.Open)); var textParser = new TextParser(); var textReader = new TextModelReader(stream); text = textParser.ParseText(textReader.ReadAllText()); } foreach (var fileName in configuration.FileNames) { using var stream = new StreamReader(new FileStream(fileName, FileMode.Open)); var textParser = new TextParser(); var textReader = new TextModelReader(stream); concordance = concordanceParser.ParseText(textReader.ReadAllText()); } var jsonText = JsonTextSerializer.Serialize(text); var jsonConc = JsonTextSerializer.Serialize(concordance); using (var writer = new StreamWriter("../../../text.json")) { var textModelWriter = new TextModelWriter(writer); textModelWriter.Write(jsonText); } using (var writer = new StreamWriter("../../../concordance.json")) { var textModelWriter = new TextModelWriter(writer); textModelWriter.Write(jsonConc); } Console.WriteLine(); Console.WriteLine("----Select words from question sentences with length 10------------------------"); Console.WriteLine(); foreach (var word in text.GetWordsFromQuestionSentences(10)) { Console.WriteLine(word); } Console.WriteLine(); Console.WriteLine("----Order sentences by words count-------------------------"); Console.WriteLine(); foreach (var sentence in text.OrderSentencesByWordsCount()) { Console.Write(sentence); Console.Write(" --- "); Console.Write($"{sentence.WordsCount} words"); Console.WriteLine(); } Console.WriteLine(); Console.WriteLine("-----Deleting words with length 10--------------"); Console.WriteLine(); text.DeleteWords(10); foreach (var sentence in text.Sentences) { Console.WriteLine(sentence); } Console.WriteLine(); Console.WriteLine("-----Replacing words: \"In\" replace by \"In word replaced\"----------------"); Console.WriteLine(); text.ReplaceWord("In", "In word replaced"); foreach (var sentence in text.Sentences) { Console.WriteLine(sentence); } Console.WriteLine("------------------------------------"); } catch (Exception e) { Console.WriteLine(e.Message); } }
public static LAPCFGrammar LoadFromStream(TextModelReader sr, Vocabulary vocab, TagSet tagSet) { var grammar = new LAPCFGrammar(); var name = typeof(LAPCFGrammar).FullName; sr.Require(name); sr.Require("VER", VER); grammar.NTCount = sr.ReadOptionInt("NTCount"); grammar.PTCount = sr.ReadOptionInt("PTCount"); grammar.ROOTID = sr.ReadOptionInt("ROOTID"); sr.Require("TerminalRule"); int lvl = sr.NestLevel; var truleStrings = new HashSet<string>(); var uruleStrings = new HashSet<string>(); var bruleStrings = new HashSet<string>(); string line = sr.Read(); while (sr.NestLevel > lvl) { truleStrings.Add(line); line = sr.Read(); } if (line != "UnaryRule") { throw new Exception("wrong model!"); } line = sr.Read(); while (sr.NestLevel > lvl) { uruleStrings.Add(line); line = sr.Read(); } if (line != "BinaryRule") { throw new Exception("wrong model!"); } line = sr.Read(); while (sr.NestLevel > lvl) { bruleStrings.Add(line); line = sr.Read(); } string[] parts = line.Split('\t'); if (parts [0] != "TraceCount") { throw new Exception("error in model"); } int subtraceCount = int.Parse(parts [1]); grammar.subtagTraces = new List<int[][]>(); for (int i = 0; i < subtraceCount; ++i) { int tlen = sr.ReadOptionInt("TRACE"); int[][] trace = new int[tlen][]; for (int j = 0; j < tlen; ++j) { trace [j] = sr.ReadIntArray(); } grammar.subtagTraces.Add(trace); } if (grammar.subtagTraces.Count == 0) { grammar.subTagCounts = new int[grammar.TotalTagCount]; ArrayHelper.Fill(grammar.subTagCounts, 1); } else { var trace = grammar.subtagTraces [grammar.subtagTraces.Count - 1]; grammar.subTagCounts = trace.Select(x => x.Length).ToArray(); } sr.Require(name); foreach (var str in uruleStrings) { grammar.BuildUnaryRule(str, tagSet); } foreach (var str in truleStrings) { grammar.BuildTerminalRule(str, vocab, tagSet); } foreach (var str in bruleStrings) { grammar.BuildBinaryRule(str, tagSet); } return grammar; }