public void DumpToStream(TextModelWriter sw) { sw.Write(typeof(TagSet).FullName); sw.WriteOption("VER", VER); sw.NestLevel += 1; sw.WriteOption("ROOT", ROOT); PTs.DumpToStream(sw); NTs.DumpToStream(sw); sw.NestLevel -= 1; sw.Write(typeof(TagSet).FullName); }
public void TestReLoadingText() { var model = Model.Load("model.txt"); Model m2; using (var s = new MemoryStream()) { using (var writer = new TextModelWriter(s, true)) { writer.Write(model); } s.Seek(0, SeekOrigin.Begin); using (var tmr = new TextModelReader(s)) { m2 = Model.Load(tmr); } } Assert.AreEqual(model.Words, m2.Words); Assert.AreEqual(model.Size, m2.Size); }
public void TestReLoadingText() { var model = Model.Load("model.txt"); Model m2; using (var s = new MemoryStream()) { using (var writer = new TextModelWriter(s, true)) { writer.Write(model); } s.Seek(0, SeekOrigin.Begin); var tmr = new TextModelReader(s); { m2 = Model.Load(tmr); } } Assert.AreEqual(model.Words, m2.Words); Assert.AreEqual(model.Size, m2.Size); }
static void BuildVocab(string inputfn, string outputfn, int cutoff) { var trees = new List<PhrasalTree>(); LoadTrees(trees, inputfn); var vbuilder = new VocabularyBuilder(); foreach (var t in trees) { string line = t.GetSentence(); string[] words = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < words.Length; ++i) { words [i] = SimpleTokenizor.ETokenize(words [i]); vbuilder.Add(words [i], i == 0); } } vbuilder.Build(cutoff); using (var sw = new TextModelWriter(outputfn)) { vbuilder.DumpToStream(sw); } }
static double RunTrainTest(int randSeed) { string rawdir = //@"/home/nan/Data/PTB/flat/"; @"D:\user\nyang\data\treebank\English\pcfg\flat"; string inputdir = //@"/home/nan/Data/PTB/xbar/"; @"D:\user\nyang\data\treebank\English\pcfg\xbar"; string trainfile = "wsj.02-21.flat"; string devfile = "wsj.22.flat"; string testfile = "wsj.23.flat"; string vocabfn = //@"/home/nan/Data/PTB/nw.wsj.vocab"; @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.vocab"; string tagfn = //@"/home/nan/Data/PTB/nw.wsj.tagset"; @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.tagset"; string modeldir = //@"/home/nan/Data/PTB/"; @"D:\user\nyang\data\treebank\English\pcfg\"; string urulefile = "wsj.urule.count"; double pruningThr = -30.0 / Math.Log10(Math.E); double smoothingAlpha = 0.01f; int afterSplitRound = 50; int afterMergeRound = 20; int afterSmoothRound = 10; int nthread = 16; Vocabulary vocab; TagSet tagSet; using (var s = new TextModelReader(vocabfn)) { vocab = Vocabulary.LoadFromStream(s); } using (var s = new TextModelReader(tagfn)) { tagSet = TagSet.LoadFromStream(s); } var treebank = new List<PhrasalTree>(); LoadTrees(treebank, Path.Combine(inputdir, trainfile)); var devtrees = new List<PhrasalTree>(); LoadTrees(devtrees, Path.Combine(inputdir, devfile)); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } } foreach (var tree in devtrees) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } } Random RNG = new Random(randSeed); LAPCFGrammar rules; GrammarBuilder.Build(vocab, tagSet, treebank, out rules, RNG); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s0.grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } //EvaluateRawParser(); //return; rules = rules.SplitSymbols(RNG, 1.0f); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); int failed = 0; var smoothRules = rules; double llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); double devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); double devbest = 0; int bestround = 0; var bestRules = rules; for (int round = 0; round < afterSplitRound; ++round) { GrammarBuilder.CalculateNewScores(rules); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); //Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); Console.Error.Write("."); if (round == 0 || devllhd > devbest) { devbest = devllhd; bestround = round; bestRules = rules.Clone(); } else { if (round - bestround > 3 && round >= 50) { break; } } } Console.Error.WriteLine(); rules = bestRules.Clone(); //rules.Normalize(); //rules.Prune(pruningThr); //rules.Normalize(); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + "1" + ".grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } return EvaluateRawParser(); }
static void BuildTagset(string inputfn, string outputfn) { var trees = new List<PhrasalTree>(); LoadTrees(trees, inputfn); var vbuilder = new TagSetBuilder(); foreach (var t in trees) { foreach (var node in t.TreeNodes) { if (node.Children.Count == 0 && node != t.Root) { vbuilder.AddPT(node.Tag); } else { vbuilder.AddNT(node.Tag); } } } vbuilder.Build(trees [0].Root.Tag); using (var sw = new TextModelWriter(outputfn)) { vbuilder.DumpToStream(sw); } }
static void Run() { string rawdir = //@"/home/nan/Data/PTB/flat/"; @"D:\user\nyang\data\treebank\English\pcfg\flat"; string inputdir = //@"/home/nan/Data/PTB/xbar/"; @"D:\user\nyang\data\treebank\English\pcfg\xbar"; string bklex = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.lexicon"; string bkgrammar = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.grammar"; string trainfile = "wsj.02-21.flat"; string devfile = "wsj.22.flat"; string testfile = "wsj.23.flat"; string vocabfn = //@"/home/nan/Data/PTB/nw.wsj.vocab"; @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.vocab"; string tagfn = //@"/home/nan/Data/PTB/nw.wsj.tagset"; @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.tagset"; string modeldir = //@"/home/nan/Data/PTB/"; @"D:\user\nyang\data\treebank\English\pcfg\"; string urulefile = "wsj.urule.count"; double pruningThr = -30.0 / Math.Log10(Math.E); double smoothingAlpha = 0.01f; int afterSplitRound = 50; int afterMergeRound = 20; int afterSmoothRound = 20; int nthread = 16; //Preprocess(Path.Combine(rawdir, trainfile), Path.Combine(inputdir, trainfile), Path.Combine(modeldir, urulefile)); //Preprocess(Path.Combine(rawdir, devfile), Path.Combine(inputdir, devfile), null); //Preprocess(Path.Combine(rawdir, testfile), Path.Combine(inputdir, testfile), null); //BuildVocab(Path.Combine(inputdir, trainfile), vocabfn, 20); //BuildTagset(Path.Combine(inputdir, trainfile), tagfn); //return; Vocabulary vocab; TagSet tagSet; using (var s = new TextModelReader(vocabfn)) { vocab = Vocabulary.LoadFromStream(s); } using (var s = new TextModelReader(tagfn)) { tagSet = TagSet.LoadFromStream(s); } var treebank = new List<PhrasalTree>(); LoadTrees(treebank, Path.Combine(inputdir, trainfile)); var devtrees = new List<PhrasalTree>(); LoadTrees(devtrees, Path.Combine(inputdir, devfile)); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } } foreach (var tree in devtrees) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { node.Lex = SimpleTokenizor.ETokenize(node.Lex); } } } Random RNG = new Random(0); LAPCFGrammar rules; GrammarBuilder.Build(vocab, tagSet, treebank, out rules, RNG); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); GC.Collect(); GC.WaitForPendingFinalizers(); Console.Error.WriteLine("save model"); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s0.grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } //EvaluateRawParser(); //return; for (int smCycle = 1; smCycle <= 6; ++smCycle) { Console.Error.WriteLine("start cycle {0}", smCycle); Console.Error.WriteLine("split grammar..."); //rules = new LAPCFGrammar(tagSet, vocab, bklex, bkgrammar); //rules.InitializeExpectedCounts(); rules = rules.SplitSymbols(RNG, 1.0f); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); GC.Collect(); GC.WaitForPendingFinalizers(); Console.Error.WriteLine("done"); GC.Collect(); GC.WaitForPendingFinalizers(); int failed = 0; var smoothRules = rules; double llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); double devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); double devbest = 0; int bestround = 0; var bestRules = rules; for (int round = 0; round < afterSplitRound; ++round) { GrammarBuilder.CalculateNewScores(rules); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); GC.Collect(); GC.WaitForPendingFinalizers(); //rules.Normalize(); //rules.Prune(pruningThr); //rules.Normalize(); //GrammarBuilder.CheckProbs(rules); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); if (round == 0 || devllhd > devbest) { devbest = devllhd; bestround = round; bestRules = rules.Clone(); } else { if (round - bestround > 3 && round >= 50) { break; } } } rules = bestRules.Clone(); //rules.Normalize(); rules.Prune(pruningThr); //rules.Normalize(); Console.Error.WriteLine("save model"); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + smCycle.ToString() + ".grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } //EvaluateRawParser(); //return; rules = bestRules; Console.Error.WriteLine("merging symbols..."); rules = GrammarBuilder.MergeSymbols(0.5f, vocab, tagSet, rules, treebank, nthread); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); GrammarBuilder.CalculateNewScores(rules, true); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); GC.Collect(); GC.WaitForPendingFinalizers(); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); for (int round = 0; round < afterMergeRound; ++round) { GrammarBuilder.CalculateNewScores(rules, false); rules.Normalize(); rules.Prune(pruningThr); rules.Trim(); rules.InitializeExpectedCounts(); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); if (round == 0 || devllhd > devbest) { devbest = devllhd; bestround = round; bestRules = rules.Clone(); } else { if (round - bestround > 3 && round >= 10) { break; } } } rules = bestRules.Clone(); //rules.Normalize(); rules.Prune(pruningThr); //rules.Normalize(); Console.Error.WriteLine("save model"); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + smCycle.ToString() + ".merged.grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } Console.Error.WriteLine("smoothing model..."); rules = bestRules.Clone(); rules.InitializeExpectedCounts(); rules.Smoothing(smoothingAlpha); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); smoothRules.Smoothing(smoothingAlpha); smoothRules.Normalize(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); for (int round = 0; round < afterSmoothRound; ++round) { GrammarBuilder.CalculateNewScores(rules, false); rules.Normalize(); rules.Smoothing(0.01, 0.1); rules.Prune(pruningThr); //rules.Normalize(); rules.Trim(); rules.InitializeExpectedCounts(); llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed); smoothRules = rules.Clone(); devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed); if (round == 0 || devllhd > devbest) { devbest = devllhd; bestround = round; bestRules = rules.Clone(); } else { if (round - bestround > 3 && round >= 10) { break; } } Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed); } rules = bestRules.Clone(); rules.Prune(pruningThr); Console.Error.WriteLine("save model"); using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + smCycle.ToString() + ".smoothed.grammar")), true)) { vocab.DumpToStream(sw); tagSet.DumpToStream(sw); rules.DumpToStream(sw, tagSet, vocab); } rules = bestRules; } //EvaluateRawParser(); return; }
static void Main(string[] args) { try { IText text = new Text(); IConcordance concordance = new Concordance(); var concordanceParser = new ConcordanceParser(); var configuration = new GlobalConfiguration(); if (args.Length != 0) { configuration.FileNames = args; } else { using var reader = new StreamReader("../../../config.json"); var json = reader.ReadToEnd(); configuration = JsonConvert.DeserializeObject <GlobalConfiguration>(json); } foreach (var fileName in configuration.FileNames) { using var stream = new StreamReader(new FileStream(fileName, FileMode.Open)); var textParser = new TextParser(); var textReader = new TextModelReader(stream); text = textParser.ParseText(textReader.ReadAllText()); } foreach (var fileName in configuration.FileNames) { using var stream = new StreamReader(new FileStream(fileName, FileMode.Open)); var textParser = new TextParser(); var textReader = new TextModelReader(stream); concordance = concordanceParser.ParseText(textReader.ReadAllText()); } var jsonText = JsonTextSerializer.Serialize(text); var jsonConc = JsonTextSerializer.Serialize(concordance); using (var writer = new StreamWriter("../../../text.json")) { var textModelWriter = new TextModelWriter(writer); textModelWriter.Write(jsonText); } using (var writer = new StreamWriter("../../../concordance.json")) { var textModelWriter = new TextModelWriter(writer); textModelWriter.Write(jsonConc); } Console.WriteLine(); Console.WriteLine("----Select words from question sentences with length 10------------------------"); Console.WriteLine(); foreach (var word in text.GetWordsFromQuestionSentences(10)) { Console.WriteLine(word); } Console.WriteLine(); Console.WriteLine("----Order sentences by words count-------------------------"); Console.WriteLine(); foreach (var sentence in text.OrderSentencesByWordsCount()) { Console.Write(sentence); Console.Write(" --- "); Console.Write($"{sentence.WordsCount} words"); Console.WriteLine(); } Console.WriteLine(); Console.WriteLine("-----Deleting words with length 10--------------"); Console.WriteLine(); text.DeleteWords(10); foreach (var sentence in text.Sentences) { Console.WriteLine(sentence); } Console.WriteLine(); Console.WriteLine("-----Replacing words: \"In\" replace by \"In word replaced\"----------------"); Console.WriteLine(); text.ReplaceWord("In", "In word replaced"); foreach (var sentence in text.Sentences) { Console.WriteLine(sentence); } Console.WriteLine("------------------------------------"); } catch (Exception e) { Console.WriteLine(e.Message); } }
public void DumpToStream(TextModelWriter sw) { var name = typeof(Vocabulary).FullName; sw.Write(name); sw.WriteOption("SIG", SIG); sw.WriteOption("VER", VER); sw.WriteOption("knownWordCount", vocab.Count); //sw.WriteOption("rareWordCount", rareVocab.Count); sw.WriteOption("sigCount", signitureVocab.Count); sw.NestLevel += 1; vocab.DumpToStream(sw); //rareVocab.DumpToStream(sw); signitureVocab.DumpToStream(sw); sw.NestLevel -= 1; sw.Write(name); }
public void DumpToStream(TextModelWriter sw, TagSet tagSet, Vocabulary vocab) { var name = typeof(LAPCFGrammar).FullName; sw.Write(name); sw.WriteOption("VER", VER); sw.WriteOption("NTCount", NTCount); sw.WriteOption("PTCount", PTCount); sw.WriteOption("ROOTID", ROOTID); sw.Write("TerminalRule"); sw.NestLevel += 1; foreach (var x in trules) { if (x != null) { foreach (var y in x) { if (y != null) { var word = vocab.GetWordString(y.word); var tag = tagSet.GetTagString(y.tag); for (int p = 0; p < y.scores.Length; ++p) { if (!double.IsInfinity(y.scores [p]) && !double.IsNaN(y.scores [p])) { sw.Write(string.Format("{0}_{1}\t{2}\t{3}", tag, p, word, y.scores [p])); } } } } } } sw.NestLevel -= 1; sw.Write("UnaryRule"); sw.NestLevel += 1; foreach (var x in urules) { if (x != null) { foreach (var y in x) { if (y != null) { var ptag = tagSet.GetTagString(y.ptag); var ctag = tagSet.GetTagString(y.ctag); for (int c = 0; c < y.scores.Length; ++c) { for (int p = 0; p < y.scores[c].Length; ++p) { if (!double.IsInfinity(y.scores [c] [p]) && !double.IsNaN(y.scores [c] [p])) { sw.Write(string.Format("{0}_{1}\t{2}_{3}\t{4}", ptag, p, ctag, c, y.scores [c] [p])); } } } } } } } sw.NestLevel -= 1; sw.Write("BinaryRule"); sw.NestLevel += 1; foreach (var x in brules) { if (x != null) { foreach (var y in x) { if (y != null) { foreach (var z in y) { if (z != null) { var ptag = tagSet.GetTagString(z.ptag); var ltag = tagSet.GetTagString(z.ltag); var rtag = tagSet.GetTagString(z.rtag); for (int l = 0; l < z.scores.Length; ++l) { for (int r = 0; r < z.scores[l].Length; ++r) { for (int p = 0; p < z.scores[l][r].Length; ++p) { if (!double.IsInfinity(z.scores [l] [r] [p]) && !double.IsNaN(z.scores [l] [r] [p])) { sw.Write( string.Format("{0}_{1}\t{2}_{3}\t{4}_{5}\t{6}", ptag, p, ltag, l, rtag, r, z.scores [l] [r] [p]) ); } } } } } } } } } } sw.NestLevel -= 1; sw.WriteOption("TraceCount", subtagTraces.Count); foreach (var trace in subtagTraces) { sw.WriteOption("TRACE", trace.Length); sw.NestLevel += 1; foreach (var t in trace) { sw.Write(string.Join(" ", t)); } sw.NestLevel -= 1; } sw.Write(name); }