public static TagSet LoadFromStream(TextModelReader sr) { string xname = sr.Read(); if (xname != typeof(TagSet).FullName) { throw new Exception("model name does not match"); } int startlvl = sr.NestLevel; var xver = sr.ReadOptionUInt64("VER"); if (xver != VER) { throw new Exception("version number does not match"); } var ts = new TagSet(); ts.ROOT = sr.ReadOptionString("ROOT"); ts.PTs = CodeBook32.LoadFromStream(sr); ts.NTs = CodeBook32.LoadFromStream(sr); xname = sr.Read(); if(xname != typeof(TagSet).FullName || sr.NestLevel != startlvl) { throw new Exception("model name does not match"); } return ts; }
public PhrasalTree ExtractViterbi(string[] words, TagSet tagSet) { var rootCell = chart[0, wids.Length - 1]; if (rootCell.l2v == null) { return null; } var rootid = tagSet.ROOTID; HyperVertex v = null;//rootCell.l2v [0]; for (int i = 0; i < rootCell.l2v.Length; ++i) { if (rootCell.l2v[i].tag == rootid) { v = rootCell.l2v[i]; break; } } if (v == null || v.TYPE == VTYPE.DEAD) { throw new Exception("node is pruned!"); } PhrasalNode rootNode = ExtractViterbiParse(v, 0, tagSet); PhrasalTree tree = new PhrasalTree(rootNode); tree.ComputeStartEnd(); AnnotateLex(words, tree.Root); return tree; }
private static double[][] CollectMergeLoss(int nthread, Vocabulary vocab, TagSet tagset, LAPCFGrammar rules, List<PhrasalTree> treebank, double[][] tagProb) { double[][][] mlossList = new double[nthread][][]; for (int tid = 0; tid < nthread; ++tid) { double[][] mergeLoss = new double[rules.TotalTagCount][]; for (int i = 0; i < mergeLoss.Length; ++i) { if (tagProb [i].Length == 1) { continue; } mergeLoss [i] = new double[tagProb [i].Length / 2]; } ArrayHelper.Fill (mergeLoss, 0); mlossList [tid] = mergeLoss; } var parser = new HyperGraphParser (vocab, tagset, rules); Parallel.For (0, nthread, threadid => { var mergeLoss = mlossList [threadid]; for (int treeId = threadid; treeId < treebank.Count; treeId += nthread) { var tree = treebank [treeId]; var g = parser.BuildHyperGraph (tree); g.SumForward (); g.SumBackward (); double sentS = g.RootScore; if (double.IsNaN (sentS) || double.IsInfinity (sentS)) { continue; } foreach (var v in g.Vs) { if (v.TYPE == VTYPE.TERMINAL) { continue; } int t = v.tag; if (v.subtagCount == 1) { continue; } double[] marginals = new double[v.subtagCount]; for (int st = 0; st < v.subtagCount; ++st) { if (!v.alpha.pruned [st]) { marginals [st] = v.alpha.v [st] + v.beta.v [st]; } } for (int st = 0; st < v.subtagCount / 2; ++st) { int l = st * 2; int r = st * 2 + 1; if (double.IsNaN (v.alpha.v [l]) || double.IsInfinity (v.alpha.v [l]) || double.IsNaN (v.beta.v [l]) || double.IsInfinity (v.beta.v [l]) || double.IsNaN (v.alpha.v [r]) || double.IsInfinity (v.alpha.v [r]) || double.IsNaN (v.beta.v [r]) || double.IsInfinity (v.beta.v [r]) || v.alpha.pruned [l] || v.alpha.pruned [r]) { continue; } double lllhd = marginals [l]; double rllhd = marginals [r]; double mllhd = MathHelper.LogAdd (tagProb [t] [l] + v.alpha.v [l], tagProb [t] [r] + v.alpha.v [r]) + MathHelper.LogAdd (v.beta.v [l], v.beta.v [r]); marginals [l] = mllhd; marginals [r] = double.NegativeInfinity; double xSentScore = MathHelper.LogAdd (marginals); double sentScore = g.RootScore; mergeLoss [t] [st] += sentScore - xSentScore; //MathHelper.LogAdd(xSentScore - sentScore, mergeLoss[t][st]); marginals [l] = lllhd; marginals [r] = rllhd; } } } } ); var ml = mlossList [0]; for (int threadid = 1; threadid < mlossList.Length; ++threadid) { var xl = mlossList [threadid]; for (int i = 0; i < ml.Length; ++i) { if (ml [i] == null) { continue; } for (int j = 0; j < ml[i].Length; ++j) { ml [i] [j] += xl [i] [j]; } } } return ml; }
public static LAPCFGrammar MergeSymbols(double percentage, Vocabulary vocab, TagSet tagset, LAPCFGrammar rules, List<PhrasalTree> treebank, int nthread) { rules.InitializeExpectedCounts (); double[][] tagProb = SubtagExpectedCounts (nthread, vocab, tagset, rules, treebank); bool[] isSplit = new bool[tagProb.Length]; for (int i = 0; i < tagProb.Length; ++i) { if (tagProb [i].Length == 1) { tagProb [i] [0] = 0; isSplit [i] = false; } else { isSplit [i] = true; for (int j = 0; j < tagProb[i].Length / 2; ++j) { double z = MathHelper.LogAdd (tagProb [i] [2 * j], tagProb [i] [2 * j + 1]); tagProb [i] [2 * j] -= z; tagProb [i] [2 * j + 1] -= z; } } } double[][] mergeLoss = CollectMergeLoss (nthread, vocab, tagset, rules, treebank, tagProb); var mergeCands = new List<MergeHelper> (); for (int t = 0; t < mergeLoss.Length; ++t) { if (mergeLoss [t] == null) { continue; } for (int st = 0; st < mergeLoss[t].Length; ++st) { mergeCands.Add (new MergeHelper (t, st, mergeLoss [t] [st])); } } mergeCands.Sort ((a, b) => { return a.loss.CompareTo (b.loss); } ); //mergeCands.Reverse(); int[][] subtagMap; bool[][] isMerged; int[] newSubTagCounts; CreateMergeMapping (rules, mergeCands, out subtagMap, out isMerged, out newSubTagCounts); var newRules = MergeRuleTable (rules, tagProb, subtagMap, isMerged, newSubTagCounts); newRules.InitializeExpectedCounts (); return newRules; }
public static double[][] SubtagExpectedCounts( int nthread, Vocabulary vocab, TagSet tagset, //LALexiconBuilder lexicon, LAPCFGrammar rules, List<PhrasalTree> treebank) { var parser = new HyperGraphParser (vocab, tagset, rules); double[][][] tagExpectsArray = new double[nthread][][]; for (int tid = 0; tid < nthread; ++tid) { tagExpectsArray [tid] = new double[rules.TotalTagCount][]; var tagExpects = tagExpectsArray [tid]; for (int i = 0; i < tagExpects.Length; ++i) { tagExpects [i] = new double[rules.GetSubTagCount (i)]; } ArrayHelper.Fill (tagExpects, double.NegativeInfinity); } Parallel.For (0, nthread, threadid => { var tagExpects = tagExpectsArray [threadid]; for (int treeId = threadid; treeId < treebank.Count; treeId += nthread) { var tree = treebank [treeId]; var g = parser.BuildHyperGraph (tree); g.SumForward (); g.SumBackward (); double sentS = g.RootScore; if (double.IsNaN (sentS) || double.IsInfinity (sentS)) { continue; } foreach (var v in g.Vs) { if (v.TYPE == VTYPE.TERMINAL) { continue; } int t = v.tag; for (int st = 0; st < v.subtagCount; ++st) { if (double.IsNaN (v.alpha.v [st]) || double.IsInfinity (v.alpha.v [st]) || double.IsNaN (v.beta.v [st]) || double.IsInfinity (v.beta.v [st]) || v.alpha.pruned [st] || v.beta.pruned [st]) { continue; } tagExpects [t] [st] = MathHelper.LogAdd (v.alpha.v [st] + v.beta.v [st] - sentS, tagExpects [t] [st]); } } } } ); var te = tagExpectsArray [0]; for (int i = 1; i < nthread; ++i) { for (int j = 0; j < te.Length; ++j) { for (int k = 0; k < te[j].Length; ++k) { te [j] [k] = MathHelper.LogAdd (te [j] [k], tagExpectsArray [i] [j] [k]); } } } return te; }
private static double ParseGraphs(int nthread, List<PhrasalTree> treebank, LAPCFGrammar rules, Vocabulary vocab, TagSet tagSet, out int failed) { double llhd = 0; failed = 0; int xfail = 0; var handle = new object(); Parallel.For(0, nthread, threadid => { int fail = 0; double xllhd = 0; var parser = new HyperGraphParser(vocab, tagSet, rules); for (int i = threadid; i < treebank.Count; i += nthread) { try { var graph = parser.BuildHyperGraph(treebank [i]); graph.SumForward(); graph.SumBackward(); if (double.IsInfinity(graph.RootScore) || double.IsNaN(graph.RootScore)) { fail += 1; continue; } xllhd += graph.RootScore; } catch { fail += 1; } } lock (handle) { xfail += fail; llhd += xllhd; } } ); failed = xfail; return llhd; }
private void BuildTerminalRule(string ruleString, Vocabulary vocab, TagSet tagSet) { string[] parts = ruleString.Split(new string[] { "\t", "_" }, StringSplitOptions.RemoveEmptyEntries); int tag = tagSet.GetID(parts [0]); int subtag = int.Parse(parts [1]); int word = vocab.GetId(parts [2]); double s = double.Parse(parts [3]); if (trules == null) { trules = new TerminalRule[vocab.VocabSize][]; } if (trules [word] == null) { trules [word] = new TerminalRule[tagSet.PTCount]; } if (trules [word] [tag] == null) { trules [word] [tag] = new TerminalRule(new double[subTagCounts [tag]], tag, word); trules [word] [tag].ClearScore(); } trules [word] [tag].scores [subtag] = s; }
public void Build(TagSet tags, double smoothCount) { foreach (var kv in wordTagCount) { var word = kv.Key; double[] tp = new double[tags.PTCount]; for (int tid = 0; tid < tp.Length; ++tid) { var tstr = tags.GetTagString(tid); int wtc = kv.Value[tstr]; int tc = tagCount[tstr]; int ttc = tagTypeCount[tstr]; if (tc <= 0) { tp[tid] = double.NegativeInfinity; } else { tp[tid] = Math.Log((wtc + smoothCount) / (tc + (ttc + 1.0) * smoothCount)); } } probs.Add(word, tp); } }
public static double[][] BuildLex(List<PhrasalTree> treebank, TagSet tagSet, Vocabulary vocab) { int PTCount = tagSet.PTCount; int vocabCount = vocab.VocabSize; double[][] tagWordCounts = ArrayHelper.AllocateArray<double> (PTCount, vocabCount); double[][] wordTagCounts = ArrayHelper.AllocateArray<double> (vocabCount, PTCount); double[] tagCounts = new double[PTCount]; double[] wordCounts = new double[vocabCount]; HashSet<string>[] tagTypeSets = new HashSet<string>[PTCount]; for (int i = 0; i < tagTypeSets.Length; ++i) { tagTypeSets [i] = new HashSet<string> (); } foreach (var tree in treebank) { tree.ComputeStartEnd (); foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { string word = SimpleTokenizor.ETokenize (node.Lex); string tag = node.Tag; int tagId = tagSet.GetPTID (tag); tagTypeSets [tagId].Add (word); int wordId = vocab.GetId (word, node.Start == 0); tagWordCounts [tagId] [wordId] += 1.0f; wordTagCounts [wordId] [tagId] += 1.0f; tagCounts [tagId] += 1.0f; wordCounts [wordId] += 1.0f; } } } double[] typeTagCount = new double[PTCount]; for (int i = 0; i < typeTagCount.Length; ++i) { typeTagCount [i] = tagTypeSets [i].Count; } // for smoothing for (int wordId = 0; wordId < wordTagCounts.Length; ++wordId) { var wt = wordTagCounts [wordId]; double wc = wordCounts [wordId]; //bool isRare = vocab.IsRareOrUNK (wordId); //if (isRare) { // for (int tid = 0; tid < wt.Length; ++tid) { // if (wt [tid] > 0 || typeTagCount [tid] >= openTagClassThr) { // wt [tid] += addXSmoothing; // wc += addXSmoothing; // } // } //} for (int i = 0; i < wt.Length; ++i) { wt [i] /= wc; } } double totalwc = MathHelper.Sum (wordCounts); for (int i = 0; i < wordCounts.Length; ++i) { wordCounts [i] /= totalwc; } double totaltc = MathHelper.Sum (tagCounts); for (int i = 0; i < tagCounts.Length; ++i) { tagCounts [i] /= totaltc; } for (int tagId = 0; tagId < tagCounts.Length; ++tagId) { for (int wordId = 0; wordId < wordCounts.Length; ++wordId) { tagWordCounts [tagId] [wordId] = wordTagCounts [wordId] [tagId] * wordCounts [wordId] / tagCounts [tagId]; } } double[][] scores = ArrayHelper.AllocateArray<double> (vocabCount, PTCount); ArrayHelper.Fill (scores, double.NegativeInfinity); for (int word = 0; word < scores.Length; ++word) { for (int tag = 0; tag < scores[word].Length; ++tag) { if (tagWordCounts [tag] [word] > 0) { //scores[i][j] = new double[1]; //expectedCounts[i][j] = new double[1]; scores [word] [tag] = (double)Math.Log (tagWordCounts [tag] [word]); //expectedCounts[i][j][0] = double.NegativeInfinity; } } } return scores; }
public void DumpToStream(TextModelWriter sw, TagSet tagSet, Vocabulary vocab) { var name = typeof(LAPCFGrammar).FullName; sw.Write(name); sw.WriteOption("VER", VER); sw.WriteOption("NTCount", NTCount); sw.WriteOption("PTCount", PTCount); sw.WriteOption("ROOTID", ROOTID); sw.Write("TerminalRule"); sw.NestLevel += 1; foreach (var x in trules) { if (x != null) { foreach (var y in x) { if (y != null) { var word = vocab.GetWordString(y.word); var tag = tagSet.GetTagString(y.tag); for (int p = 0; p < y.scores.Length; ++p) { if (!double.IsInfinity(y.scores [p]) && !double.IsNaN(y.scores [p])) { sw.Write(string.Format("{0}_{1}\t{2}\t{3}", tag, p, word, y.scores [p])); } } } } } } sw.NestLevel -= 1; sw.Write("UnaryRule"); sw.NestLevel += 1; foreach (var x in urules) { if (x != null) { foreach (var y in x) { if (y != null) { var ptag = tagSet.GetTagString(y.ptag); var ctag = tagSet.GetTagString(y.ctag); for (int c = 0; c < y.scores.Length; ++c) { for (int p = 0; p < y.scores[c].Length; ++p) { if (!double.IsInfinity(y.scores [c] [p]) && !double.IsNaN(y.scores [c] [p])) { sw.Write(string.Format("{0}_{1}\t{2}_{3}\t{4}", ptag, p, ctag, c, y.scores [c] [p])); } } } } } } } sw.NestLevel -= 1; sw.Write("BinaryRule"); sw.NestLevel += 1; foreach (var x in brules) { if (x != null) { foreach (var y in x) { if (y != null) { foreach (var z in y) { if (z != null) { var ptag = tagSet.GetTagString(z.ptag); var ltag = tagSet.GetTagString(z.ltag); var rtag = tagSet.GetTagString(z.rtag); for (int l = 0; l < z.scores.Length; ++l) { for (int r = 0; r < z.scores[l].Length; ++r) { for (int p = 0; p < z.scores[l][r].Length; ++p) { if (!double.IsInfinity(z.scores [l] [r] [p]) && !double.IsNaN(z.scores [l] [r] [p])) { sw.Write( string.Format("{0}_{1}\t{2}_{3}\t{4}_{5}\t{6}", ptag, p, ltag, l, rtag, r, z.scores [l] [r] [p]) ); } } } } } } } } } } sw.NestLevel -= 1; sw.WriteOption("TraceCount", subtagTraces.Count); foreach (var trace in subtagTraces) { sw.WriteOption("TRACE", trace.Length); sw.NestLevel += 1; foreach (var t in trace) { sw.Write(string.Join(" ", t)); } sw.NestLevel -= 1; } sw.Write(name); }
public HyperGraphParser( Vocabulary vocab, TagSet tagset, LAPCFGrammar rules) { this.vocab = vocab; this.tagset = tagset; this.rules = rules; }
public static LAPCFGrammar LoadFromStream(TextModelReader sr, Vocabulary vocab, TagSet tagSet) { var grammar = new LAPCFGrammar(); var name = typeof(LAPCFGrammar).FullName; sr.Require(name); sr.Require("VER", VER); grammar.NTCount = sr.ReadOptionInt("NTCount"); grammar.PTCount = sr.ReadOptionInt("PTCount"); grammar.ROOTID = sr.ReadOptionInt("ROOTID"); sr.Require("TerminalRule"); int lvl = sr.NestLevel; var truleStrings = new HashSet<string>(); var uruleStrings = new HashSet<string>(); var bruleStrings = new HashSet<string>(); string line = sr.Read(); while (sr.NestLevel > lvl) { truleStrings.Add(line); line = sr.Read(); } if (line != "UnaryRule") { throw new Exception("wrong model!"); } line = sr.Read(); while (sr.NestLevel > lvl) { uruleStrings.Add(line); line = sr.Read(); } if (line != "BinaryRule") { throw new Exception("wrong model!"); } line = sr.Read(); while (sr.NestLevel > lvl) { bruleStrings.Add(line); line = sr.Read(); } string[] parts = line.Split('\t'); if (parts [0] != "TraceCount") { throw new Exception("error in model"); } int subtraceCount = int.Parse(parts [1]); grammar.subtagTraces = new List<int[][]>(); for (int i = 0; i < subtraceCount; ++i) { int tlen = sr.ReadOptionInt("TRACE"); int[][] trace = new int[tlen][]; for (int j = 0; j < tlen; ++j) { trace [j] = sr.ReadIntArray(); } grammar.subtagTraces.Add(trace); } if (grammar.subtagTraces.Count == 0) { grammar.subTagCounts = new int[grammar.TotalTagCount]; ArrayHelper.Fill(grammar.subTagCounts, 1); } else { var trace = grammar.subtagTraces [grammar.subtagTraces.Count - 1]; grammar.subTagCounts = trace.Select(x => x.Length).ToArray(); } sr.Require(name); foreach (var str in uruleStrings) { grammar.BuildUnaryRule(str, tagSet); } foreach (var str in truleStrings) { grammar.BuildTerminalRule(str, vocab, tagSet); } foreach (var str in bruleStrings) { grammar.BuildBinaryRule(str, tagSet); } return grammar; }
public LAPCFGrammar(TagSet set, double[][][] brawScores, double[][] urawScores, double[][] trawScores) { NTCount = set.NTCount; PTCount = set.PTCount; ROOTID = set.ROOTID; var tagCount = TotalTagCount; brules = new BinaryRule[tagCount][][]; for (int l = 0; l < tagCount; ++l) { if (brawScores [l] == null) { continue; } brules [l] = new BinaryRule[tagCount][]; for (int r = 0; r < tagCount; ++r) { if (brawScores [l] [r] == null) { continue; } brules [l] [r] = new BinaryRule[tagCount]; for (int p = 0; p < tagCount; ++p) { if (!double.IsInfinity(brawScores [l] [r] [p]) && !double.IsNaN(brawScores [l] [r] [p])) { double[][][] s = new double[1][][]; s [0] = new double[1][]; s [0] [0] = new double[1]; s [0] [0] [0] = brawScores [l] [r] [p]; brules [l] [r] [p] = new BinaryRule(s, p, l, r); } } } } urules = new UnaryRule[tagCount][]; for (int c = 0; c < tagCount; ++c) { if (urawScores [c] == null) { continue; } urules [c] = new UnaryRule[tagCount]; for (int p = 0; p < tagCount; ++p) { if (!double.IsNaN(urawScores [c] [p]) && !double.IsInfinity(urawScores [c] [p])) { double[][] s = new double[1][]; s [0] = new double[1]; s [0] [0] = urawScores [c] [p]; urules [c] [p] = new UnaryRule(s, p, c); } } } trules = new TerminalRule[trawScores.Length][]; for (int w = 0; w < trawScores.Length; ++w) { if (trawScores [w] == null) { continue; } trules [w] = new TerminalRule[trawScores [w].Length]; for (int t = 0; t < trules[w].Length; ++t) { if (!double.IsNaN(trawScores [w] [t]) && !double.IsInfinity(trawScores [w] [t])) { double[] s = new double[1]; s [0] = trawScores [w] [t]; trules [w] [t] = new TerminalRule(s, t, w); } } } subTagCounts = new int[tagCount]; for (int i = 0; i < subTagCounts.Length; ++i) { subTagCounts [i] = 1; } }
private void BuildUnaryRule(string ruleString, TagSet tagSet) { string[] parts = ruleString.Split(new string[] { "\t", "_" }, StringSplitOptions.RemoveEmptyEntries); int ptag = tagSet.GetID(parts [0]); int subptag = int.Parse(parts [1]); int ctag = tagSet.GetID(parts [2]); int subctag = int.Parse(parts [3]); double s = double.Parse(parts [4]); if (urules == null) { urules = new UnaryRule[TotalTagCount][]; } if (urules [ctag] == null) { urules [ctag] = new UnaryRule[TotalTagCount]; } if (urules [ctag] [ptag] == null) { urules [ctag] [ptag] = new UnaryRule(ArrayHelper.AllocateArray<double>(subTagCounts [ctag], subTagCounts [ptag]), ptag, ctag); urules [ctag] [ptag].ClearScore(); } urules [ctag] [ptag].scores [subctag] [subptag] = s; }
private PhrasalNode ExtractViterbiParse(HyperVertex v, int subtag, TagSet tagSet) { if (v == null || v.TYPE == VTYPE.TERMINAL) { return null; } PhrasalNode node = new PhrasalNode(); node.Tag = tagSet.GetTagString(v.tag); var bestEdge = v.traces[subtag].edge; if (bestEdge == null) { return node; } switch (bestEdge.TYPE) { case ETYPE.BINARY: var l = ExtractViterbiParse(bestEdge.from0, v.traces[subtag].subtag0, tagSet); var r = ExtractViterbiParse(bestEdge.from1, v.traces[subtag].subtag1, tagSet); node.Children.Add(l); node.Children.Add(r); l.Parent = node; r.Parent = node; break; case ETYPE.UNARY: var c = ExtractViterbiParse(bestEdge.from0, v.traces[subtag].subtag0, tagSet); node.Children.Add(c); c.Parent = node; break; case ETYPE.TERMINAL: break; case ETYPE.DUMMY: node = ExtractViterbiParse(bestEdge.from0, v.traces[subtag].subtag0, tagSet); break; default: throw new Exception("unknown edge type!"); } return node; }
public static void Build( Vocabulary vocab, TagSet tagset, List<PhrasalTree> treebank, out LAPCFGrammar rules, Random RNG = null) { int tagCount = tagset.NTCount + tagset.PTCount; double[] pmass = new double[tagset.NTCount + tagset.PTCount]; double[][] unaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount); double[][][] binaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount, tagCount); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { // terminals continue; } else if (node.Children.Count == 1) { int pt = tagset.GetID (node.Tag); int ct = tagset.GetID (node.Children [0].Tag); pmass [pt] += 1.0f; unaries [ct] [pt] += 1.0f; } else if (node.Children.Count == 2) { int pt = tagset.GetID (node.Tag); int lt = tagset.GetID (node.Children [0].Tag); int rt = tagset.GetID (node.Children [1].Tag); pmass [pt] += 1.0f; binaries [lt] [rt] [pt] += 1.0f; } else { throw new Exception ("tree node with more than 2 children!"); } } } for (int c = 0; c < unaries.Length; ++c) { bool csurvive = false; for (int p = 0; p < unaries[c].Length; ++p) { if (unaries[c][p] > 0) { csurvive = true; break; } } if (!csurvive) { unaries[c] = null; } } for (int l = 0; l < binaries.Length; ++l) { bool lsurvive = false; for (int r = 0; r < binaries[l].Length; ++r) { bool rsurvive = false; for (int p = 0; p < binaries[l][r].Length; ++p) { if (binaries[l][r][p] > 0) { rsurvive = true; break; } } if (rsurvive) { lsurvive = true; } else { binaries[l][r] = null; } } if (!lsurvive) { binaries[l] = null; } } foreach (var x in unaries.Where(x => x != null)) { for (int p = 0; p < x.Length; ++p) { double noise = RNG.NextDouble(); x[p] += noise; pmass[p] += noise; } } foreach (var x in binaries.Where(x => x != null)) { foreach (var y in x.Where(y => y != null)) { for (int p = 0; p < y.Length; ++p) { double noise = RNG.NextDouble(); y[p] += noise; pmass[p] += noise; } } } for (int c = 0; c < tagCount; ++c) { for (int p = 0; p < tagCount; ++p) { if (pmass [p] == 0) { continue; } if (unaries[c] == null) { continue; } unaries [c] [p] /= pmass [p]; } } for (int c = 0; c < tagCount; ++c) { if (unaries [c] == null) { continue; } for (int p = 0; p < tagCount; ++p) { if (unaries [c] [p] <= 0) { unaries [c] [p] = double.NegativeInfinity; } else { unaries [c] [p] = (double)Math.Log (unaries [c] [p]); } } } for (int l = 0; l < tagCount; ++l) { if (binaries[l] == null) { continue; } for (int r = 0; r < tagCount; ++r) { for (int p = 0; p < tagCount; ++p) { if (pmass [p] == 0) { continue; } if (binaries[l][r] == null) { continue; } binaries [l] [r] [p] /= pmass [p]; } } } for (int l = 0; l < tagCount; ++l) { if (binaries [l] == null) { continue; } for (int r = 0; r < tagCount; ++r) { if (binaries [l] [r] == null) { continue; } for (int p = 0; p < tagCount; ++p) { if (binaries [l] [r] [p] <= 0) { binaries [l] [r] [p] = double.NegativeInfinity; } else { binaries [l] [r] [p] = (double)Math.Log (binaries [l] [r] [p]); } } } } var terminals = BuildLexSimple (treebank, tagset, vocab, RNG); rules = new LAPCFGrammar (tagset, binaries, unaries, terminals); }
private static double ParseGraphAndCollect(int nthread, List<PhrasalTree> treebank, LAPCFGrammar rules, Vocabulary vocab, TagSet tagSet, out int failed) { double llhd = 0; failed = 0; int xfail = 0; var handle = new object(); var rulelist = new List<LAPCFGrammar>(); rulelist.Add(rules); while (rulelist.Count < nthread) { rulelist.Add(rules.CloneWithSharedParameters()); } Parallel.For(0, nthread, threadid => { int fail = 0; double xllhd = 0; var parser = new HyperGraphParser(vocab, tagSet, rulelist [threadid]); for (int i = threadid; i < treebank.Count; i += nthread) { try { var graph = parser.BuildHyperGraph(treebank [i]); graph.SumForward(); graph.SumBackward(); if (double.IsInfinity(graph.RootScore) || double.IsNaN(graph.RootScore)) { fail += 1; continue; } graph.CollectExpectedCount(); xllhd += graph.RootScore; } catch { fail += 1; } } lock (handle) { xfail += fail; llhd += xllhd; } } ); for (int i = 1; i < rulelist.Count; ++i) { LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.tposteriorCounts, rulelist [i].tposteriorCounts); LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.uposteriorCounts, rulelist [i].uposteriorCounts); LAPCFGrammar.ApplyToRules((x, y) => x.Add(y), rules.bposteriorCounts, rulelist [i].bposteriorCounts); } failed = xfail; //Console.Error.WriteLine("fail: {0}\tllhd: {1}", failed, llhd); return llhd; }
public static double[][] BuildLexSimple(List<PhrasalTree> treebank, TagSet tagSet, Vocabulary vocab, Random RNG) { int PTCount = tagSet.PTCount; int vocabCount = vocab.VocabSize; double[][] tagWordCounts = ArrayHelper.AllocateArray<double>(PTCount, vocabCount); double[] tagCounts = new double[PTCount]; HashSet<string>[] tagTypeSets = new HashSet<string>[PTCount]; for (int i = 0; i < tagTypeSets.Length; ++i) { tagTypeSets[i] = new HashSet<string>(); } foreach (var tree in treebank) { tree.ComputeStartEnd(); foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { string word = SimpleTokenizor.ETokenize(node.Lex); string tag = node.Tag; int tagId = tagSet.GetPTID(tag); tagTypeSets[tagId].Add(word); int wordId = vocab.GetId(word, node.Start == 0); double weight = RNG == null ? 1.0 : 1.0 + (RNG.NextDouble() - 0.5) / 100; tagWordCounts[tagId][wordId] += weight; tagCounts[tagId] += weight; } } } double[][] scores = ArrayHelper.AllocateArray<double>(vocabCount, PTCount); ArrayHelper.Fill(scores, double.NegativeInfinity); for (int word = 0; word < scores.Length; ++word) { for (int tag = 0; tag < scores[word].Length; ++tag) { if (tagWordCounts[tag][word] > 0) { //scores[i][j] = new double[1]; //expectedCounts[i][j] = new double[1]; scores[word][tag] = (double)Math.Log(tagWordCounts[tag][word] / tagCounts[tag]); //expectedCounts[i][j][0] = double.NegativeInfinity; } } } return scores; }
private static bool[][] AssignTagConstraints(Vocabulary vocab, TagSet tagSet, string[] words, int[] wids) { bool[][] allowedTags = new bool[wids.Length][]; for (int i = 0; i < wids.Length; ++i) { //allowedTags[i] = new bool[tagSet.PTCount]; //allowedTags[i][tagSet.GetID(tags[i])] = true; //continue; if (vocab.IsRareOrUNK(wids[i])) { var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]); if (lemmas == null || lemmas.Count == 0) { continue; } allowedTags[i] = new bool[tagSet.PTCount]; if (char.IsUpper(words[i][0])) { allowedTags[i][tagSet.GetID("NNP")] = true; allowedTags[i][tagSet.GetID("NNPS")] = true; } foreach (var lemma in lemmas) { switch (lemma.PoS) { case EMorph.MorphPoS.NN: allowedTags[i][tagSet.GetID("NN")] = true; var w = words[i].ToLower(); if (EMorph.EnglishMorph.IsNoChangeNoun(w) || w.EndsWith("ese") || w.EndsWith("ise")) { allowedTags[i][tagSet.GetID("NNS")] = true; } break; case EMorph.MorphPoS.NNS: allowedTags[i][tagSet.GetID("NNS")] = true; //allowedTags[i][tagSet.GetID("NN")] = true; break; case EMorph.MorphPoS.JJ: allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.JJR: allowedTags[i][tagSet.GetID("JJR")] = true; break; case EMorph.MorphPoS.JJS: allowedTags[i][tagSet.GetID("JJS")] = true; break; case EMorph.MorphPoS.RB: allowedTags[i][tagSet.GetID("RB")] = true; break; case EMorph.MorphPoS.RBR: allowedTags[i][tagSet.GetID("RBR")] = true; break; case EMorph.MorphPoS.RBS: allowedTags[i][tagSet.GetID("RBS")] = true; break; case EMorph.MorphPoS.VB: allowedTags[i][tagSet.GetID("VB")] = true; allowedTags[i][tagSet.GetID("VBP")] = true; break; case EMorph.MorphPoS.VBD: allowedTags[i][tagSet.GetID("VBD")] = true; allowedTags[i][tagSet.GetID("VBN")] = true; //allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.VBG: allowedTags[i][tagSet.GetID("VBG")] = true; //allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.VBZ: allowedTags[i][tagSet.GetID("VBZ")] = true; break; default: throw new Exception("not recognized morph lemma!"); } } //if(!allowedTags[i][tagSet.GetID(tags[i])]) //{ // Console.Error.WriteLine("!"); //} } } return allowedTags; }
public LAPCFGrammar(TagSet tagset, Vocabulary vocab, string lexfile, string rulefile) { NTCount = tagset.NTCount; PTCount = tagset.PTCount; ROOTID = tagset.ROOTID; subTagCounts = new int[TotalTagCount]; ArrayHelper.Fill(subTagCounts, 2); subTagCounts[ROOTID] = 1; var trace = subTagCounts.Select(x => new int[x]).ToArray(); subtagTraces.Add(trace); using (var sr = new System.IO.StreamReader(lexfile)) { while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } string[] parts = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries); string tag = parts[0]; string word = parts[1]; for (int i = 0; i < parts.Length - 2; ++i) { double p = double.Parse(parts[i + 2]); if (p <= 0) { continue; } string pline = string.Format("{0}_{1}\t{2}\t{3}", tag, i, word, Math.Log(p)); BuildTerminalRule(pline, vocab, tagset); } } } using (var sr = new System.IO.StreamReader(rulefile)) { while (!sr.EndOfStream) { string line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } string[] parts = line.Split(new string[] { " ", "\t", "->" }, StringSplitOptions.RemoveEmptyEntries); int xlen = parts.Length - 1; parts[xlen] = Math.Log(double.Parse(parts[xlen])).ToString(); string pline = string.Join("\t", parts); if (parts.Length == 3) { BuildUnaryRule(pline, tagset); } else { BuildBinaryRule(pline, tagset); } } } }