private static bool[][] AssignTagConstraints(Vocabulary vocab, TagSet tagSet, string[] words, int[] wids) { bool[][] allowedTags = new bool[wids.Length][]; for (int i = 0; i < wids.Length; ++i) { //allowedTags[i] = new bool[tagSet.PTCount]; //allowedTags[i][tagSet.GetID(tags[i])] = true; //continue; if (vocab.IsRareOrUNK(wids[i])) { var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]); if (lemmas == null || lemmas.Count == 0) { continue; } allowedTags[i] = new bool[tagSet.PTCount]; if (char.IsUpper(words[i][0])) { allowedTags[i][tagSet.GetID("NNP")] = true; allowedTags[i][tagSet.GetID("NNPS")] = true; } foreach (var lemma in lemmas) { switch (lemma.PoS) { case EMorph.MorphPoS.NN: allowedTags[i][tagSet.GetID("NN")] = true; var w = words[i].ToLower(); if (EMorph.EnglishMorph.IsNoChangeNoun(w) || w.EndsWith("ese") || w.EndsWith("ise")) { allowedTags[i][tagSet.GetID("NNS")] = true; } break; case EMorph.MorphPoS.NNS: allowedTags[i][tagSet.GetID("NNS")] = true; //allowedTags[i][tagSet.GetID("NN")] = true; break; case EMorph.MorphPoS.JJ: allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.JJR: allowedTags[i][tagSet.GetID("JJR")] = true; break; case EMorph.MorphPoS.JJS: allowedTags[i][tagSet.GetID("JJS")] = true; break; case EMorph.MorphPoS.RB: allowedTags[i][tagSet.GetID("RB")] = true; break; case EMorph.MorphPoS.RBR: allowedTags[i][tagSet.GetID("RBR")] = true; break; case EMorph.MorphPoS.RBS: allowedTags[i][tagSet.GetID("RBS")] = true; break; case EMorph.MorphPoS.VB: allowedTags[i][tagSet.GetID("VB")] = true; allowedTags[i][tagSet.GetID("VBP")] = true; break; case EMorph.MorphPoS.VBD: allowedTags[i][tagSet.GetID("VBD")] = true; allowedTags[i][tagSet.GetID("VBN")] = true; //allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.VBG: allowedTags[i][tagSet.GetID("VBG")] = true; //allowedTags[i][tagSet.GetID("JJ")] = true; break; case EMorph.MorphPoS.VBZ: allowedTags[i][tagSet.GetID("VBZ")] = true; break; default: throw new Exception("not recognized morph lemma!"); } } //if(!allowedTags[i][tagSet.GetID(tags[i])]) //{ // Console.Error.WriteLine("!"); //} } } return allowedTags; }
private void BuildUnaryRule(string ruleString, TagSet tagSet) { string[] parts = ruleString.Split(new string[] { "\t", "_" }, StringSplitOptions.RemoveEmptyEntries); int ptag = tagSet.GetID(parts [0]); int subptag = int.Parse(parts [1]); int ctag = tagSet.GetID(parts [2]); int subctag = int.Parse(parts [3]); double s = double.Parse(parts [4]); if (urules == null) { urules = new UnaryRule[TotalTagCount][]; } if (urules [ctag] == null) { urules [ctag] = new UnaryRule[TotalTagCount]; } if (urules [ctag] [ptag] == null) { urules [ctag] [ptag] = new UnaryRule(ArrayHelper.AllocateArray<double>(subTagCounts [ctag], subTagCounts [ptag]), ptag, ctag); urules [ctag] [ptag].ClearScore(); } urules [ctag] [ptag].scores [subctag] [subptag] = s; }
public static void Build( Vocabulary vocab, TagSet tagset, List<PhrasalTree> treebank, out LAPCFGrammar rules, Random RNG = null) { int tagCount = tagset.NTCount + tagset.PTCount; double[] pmass = new double[tagset.NTCount + tagset.PTCount]; double[][] unaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount); double[][][] binaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount, tagCount); foreach (var tree in treebank) { foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { // terminals continue; } else if (node.Children.Count == 1) { int pt = tagset.GetID (node.Tag); int ct = tagset.GetID (node.Children [0].Tag); pmass [pt] += 1.0f; unaries [ct] [pt] += 1.0f; } else if (node.Children.Count == 2) { int pt = tagset.GetID (node.Tag); int lt = tagset.GetID (node.Children [0].Tag); int rt = tagset.GetID (node.Children [1].Tag); pmass [pt] += 1.0f; binaries [lt] [rt] [pt] += 1.0f; } else { throw new Exception ("tree node with more than 2 children!"); } } } for (int c = 0; c < unaries.Length; ++c) { bool csurvive = false; for (int p = 0; p < unaries[c].Length; ++p) { if (unaries[c][p] > 0) { csurvive = true; break; } } if (!csurvive) { unaries[c] = null; } } for (int l = 0; l < binaries.Length; ++l) { bool lsurvive = false; for (int r = 0; r < binaries[l].Length; ++r) { bool rsurvive = false; for (int p = 0; p < binaries[l][r].Length; ++p) { if (binaries[l][r][p] > 0) { rsurvive = true; break; } } if (rsurvive) { lsurvive = true; } else { binaries[l][r] = null; } } if (!lsurvive) { binaries[l] = null; } } foreach (var x in unaries.Where(x => x != null)) { for (int p = 0; p < x.Length; ++p) { double noise = RNG.NextDouble(); x[p] += noise; pmass[p] += noise; } } foreach (var x in binaries.Where(x => x != null)) { foreach (var y in x.Where(y => y != null)) { for (int p = 0; p < y.Length; ++p) { double noise = RNG.NextDouble(); y[p] += noise; pmass[p] += noise; } } } for (int c = 0; c < tagCount; ++c) { for (int p = 0; p < tagCount; ++p) { if (pmass [p] == 0) { continue; } if (unaries[c] == null) { continue; } unaries [c] [p] /= pmass [p]; } } for (int c = 0; c < tagCount; ++c) { if (unaries [c] == null) { continue; } for (int p = 0; p < tagCount; ++p) { if (unaries [c] [p] <= 0) { unaries [c] [p] = double.NegativeInfinity; } else { unaries [c] [p] = (double)Math.Log (unaries [c] [p]); } } } for (int l = 0; l < tagCount; ++l) { if (binaries[l] == null) { continue; } for (int r = 0; r < tagCount; ++r) { for (int p = 0; p < tagCount; ++p) { if (pmass [p] == 0) { continue; } if (binaries[l][r] == null) { continue; } binaries [l] [r] [p] /= pmass [p]; } } } for (int l = 0; l < tagCount; ++l) { if (binaries [l] == null) { continue; } for (int r = 0; r < tagCount; ++r) { if (binaries [l] [r] == null) { continue; } for (int p = 0; p < tagCount; ++p) { if (binaries [l] [r] [p] <= 0) { binaries [l] [r] [p] = double.NegativeInfinity; } else { binaries [l] [r] [p] = (double)Math.Log (binaries [l] [r] [p]); } } } } var terminals = BuildLexSimple (treebank, tagset, vocab, RNG); rules = new LAPCFGrammar (tagset, binaries, unaries, terminals); }
private void BuildTerminalRule(string ruleString, Vocabulary vocab, TagSet tagSet) { string[] parts = ruleString.Split(new string[] { "\t", "_" }, StringSplitOptions.RemoveEmptyEntries); int tag = tagSet.GetID(parts [0]); int subtag = int.Parse(parts [1]); int word = vocab.GetId(parts [2]); double s = double.Parse(parts [3]); if (trules == null) { trules = new TerminalRule[vocab.VocabSize][]; } if (trules [word] == null) { trules [word] = new TerminalRule[tagSet.PTCount]; } if (trules [word] [tag] == null) { trules [word] [tag] = new TerminalRule(new double[subTagCounts [tag]], tag, word); trules [word] [tag].ClearScore(); } trules [word] [tag].scores [subtag] = s; }