public static double[][] BuildLexSimple(List<PhrasalTree> treebank, TagSet tagSet, Vocabulary vocab, Random RNG) { int PTCount = tagSet.PTCount; int vocabCount = vocab.VocabSize; double[][] tagWordCounts = ArrayHelper.AllocateArray<double>(PTCount, vocabCount); double[] tagCounts = new double[PTCount]; HashSet<string>[] tagTypeSets = new HashSet<string>[PTCount]; for (int i = 0; i < tagTypeSets.Length; ++i) { tagTypeSets[i] = new HashSet<string>(); } foreach (var tree in treebank) { tree.ComputeStartEnd(); foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { string word = SimpleTokenizor.ETokenize(node.Lex); string tag = node.Tag; int tagId = tagSet.GetPTID(tag); tagTypeSets[tagId].Add(word); int wordId = vocab.GetId(word, node.Start == 0); double weight = RNG == null ? 1.0 : 1.0 + (RNG.NextDouble() - 0.5) / 100; tagWordCounts[tagId][wordId] += weight; tagCounts[tagId] += weight; } } } double[][] scores = ArrayHelper.AllocateArray<double>(vocabCount, PTCount); ArrayHelper.Fill(scores, double.NegativeInfinity); for (int word = 0; word < scores.Length; ++word) { for (int tag = 0; tag < scores[word].Length; ++tag) { if (tagWordCounts[tag][word] > 0) { //scores[i][j] = new double[1]; //expectedCounts[i][j] = new double[1]; scores[word][tag] = (double)Math.Log(tagWordCounts[tag][word] / tagCounts[tag]); //expectedCounts[i][j][0] = double.NegativeInfinity; } } } return scores; }
public static double[][] BuildLex(List<PhrasalTree> treebank, TagSet tagSet, Vocabulary vocab) { int PTCount = tagSet.PTCount; int vocabCount = vocab.VocabSize; double[][] tagWordCounts = ArrayHelper.AllocateArray<double> (PTCount, vocabCount); double[][] wordTagCounts = ArrayHelper.AllocateArray<double> (vocabCount, PTCount); double[] tagCounts = new double[PTCount]; double[] wordCounts = new double[vocabCount]; HashSet<string>[] tagTypeSets = new HashSet<string>[PTCount]; for (int i = 0; i < tagTypeSets.Length; ++i) { tagTypeSets [i] = new HashSet<string> (); } foreach (var tree in treebank) { tree.ComputeStartEnd (); foreach (var node in tree.TreeNodes) { if (node.Children.Count == 0) { string word = SimpleTokenizor.ETokenize (node.Lex); string tag = node.Tag; int tagId = tagSet.GetPTID (tag); tagTypeSets [tagId].Add (word); int wordId = vocab.GetId (word, node.Start == 0); tagWordCounts [tagId] [wordId] += 1.0f; wordTagCounts [wordId] [tagId] += 1.0f; tagCounts [tagId] += 1.0f; wordCounts [wordId] += 1.0f; } } } double[] typeTagCount = new double[PTCount]; for (int i = 0; i < typeTagCount.Length; ++i) { typeTagCount [i] = tagTypeSets [i].Count; } // for smoothing for (int wordId = 0; wordId < wordTagCounts.Length; ++wordId) { var wt = wordTagCounts [wordId]; double wc = wordCounts [wordId]; //bool isRare = vocab.IsRareOrUNK (wordId); //if (isRare) { // for (int tid = 0; tid < wt.Length; ++tid) { // if (wt [tid] > 0 || typeTagCount [tid] >= openTagClassThr) { // wt [tid] += addXSmoothing; // wc += addXSmoothing; // } // } //} for (int i = 0; i < wt.Length; ++i) { wt [i] /= wc; } } double totalwc = MathHelper.Sum (wordCounts); for (int i = 0; i < wordCounts.Length; ++i) { wordCounts [i] /= totalwc; } double totaltc = MathHelper.Sum (tagCounts); for (int i = 0; i < tagCounts.Length; ++i) { tagCounts [i] /= totaltc; } for (int tagId = 0; tagId < tagCounts.Length; ++tagId) { for (int wordId = 0; wordId < wordCounts.Length; ++wordId) { tagWordCounts [tagId] [wordId] = wordTagCounts [wordId] [tagId] * wordCounts [wordId] / tagCounts [tagId]; } } double[][] scores = ArrayHelper.AllocateArray<double> (vocabCount, PTCount); ArrayHelper.Fill (scores, double.NegativeInfinity); for (int word = 0; word < scores.Length; ++word) { for (int tag = 0; tag < scores[word].Length; ++tag) { if (tagWordCounts [tag] [word] > 0) { //scores[i][j] = new double[1]; //expectedCounts[i][j] = new double[1]; scores [word] [tag] = (double)Math.Log (tagWordCounts [tag] [word]); //expectedCounts[i][j][0] = double.NegativeInfinity; } } } return scores; }