Exemplo n.º 1
0
        public static double[][] BuildLex(List<PhrasalTree> treebank, TagSet tagSet, Vocabulary vocab)
        {
            int PTCount = tagSet.PTCount;
            int vocabCount = vocab.VocabSize;
            double[][] tagWordCounts = ArrayHelper.AllocateArray<double> (PTCount, vocabCount);
            double[][] wordTagCounts = ArrayHelper.AllocateArray<double> (vocabCount, PTCount);
            double[] tagCounts = new double[PTCount];
            double[] wordCounts = new double[vocabCount];

            HashSet<string>[] tagTypeSets = new HashSet<string>[PTCount];

            for (int i = 0; i < tagTypeSets.Length; ++i) {
                tagTypeSets [i] = new HashSet<string> ();
            }

            foreach (var tree in treebank) {
                tree.ComputeStartEnd ();
                foreach (var node in tree.TreeNodes) {
                    if (node.Children.Count == 0) {
                        string word = SimpleTokenizor.ETokenize (node.Lex);
                        string tag = node.Tag;

                        int tagId = tagSet.GetPTID (tag);
                        tagTypeSets [tagId].Add (word);

                        int wordId = vocab.GetId (word, node.Start == 0);

                        tagWordCounts [tagId] [wordId] += 1.0f;
                        wordTagCounts [wordId] [tagId] += 1.0f;

                        tagCounts [tagId] += 1.0f;
                        wordCounts [wordId] += 1.0f;
                    }
                }
            }

            double[] typeTagCount = new double[PTCount];

            for (int i = 0; i < typeTagCount.Length; ++i) {
                typeTagCount [i] = tagTypeSets [i].Count;
            }

            // for smoothing
            for (int wordId = 0; wordId < wordTagCounts.Length; ++wordId) {
                var wt = wordTagCounts [wordId];
                double wc = wordCounts [wordId];

                //bool isRare = vocab.IsRareOrUNK (wordId);

                //if (isRare) {
                //    for (int tid = 0; tid < wt.Length; ++tid) {
                //        if (wt [tid] > 0 || typeTagCount [tid] >= openTagClassThr) {
                //            wt [tid] += addXSmoothing;
                //            wc += addXSmoothing;
                //        }
                //    }
                //}

                for (int i = 0; i < wt.Length; ++i) {
                    wt [i] /= wc;
                }
            }

            double totalwc = MathHelper.Sum (wordCounts);

            for (int i = 0; i < wordCounts.Length; ++i) {
                wordCounts [i] /= totalwc;
            }

            double totaltc = MathHelper.Sum (tagCounts);

            for (int i = 0; i < tagCounts.Length; ++i) {
                tagCounts [i] /= totaltc;
            }

            for (int tagId = 0; tagId < tagCounts.Length; ++tagId) {
                for (int wordId = 0; wordId < wordCounts.Length; ++wordId) {
                    tagWordCounts [tagId] [wordId] = wordTagCounts [wordId] [tagId] * wordCounts [wordId] / tagCounts [tagId];
                }
            }

            double[][] scores = ArrayHelper.AllocateArray<double> (vocabCount, PTCount);

            ArrayHelper.Fill (scores, double.NegativeInfinity);

            for (int word = 0; word < scores.Length; ++word) {
                for (int tag = 0; tag < scores[word].Length; ++tag) {
                    if (tagWordCounts [tag] [word] > 0) {
                        //scores[i][j] = new double[1];
                        //expectedCounts[i][j] = new double[1];
                        scores [word] [tag] = (double)Math.Log (tagWordCounts [tag] [word]);
                        //expectedCounts[i][j][0] = double.NegativeInfinity;
                    }
                }
            }

            return scores;
        }
Exemplo n.º 2
0
        public static double[][] BuildLexSimple(List<PhrasalTree> treebank, TagSet tagSet, Vocabulary vocab, Random RNG)
        {
            int PTCount = tagSet.PTCount;
            int vocabCount = vocab.VocabSize;
            double[][] tagWordCounts = ArrayHelper.AllocateArray<double>(PTCount, vocabCount);
            double[] tagCounts = new double[PTCount];

            HashSet<string>[] tagTypeSets = new HashSet<string>[PTCount];

            for (int i = 0; i < tagTypeSets.Length; ++i)
            {
                tagTypeSets[i] = new HashSet<string>();
            }

            foreach (var tree in treebank)
            {
                tree.ComputeStartEnd();
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        string word = SimpleTokenizor.ETokenize(node.Lex);
                        string tag = node.Tag;

                        int tagId = tagSet.GetPTID(tag);
                        tagTypeSets[tagId].Add(word);

                        int wordId = vocab.GetId(word, node.Start == 0);

                        double weight = RNG == null ? 1.0 : 1.0 + (RNG.NextDouble() - 0.5) / 100;

                        tagWordCounts[tagId][wordId] += weight;
                        tagCounts[tagId] += weight;
                    }
                }
            }

            double[][] scores = ArrayHelper.AllocateArray<double>(vocabCount, PTCount);

            ArrayHelper.Fill(scores, double.NegativeInfinity);

            for (int word = 0; word < scores.Length; ++word)
            {
                for (int tag = 0; tag < scores[word].Length; ++tag)
                {
                    if (tagWordCounts[tag][word] > 0)
                    {
                        //scores[i][j] = new double[1];
                        //expectedCounts[i][j] = new double[1];
                        scores[word][tag] = (double)Math.Log(tagWordCounts[tag][word] / tagCounts[tag]);
                        //expectedCounts[i][j][0] = double.NegativeInfinity;
                    }
                }
            }

            return scores;
        }
Exemplo n.º 3
0
        private void BuildTerminalRule(string ruleString, Vocabulary vocab, TagSet tagSet)
        {
            string[] parts = ruleString.Split(new string[] { "\t", "_" }, StringSplitOptions.RemoveEmptyEntries);

            int tag = tagSet.GetID(parts [0]);
            int subtag = int.Parse(parts [1]);
            int word = vocab.GetId(parts [2]);
            double s = double.Parse(parts [3]);

            if (trules == null)
            {
                trules = new TerminalRule[vocab.VocabSize][];
            }

            if (trules [word] == null)
            {
                trules [word] = new TerminalRule[tagSet.PTCount];
            }

            if (trules [word] [tag] == null)
            {
                trules [word] [tag] = new TerminalRule(new double[subTagCounts [tag]], tag, word);
                trules [word] [tag].ClearScore();
            }

            trules [word] [tag].scores [subtag] = s;
        }