예제 #1
0
        private static bool[][] AssignTagConstraints(Vocabulary vocab, TagSet tagSet, string[] words, int[] wids)
        {
            bool[][] allowedTags = new bool[wids.Length][];

            for (int i = 0; i < wids.Length; ++i)
            {
                //allowedTags[i] = new bool[tagSet.PTCount];

                //allowedTags[i][tagSet.GetID(tags[i])] = true;

                //continue;
                if (vocab.IsRareOrUNK(wids[i]))
                {
                    var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]);

                    if (lemmas == null || lemmas.Count == 0)
                    {
                        continue;
                    }

                    allowedTags[i] = new bool[tagSet.PTCount];

                    if (char.IsUpper(words[i][0]))
                    {
                        allowedTags[i][tagSet.GetID("NNP")] = true;
                        allowedTags[i][tagSet.GetID("NNPS")] = true;
                    }

                    foreach (var lemma in lemmas)
                    {
                        switch (lemma.PoS)
                        {
                            case EMorph.MorphPoS.NN:
                                allowedTags[i][tagSet.GetID("NN")] = true;
                                var w = words[i].ToLower();
                                if (EMorph.EnglishMorph.IsNoChangeNoun(w)
                                    || w.EndsWith("ese") || w.EndsWith("ise"))
                                {
                                    allowedTags[i][tagSet.GetID("NNS")] = true;
                                }
                                break;
                            case EMorph.MorphPoS.NNS:
                                allowedTags[i][tagSet.GetID("NNS")] = true;
                                //allowedTags[i][tagSet.GetID("NN")] = true;
                                break;
                            case EMorph.MorphPoS.JJ:
                                allowedTags[i][tagSet.GetID("JJ")] = true;
                                break;
                            case EMorph.MorphPoS.JJR:
                                allowedTags[i][tagSet.GetID("JJR")] = true;
                                break;
                            case EMorph.MorphPoS.JJS:
                                allowedTags[i][tagSet.GetID("JJS")] = true;
                                break;
                            case EMorph.MorphPoS.RB:
                                allowedTags[i][tagSet.GetID("RB")] = true;
                                break;
                            case EMorph.MorphPoS.RBR:
                                allowedTags[i][tagSet.GetID("RBR")] = true;
                                break;
                            case EMorph.MorphPoS.RBS:
                                allowedTags[i][tagSet.GetID("RBS")] = true;
                                break;
                            case EMorph.MorphPoS.VB:
                                allowedTags[i][tagSet.GetID("VB")] = true;
                                allowedTags[i][tagSet.GetID("VBP")] = true;
                                break;
                            case EMorph.MorphPoS.VBD:
                                allowedTags[i][tagSet.GetID("VBD")] = true;
                                allowedTags[i][tagSet.GetID("VBN")] = true;
                                //allowedTags[i][tagSet.GetID("JJ")] = true;
                                break;
                            case EMorph.MorphPoS.VBG:
                                allowedTags[i][tagSet.GetID("VBG")] = true;
                                //allowedTags[i][tagSet.GetID("JJ")] = true;
                                break;
                            case EMorph.MorphPoS.VBZ:
                                allowedTags[i][tagSet.GetID("VBZ")] = true;
                                break;
                            default:
                                throw new Exception("not recognized morph lemma!");
                        }
                    }

                    //if(!allowedTags[i][tagSet.GetID(tags[i])])
                    //{
                    //    Console.Error.WriteLine("!");
                    //}
                }
            }
            return allowedTags;
        }
예제 #2
0
        private void BuildUnaryRule(string ruleString, TagSet tagSet)
        {
            string[] parts = ruleString.Split(new string[] { "\t", "_" }, StringSplitOptions.RemoveEmptyEntries);

            int ptag = tagSet.GetID(parts [0]);
            int subptag = int.Parse(parts [1]);
            int ctag = tagSet.GetID(parts [2]);
            int subctag = int.Parse(parts [3]);
            double s = double.Parse(parts [4]);

            if (urules == null)
            {
                urules = new UnaryRule[TotalTagCount][];
            }

            if (urules [ctag] == null)
            {
                urules [ctag] = new UnaryRule[TotalTagCount];
            }

            if (urules [ctag] [ptag] == null)
            {
                urules [ctag] [ptag] = new UnaryRule(ArrayHelper.AllocateArray<double>(subTagCounts [ctag], subTagCounts [ptag]), ptag, ctag);
                urules [ctag] [ptag].ClearScore();
            }

            urules [ctag] [ptag].scores [subctag] [subptag] = s;
        }
예제 #3
0
        public static void Build(
            Vocabulary vocab,
            TagSet tagset,
            List<PhrasalTree> treebank,
            out LAPCFGrammar rules,
            Random RNG = null)
        {
            int tagCount = tagset.NTCount + tagset.PTCount;

            double[] pmass = new double[tagset.NTCount + tagset.PTCount];

            double[][] unaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount);

            double[][][] binaries = ArrayHelper.AllocateArray<double> (tagCount, tagCount, tagCount);

            foreach (var tree in treebank) {
                foreach (var node in tree.TreeNodes) {
                    if (node.Children.Count == 0) {
                        // terminals
                        continue;
                    } else if (node.Children.Count == 1) {
                        int pt = tagset.GetID (node.Tag);
                        int ct = tagset.GetID (node.Children [0].Tag);

                        pmass [pt] += 1.0f;
                        unaries [ct] [pt] += 1.0f;
                    } else if (node.Children.Count == 2) {
                        int pt = tagset.GetID (node.Tag);
                        int lt = tagset.GetID (node.Children [0].Tag);
                        int rt = tagset.GetID (node.Children [1].Tag);

                        pmass [pt] += 1.0f;
                        binaries [lt] [rt] [pt] += 1.0f;
                    } else {
                        throw new Exception ("tree node with more than 2 children!");
                    }
                }
            }

            for (int c = 0; c < unaries.Length; ++c)
            {
                bool csurvive = false;
                for (int p = 0; p < unaries[c].Length; ++p)
                {
                    if (unaries[c][p] > 0)
                    {
                        csurvive = true;
                        break;
                    }
                }

                if (!csurvive)
                {
                    unaries[c] = null;
                }
            }

            for (int l = 0; l < binaries.Length; ++l)
            {
                bool lsurvive = false;
                for (int r = 0; r < binaries[l].Length; ++r)
                {
                    bool rsurvive = false;

                    for (int p = 0; p < binaries[l][r].Length; ++p)
                    {
                        if (binaries[l][r][p] > 0)
                        {
                            rsurvive = true;
                            break;
                        }
                    }

                    if (rsurvive)
                    {
                        lsurvive = true;
                    }
                    else
                    {
                        binaries[l][r] = null;
                    }
                }

                if (!lsurvive)
                {
                    binaries[l] = null;
                }
            }

            foreach (var x in unaries.Where(x => x != null))
            {
                for (int p = 0; p < x.Length; ++p)
                {
                    double noise = RNG.NextDouble();
                    x[p] += noise;
                    pmass[p] += noise;
                }
            }

            foreach (var x in binaries.Where(x => x != null))
            {
                foreach (var y in x.Where(y => y != null))
                {
                    for (int p = 0; p < y.Length; ++p)
                    {
                        double noise = RNG.NextDouble();
                        y[p] += noise;
                        pmass[p] += noise;
                    }
                }
            }

            for (int c = 0; c < tagCount; ++c) {
                for (int p = 0; p < tagCount; ++p) {
                    if (pmass [p] == 0) {
                        continue;
                    }
                    if (unaries[c] == null)
                    {
                        continue;
                    }
                    unaries [c] [p] /= pmass [p];
                }
            }

            for (int c = 0; c < tagCount; ++c) {
                if (unaries [c] == null) {
                    continue;
                }
                for (int p = 0; p < tagCount; ++p) {
                    if (unaries [c] [p] <= 0) {
                        unaries [c] [p] = double.NegativeInfinity;
                    } else {
                        unaries [c] [p] = (double)Math.Log (unaries [c] [p]);
                    }
                }
            }

            for (int l = 0; l < tagCount; ++l) {
                if (binaries[l] == null)
                {
                    continue;
                }
                for (int r = 0; r < tagCount; ++r) {
                    for (int p = 0; p < tagCount; ++p) {
                        if (pmass [p] == 0) {
                            continue;
                        }

                        if (binaries[l][r] == null)
                        {
                            continue;
                        }

                        binaries [l] [r] [p] /= pmass [p];
                    }
                }
            }

            for (int l = 0; l < tagCount; ++l) {
                if (binaries [l] == null) {
                    continue;
                }
                for (int r = 0; r < tagCount; ++r) {
                    if (binaries [l] [r] == null) {
                        continue;
                    }

                    for (int p = 0; p < tagCount; ++p) {

                        if (binaries [l] [r] [p] <= 0) {
                            binaries [l] [r] [p] = double.NegativeInfinity;
                        } else {
                            binaries [l] [r] [p] = (double)Math.Log (binaries [l] [r] [p]);
                        }
                    }

                }
            }

            var terminals = BuildLexSimple (treebank, tagset, vocab, RNG);

            rules = new LAPCFGrammar (tagset, binaries, unaries, terminals);
        }
예제 #4
0
        private void BuildTerminalRule(string ruleString, Vocabulary vocab, TagSet tagSet)
        {
            string[] parts = ruleString.Split(new string[] { "\t", "_" }, StringSplitOptions.RemoveEmptyEntries);

            int tag = tagSet.GetID(parts [0]);
            int subtag = int.Parse(parts [1]);
            int word = vocab.GetId(parts [2]);
            double s = double.Parse(parts [3]);

            if (trules == null)
            {
                trules = new TerminalRule[vocab.VocabSize][];
            }

            if (trules [word] == null)
            {
                trules [word] = new TerminalRule[tagSet.PTCount];
            }

            if (trules [word] [tag] == null)
            {
                trules [word] [tag] = new TerminalRule(new double[subTagCounts [tag]], tag, word);
                trules [word] [tag].ClearScore();
            }

            trules [word] [tag].scores [subtag] = s;
        }