Example #1
0
        public BrillTagJob BrillTag(string TheSentence, bool DoLexical, bool DoContextual, bool DoClean)
        {
            var tagger = new BrillTagJob();
            // , ByVal TheFlagBox As TextBox) As String
            int i;
            //TheNBest.Clear();
            var TheWords = tagger.TheWords;
            var TheTags  = tagger.TheTags;

            TheSentence = Formatter.FormatText(TheSentence, DoClean);
            // Because we've done a FormatText it is easy to create individual words via Split
            // Lexical tagger requires the first word of the sentence to be S-T-A-R-T
            TheWords.AddRange(("S-T-A-R-T " + TheSentence).Split(' '));
            tagger.LastWord = (TheWords.Count - 1);

            // TheFlagBox.Text = "Tagging..."
            // TheFlagBox.Refresh()
            DoBasicTagging(tagger);
            if (DoLexical)
            {
                DoLexicalTagging(tagger);
            }
            if (DoContextual)
            {
                // Contextual tagger starts with STAART
                TheWords[0] = "STAART";
                DoContextualTagging(tagger);
            }
            return(tagger);
        }
Example #2
0
        private void DoBasicTagging(BrillTagJob tagger)
        {
            //var TheNBest = tagger.TheNBest;
            var TheWords = tagger.TheWords;
            var TheTags  = tagger.TheTags;
            //var TheRules = tagger.TheRules;
            //var TheContext = tagger.TheContext;

            int i;

            string[] s;
            // If the word is in the lexicon, tag it with its first (most likely) tag
            // if not tag it as NN or NNP if it has a capital letter.
            // An unofficial rule for my convenience:
            // Ignore everything that doesn't start with a letter of the alphabet
            // except for something which starts with a number then make it CD
            // it will get changed to JJ if it contains a 'd' (e.g. 2nd) or a 't' (e.g. 31st)
            for (i = 0; (i <= tagger.LastWord); i++)
            {
                if (vbLike(TheWords[i].Substring(0, 1), "[a-zA-Z\']"))
                {
                    if (Lexicon.ContainsKey(TheWords[i]))
                    {
                        s = ((string[])(Lexicon[TheWords[i]]));
                        TheTags.Add(s[0]);
                    }
                    else if (vbLike(TheWords[i].Substring(0, 1), "[a-z]"))
                    {
                        TheTags.Add("NN");
                    }
                    else if ((TheWords[i] == "\'"))
                    {
                        TheTags.Add("");
                    }
                    else
                    {
                        TheTags.Add("NNP");
                    }
                }
                else if (vbLike(TheWords[i].Substring(0, 1), "[0-9]"))
                {
                    // TDMS 18 Nov 2005 - changed unknown words to noun, which duplicates
                    // functionality of the original Brill Tagger.  Numbers were being
                    // incorrectly tagged as /CD instead of /JJ.
                    TheTags.Add("NN");
                    //					TheTags.Add("CD");
                }
                else
                {
                    TheTags.Add("");
                }
            }
        }
Example #3
0
        public void DoContextualTagging(BrillTagJob tagger)
        {
            /// var TheNBest = tagger.TheNBest;
            var TheWords = tagger.TheWords;
            var TheTags  = tagger.TheTags;
            var LastWord = tagger.LastWord;

            int i;
            int j;
            int k;

            string[] SubRule;
            string   SR0;
            string   SR1;
            string   SR2;
            string   SR3 = null;
            string   SR4 = null;

            string[] tlist;
            bool     OKtoCheck;

            for (i = 0; (i
                         <= (TheContext.Count - 1)); i++)
            {
                SubRule = TheContext[i].Split(' ');
                // We have to refer to the individual items in the rule
                // The code is much clearer if we name them now

                SR0 = SubRule[0];
                SR1 = SubRule[1];
                SR2 = SubRule[2];
                if ((SubRule.Length >= 4))
                {
                    SR3 = SubRule[3];
                }
                else
                {
                    SR3 = null;
                }
                if ((SubRule.Length == 5))
                {
                    SR4 = SubRule[4];
                }
                else
                {
                    SR4 = null;
                }
                //int LastWord = tagger.LastWord;
                for (j = 0; (j <= LastWord); j++)
                {
                    // The norm is to only check for a substitution if the new tag
                    // already exists in the list of possible tags
                    // If the word is unknown then it's probably best to try the substitution
                    OKtoCheck = false;
                    tlist     = ((string[])(Lexicon[TheWords[j]]));
                    if ((tlist == null))
                    {
                        OKtoCheck = true;
                    }
                    else
                    {
                        for (k = 0; (k
                                     <= (tlist.Length - 1)); k++)
                        {
                            if ((tlist[k] == SR1))
                            {
                                OKtoCheck = true;
                                break;
                            }
                        }
                    }
                    if (OKtoCheck)
                    {
                        //  Change this to If True then... if you want everything checked
                        switch (SR2)
                        {
                        case "PREVTAG":
                            if ((j > 0))
                            {
                                if ((TheTags[j] == SR0) && (TheTags[(j - 1)] == SR3))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "PREV1OR2TAG":
                            if ((j == 1))
                            {
                                if (((TheTags[j] == SR0) &&
                                     (TheTags[(j - 1)] == SR3)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            else if ((j > 1))
                            {
                                if (((TheTags[j] == SR0) &&
                                     ((TheTags[(j - 2)] == SR3) ||
                                      (TheTags[(j - 1)] == SR3))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "PREV1OR2OR3TAG":
                            if ((j == 1))
                            {
                                if (((TheTags[j] == SR0) &&
                                     (TheTags[(j - 1)] == SR3)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            else if (((j == 2) &&
                                      ((TheTags[j] == SR0) &&
                                       ((TheTags[(j - 2)] == SR3) ||
                                        (TheTags[(j - 1)] == SR3)))))
                            {
                                TheTags[j] = SR1;
                            }
                            else if ((j > 2))
                            {
                                if (((TheTags[j] == SR0) &&
                                     ((TheTags[(j - 3)] == SR3) ||
                                      ((TheTags[(j - 2)] == SR3) ||
                                       (TheTags[(j - 1)] == SR3)))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "PREV2TAG":
                            if ((j > 1))
                            {
                                if (((TheTags[j] == SR0) &&
                                     (TheTags[(j - 2)] == SR3)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "NEXTTAG":
                            if ((j < LastWord))
                            {
                                if (((TheTags[j] == SR0) &&
                                     (TheTags[(j + 1)] == SR3)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "NEXT1OR2TAG":
                            if ((j
                                 == (LastWord - 1)))
                            {
                                if (((TheTags[j] == SR0) &&
                                     (TheTags[(j + 1)] == SR3)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            else if ((j
                                      < (LastWord - 2)))
                            {
                                if (((TheTags[j] == SR0) &&
                                     ((TheTags[(j + 2)] == SR3) ||
                                      (TheTags[(j + 1)] == SR3))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "NEXT1OR2OR3TAG":
                            if ((j == (LastWord - 1)))
                            {
                                if (((TheTags[j] == SR0) &&
                                     (TheTags[(j + 1)] == SR3)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            else if (((j
                                       == (LastWord - 2)) &&
                                      ((TheTags[j] == SR0) &&
                                       ((TheTags[(j + 2)] == SR3) ||
                                        (TheTags[(j + 1)] == SR3)))))
                            {
                                TheTags[j] = SR1;
                            }
                            else if ((j
                                      < (LastWord - 2)))
                            {
                                if (((TheTags[j] == SR0) &&
                                     ((TheTags[(j + 3)] == SR3) ||
                                      ((TheTags[(j + 2)] == SR3) ||
                                       (TheTags[(j + 1)] == SR3)))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "NEXT2TAG":
                            if ((j
                                 < (LastWord - 1)))
                            {
                                if (((TheTags[j] == SR0) &&
                                     (TheTags[(j + 2)] == SR3)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "PREVBIGRAM":
                            if ((j > 1))
                            {
                                if (((TheTags[j] == SR0) &&
                                     ((TheTags[(j - 2)] == SR3) &&
                                      (TheTags[(j - 1)] == SR4))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "NEXTBIGRAM":
                            if ((j
                                 < (LastWord - 1)))
                            {
                                if (((TheTags[j] == SR0) &&
                                     ((TheTags[(j + 1)] == SR3) &&
                                      (TheTags[(j + 2)] == SR4))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "SURROUNDTAG":
                            if (((j > 0) &&
                                 (j < LastWord)))
                            {
                                if (((TheTags[j] == SR0) &&
                                     ((TheTags[(j - 1)] == SR3) &&
                                      (TheTags[(j + 1)] == SR4))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "CURWD":
                            if (((TheWords[j] == SR3) &&
                                 (TheTags[j] == SR0)))
                            {
                                TheTags[j] = SR1;
                            }
                            break;

                        case "PREVWD":
                            if ((j > 0))
                            {
                                if (((TheWords[(j - 1)] == SR3) &&
                                     (TheTags[j] == SR0)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "PREV1OR2WD":
                            if ((j == 1))
                            {
                                if (((TheWords[(j - 1)] == SR3) &&
                                     (TheTags[j] == SR0)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            else if ((j > 1))
                            {
                                if ((((TheWords[(j - 1)] == SR3) ||
                                      (TheWords[(j - 2)] == SR3)) &&
                                     (TheTags[j] == SR0)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "PREV2WD":
                            if ((j > 1))
                            {
                                if (((TheWords[(j - 2)] == SR3) &&
                                     (TheTags[j] == SR0)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "NEXTWD":
                            if ((j < LastWord))
                            {
                                if (((TheWords[(j + 1)] == SR3) &&
                                     (TheTags[j] == SR0)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "NEXT1OR2WD":
                            if ((j
                                 == (LastWord - 1)))
                            {
                                if (j > 0 && ((TheWords[(j - 1)] == SR3) &&
                                              (TheTags[j] == SR0)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            else if ((j
                                      < (LastWord - 1)))
                            {
                                if ((((TheWords[(j + 1)] == SR3) ||
                                      (TheWords[(j + 2)] == SR3)) &&
                                     (TheTags[j] == SR0)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "NEXT2WD":
                            if ((j
                                 < (LastWord - 1)))
                            {
                                if (((TheWords[(j + 2)] == SR3) &&
                                     (TheTags[j] == SR0)))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "LBIGRAM":
                            if ((j > 0))
                            {
                                if (((TheWords[(j - 1)] == SR3) &&
                                     ((TheWords[j] == SR4) &&
                                      (TheTags[j] == SR0))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "RBIGRAM":
                            if ((j < LastWord))
                            {
                                if (((TheWords[j] == SR3) &&
                                     ((TheWords[(j + 1)] == SR4) &&
                                      (TheTags[j] == SR0))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "WDAND2BFR":
                            if ((j > 1))
                            {
                                if (((TheWords[(j - 2)] == SR3) &&
                                     ((TheWords[j] == SR4) &&
                                      (TheTags[j] == SR0))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "WDAND2AFT":
                            if ((j
                                 < (LastWord - 1)))
                            {
                                if (((TheWords[j] == SR3) &&
                                     ((TheWords[(j + 2)] == SR4) &&
                                      (TheTags[j] == SR0))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "WDPREVTAG":
                            if ((j > 0))
                            {
                                if (((TheTags[(j - 1)] == SR3) &&
                                     ((TheWords[j] == SR4) &&
                                      (TheTags[j] == SR0))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "WDNEXTTAG":
                            if ((j < LastWord))
                            {
                                if (((TheWords[j] == SR3) &&
                                     ((TheTags[(j + 1)] == SR4) &&
                                      (TheTags[j] == SR0))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "WDAND2TAGBFR":
                            if ((j > 1))
                            {
                                if (((TheTags[(j - 2)] == SR3) &&
                                     ((TheWords[j] == SR4) &&
                                      (TheTags[j] == SR0))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;

                        case "WDAND2TAGAFT":
                            if ((j
                                 < (LastWord - 1)))
                            {
                                if (((TheWords[j] == SR3) &&
                                     ((TheTags[(j + 2)] == SR4) &&
                                      (TheTags[j] == SR0))))
                                {
                                    TheTags[j] = SR1;
                                }
                            }
                            break;
                        }
                    }
                }
            }
        }
Example #4
0
        private void DoLexicalTagging(BrillTagJob tagger)
        {
            //var TheNBest = tagger.TheNBest;
            var TheWords = tagger.TheWords;
            var TheTags  = tagger.TheTags;

            // Go through each of the rules
            // Each of these will go through every word in the sentence to see if the rule applies
            // This is a lot of work, but it has to be done
            // The code is tedious and repetitive. But just think how horrendous it looked in C!
            // The code assumes that the rules are all perfectly formed, so don't hand edit the rule file!
            int i;
            int j;

            string[] SubRule;
            string   SR0;
            string   SR1;
            string   SR2;
            string   SR3 = null;
            string   SR4 = null;

            string[] tlist;
            for (i = 0; (i
                         <= (TheRules.Count - 1)); i++)
            {
                SubRule = TheRules[i].Split(' ');
                // We have to refer to the individual items in the rule
                // The code is much clearer if we name them now
                SR0 = SubRule[0];
                SR1 = SubRule[1];
                SR2 = SubRule[2];
                if ((SubRule.Length >= 4))
                {
                    SR3 = SubRule[3];
                }
                else
                {
                    SR3 = null;
                }
                if ((SubRule.Length >= 5))
                {
                    SR4 = SubRule[4];
                }
                else
                {
                    SR4 = null;
                }
                for (j = 0; (j <= tagger.LastWord); j++)
                {
                    // I may be wrong on this but it makes sense to me to ONLY check if the word
                    // is NOT in the lexicon. You can easily disable this check if you think I'm wrong
                    // If the word is unknown then it's probably best to try the substitution
                    tlist = ((string[])(Lexicon[TheWords[j]]));
                    //					if ((tlist == null))
                    //					{
                    //  Change this to If True then... if you want everything checked
                    if ((SR2.Substring(0, 1) != "f"))
                    {
                        // Two types of rules take their choice from SR1 or SR2
                        switch (SR1)
                        {
                        case "haspref":
                            if (TheWords[j].StartsWith(SR0))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "deletepref":
                            if ((TheWords[j].StartsWith(SR0) && Lexicon.ContainsKey((SR0 + TheWords[j].Substring(SR0.Length)))))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "addpref":
                            if (Lexicon.ContainsKey((SR0 + TheWords[j])))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "hassuf":
                            if (TheWords[j].EndsWith(SR0))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "deletesuf":
                            if ((TheWords[j].EndsWith(SR0) && Lexicon.ContainsKey(TheWords[j].Substring(0, (TheWords[j].Length - SR0.Length)))))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "addsuf":
                            if (Lexicon.ContainsKey((TheWords[j] + SR0)))
                            {
                                TheTags[j] = SR3;
                            }
                            // Not implemented as these depend on bigrams and I have not implemented them and the standard Brill sources come with no useful bigrams
                            // Case "goodright"
                            // Case "goodleft"
                            break;

                        case "char":
                            if ((TheWords[j].IndexOf(SR0) != -1))
                            {
                                TheTags[j] = SR2;
                            }
                            break;
                        }
                    }
                    else
                    {
                        switch (SR2)
                        {
                        case "fhaspref":
                            if (((TheTags[j] == SR0) &&
                                 TheWords[j].StartsWith(SR1)))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "fdeletepref":
                            if (((TheTags[j] == SR0) &&
                                 (TheWords[j].StartsWith(SR1) && Lexicon.ContainsKey((SR1 + TheWords[j].Substring(SR1.Length))))))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "faddpref":
                            if (((TheTags[j] == SR0) &&
                                 Lexicon.ContainsKey((SR1 + TheWords[j]))))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "fhassuf":
                            if (((TheTags[j] == SR0) &&
                                 TheWords[j].EndsWith(SR1)))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "fdeletesuf":
                            if (((TheTags[j] == SR0) &&
                                 (TheWords[j].EndsWith(SR1) && Lexicon.ContainsKey(TheWords[j].Substring(0, (TheWords[j].Length - SR1.Length))))))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "faddsuf":
                            if (((TheTags[j] == SR0) &&
                                 Lexicon.ContainsKey((TheWords[j] + SR1))))
                            {
                                TheTags[j] = SR4;
                            }
                            // Not implemented as these depend on bigrams and I have not implemented them and the standard Brill sources come with no useful bigrams
                            // Case "fgoodright"
                            // Case "fgoodleft"
                            break;

                        case "fchar":
                            if ((TheTags[j] == SR0))
                            {
                                if ((TheWords[j].IndexOf(SR1) != -1))
                                {
                                    TheTags[j] = SR3;
                                }
                            }
                            break;
                        }
                    }
                    //}
                }
            }
        }