Beispiel #1
0
        private void DoLexicalTagging(BrillTagJob tagger)
        {
            //var TheNBest = tagger.TheNBest;
            var TheWords = tagger.TheWords;
            var TheTags  = tagger.TheTags;

            // Go through each of the rules
            // Each of these will go through every word in the sentence to see if the rule applies
            // This is a lot of work, but it has to be done
            // The code is tedious and repetitive. But just think how horrendous it looked in C!
            // The code assumes that the rules are all perfectly formed, so don't hand edit the rule file!
            int i;
            int j;

            string[] SubRule;
            string   SR0;
            string   SR1;
            string   SR2;
            string   SR3 = null;
            string   SR4 = null;

            string[] tlist;
            for (i = 0; (i
                         <= (TheRules.Count - 1)); i++)
            {
                SubRule = TheRules[i].Split(' ');
                // We have to refer to the individual items in the rule
                // The code is much clearer if we name them now
                SR0 = SubRule[0];
                SR1 = SubRule[1];
                SR2 = SubRule[2];
                if ((SubRule.Length >= 4))
                {
                    SR3 = SubRule[3];
                }
                else
                {
                    SR3 = null;
                }
                if ((SubRule.Length >= 5))
                {
                    SR4 = SubRule[4];
                }
                else
                {
                    SR4 = null;
                }
                for (j = 0; (j <= tagger.LastWord); j++)
                {
                    // I may be wrong on this but it makes sense to me to ONLY check if the word
                    // is NOT in the lexicon. You can easily disable this check if you think I'm wrong
                    // If the word is unknown then it's probably best to try the substitution
                    tlist = ((string[])(Lexicon[TheWords[j]]));
                    //					if ((tlist == null))
                    //					{
                    //  Change this to If True then... if you want everything checked
                    if ((SR2.Substring(0, 1) != "f"))
                    {
                        // Two types of rules take their choice from SR1 or SR2
                        switch (SR1)
                        {
                        case "haspref":
                            if (TheWords[j].StartsWith(SR0))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "deletepref":
                            if ((TheWords[j].StartsWith(SR0) && Lexicon.ContainsKey((SR0 + TheWords[j].Substring(SR0.Length)))))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "addpref":
                            if (Lexicon.ContainsKey((SR0 + TheWords[j])))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "hassuf":
                            if (TheWords[j].EndsWith(SR0))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "deletesuf":
                            if ((TheWords[j].EndsWith(SR0) && Lexicon.ContainsKey(TheWords[j].Substring(0, (TheWords[j].Length - SR0.Length)))))
                            {
                                TheTags[j] = SR3;
                            }
                            break;

                        case "addsuf":
                            if (Lexicon.ContainsKey((TheWords[j] + SR0)))
                            {
                                TheTags[j] = SR3;
                            }
                            // Not implemented as these depend on bigrams and I have not implemented them and the standard Brill sources come with no useful bigrams
                            // Case "goodright"
                            // Case "goodleft"
                            break;

                        case "char":
                            if ((TheWords[j].IndexOf(SR0) != -1))
                            {
                                TheTags[j] = SR2;
                            }
                            break;
                        }
                    }
                    else
                    {
                        switch (SR2)
                        {
                        case "fhaspref":
                            if (((TheTags[j] == SR0) &&
                                 TheWords[j].StartsWith(SR1)))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "fdeletepref":
                            if (((TheTags[j] == SR0) &&
                                 (TheWords[j].StartsWith(SR1) && Lexicon.ContainsKey((SR1 + TheWords[j].Substring(SR1.Length))))))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "faddpref":
                            if (((TheTags[j] == SR0) &&
                                 Lexicon.ContainsKey((SR1 + TheWords[j]))))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "fhassuf":
                            if (((TheTags[j] == SR0) &&
                                 TheWords[j].EndsWith(SR1)))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "fdeletesuf":
                            if (((TheTags[j] == SR0) &&
                                 (TheWords[j].EndsWith(SR1) && Lexicon.ContainsKey(TheWords[j].Substring(0, (TheWords[j].Length - SR1.Length))))))
                            {
                                TheTags[j] = SR4;
                            }
                            break;

                        case "faddsuf":
                            if (((TheTags[j] == SR0) &&
                                 Lexicon.ContainsKey((TheWords[j] + SR1))))
                            {
                                TheTags[j] = SR4;
                            }
                            // Not implemented as these depend on bigrams and I have not implemented them and the standard Brill sources come with no useful bigrams
                            // Case "fgoodright"
                            // Case "fgoodleft"
                            break;

                        case "fchar":
                            if ((TheTags[j] == SR0))
                            {
                                if ((TheWords[j].IndexOf(SR1) != -1))
                                {
                                    TheTags[j] = SR3;
                                }
                            }
                            break;
                        }
                    }
                    //}
                }
            }
        }