private void DoLexicalTagging(BrillTagJob tagger) { //var TheNBest = tagger.TheNBest; var TheWords = tagger.TheWords; var TheTags = tagger.TheTags; // Go through each of the rules // Each of these will go through every word in the sentence to see if the rule applies // This is a lot of work, but it has to be done // The code is tedious and repetitive. But just think how horrendous it looked in C! // The code assumes that the rules are all perfectly formed, so don't hand edit the rule file! int i; int j; string[] SubRule; string SR0; string SR1; string SR2; string SR3 = null; string SR4 = null; string[] tlist; for (i = 0; (i <= (TheRules.Count - 1)); i++) { SubRule = TheRules[i].Split(' '); // We have to refer to the individual items in the rule // The code is much clearer if we name them now SR0 = SubRule[0]; SR1 = SubRule[1]; SR2 = SubRule[2]; if ((SubRule.Length >= 4)) { SR3 = SubRule[3]; } else { SR3 = null; } if ((SubRule.Length >= 5)) { SR4 = SubRule[4]; } else { SR4 = null; } for (j = 0; (j <= tagger.LastWord); j++) { // I may be wrong on this but it makes sense to me to ONLY check if the word // is NOT in the lexicon. You can easily disable this check if you think I'm wrong // If the word is unknown then it's probably best to try the substitution tlist = ((string[])(Lexicon[TheWords[j]])); // if ((tlist == null)) // { // Change this to If True then... if you want everything checked if ((SR2.Substring(0, 1) != "f")) { // Two types of rules take their choice from SR1 or SR2 switch (SR1) { case "haspref": if (TheWords[j].StartsWith(SR0)) { TheTags[j] = SR3; } break; case "deletepref": if ((TheWords[j].StartsWith(SR0) && Lexicon.ContainsKey((SR0 + TheWords[j].Substring(SR0.Length))))) { TheTags[j] = SR3; } break; case "addpref": if (Lexicon.ContainsKey((SR0 + TheWords[j]))) { TheTags[j] = SR3; } break; case "hassuf": if (TheWords[j].EndsWith(SR0)) { TheTags[j] = SR3; } break; case "deletesuf": if ((TheWords[j].EndsWith(SR0) && Lexicon.ContainsKey(TheWords[j].Substring(0, (TheWords[j].Length - SR0.Length))))) { TheTags[j] = SR3; } break; case "addsuf": if (Lexicon.ContainsKey((TheWords[j] + SR0))) { TheTags[j] = SR3; } // Not implemented as these depend on bigrams and I have not implemented them and the standard Brill sources come with no useful bigrams // Case "goodright" // Case "goodleft" break; case "char": if ((TheWords[j].IndexOf(SR0) != -1)) { TheTags[j] = SR2; } break; } } else { switch (SR2) { case "fhaspref": if (((TheTags[j] == SR0) && TheWords[j].StartsWith(SR1))) { TheTags[j] = SR4; } break; case "fdeletepref": if (((TheTags[j] == SR0) && (TheWords[j].StartsWith(SR1) && Lexicon.ContainsKey((SR1 + TheWords[j].Substring(SR1.Length)))))) { TheTags[j] = SR4; } break; case "faddpref": if (((TheTags[j] == SR0) && Lexicon.ContainsKey((SR1 + TheWords[j])))) { TheTags[j] = SR4; } break; case "fhassuf": if (((TheTags[j] == SR0) && TheWords[j].EndsWith(SR1))) { TheTags[j] = SR4; } break; case "fdeletesuf": if (((TheTags[j] == SR0) && (TheWords[j].EndsWith(SR1) && Lexicon.ContainsKey(TheWords[j].Substring(0, (TheWords[j].Length - SR1.Length)))))) { TheTags[j] = SR4; } break; case "faddsuf": if (((TheTags[j] == SR0) && Lexicon.ContainsKey((TheWords[j] + SR1)))) { TheTags[j] = SR4; } // Not implemented as these depend on bigrams and I have not implemented them and the standard Brill sources come with no useful bigrams // Case "fgoodright" // Case "fgoodleft" break; case "fchar": if ((TheTags[j] == SR0)) { if ((TheWords[j].IndexOf(SR1) != -1)) { TheTags[j] = SR3; } } break; } } //} } } }