public BrillTagJob BrillTag(string TheSentence, bool DoLexical, bool DoContextual, bool DoClean) { var tagger = new BrillTagJob(); // , ByVal TheFlagBox As TextBox) As String int i; //TheNBest.Clear(); var TheWords = tagger.TheWords; var TheTags = tagger.TheTags; TheSentence = Formatter.FormatText(TheSentence, DoClean); // Because we've done a FormatText it is easy to create individual words via Split // Lexical tagger requires the first word of the sentence to be S-T-A-R-T TheWords.AddRange(("S-T-A-R-T " + TheSentence).Split(' ')); tagger.LastWord = (TheWords.Count - 1); // TheFlagBox.Text = "Tagging..." // TheFlagBox.Refresh() DoBasicTagging(tagger); if (DoLexical) { DoLexicalTagging(tagger); } if (DoContextual) { // Contextual tagger starts with STAART TheWords[0] = "STAART"; DoContextualTagging(tagger); } return(tagger); }
private void DoBasicTagging(BrillTagJob tagger) { //var TheNBest = tagger.TheNBest; var TheWords = tagger.TheWords; var TheTags = tagger.TheTags; //var TheRules = tagger.TheRules; //var TheContext = tagger.TheContext; int i; string[] s; // If the word is in the lexicon, tag it with its first (most likely) tag // if not tag it as NN or NNP if it has a capital letter. // An unofficial rule for my convenience: // Ignore everything that doesn't start with a letter of the alphabet // except for something which starts with a number then make it CD // it will get changed to JJ if it contains a 'd' (e.g. 2nd) or a 't' (e.g. 31st) for (i = 0; (i <= tagger.LastWord); i++) { if (vbLike(TheWords[i].Substring(0, 1), "[a-zA-Z\']")) { if (Lexicon.ContainsKey(TheWords[i])) { s = ((string[])(Lexicon[TheWords[i]])); TheTags.Add(s[0]); } else if (vbLike(TheWords[i].Substring(0, 1), "[a-z]")) { TheTags.Add("NN"); } else if ((TheWords[i] == "\'")) { TheTags.Add(""); } else { TheTags.Add("NNP"); } } else if (vbLike(TheWords[i].Substring(0, 1), "[0-9]")) { // TDMS 18 Nov 2005 - changed unknown words to noun, which duplicates // functionality of the original Brill Tagger. Numbers were being // incorrectly tagged as /CD instead of /JJ. TheTags.Add("NN"); // TheTags.Add("CD"); } else { TheTags.Add(""); } } }
public void DoContextualTagging(BrillTagJob tagger) { /// var TheNBest = tagger.TheNBest; var TheWords = tagger.TheWords; var TheTags = tagger.TheTags; var LastWord = tagger.LastWord; int i; int j; int k; string[] SubRule; string SR0; string SR1; string SR2; string SR3 = null; string SR4 = null; string[] tlist; bool OKtoCheck; for (i = 0; (i <= (TheContext.Count - 1)); i++) { SubRule = TheContext[i].Split(' '); // We have to refer to the individual items in the rule // The code is much clearer if we name them now SR0 = SubRule[0]; SR1 = SubRule[1]; SR2 = SubRule[2]; if ((SubRule.Length >= 4)) { SR3 = SubRule[3]; } else { SR3 = null; } if ((SubRule.Length == 5)) { SR4 = SubRule[4]; } else { SR4 = null; } //int LastWord = tagger.LastWord; for (j = 0; (j <= LastWord); j++) { // The norm is to only check for a substitution if the new tag // already exists in the list of possible tags // If the word is unknown then it's probably best to try the substitution OKtoCheck = false; tlist = ((string[])(Lexicon[TheWords[j]])); if ((tlist == null)) { OKtoCheck = true; } else { for (k = 0; (k <= (tlist.Length - 1)); k++) { if ((tlist[k] == SR1)) { OKtoCheck = true; break; } } } if (OKtoCheck) { // Change this to If True then... if you want everything checked switch (SR2) { case "PREVTAG": if ((j > 0)) { if ((TheTags[j] == SR0) && (TheTags[(j - 1)] == SR3)) { TheTags[j] = SR1; } } break; case "PREV1OR2TAG": if ((j == 1)) { if (((TheTags[j] == SR0) && (TheTags[(j - 1)] == SR3))) { TheTags[j] = SR1; } } else if ((j > 1)) { if (((TheTags[j] == SR0) && ((TheTags[(j - 2)] == SR3) || (TheTags[(j - 1)] == SR3)))) { TheTags[j] = SR1; } } break; case "PREV1OR2OR3TAG": if ((j == 1)) { if (((TheTags[j] == SR0) && (TheTags[(j - 1)] == SR3))) { TheTags[j] = SR1; } } else if (((j == 2) && ((TheTags[j] == SR0) && ((TheTags[(j - 2)] == SR3) || (TheTags[(j - 1)] == SR3))))) { TheTags[j] = SR1; } else if ((j > 2)) { if (((TheTags[j] == SR0) && ((TheTags[(j - 3)] == SR3) || ((TheTags[(j - 2)] == SR3) || (TheTags[(j - 1)] == SR3))))) { TheTags[j] = SR1; } } break; case "PREV2TAG": if ((j > 1)) { if (((TheTags[j] == SR0) && (TheTags[(j - 2)] == SR3))) { TheTags[j] = SR1; } } break; case "NEXTTAG": if ((j < LastWord)) { if (((TheTags[j] == SR0) && (TheTags[(j + 1)] == SR3))) { TheTags[j] = SR1; } } break; case "NEXT1OR2TAG": if ((j == (LastWord - 1))) { if (((TheTags[j] == SR0) && (TheTags[(j + 1)] == SR3))) { TheTags[j] = SR1; } } else if ((j < (LastWord - 2))) { if (((TheTags[j] == SR0) && ((TheTags[(j + 2)] == SR3) || (TheTags[(j + 1)] == SR3)))) { TheTags[j] = SR1; } } break; case "NEXT1OR2OR3TAG": if ((j == (LastWord - 1))) { if (((TheTags[j] == SR0) && (TheTags[(j + 1)] == SR3))) { TheTags[j] = SR1; } } else if (((j == (LastWord - 2)) && ((TheTags[j] == SR0) && ((TheTags[(j + 2)] == SR3) || (TheTags[(j + 1)] == SR3))))) { TheTags[j] = SR1; } else if ((j < (LastWord - 2))) { if (((TheTags[j] == SR0) && ((TheTags[(j + 3)] == SR3) || ((TheTags[(j + 2)] == SR3) || (TheTags[(j + 1)] == SR3))))) { TheTags[j] = SR1; } } break; case "NEXT2TAG": if ((j < (LastWord - 1))) { if (((TheTags[j] == SR0) && (TheTags[(j + 2)] == SR3))) { TheTags[j] = SR1; } } break; case "PREVBIGRAM": if ((j > 1)) { if (((TheTags[j] == SR0) && ((TheTags[(j - 2)] == SR3) && (TheTags[(j - 1)] == SR4)))) { TheTags[j] = SR1; } } break; case "NEXTBIGRAM": if ((j < (LastWord - 1))) { if (((TheTags[j] == SR0) && ((TheTags[(j + 1)] == SR3) && (TheTags[(j + 2)] == SR4)))) { TheTags[j] = SR1; } } break; case "SURROUNDTAG": if (((j > 0) && (j < LastWord))) { if (((TheTags[j] == SR0) && ((TheTags[(j - 1)] == SR3) && (TheTags[(j + 1)] == SR4)))) { TheTags[j] = SR1; } } break; case "CURWD": if (((TheWords[j] == SR3) && (TheTags[j] == SR0))) { TheTags[j] = SR1; } break; case "PREVWD": if ((j > 0)) { if (((TheWords[(j - 1)] == SR3) && (TheTags[j] == SR0))) { TheTags[j] = SR1; } } break; case "PREV1OR2WD": if ((j == 1)) { if (((TheWords[(j - 1)] == SR3) && (TheTags[j] == SR0))) { TheTags[j] = SR1; } } else if ((j > 1)) { if ((((TheWords[(j - 1)] == SR3) || (TheWords[(j - 2)] == SR3)) && (TheTags[j] == SR0))) { TheTags[j] = SR1; } } break; case "PREV2WD": if ((j > 1)) { if (((TheWords[(j - 2)] == SR3) && (TheTags[j] == SR0))) { TheTags[j] = SR1; } } break; case "NEXTWD": if ((j < LastWord)) { if (((TheWords[(j + 1)] == SR3) && (TheTags[j] == SR0))) { TheTags[j] = SR1; } } break; case "NEXT1OR2WD": if ((j == (LastWord - 1))) { if (j > 0 && ((TheWords[(j - 1)] == SR3) && (TheTags[j] == SR0))) { TheTags[j] = SR1; } } else if ((j < (LastWord - 1))) { if ((((TheWords[(j + 1)] == SR3) || (TheWords[(j + 2)] == SR3)) && (TheTags[j] == SR0))) { TheTags[j] = SR1; } } break; case "NEXT2WD": if ((j < (LastWord - 1))) { if (((TheWords[(j + 2)] == SR3) && (TheTags[j] == SR0))) { TheTags[j] = SR1; } } break; case "LBIGRAM": if ((j > 0)) { if (((TheWords[(j - 1)] == SR3) && ((TheWords[j] == SR4) && (TheTags[j] == SR0)))) { TheTags[j] = SR1; } } break; case "RBIGRAM": if ((j < LastWord)) { if (((TheWords[j] == SR3) && ((TheWords[(j + 1)] == SR4) && (TheTags[j] == SR0)))) { TheTags[j] = SR1; } } break; case "WDAND2BFR": if ((j > 1)) { if (((TheWords[(j - 2)] == SR3) && ((TheWords[j] == SR4) && (TheTags[j] == SR0)))) { TheTags[j] = SR1; } } break; case "WDAND2AFT": if ((j < (LastWord - 1))) { if (((TheWords[j] == SR3) && ((TheWords[(j + 2)] == SR4) && (TheTags[j] == SR0)))) { TheTags[j] = SR1; } } break; case "WDPREVTAG": if ((j > 0)) { if (((TheTags[(j - 1)] == SR3) && ((TheWords[j] == SR4) && (TheTags[j] == SR0)))) { TheTags[j] = SR1; } } break; case "WDNEXTTAG": if ((j < LastWord)) { if (((TheWords[j] == SR3) && ((TheTags[(j + 1)] == SR4) && (TheTags[j] == SR0)))) { TheTags[j] = SR1; } } break; case "WDAND2TAGBFR": if ((j > 1)) { if (((TheTags[(j - 2)] == SR3) && ((TheWords[j] == SR4) && (TheTags[j] == SR0)))) { TheTags[j] = SR1; } } break; case "WDAND2TAGAFT": if ((j < (LastWord - 1))) { if (((TheWords[j] == SR3) && ((TheTags[(j + 2)] == SR4) && (TheTags[j] == SR0)))) { TheTags[j] = SR1; } } break; } } } } }
private void DoLexicalTagging(BrillTagJob tagger) { //var TheNBest = tagger.TheNBest; var TheWords = tagger.TheWords; var TheTags = tagger.TheTags; // Go through each of the rules // Each of these will go through every word in the sentence to see if the rule applies // This is a lot of work, but it has to be done // The code is tedious and repetitive. But just think how horrendous it looked in C! // The code assumes that the rules are all perfectly formed, so don't hand edit the rule file! int i; int j; string[] SubRule; string SR0; string SR1; string SR2; string SR3 = null; string SR4 = null; string[] tlist; for (i = 0; (i <= (TheRules.Count - 1)); i++) { SubRule = TheRules[i].Split(' '); // We have to refer to the individual items in the rule // The code is much clearer if we name them now SR0 = SubRule[0]; SR1 = SubRule[1]; SR2 = SubRule[2]; if ((SubRule.Length >= 4)) { SR3 = SubRule[3]; } else { SR3 = null; } if ((SubRule.Length >= 5)) { SR4 = SubRule[4]; } else { SR4 = null; } for (j = 0; (j <= tagger.LastWord); j++) { // I may be wrong on this but it makes sense to me to ONLY check if the word // is NOT in the lexicon. You can easily disable this check if you think I'm wrong // If the word is unknown then it's probably best to try the substitution tlist = ((string[])(Lexicon[TheWords[j]])); // if ((tlist == null)) // { // Change this to If True then... if you want everything checked if ((SR2.Substring(0, 1) != "f")) { // Two types of rules take their choice from SR1 or SR2 switch (SR1) { case "haspref": if (TheWords[j].StartsWith(SR0)) { TheTags[j] = SR3; } break; case "deletepref": if ((TheWords[j].StartsWith(SR0) && Lexicon.ContainsKey((SR0 + TheWords[j].Substring(SR0.Length))))) { TheTags[j] = SR3; } break; case "addpref": if (Lexicon.ContainsKey((SR0 + TheWords[j]))) { TheTags[j] = SR3; } break; case "hassuf": if (TheWords[j].EndsWith(SR0)) { TheTags[j] = SR3; } break; case "deletesuf": if ((TheWords[j].EndsWith(SR0) && Lexicon.ContainsKey(TheWords[j].Substring(0, (TheWords[j].Length - SR0.Length))))) { TheTags[j] = SR3; } break; case "addsuf": if (Lexicon.ContainsKey((TheWords[j] + SR0))) { TheTags[j] = SR3; } // Not implemented as these depend on bigrams and I have not implemented them and the standard Brill sources come with no useful bigrams // Case "goodright" // Case "goodleft" break; case "char": if ((TheWords[j].IndexOf(SR0) != -1)) { TheTags[j] = SR2; } break; } } else { switch (SR2) { case "fhaspref": if (((TheTags[j] == SR0) && TheWords[j].StartsWith(SR1))) { TheTags[j] = SR4; } break; case "fdeletepref": if (((TheTags[j] == SR0) && (TheWords[j].StartsWith(SR1) && Lexicon.ContainsKey((SR1 + TheWords[j].Substring(SR1.Length)))))) { TheTags[j] = SR4; } break; case "faddpref": if (((TheTags[j] == SR0) && Lexicon.ContainsKey((SR1 + TheWords[j])))) { TheTags[j] = SR4; } break; case "fhassuf": if (((TheTags[j] == SR0) && TheWords[j].EndsWith(SR1))) { TheTags[j] = SR4; } break; case "fdeletesuf": if (((TheTags[j] == SR0) && (TheWords[j].EndsWith(SR1) && Lexicon.ContainsKey(TheWords[j].Substring(0, (TheWords[j].Length - SR1.Length)))))) { TheTags[j] = SR4; } break; case "faddsuf": if (((TheTags[j] == SR0) && Lexicon.ContainsKey((TheWords[j] + SR1)))) { TheTags[j] = SR4; } // Not implemented as these depend on bigrams and I have not implemented them and the standard Brill sources come with no useful bigrams // Case "fgoodright" // Case "fgoodleft" break; case "fchar": if ((TheTags[j] == SR0)) { if ((TheWords[j].IndexOf(SR1) != -1)) { TheTags[j] = SR3; } } break; } } //} } } }