/// <summary> /// Return the context for finding names at the specified index. /// </summary> /// <param name="index"> /// The index of the token in the specified tokens array for which the context should be constructed. /// </param> /// <param name="tokens"> /// tokens of the sentence. /// </param> /// <param name="predicates"> /// The previous decisions made in the tagging of this sequence. Only indices less than {index} will be examined. /// </param> /// <param name="previousTags"> /// A mapping between tokens and the previous outcome for these tokens. /// </param> /// <returns> /// the context for finding names at the specified index. /// </returns> public virtual string[] GetContext(int index, string[] tokens, string[] predicates, IDictionary <string, string> previousTags) { string previous = MaximumEntropyNameFinder.Other; string previousPrevious = MaximumEntropyNameFinder.Other; if (index > 1) { previousPrevious = predicates[index - 2]; } if (index > 0) { previous = predicates[index - 1]; } string cacheKey = index.ToString(System.Globalization.CultureInfo.InvariantCulture) + previous + previousPrevious; if (mContextsCache != null) { if (mWordsKey == tokens) { string[] cachedContexts = (string[])mContextsCache[cacheKey]; if (cachedContexts != null) { return(cachedContexts); } } else { mContextsCache.Clear(); mWordsKey = tokens; } } List <string> features; if (mWordsKey == tokens && index == mPreviousIndex) { features = mPreviousStaticFeatures; } else { features = GetStaticFeatures(tokens, index, previousTags); mPreviousIndex = index; mPreviousStaticFeatures = features; } int featureCount = features.Count; string[] contexts = new string[featureCount + 4]; for (int currentFeature = 0; currentFeature < featureCount; currentFeature++) { contexts[currentFeature] = features[currentFeature]; } contexts[featureCount] = "po=" + previous; contexts[featureCount + 1] = "pow=" + previous + tokens[index]; contexts[featureCount + 2] = "powf=" + previous + WordFeature(tokens[index]); contexts[featureCount + 3] = "ppo=" + previousPrevious; if (mContextsCache != null) { mContextsCache[cacheKey] = contexts; } return(contexts); }
/// <summary> /// Returns the contexts for chunking of the specified index. /// </summary> /// <param name="index"> /// The index of the token in the specified tokens array for which the context should be constructed. /// </param> /// <param name="words"> /// The tokens of the sentence. The <code>ToString()</code> methods of these objects should return the token text. /// </param> /// <param name="predicates"> /// The previous decisions made in the tagging of this sequence. Only indices less than i will be examined. /// </param> /// <param name="tags"> /// The POS tags for the the specified tokens. /// </param> /// <returns> /// An array of predictive contexts on which a model basis its decisions. /// </returns> public virtual string[] GetContext(int index, object[] words, string[] tags, string[] predicates) { List <string> features = new List <string>(19); int currentTokenIndex = index; int previousPreviousTokenIndex = currentTokenIndex - 2; int previousTokenIndex = currentTokenIndex - 1; int nextNextTokenIndex = currentTokenIndex + 2; int nextTokenIndex = currentTokenIndex + 1; string previousPreviousWord, previousWord, currentWord, nextWord, nextNextWord; string previousPreviousTag, previousTag, currentTag, nextTag, nextNextTag; string previousPreviousPriorDecision, previousPriorDecision; string[] contexts; // ChunkAndPosTag(-2) if (previousPreviousTokenIndex >= 0) { previousPreviousTag = tags[previousPreviousTokenIndex]; previousPreviousPriorDecision = predicates[previousPreviousTokenIndex]; previousPreviousWord = words[previousPreviousTokenIndex].ToString(); } else { previousPreviousTag = mEndOfSentence; previousPreviousPriorDecision = mEndOfSentence; previousPreviousWord = mEndOfSentence; } // ChunkAndPosTag(-1) if (previousTokenIndex >= 0) { previousTag = tags[previousTokenIndex]; previousPriorDecision = predicates[previousTokenIndex]; previousWord = words[previousTokenIndex].ToString(); } else { previousTag = mEndOfSentence; previousPriorDecision = mEndOfSentence; previousWord = mEndOfSentence; } // ChunkAndPosTag(0) currentTag = tags[currentTokenIndex]; currentWord = words[currentTokenIndex].ToString(); // ChunkAndPosTag(1) if (nextTokenIndex < tags.Length) { nextTag = tags[nextTokenIndex]; nextWord = words[nextTokenIndex].ToString(); } else { nextTag = mEndOfSentence; nextWord = mEndOfSentence; } // ChunkAndPosTag(2) if (nextNextTokenIndex < tags.Length) { nextNextTag = tags[nextNextTokenIndex]; nextNextWord = words[nextNextTokenIndex].ToString(); } else { nextNextTag = mEndOfSentence; nextNextWord = mEndOfSentence; } string cacheKey = currentTokenIndex.ToString(System.Globalization.CultureInfo.InvariantCulture) + previousPreviousTag + previousTag + currentTag + nextTag + nextNextTag + previousPreviousPriorDecision + previousPriorDecision; if (mContextsCache != null) { if (mWordsKey == words) { contexts = (string[])mContextsCache[cacheKey]; if (contexts != null) { return(contexts); } } else { mContextsCache.Clear(); mWordsKey = words; } } string previousPreviousChunkTag = ChunkAndPosTag(-2, previousPreviousWord, previousPreviousTag, previousPreviousPriorDecision); string previousPreviousChunkTagBackOff = ChunkAndPosTagBackOff(-2, previousPreviousTag, previousPreviousPriorDecision); string previousChunkTag = ChunkAndPosTag(-1, previousWord, previousTag, previousPriorDecision); string previousChunkTagBackOff = ChunkAndPosTagBackOff(-1, previousTag, previousPriorDecision); string currentChunkTag = ChunkAndPosTag(0, currentWord, currentTag, null); string currentChunkTagBackOff = ChunkAndPosTagBackOff(0, currentTag, null); string nextChunkTag = ChunkAndPosTag(1, nextWord, nextTag, null); string nextChunkTagBackOff = ChunkAndPosTagBackOff(1, nextTag, null); string nextNextChunkTag = ChunkAndPosTag(2, nextNextWord, nextNextTag, null); string nextNextChunkTagBackOff = ChunkAndPosTagBackOff(2, nextNextTag, null); features.Add("default"); features.Add(previousPreviousChunkTag); features.Add(previousPreviousChunkTagBackOff); features.Add(previousChunkTag); features.Add(previousChunkTagBackOff); features.Add(currentChunkTag); features.Add(currentChunkTagBackOff); features.Add(nextChunkTag); features.Add(nextChunkTagBackOff); features.Add(nextNextChunkTag); features.Add(nextNextChunkTagBackOff); //ChunkAndPosTag(-1,0) features.Add(previousChunkTag + "," + currentChunkTag); features.Add(previousChunkTagBackOff + "," + currentChunkTag); features.Add(previousChunkTag + "," + currentChunkTagBackOff); features.Add(previousChunkTagBackOff + "," + currentChunkTagBackOff); //ChunkAndPosTag(0,1) features.Add(currentChunkTag + "," + nextChunkTag); features.Add(currentChunkTagBackOff + "," + nextChunkTag); features.Add(currentChunkTag + "," + nextChunkTagBackOff); features.Add(currentChunkTagBackOff + "," + nextChunkTagBackOff); contexts = features.ToArray(); if (mContextsCache != null) { mContextsCache[cacheKey] = contexts; } return(contexts); }
/// <summary> /// Returns the context for making a pos tag decision at the specified token index given the specified tokens and previous tags. /// </summary> /// <param name="index"> /// The index of the token for which the context is provided. /// </param> /// <param name="tokens"> /// The tokens in the sentence. /// </param> /// <param name="tags"> /// The tags assigned to the previous words in the sentence. /// </param> /// <returns> /// The context for making a pos tag decision at the specified token index given the specified tokens and previous tags. /// </returns> public virtual string[] GetContext(int index, string[] tokens, string[] tags) { string next, nextNext, lex, previous, previousPrevious; string tagPrevious, tagPreviousPrevious; tagPrevious = tagPreviousPrevious = null; next = nextNext = lex = previous = previousPrevious = null; lex = tokens[index]; if (tokens.Length > index + 1) { next = tokens[index + 1]; if (tokens.Length > index + 2) { nextNext = tokens[index + 2]; } else { nextNext = SentenceEnd; } } else { next = SentenceEnd; } if (index - 1 >= 0) { previous = tokens[index - 1]; tagPrevious = tags[index - 1]; if (index - 2 >= 0) { previousPrevious = tokens[index - 2]; tagPreviousPrevious = tags[index - 2]; } else { previousPrevious = SentenceBeginning; } } else { previous = SentenceBeginning; } string cacheKey = index.ToString(System.Globalization.CultureInfo.InvariantCulture) + tagPrevious + tagPreviousPrevious; if (_contextsCache != null) { if (_wordsKey == tokens) { var cachedContexts = (string[])_contextsCache[cacheKey]; if (cachedContexts != null) { return(cachedContexts); } } else { _contextsCache.Clear(); _wordsKey = tokens; } } var eventList = new List <string>(); // add the word itself eventList.Add("w=" + lex); // do some basic suffix analysis string[] suffixes = GetSuffixes(lex); for (int currentSuffix = 0; currentSuffix < suffixes.Length; currentSuffix++) { eventList.Add("suf=" + suffixes[currentSuffix]); } string[] prefixes = GetPrefixes(lex); for (int currentPrefix = 0; currentPrefix < prefixes.Length; currentPrefix++) { eventList.Add("pre=" + prefixes[currentPrefix]); } // see if the word has any special characters if (lex.IndexOf('-') != -1) { eventList.Add("h"); } if (HasCapitalRegex.IsMatch(lex)) { eventList.Add("c"); } if (HasNumericRegex.IsMatch(lex)) { eventList.Add("d"); } // add the words and positions of the surrounding context if ((object)previous != null) { eventList.Add("p=" + previous); if ((object)tagPrevious != null) { eventList.Add("t=" + tagPrevious); } if ((object)previousPrevious != null) { eventList.Add("pp=" + previousPrevious); if ((object)tagPreviousPrevious != null) { eventList.Add("tt=" + tagPreviousPrevious); } } } if ((object)next != null) { eventList.Add("n=" + next); if ((object)nextNext != null) { eventList.Add("nn=" + nextNext); } } string[] contexts = eventList.ToArray(); if (_contextsCache != null) { _contextsCache[cacheKey] = contexts; } return(contexts); }