/// <summary> /// Get pos tag informaiton of mention and its context. The context is limited to a sentence contains this mention. /// </summary> /// <param name="context">The context of mention</param> /// <param name="mention">Mention</param> /// <returns> /// A pair with pair.first storing the pos tag information of mention's limited context(a sentence) /// The tag informaiton is stored as a list of pairs with pair.first the string and pair.second the corresponding pos tag /// of the string. /// And pair.second of the return is the index of mention counted by pairs number in pair.first. /// </returns> /// <example> /// context:I like Beijing mention:Beijing /// ((I, NP) (like, VP) (Beijing, Np), 2) /// </example> protected IEnumerable <Pair <string, string> > GetPosTags(string mention, string context) { if (context == null) { return(null); } context = context.Trim(); if (mention == null) { return(null); } mention = mention.Trim(); var sspliter = SSpliterPool.GetSSpliter(); var sentences = sspliter.SplitSequence(context); SSpliterPool.ReturnSSpliter(sspliter); sspliter = null; var sentence = sentences.FirstOrDefault(item => item.Contains(mention)); if (sentence == null) { return(null); } var posTagger = PosTaggerPool.GetPosTagger(); var pairs = posTagger.TagString(sentence); PosTaggerPool.ReturnPosTagger(posTagger); return(pairs); }
/* Extract feature from the input, and the feature is clustered by field * The input should contains two items: * Mention surface: the surface text of the mention // input[0] * Mention context: the context contains the mention // input[1] * The output are a list of pairs store the features' index and value: * Mention surface * Mention Shape * Cluster ID of mention words * Mention length * Mention ID * Last token * Last token pos tag * Last token ID * Next token * Next token pos tag * Next token ID * Parent in dependency tree(stanford corenlp) * Dictionary :TODO * Topic(Define topic) :TODO: I am going to work with document cluster * */ public List <string> ExtractFeature(Instance instance) { var mention = instance.Mention; var context = instance.Context; this.feature.Clear(); List <string> words = new List <string>(); List <string> tokens = new List <string>(); var tokenizer = TokenizerPool.GetTokenizer(); try { var ws = tokenizer.Tokenize(mention); for (var i = 0; i < ws.Count; i++) { if (ws[i].Equals(".") && i > 0 && ws[i - 1].EndsWith(".")) { continue; } words.Add(ws[i]); } var ts = tokenizer.Tokenize(context); for (var i = 0; i < ts.Count; i++) { if (ts[i].Equals(".") && i > 0 && ts[i - 1].EndsWith(".")) { continue; } tokens.Add(ts[i]); } TokenizerPool.ReturnTokenizer(tokenizer); tokenizer = null; } catch (Exception e) { TokenizerPool.ReturnTokenizer(tokenizer); throw e; } // select the first sentence contains mention. This will reduce the parse cost. List <string> sentences = null; var sspliter = SSpliterPool.GetSSpliter(); try { sentences = sspliter.SplitSequence(tokens); SSpliterPool.ReturnSSpliter(sspliter); } catch (Exception e) { SSpliterPool.ReturnSSpliter(sspliter); Console.Clear(); Console.WriteLine("Error in sentence spliter."); throw e; } context = GetSentenceCoverMention(sentences, words); if (context == null) { throw new Exception("Cannot find mention by token within context!"); } // get a parser DependencyParser parser = null; try { parser = ParserPool.GetParser(); } catch (Exception) { throw new Exception("Cannot get a parser!"); } List <Pair <string, string> > pairs = null; Pair <int, int> pair = null; try { parser.Parse(context); pairs = parser.GetPosTags(); pair = GetIndexOfMention(pairs, words); if (pair.first == -1) { throw new Exception("Cannot find mention by token within context!"); } this.offset = 0; #region last word { var index = pair.first - 1; while (index >= 0) { if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";")) { index = -1; break; } else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first)) { index--; } else { break; } } if (index >= 0) { var word = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(word, posTag); } else { AddFieldToFeture(null, null); } } #endregion #region next word { var index = pair.second + 1; while (index < pairs.Count) { if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";")) { index = pairs.Count; break; } else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first)) { index++; } else { break; } } if (index < pairs.Count) { var word = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(word, posTag); } else { AddFieldToFeture(null, null); } } #endregion #region mention head { string head = null, posTag = null; for (int i = pair.first; i <= pair.second; i++) { if (pairs.ElementAt(i).second.StartsWith("N")) { // last noun head = pairs.ElementAt(i).first; posTag = pairs.ElementAt(i).second; } else if (pairs.ElementAt(i).second.Equals("IN") || pairs.ElementAt(i).second.Equals(",")) { // before IN break; } } if (head == null) { head = words[words.Count - 1]; posTag = pairs.ElementAt(pair.second).second; } AddFieldToFeture(head, posTag); } #endregion #region mention driver { int index = parser.GetDriver(pair.first, pair.second); if (index > 0) { var driver = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(driver, posTag); } else { AddFieldToFeture(null, null); } } #endregion #region mention adjective modifer { int index = parser.GetAdjModifier(pair.first, pair.second); if (index > 0) { var adjModifier = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(adjModifier, posTag); } else { AddFieldToFeture(null, null); } } #endregion #region mention action { int index = parser.GetAction(pair.first, pair.second); if (index > 0) { var action = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(action, posTag); } else { AddFieldToFeture(null, null); } } #endregion ParserPool.ReturnParser(parser); parser = null; } catch (Exception e) { if (parser != null) { ParserPool.ReturnParser(parser); parser = null; } throw e; } #region Mention Words { // mention surfaces var mentionWords = new StringBuilder(); foreach (var word in words) { if (mentionWords.Length == 0) { mentionWords.Append(Generalizer.Generalize(word)); } else { mentionWords.Append("," + Generalizer.Generalize(word)); } } // add mention surface feature.Add(string.Join(",", words)); // add stemmed mention surface feature.Add(mentionWords.ToString()); // mention tags var mentionTags = mentionWords.Clear(); for (var i = pair.first; i <= pair.second; i++) { if (mentionTags.Length == 0) { mentionTags.Append(pairs.ElementAt(i).second); } else { mentionTags.Append("," + pairs.ElementAt(i).second); } } feature.Add(mentionTags.ToString()); // mention IDs var mentionIDs = mentionTags.Clear(); foreach (var word in words) { if (mentionIDs.Length == 0) { mentionIDs.Append(DataCenter.GetWordClusterID(word)); } else { mentionIDs.Append("," + DataCenter.GetWordClusterID(word)); } } feature.Add(mentionIDs.ToString()); // mention shapes var mentionShapes = mentionIDs.Clear(); foreach (var word in words) { if (mentionShapes.Length == 0) { mentionShapes.Append(GetWordShape(word)); } else { mentionShapes.Append("," + GetWordShape(word)); } } feature.Add(mentionShapes.ToString()); } #endregion #region mention ID { feature.Add(DataCenter.GetMentionClusterID(mention).ToString()); } #endregion #region mention length { feature.Add(words.Count.ToString()); } #endregion #region Stanford NER { var ner = StanfordNerPool.GetStanfordNer(); ner.FindNer(context); var type = ner.GetNerType(mention); StanfordNerPool.ReturnStanfordNer(ner); ner = null; feature.Add(type); } #endregion #region OpenNLP NER { var ner = OpenNerPool.GetOpenNer(); ner.FindNer(context); var type = ner.GetNerType(mention); OpenNerPool.ReturnOpenNer(ner); ner = null; feature.Add(type); } #endregion #region DBpedia dictionary { var types = string.Join(",", DataCenter.GetDBpediaType(mention)); feature.Add(types); } #endregion #region Key words { var keyWords = DataCenter.ExtractKeyWords(context); feature.Add(string.Join(",", keyWords)); } #endregion #region TDDO: topic { // TODO } #endregion #region TDDO: dictionary { // dictionary // TODO } #endregion feature.Add(context); return(feature); }
/*Input: * mention TAB context */ internal Dictionary <string, object> GetFeature(string mention, string context) { var sspliter = SSpliterPool.GetSSpliter(); context = GetSentenceCoverMention(sspliter.SplitSequence(context), mention); SSpliterPool.ReturnSSpliter(sspliter); sspliter = null; var feature = new Dictionary <string, object>(); var words = mention.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); /**************Word Level****************/ // last word shape var lastWord = GetLastToken(mention, context); if (lastWord == null) { feature[Field.lastWordShape] = "NULL"; } else { feature[Field.lastWordShape] = GetWordShape(lastWord); } // next word shape var nextWord = GetNextToken(mention, context); if (nextWord == null) { feature[Field.nextWordShape] = "NULL"; } else { feature[Field.nextWordShape] = GetWordShape(nextWord); //Console.WriteLine(feature[Field.nextWordShape]); } // mention words shape var list = new List <string>(); list = (from string word in words select GetWordShape(word)).ToList(); feature[Field.mentionWordShapes] = list; // pos tags of mention words var pairs = GetPosTags(mention, context); var pair = GetIndexOfMention(pairs, mention); list = new List <string>(); for (var i = pair.first; i <= pair.second; i++) { if (pairs != null) { list.Add(pairs.ElementAt(i).second); } } feature[Field.mentionWordTags] = list; // pos tag of last word var index = 0; if (lastWord != null) { index = GetLastWordIndex(pairs, lastWord, pair.first); feature[Field.lastWordTag] = pairs.ElementAt(index).second; } else { feature[Field.lastWordTag] = "NULL"; } // pos tag of next word if (nextWord != null) { index = GetNextWordIndex(pairs, nextWord, pair.second); feature[Field.nextWordTag] = pairs.ElementAt(index).second; } else { feature[Field.nextWordTag] = "NULL"; } //stem words lastWord = StemWord(lastWord); nextWord = StemWord(nextWord); words = (from string word in words select StemWord(word)).ToArray(); // make word lowercase lastWord = lastWord.ToLower(); nextWord = nextWord.ToLower(); words = (from string word in words select word.ToLower()).ToArray(); // stemmed last word surface feature[Field.lastWord] = (lastWord ?? "NULL"); // stemmed next word surface feature[Field.nextWord] = (nextWord ?? "NULL"); // stemmed mention words surface feature[Field.mentionWords] = words; /**************Mention Level****************/ // mention length feature[Field.mentionLength] = words.Length.ToString(); // mention words 2-gram var gram2 = GetNGram(words, 2); feature[Field.gram2] = gram2; // mention words 2-gram var gram3 = GetNGram(words, 3); feature[Field.gram3] = gram3; /**************Document Level****************/ // TODO /**************External****************/ // TODO return(feature); }