/// <summary> /// Get pos tag informaiton of mention and its context. The context is limited to a sentence contains this mention. /// </summary> /// <param name="context">The context of mention</param> /// <param name="mention">Mention</param> /// <returns> /// A pair with pair.first storing the pos tag information of mention's limited context(a sentence) /// The tag informaiton is stored as a list of pairs with pair.first the string and pair.second the corresponding pos tag /// of the string. /// And pair.second of the return is the index of mention counted by pairs number in pair.first. /// </returns> /// <example> /// context:I like Beijing mention:Beijing /// ((I, NP) (like, VP) (Beijing, Np), 2) /// </example> protected IEnumerable <Pair <string, string> > GetPosTags(string mention, string context) { if (context == null) { return(null); } context = context.Trim(); if (mention == null) { return(null); } mention = mention.Trim(); var sspliter = SSpliterPool.GetSSpliter(); var sentences = sspliter.SplitSequence(context); SSpliterPool.ReturnSSpliter(sspliter); sspliter = null; var sentence = sentences.FirstOrDefault(item => item.Contains(mention)); if (sentence == null) { return(null); } var posTagger = PosTaggerPool.GetPosTagger(); var pairs = posTagger.TagString(sentence); PosTaggerPool.ReturnPosTagger(posTagger); return(pairs); }
public void GetKeyWordInfo() { Console.WriteLine("Thread {0} start.", threadID); var reader = new LargeFileReader(source); var wordOccurNumDic = new Dictionary <string, int>(); var line = ""; var classNum = 0; var tagger = PosTaggerPool.GetPosTagger(); var set = new HashSet <string>(); while ((line = reader.ReadLine()) != null) { if (classNum > 10000) { break; } classNum++; if (classNum % 1000 == 0) { Console.WriteLine("Thread {0} has processed: {1}", threadID, classNum); } var array = line.Split('\t'); var pairs = tagger.TagString(array[3]); set.Clear(); foreach (var pair in pairs) { if (pair.second.StartsWith("N") || pair.second.StartsWith("V") || pair.second.StartsWith("J")) { var tokenStemmed = Generalizer.Generalize(pair.first).ToLower(); set.Add(tokenStemmed); } } foreach (var token in set) { int num = 0; wordOccurNumDic.TryGetValue(token, out num); wordOccurNumDic[token] = num + 1; } } reader.Close(); PosTaggerPool.ReturnPosTagger(tagger); KeyWordSelector.tuples[threadID] = new Tuple(classNum, wordOccurNumDic); }
public List <string> AddFeature(Event e) { var rawFeature = (List <string>)e.Feature; var mention = rawFeature.ElementAt((int)Event.Field.mentionSurfaces).Replace(',', ' '); var context = rawFeature.ElementAt((int)Event.Field.sentenceContext); #region Stanford NER if (false) { var ner = StanfordNerPool.GetStanfordNer(); ner.FindNer(context); var type = ner.GetNerType(mention); StanfordNerPool.ReturnStanfordNer(ner); ner = null; feature.Add(type); } #endregion #region OpenNLP NER if (false) { var ner = OpenNerPool.GetOpenNer(); ner.FindNer(context); var type = ner.GetNerType(mention); OpenNerPool.ReturnOpenNer(ner); ner = null; rawFeature[(int)Event.Field.opennlpNerType] = type; } #endregion #region DBpedia dictionary { var types = string.Join(",", DataCenter.GetDBpediaType(mention)); rawFeature[(int)Event.Field.dbpediaTypes] = types; } #endregion List <Pair <string, string> > pairs = null; Pair <int, int> pair = null; #region Modify last word System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"\W"); if (false) { var lastWord = rawFeature.ElementAt((int)Event.Field.lastWord); if (lastWord.Equals("##") || lastWord.Equals(".") || lastWord.Equals("!") || lastWord.Equals("?") || lastWord.Equals(";")) { rawFeature[(int)Event.Field.lastWord] = "NULL"; rawFeature[(int)Event.Field.lastWordStemmed] = "NULL"; rawFeature[(int)Event.Field.lastWordTag] = "NULL"; rawFeature[(int)Event.Field.lastWordID] = "100"; rawFeature[(int)Event.Field.lastWordShape] = "NULL"; } else if (!lastWord.Equals("'s") && regex.IsMatch(lastWord)) { var pos = PosTaggerPool.GetPosTagger(); try { pairs = pos.TagString(context); PosTaggerPool.ReturnPosTagger(pos); pair = GetIndexOfMention(pairs, mention); var index = pair.first - 1; while (index >= 0) { if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";")) { index = -1; break; } else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first)) { index--; } else { break; } } if (index >= 0) { var word = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; var wordStemmed = Generalizer.Generalize(word); var ID = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface var shape = GetWordShape(word); rawFeature[(int)Event.Field.lastWord] = word; rawFeature[(int)Event.Field.lastWordStemmed] = wordStemmed; rawFeature[(int)Event.Field.lastWordTag] = posTag; rawFeature[(int)Event.Field.lastWordID] = ID; rawFeature[(int)Event.Field.lastWordShape] = shape; } else { rawFeature[(int)Event.Field.lastWord] = "NULL"; rawFeature[(int)Event.Field.lastWordStemmed] = "NULL"; rawFeature[(int)Event.Field.lastWordTag] = "NULL"; rawFeature[(int)Event.Field.lastWordID] = "100"; rawFeature[(int)Event.Field.lastWordShape] = "NULL"; } PosTaggerPool.ReturnPosTagger(pos); } catch (Exception ex) { PosTaggerPool.ReturnPosTagger(pos); throw ex; } } } #endregion #region Modify next word if (false) { var nextWord = rawFeature.ElementAt((int)Event.Field.nextWord); if (nextWord.Equals("##") || nextWord.Equals(".") || nextWord.Equals("!") || nextWord.Equals("?") || nextWord.Equals(";")) { rawFeature[(int)Event.Field.nextWord] = "NULL"; rawFeature[(int)Event.Field.nextWordStemmed] = "NULL"; rawFeature[(int)Event.Field.nextWordTag] = "NULL"; rawFeature[(int)Event.Field.nextWordID] = "100"; rawFeature[(int)Event.Field.nextWordShape] = "NULL"; } else if (!nextWord.Equals("'s") && regex.IsMatch(nextWord)) { if (pairs == null) { var pos = PosTaggerPool.GetPosTagger(); try { pairs = pos.TagString(context); PosTaggerPool.ReturnPosTagger(pos); pair = GetIndexOfMention(pairs, mention); } catch (Exception ex) { PosTaggerPool.ReturnPosTagger(pos); throw ex; } } var index = pair.second + 1; while (index < pairs.Count) { if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";")) { index = pairs.Count; break; } else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first)) { index++; } else { break; } } if (index < pairs.Count) { var word = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; var wordStemmed = Generalizer.Generalize(word); var ID = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface var shape = GetWordShape(word); rawFeature[(int)Event.Field.nextWord] = word; rawFeature[(int)Event.Field.nextWordStemmed] = wordStemmed; rawFeature[(int)Event.Field.nextWordTag] = posTag; rawFeature[(int)Event.Field.nextWordID] = ID; rawFeature[(int)Event.Field.nextWordShape] = shape; } else { rawFeature[(int)Event.Field.nextWord] = "NULL"; rawFeature[(int)Event.Field.nextWordStemmed] = "NULL"; rawFeature[(int)Event.Field.nextWordTag] = "NULL"; rawFeature[(int)Event.Field.nextWordID] = "100"; rawFeature[(int)Event.Field.nextWordShape] = "NULL"; } } } #endregion #region Modify mention ID if (true) { var mentionID = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionID)); var mentionClusterNum = DataCenter.GetMentionClusterNumber(); if (mentionID == mentionClusterNum) { mentionID = DataCenter.GetMentionClusterID(mention); rawFeature[(int)Event.Field.mentionID] = mentionID.ToString(); } } #endregion #region Key words if (false) { var keyWords = DataCenter.ExtractKeyWords(context); rawFeature[(int)Event.Field.sentenceContext] = string.Join(",", keyWords); rawFeature.Add(context); } #endregion return(rawFeature); }