예제 #1
0
        /// <summary>
        /// Get pos tag informaiton of mention and its context. The context is limited to a sentence contains this mention.
        /// </summary>
        /// <param name="context">The context of mention</param>
        /// <param name="mention">Mention</param>
        /// <returns>
        /// A pair with pair.first storing the pos tag information of mention's limited context(a sentence)
        /// The tag informaiton is stored as a list of pairs with pair.first the string and pair.second the corresponding pos tag
        /// of the string.
        ///  And pair.second of the return is the index of mention counted by pairs number in pair.first.
        /// </returns>
        /// <example>
        /// context:I like Beijing  mention:Beijing
        /// ((I, NP) (like, VP) (Beijing, Np), 2)
        /// </example>
        protected IEnumerable <Pair <string, string> > GetPosTags(string mention, string context)
        {
            if (context == null)
            {
                return(null);
            }
            context = context.Trim();
            if (mention == null)
            {
                return(null);
            }
            mention = mention.Trim();
            var sspliter  = SSpliterPool.GetSSpliter();
            var sentences = sspliter.SplitSequence(context);

            SSpliterPool.ReturnSSpliter(sspliter);
            sspliter = null;
            var sentence = sentences.FirstOrDefault(item => item.Contains(mention));

            if (sentence == null)
            {
                return(null);
            }
            var posTagger = PosTaggerPool.GetPosTagger();
            var pairs     = posTagger.TagString(sentence);

            PosTaggerPool.ReturnPosTagger(posTagger);
            return(pairs);
        }
예제 #2
0
            public void GetKeyWordInfo()
            {
                Console.WriteLine("Thread {0} start.", threadID);
                var reader          = new LargeFileReader(source);
                var wordOccurNumDic = new Dictionary <string, int>();
                var line            = "";
                var classNum        = 0;
                var tagger          = PosTaggerPool.GetPosTagger();
                var set             = new HashSet <string>();

                while ((line = reader.ReadLine()) != null)
                {
                    if (classNum > 10000)
                    {
                        break;
                    }
                    classNum++;
                    if (classNum % 1000 == 0)
                    {
                        Console.WriteLine("Thread {0} has processed: {1}", threadID, classNum);
                    }
                    var array = line.Split('\t');
                    var pairs = tagger.TagString(array[3]);
                    set.Clear();

                    foreach (var pair in pairs)
                    {
                        if (pair.second.StartsWith("N") || pair.second.StartsWith("V") || pair.second.StartsWith("J"))
                        {
                            var tokenStemmed = Generalizer.Generalize(pair.first).ToLower();
                            set.Add(tokenStemmed);
                        }
                    }
                    foreach (var token in set)
                    {
                        int num = 0;
                        wordOccurNumDic.TryGetValue(token, out num);
                        wordOccurNumDic[token] = num + 1;
                    }
                }
                reader.Close();
                PosTaggerPool.ReturnPosTagger(tagger);
                KeyWordSelector.tuples[threadID] = new Tuple(classNum, wordOccurNumDic);
            }
예제 #3
0
        public List <string> AddFeature(Event e)
        {
            var rawFeature = (List <string>)e.Feature;
            var mention    = rawFeature.ElementAt((int)Event.Field.mentionSurfaces).Replace(',', ' ');
            var context    = rawFeature.ElementAt((int)Event.Field.sentenceContext);

            #region Stanford NER
            if (false)
            {
                var ner = StanfordNerPool.GetStanfordNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                StanfordNerPool.ReturnStanfordNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region OpenNLP NER
            if (false)
            {
                var ner = OpenNerPool.GetOpenNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                OpenNerPool.ReturnOpenNer(ner);
                ner = null;
                rawFeature[(int)Event.Field.opennlpNerType] = type;
            }
            #endregion

            #region DBpedia dictionary
            {
                var types = string.Join(",", DataCenter.GetDBpediaType(mention));
                rawFeature[(int)Event.Field.dbpediaTypes] = types;
            }
            #endregion

            List <Pair <string, string> > pairs = null;
            Pair <int, int> pair = null;

            #region Modify last word
            System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"\W");

            if (false)
            {
                var lastWord = rawFeature.ElementAt((int)Event.Field.lastWord);
                if (lastWord.Equals("##") || lastWord.Equals(".") || lastWord.Equals("!") || lastWord.Equals("?") || lastWord.Equals(";"))
                {
                    rawFeature[(int)Event.Field.lastWord]        = "NULL";
                    rawFeature[(int)Event.Field.lastWordStemmed] = "NULL";
                    rawFeature[(int)Event.Field.lastWordTag]     = "NULL";
                    rawFeature[(int)Event.Field.lastWordID]      = "100";
                    rawFeature[(int)Event.Field.lastWordShape]   = "NULL";
                }
                else if (!lastWord.Equals("'s") && regex.IsMatch(lastWord))
                {
                    var pos = PosTaggerPool.GetPosTagger();
                    try
                    {
                        pairs = pos.TagString(context);
                        PosTaggerPool.ReturnPosTagger(pos);
                        pair = GetIndexOfMention(pairs, mention);
                        var index = pair.first - 1;
                        while (index >= 0)
                        {
                            if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                            {
                                index = -1;
                                break;
                            }
                            else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first))
                            {
                                index--;
                            }
                            else
                            {
                                break;
                            }
                        }
                        if (index >= 0)
                        {
                            var word        = pairs.ElementAt(index).first;
                            var posTag      = pairs.ElementAt(index).second;
                            var wordStemmed = Generalizer.Generalize(word);
                            var ID          = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface
                            var shape       = GetWordShape(word);

                            rawFeature[(int)Event.Field.lastWord]        = word;
                            rawFeature[(int)Event.Field.lastWordStemmed] = wordStemmed;
                            rawFeature[(int)Event.Field.lastWordTag]     = posTag;
                            rawFeature[(int)Event.Field.lastWordID]      = ID;
                            rawFeature[(int)Event.Field.lastWordShape]   = shape;
                        }
                        else
                        {
                            rawFeature[(int)Event.Field.lastWord]        = "NULL";
                            rawFeature[(int)Event.Field.lastWordStemmed] = "NULL";
                            rawFeature[(int)Event.Field.lastWordTag]     = "NULL";
                            rawFeature[(int)Event.Field.lastWordID]      = "100";
                            rawFeature[(int)Event.Field.lastWordShape]   = "NULL";
                        }
                        PosTaggerPool.ReturnPosTagger(pos);
                    }
                    catch (Exception ex)
                    {
                        PosTaggerPool.ReturnPosTagger(pos);
                        throw ex;
                    }
                }
            }
            #endregion

            #region Modify next word
            if (false)
            {
                var nextWord = rawFeature.ElementAt((int)Event.Field.nextWord);
                if (nextWord.Equals("##") || nextWord.Equals(".") || nextWord.Equals("!") || nextWord.Equals("?") || nextWord.Equals(";"))
                {
                    rawFeature[(int)Event.Field.nextWord]        = "NULL";
                    rawFeature[(int)Event.Field.nextWordStemmed] = "NULL";
                    rawFeature[(int)Event.Field.nextWordTag]     = "NULL";
                    rawFeature[(int)Event.Field.nextWordID]      = "100";
                    rawFeature[(int)Event.Field.nextWordShape]   = "NULL";
                }
                else if (!nextWord.Equals("'s") && regex.IsMatch(nextWord))
                {
                    if (pairs == null)
                    {
                        var pos = PosTaggerPool.GetPosTagger();
                        try
                        {
                            pairs = pos.TagString(context);
                            PosTaggerPool.ReturnPosTagger(pos);
                            pair = GetIndexOfMention(pairs, mention);
                        }
                        catch (Exception ex)
                        {
                            PosTaggerPool.ReturnPosTagger(pos);
                            throw ex;
                        }
                    }
                    var index = pair.second + 1;
                    while (index < pairs.Count)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = pairs.Count;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first))
                        {
                            index++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index < pairs.Count)
                    {
                        var word        = pairs.ElementAt(index).first;
                        var posTag      = pairs.ElementAt(index).second;
                        var wordStemmed = Generalizer.Generalize(word);
                        var ID          = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface
                        var shape       = GetWordShape(word);

                        rawFeature[(int)Event.Field.nextWord]        = word;
                        rawFeature[(int)Event.Field.nextWordStemmed] = wordStemmed;
                        rawFeature[(int)Event.Field.nextWordTag]     = posTag;
                        rawFeature[(int)Event.Field.nextWordID]      = ID;
                        rawFeature[(int)Event.Field.nextWordShape]   = shape;
                    }
                    else
                    {
                        rawFeature[(int)Event.Field.nextWord]        = "NULL";
                        rawFeature[(int)Event.Field.nextWordStemmed] = "NULL";
                        rawFeature[(int)Event.Field.nextWordTag]     = "NULL";
                        rawFeature[(int)Event.Field.nextWordID]      = "100";
                        rawFeature[(int)Event.Field.nextWordShape]   = "NULL";
                    }
                }
            }
            #endregion

            #region   Modify mention ID
            if (true)
            {
                var mentionID         = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionID));
                var mentionClusterNum = DataCenter.GetMentionClusterNumber();
                if (mentionID == mentionClusterNum)
                {
                    mentionID = DataCenter.GetMentionClusterID(mention);
                    rawFeature[(int)Event.Field.mentionID] = mentionID.ToString();
                }
            }
            #endregion

            #region Key words
            if (false)
            {
                var keyWords = DataCenter.ExtractKeyWords(context);
                rawFeature[(int)Event.Field.sentenceContext] = string.Join(",", keyWords);

                rawFeature.Add(context);
            }
            #endregion

            return(rawFeature);
        }