Exemplo n.º 1
0
        /// <summary>
        /// Get pos tag informaiton of mention and its context. The context is limited to a sentence contains this mention.
        /// </summary>
        /// <param name="context">The context of mention</param>
        /// <param name="mention">Mention</param>
        /// <returns>
        /// A pair with pair.first storing the pos tag information of mention's limited context(a sentence)
        /// The tag informaiton is stored as a list of pairs with pair.first the string and pair.second the corresponding pos tag
        /// of the string.
        ///  And pair.second of the return is the index of mention counted by pairs number in pair.first.
        /// </returns>
        /// <example>
        /// context:I like Beijing  mention:Beijing
        /// ((I, NP) (like, VP) (Beijing, Np), 2)
        /// </example>
        protected IEnumerable <Pair <string, string> > GetPosTags(string mention, string context)
        {
            if (context == null)
            {
                return(null);
            }
            context = context.Trim();
            if (mention == null)
            {
                return(null);
            }
            mention = mention.Trim();
            var sspliter  = SSpliterPool.GetSSpliter();
            var sentences = sspliter.SplitSequence(context);

            SSpliterPool.ReturnSSpliter(sspliter);
            sspliter = null;
            var sentence = sentences.FirstOrDefault(item => item.Contains(mention));

            if (sentence == null)
            {
                return(null);
            }
            var posTagger = PosTaggerPool.GetPosTagger();
            var pairs     = posTagger.TagString(sentence);

            PosTaggerPool.ReturnPosTagger(posTagger);
            return(pairs);
        }
Exemplo n.º 2
0
        /*   Extract feature from the input, and the feature is clustered by field
         *   The input should contains two items:
         *      Mention surface:   the surface text of the mention             // input[0]
         *      Mention context:   the context contains the mention         // input[1]
         *   The output are a list of pairs store the features' index and value:
         *      Mention surface
         *      Mention Shape
         *      Cluster ID of mention words
         *      Mention length
         *      Mention ID
         *      Last token
         *      Last token pos tag
         *      Last token ID
         *      Next token
         *      Next token pos tag
         *      Next token ID
         *      Parent in dependency tree(stanford corenlp)
         *      Dictionary                      :TODO
         *      Topic(Define topic)             :TODO: I am going to work with document cluster
         *
         */
        public List <string> ExtractFeature(Instance instance)
        {
            var mention = instance.Mention;
            var context = instance.Context;

            this.feature.Clear();
            List <string> words     = new List <string>();
            List <string> tokens    = new List <string>();
            var           tokenizer = TokenizerPool.GetTokenizer();

            try
            {
                var ws = tokenizer.Tokenize(mention);
                for (var i = 0; i < ws.Count; i++)
                {
                    if (ws[i].Equals(".") && i > 0 && ws[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    words.Add(ws[i]);
                }
                var ts = tokenizer.Tokenize(context);
                for (var i = 0; i < ts.Count; i++)
                {
                    if (ts[i].Equals(".") && i > 0 && ts[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    tokens.Add(ts[i]);
                }
                TokenizerPool.ReturnTokenizer(tokenizer);
                tokenizer = null;
            }
            catch (Exception e)
            {
                TokenizerPool.ReturnTokenizer(tokenizer);
                throw e;
            }
            // select the first sentence contains mention. This will reduce the parse cost.
            List <string> sentences = null;
            var           sspliter  = SSpliterPool.GetSSpliter();

            try
            {
                sentences = sspliter.SplitSequence(tokens);
                SSpliterPool.ReturnSSpliter(sspliter);
            }
            catch (Exception e)
            {
                SSpliterPool.ReturnSSpliter(sspliter);
                Console.Clear();
                Console.WriteLine("Error in sentence spliter.");
                throw e;
            }
            context = GetSentenceCoverMention(sentences, words);
            if (context == null)
            {
                throw new Exception("Cannot find mention by token within context!");
            }
            // get a parser
            DependencyParser parser = null;

            try
            {
                parser = ParserPool.GetParser();
            }
            catch (Exception)
            {
                throw new Exception("Cannot get a parser!");
            }
            List <Pair <string, string> > pairs = null;
            Pair <int, int> pair = null;

            try
            {
                parser.Parse(context);

                pairs = parser.GetPosTags();
                pair  = GetIndexOfMention(pairs, words);
                if (pair.first == -1)
                {
                    throw new Exception("Cannot find mention by token within context!");
                }
                this.offset = 0;

                #region last word
                {
                    var index = pair.first - 1;
                    while (index >= 0)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = -1;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index--;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index >= 0)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region next word
                {
                    var index = pair.second + 1;
                    while (index < pairs.Count)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = pairs.Count;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index < pairs.Count)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention head
                {
                    string head = null, posTag = null;
                    for (int i = pair.first; i <= pair.second; i++)
                    {
                        if (pairs.ElementAt(i).second.StartsWith("N"))
                        {
                            // last noun
                            head   = pairs.ElementAt(i).first;
                            posTag = pairs.ElementAt(i).second;
                        }
                        else if (pairs.ElementAt(i).second.Equals("IN") || pairs.ElementAt(i).second.Equals(","))
                        {
                            // before IN
                            break;
                        }
                    }
                    if (head == null)
                    {
                        head   = words[words.Count - 1];
                        posTag = pairs.ElementAt(pair.second).second;
                    }
                    AddFieldToFeture(head, posTag);
                }
                #endregion

                #region mention driver
                {
                    int index = parser.GetDriver(pair.first, pair.second);
                    if (index > 0)
                    {
                        var driver = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(driver, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention adjective modifer
                {
                    int index = parser.GetAdjModifier(pair.first, pair.second);
                    if (index > 0)
                    {
                        var adjModifier = pairs.ElementAt(index).first;
                        var posTag      = pairs.ElementAt(index).second;
                        AddFieldToFeture(adjModifier, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention action
                {
                    int index = parser.GetAction(pair.first, pair.second);
                    if (index > 0)
                    {
                        var action = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(action, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                ParserPool.ReturnParser(parser);
                parser = null;
            }
            catch (Exception e)
            {
                if (parser != null)
                {
                    ParserPool.ReturnParser(parser);
                    parser = null;
                }
                throw e;
            }


            #region Mention Words
            {
                // mention surfaces
                var mentionWords = new StringBuilder();
                foreach (var word in words)
                {
                    if (mentionWords.Length == 0)
                    {
                        mentionWords.Append(Generalizer.Generalize(word));
                    }
                    else
                    {
                        mentionWords.Append("," + Generalizer.Generalize(word));
                    }
                }
                // add mention surface
                feature.Add(string.Join(",", words));
                // add stemmed mention surface
                feature.Add(mentionWords.ToString());
                // mention tags
                var mentionTags = mentionWords.Clear();
                for (var i = pair.first; i <= pair.second; i++)
                {
                    if (mentionTags.Length == 0)
                    {
                        mentionTags.Append(pairs.ElementAt(i).second);
                    }
                    else
                    {
                        mentionTags.Append("," + pairs.ElementAt(i).second);
                    }
                }
                feature.Add(mentionTags.ToString());
                // mention IDs
                var mentionIDs = mentionTags.Clear();
                foreach (var word in words)
                {
                    if (mentionIDs.Length == 0)
                    {
                        mentionIDs.Append(DataCenter.GetWordClusterID(word));
                    }
                    else
                    {
                        mentionIDs.Append("," + DataCenter.GetWordClusterID(word));
                    }
                }
                feature.Add(mentionIDs.ToString());
                // mention shapes
                var mentionShapes = mentionIDs.Clear();
                foreach (var word in words)
                {
                    if (mentionShapes.Length == 0)
                    {
                        mentionShapes.Append(GetWordShape(word));
                    }
                    else
                    {
                        mentionShapes.Append("," + GetWordShape(word));
                    }
                }
                feature.Add(mentionShapes.ToString());
            }
            #endregion

            #region mention ID
            {
                feature.Add(DataCenter.GetMentionClusterID(mention).ToString());
            }
            #endregion

            #region mention length
            {
                feature.Add(words.Count.ToString());
            }
            #endregion

            #region Stanford NER
            {
                var ner = StanfordNerPool.GetStanfordNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                StanfordNerPool.ReturnStanfordNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region OpenNLP NER
            {
                var ner = OpenNerPool.GetOpenNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                OpenNerPool.ReturnOpenNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region DBpedia dictionary
            {
                var types = string.Join(",", DataCenter.GetDBpediaType(mention));
                feature.Add(types);
            }
            #endregion

            #region Key words
            {
                var keyWords = DataCenter.ExtractKeyWords(context);
                feature.Add(string.Join(",", keyWords));
            }
            #endregion

            #region TDDO: topic
            {
                // TODO
            }
            #endregion

            #region TDDO: dictionary
            {
                // dictionary
                // TODO
            }
            #endregion

            feature.Add(context);

            return(feature);
        }
Exemplo n.º 3
0
        /*Input:
         * mention TAB context
         */

        internal Dictionary <string, object> GetFeature(string mention, string context)
        {
            var sspliter = SSpliterPool.GetSSpliter();

            context = GetSentenceCoverMention(sspliter.SplitSequence(context), mention);
            SSpliterPool.ReturnSSpliter(sspliter);
            sspliter = null;
            var feature = new Dictionary <string, object>();
            var words   = mention.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

            /**************Word Level****************/
            // last word shape
            var lastWord = GetLastToken(mention, context);

            if (lastWord == null)
            {
                feature[Field.lastWordShape] = "NULL";
            }
            else
            {
                feature[Field.lastWordShape] = GetWordShape(lastWord);
            }
            // next word shape
            var nextWord = GetNextToken(mention, context);

            if (nextWord == null)
            {
                feature[Field.nextWordShape] = "NULL";
            }
            else
            {
                feature[Field.nextWordShape] = GetWordShape(nextWord);
                //Console.WriteLine(feature[Field.nextWordShape]);
            }
            // mention words shape
            var list = new List <string>();

            list = (from string word in words select GetWordShape(word)).ToList();
            feature[Field.mentionWordShapes] = list;
            // pos tags of mention words
            var pairs = GetPosTags(mention, context);
            var pair  = GetIndexOfMention(pairs, mention);

            list = new List <string>();
            for (var i = pair.first; i <= pair.second; i++)
            {
                if (pairs != null)
                {
                    list.Add(pairs.ElementAt(i).second);
                }
            }
            feature[Field.mentionWordTags] = list;
            // pos tag of last word
            var index = 0;

            if (lastWord != null)
            {
                index = GetLastWordIndex(pairs, lastWord, pair.first);
                feature[Field.lastWordTag] = pairs.ElementAt(index).second;
            }
            else
            {
                feature[Field.lastWordTag] = "NULL";
            }
            // pos tag of next word
            if (nextWord != null)
            {
                index = GetNextWordIndex(pairs, nextWord, pair.second);
                feature[Field.nextWordTag] = pairs.ElementAt(index).second;
            }
            else
            {
                feature[Field.nextWordTag] = "NULL";
            }
            //stem words
            lastWord = StemWord(lastWord);
            nextWord = StemWord(nextWord);
            words    = (from string word in words select StemWord(word)).ToArray();
            // make word lowercase
            lastWord = lastWord.ToLower();
            nextWord = nextWord.ToLower();
            words    = (from string word in words select word.ToLower()).ToArray();
            // stemmed last word surface
            feature[Field.lastWord] = (lastWord ?? "NULL");
            // stemmed next word surface
            feature[Field.nextWord] = (nextWord ?? "NULL");
            // stemmed mention words surface
            feature[Field.mentionWords] = words;
            /**************Mention Level****************/
            // mention length
            feature[Field.mentionLength] = words.Length.ToString();
            // mention words 2-gram
            var gram2 = GetNGram(words, 2);

            feature[Field.gram2] = gram2;
            // mention words 2-gram
            var gram3 = GetNGram(words, 3);

            feature[Field.gram3] = gram3;
            /**************Document Level****************/
            // TODO
            /**************External****************/
            // TODO

            return(feature);
        }