Exemple #1
0
        public static void Main(string[] args)
        {
            DependencyParser parser = new DependencyParser();

            parser.Parse("I like Beijing.");
            int index = parser.GetDriver(2, 2);
        }
Exemple #2
0
 /// <summary>
 /// return parser to the parser pool
 /// </summary>
 /// <param name="parser"></param>
 public static void ReturnParser(DependencyParser parser)
 {
     for (var i = 0; i < parsers.Count; i++)
     {
         if (parser == parsers[i])
         {
             lock (availableParsers)
             {
                 availableParsers.Add(i);
             }
             break;
         }
     }
 }
Exemple #3
0
 /// <summary>
 /// Get a dependency parser from parser pool
 /// </summary>
 /// <returns></returns>
 public static DependencyParser GetParser()
 {
     lock (locker)
     {
         lock (availableParsers)
         {
             if (availableParsers.Count > 0)
             {
                 try
                 {
                     var index = availableParsers.First();
                     availableParsers.Remove(index);
                     return(parsers[index]);
                 }
                 catch (Exception e)
                 {
                     Console.Clear();
                     Console.WriteLine("Parsers pool is empty!");
                     Console.WriteLine(availableParsers.Count);
                     Console.WriteLine(e.Message);
                     throw e;
                 }
             }
             else if (parsers.Count < maxParserNum)
             {
                 if (availableParsers.Count == 0)
                 {
                     var parser = new DependencyParser();
                     parsers.Add(parser);
                     return(parser);
                 }
                 else
                 {
                     var index = availableParsers.First();
                     availableParsers.Remove(index);
                     return(parsers[index]);
                 }
             }
         }
         {
             while (availableParsers.Count == 0)
             {
                 Thread.Sleep(10);
             }
             var index = availableParsers.First();
             availableParsers.Remove(index);
             return(parsers[index]);
         }
     }
 }
Exemple #4
0
        /*   Extract feature from the input, and the feature is clustered by field
         *   The input should contains two items:
         *      Mention surface:   the surface text of the mention             // input[0]
         *      Mention context:   the context contains the mention         // input[1]
         *   The output are a list of pairs store the features' index and value:
         *      Mention surface
         *      Mention Shape
         *      Cluster ID of mention words
         *      Mention length
         *      Mention ID
         *      Last token
         *      Last token pos tag
         *      Last token ID
         *      Next token
         *      Next token pos tag
         *      Next token ID
         *      Parent in dependency tree(stanford corenlp)
         *      Dictionary                      :TODO
         *      Topic(Define topic)             :TODO: I am going to work with document cluster
         *
         */
        public List <string> ExtractFeature(Instance instance)
        {
            var mention = instance.Mention;
            var context = instance.Context;

            this.feature.Clear();
            List <string> words     = new List <string>();
            List <string> tokens    = new List <string>();
            var           tokenizer = TokenizerPool.GetTokenizer();

            try
            {
                var ws = tokenizer.Tokenize(mention);
                for (var i = 0; i < ws.Count; i++)
                {
                    if (ws[i].Equals(".") && i > 0 && ws[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    words.Add(ws[i]);
                }
                var ts = tokenizer.Tokenize(context);
                for (var i = 0; i < ts.Count; i++)
                {
                    if (ts[i].Equals(".") && i > 0 && ts[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    tokens.Add(ts[i]);
                }
                TokenizerPool.ReturnTokenizer(tokenizer);
                tokenizer = null;
            }
            catch (Exception e)
            {
                TokenizerPool.ReturnTokenizer(tokenizer);
                throw e;
            }
            // select the first sentence contains mention. This will reduce the parse cost.
            List <string> sentences = null;
            var           sspliter  = SSpliterPool.GetSSpliter();

            try
            {
                sentences = sspliter.SplitSequence(tokens);
                SSpliterPool.ReturnSSpliter(sspliter);
            }
            catch (Exception e)
            {
                SSpliterPool.ReturnSSpliter(sspliter);
                Console.Clear();
                Console.WriteLine("Error in sentence spliter.");
                throw e;
            }
            context = GetSentenceCoverMention(sentences, words);
            if (context == null)
            {
                throw new Exception("Cannot find mention by token within context!");
            }
            // get a parser
            DependencyParser parser = null;

            try
            {
                parser = ParserPool.GetParser();
            }
            catch (Exception)
            {
                throw new Exception("Cannot get a parser!");
            }
            List <Pair <string, string> > pairs = null;
            Pair <int, int> pair = null;

            try
            {
                parser.Parse(context);

                pairs = parser.GetPosTags();
                pair  = GetIndexOfMention(pairs, words);
                if (pair.first == -1)
                {
                    throw new Exception("Cannot find mention by token within context!");
                }
                this.offset = 0;

                #region last word
                {
                    var index = pair.first - 1;
                    while (index >= 0)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = -1;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index--;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index >= 0)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region next word
                {
                    var index = pair.second + 1;
                    while (index < pairs.Count)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = pairs.Count;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index < pairs.Count)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention head
                {
                    string head = null, posTag = null;
                    for (int i = pair.first; i <= pair.second; i++)
                    {
                        if (pairs.ElementAt(i).second.StartsWith("N"))
                        {
                            // last noun
                            head   = pairs.ElementAt(i).first;
                            posTag = pairs.ElementAt(i).second;
                        }
                        else if (pairs.ElementAt(i).second.Equals("IN") || pairs.ElementAt(i).second.Equals(","))
                        {
                            // before IN
                            break;
                        }
                    }
                    if (head == null)
                    {
                        head   = words[words.Count - 1];
                        posTag = pairs.ElementAt(pair.second).second;
                    }
                    AddFieldToFeture(head, posTag);
                }
                #endregion

                #region mention driver
                {
                    int index = parser.GetDriver(pair.first, pair.second);
                    if (index > 0)
                    {
                        var driver = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(driver, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention adjective modifer
                {
                    int index = parser.GetAdjModifier(pair.first, pair.second);
                    if (index > 0)
                    {
                        var adjModifier = pairs.ElementAt(index).first;
                        var posTag      = pairs.ElementAt(index).second;
                        AddFieldToFeture(adjModifier, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention action
                {
                    int index = parser.GetAction(pair.first, pair.second);
                    if (index > 0)
                    {
                        var action = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(action, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                ParserPool.ReturnParser(parser);
                parser = null;
            }
            catch (Exception e)
            {
                if (parser != null)
                {
                    ParserPool.ReturnParser(parser);
                    parser = null;
                }
                throw e;
            }


            #region Mention Words
            {
                // mention surfaces
                var mentionWords = new StringBuilder();
                foreach (var word in words)
                {
                    if (mentionWords.Length == 0)
                    {
                        mentionWords.Append(Generalizer.Generalize(word));
                    }
                    else
                    {
                        mentionWords.Append("," + Generalizer.Generalize(word));
                    }
                }
                // add mention surface
                feature.Add(string.Join(",", words));
                // add stemmed mention surface
                feature.Add(mentionWords.ToString());
                // mention tags
                var mentionTags = mentionWords.Clear();
                for (var i = pair.first; i <= pair.second; i++)
                {
                    if (mentionTags.Length == 0)
                    {
                        mentionTags.Append(pairs.ElementAt(i).second);
                    }
                    else
                    {
                        mentionTags.Append("," + pairs.ElementAt(i).second);
                    }
                }
                feature.Add(mentionTags.ToString());
                // mention IDs
                var mentionIDs = mentionTags.Clear();
                foreach (var word in words)
                {
                    if (mentionIDs.Length == 0)
                    {
                        mentionIDs.Append(DataCenter.GetWordClusterID(word));
                    }
                    else
                    {
                        mentionIDs.Append("," + DataCenter.GetWordClusterID(word));
                    }
                }
                feature.Add(mentionIDs.ToString());
                // mention shapes
                var mentionShapes = mentionIDs.Clear();
                foreach (var word in words)
                {
                    if (mentionShapes.Length == 0)
                    {
                        mentionShapes.Append(GetWordShape(word));
                    }
                    else
                    {
                        mentionShapes.Append("," + GetWordShape(word));
                    }
                }
                feature.Add(mentionShapes.ToString());
            }
            #endregion

            #region mention ID
            {
                feature.Add(DataCenter.GetMentionClusterID(mention).ToString());
            }
            #endregion

            #region mention length
            {
                feature.Add(words.Count.ToString());
            }
            #endregion

            #region Stanford NER
            {
                var ner = StanfordNerPool.GetStanfordNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                StanfordNerPool.ReturnStanfordNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region OpenNLP NER
            {
                var ner = OpenNerPool.GetOpenNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                OpenNerPool.ReturnOpenNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region DBpedia dictionary
            {
                var types = string.Join(",", DataCenter.GetDBpediaType(mention));
                feature.Add(types);
            }
            #endregion

            #region Key words
            {
                var keyWords = DataCenter.ExtractKeyWords(context);
                feature.Add(string.Join(",", keyWords));
            }
            #endregion

            #region TDDO: topic
            {
                // TODO
            }
            #endregion

            #region TDDO: dictionary
            {
                // dictionary
                // TODO
            }
            #endregion

            feature.Add(context);

            return(feature);
        }