Exemplo n.º 1
0
        /* Train file format:
         *      Mention     Type    Context
         * Extract word table and word shape table from train data
         * Every word is converted to lowercase and stemmed
         * /************************************************************************/
        public void ExtractWordTable()
        {
            FileReader reader          = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file));
            FileWriter writer          = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_table_file), FileMode.Create);
            FileWriter wordShapeWriter = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_shape_table_file), FileMode.Create);
            //FileWriter wordShapeWriter = new LargeFileWriter("../../../Fine-ner/input/shape-table-file.txt", FileMode.Create);

            string line           = null;
            var    wordTable      = new HashSet <string>();
            var    wordShapeTable = new HashSet <string>();

            while ((line = reader.ReadLine()) != null)
            {
                try
                {
                    var array     = line.Split('\t');
                    var tokenizer = TokenizerPool.GetTokenizer();
                    var words     = tokenizer.Tokenize(array[2]);
                    TokenizerPool.ReturnTokenizer(tokenizer);
                    foreach (var w in words)
                    {
                        if (!string.IsNullOrEmpty(w))   // w should not be empty
                        {
                            var shape = Feature.GetWordShape(w);
                            if (!wordShapeTable.Contains(shape))
                            {
                                wordShapeWriter.WriteLine(shape);
                                wordShapeTable.Add(shape);
                            }
                            var word = Generalizer.Generalize(w);
                            if (!wordTable.Contains(word))
                            {
                                writer.WriteLine(word);
                                wordTable.Add(word);
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    Console.WriteLine("=================error!===============");
                    Console.WriteLine("\t" + e.Message);
                    Console.WriteLine("\t" + e.StackTrace);
                    Console.WriteLine("=================error!===============");
                    continue;
                }
            }
            reader.Close();
            writer.Close();
        }
Exemplo n.º 2
0
        /*   Extract feature from the input, and the feature is clustered by field
         *   The input should contains two items:
         *      Mention surface:   the surface text of the mention             // input[0]
         *      Mention context:   the context contains the mention         // input[1]
         *   The output are a list of pairs store the features' index and value:
         *      Mention surface
         *      Mention Shape
         *      Cluster ID of mention words
         *      Mention length
         *      Mention ID
         *      Last token
         *      Last token pos tag
         *      Last token ID
         *      Next token
         *      Next token pos tag
         *      Next token ID
         *      Parent in dependency tree(stanford corenlp)
         *      Dictionary                      :TODO
         *      Topic(Define topic)             :TODO: I am going to work with document cluster
         *
         */
        public List <string> ExtractFeature(Instance instance)
        {
            var mention = instance.Mention;
            var context = instance.Context;

            this.feature.Clear();
            List <string> words     = new List <string>();
            List <string> tokens    = new List <string>();
            var           tokenizer = TokenizerPool.GetTokenizer();

            try
            {
                var ws = tokenizer.Tokenize(mention);
                for (var i = 0; i < ws.Count; i++)
                {
                    if (ws[i].Equals(".") && i > 0 && ws[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    words.Add(ws[i]);
                }
                var ts = tokenizer.Tokenize(context);
                for (var i = 0; i < ts.Count; i++)
                {
                    if (ts[i].Equals(".") && i > 0 && ts[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    tokens.Add(ts[i]);
                }
                TokenizerPool.ReturnTokenizer(tokenizer);
                tokenizer = null;
            }
            catch (Exception e)
            {
                TokenizerPool.ReturnTokenizer(tokenizer);
                throw e;
            }
            // select the first sentence contains mention. This will reduce the parse cost.
            List <string> sentences = null;
            var           sspliter  = SSpliterPool.GetSSpliter();

            try
            {
                sentences = sspliter.SplitSequence(tokens);
                SSpliterPool.ReturnSSpliter(sspliter);
            }
            catch (Exception e)
            {
                SSpliterPool.ReturnSSpliter(sspliter);
                Console.Clear();
                Console.WriteLine("Error in sentence spliter.");
                throw e;
            }
            context = GetSentenceCoverMention(sentences, words);
            if (context == null)
            {
                throw new Exception("Cannot find mention by token within context!");
            }
            // get a parser
            DependencyParser parser = null;

            try
            {
                parser = ParserPool.GetParser();
            }
            catch (Exception)
            {
                throw new Exception("Cannot get a parser!");
            }
            List <Pair <string, string> > pairs = null;
            Pair <int, int> pair = null;

            try
            {
                parser.Parse(context);

                pairs = parser.GetPosTags();
                pair  = GetIndexOfMention(pairs, words);
                if (pair.first == -1)
                {
                    throw new Exception("Cannot find mention by token within context!");
                }
                this.offset = 0;

                #region last word
                {
                    var index = pair.first - 1;
                    while (index >= 0)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = -1;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index--;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index >= 0)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region next word
                {
                    var index = pair.second + 1;
                    while (index < pairs.Count)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = pairs.Count;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index < pairs.Count)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention head
                {
                    string head = null, posTag = null;
                    for (int i = pair.first; i <= pair.second; i++)
                    {
                        if (pairs.ElementAt(i).second.StartsWith("N"))
                        {
                            // last noun
                            head   = pairs.ElementAt(i).first;
                            posTag = pairs.ElementAt(i).second;
                        }
                        else if (pairs.ElementAt(i).second.Equals("IN") || pairs.ElementAt(i).second.Equals(","))
                        {
                            // before IN
                            break;
                        }
                    }
                    if (head == null)
                    {
                        head   = words[words.Count - 1];
                        posTag = pairs.ElementAt(pair.second).second;
                    }
                    AddFieldToFeture(head, posTag);
                }
                #endregion

                #region mention driver
                {
                    int index = parser.GetDriver(pair.first, pair.second);
                    if (index > 0)
                    {
                        var driver = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(driver, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention adjective modifer
                {
                    int index = parser.GetAdjModifier(pair.first, pair.second);
                    if (index > 0)
                    {
                        var adjModifier = pairs.ElementAt(index).first;
                        var posTag      = pairs.ElementAt(index).second;
                        AddFieldToFeture(adjModifier, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention action
                {
                    int index = parser.GetAction(pair.first, pair.second);
                    if (index > 0)
                    {
                        var action = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(action, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                ParserPool.ReturnParser(parser);
                parser = null;
            }
            catch (Exception e)
            {
                if (parser != null)
                {
                    ParserPool.ReturnParser(parser);
                    parser = null;
                }
                throw e;
            }


            #region Mention Words
            {
                // mention surfaces
                var mentionWords = new StringBuilder();
                foreach (var word in words)
                {
                    if (mentionWords.Length == 0)
                    {
                        mentionWords.Append(Generalizer.Generalize(word));
                    }
                    else
                    {
                        mentionWords.Append("," + Generalizer.Generalize(word));
                    }
                }
                // add mention surface
                feature.Add(string.Join(",", words));
                // add stemmed mention surface
                feature.Add(mentionWords.ToString());
                // mention tags
                var mentionTags = mentionWords.Clear();
                for (var i = pair.first; i <= pair.second; i++)
                {
                    if (mentionTags.Length == 0)
                    {
                        mentionTags.Append(pairs.ElementAt(i).second);
                    }
                    else
                    {
                        mentionTags.Append("," + pairs.ElementAt(i).second);
                    }
                }
                feature.Add(mentionTags.ToString());
                // mention IDs
                var mentionIDs = mentionTags.Clear();
                foreach (var word in words)
                {
                    if (mentionIDs.Length == 0)
                    {
                        mentionIDs.Append(DataCenter.GetWordClusterID(word));
                    }
                    else
                    {
                        mentionIDs.Append("," + DataCenter.GetWordClusterID(word));
                    }
                }
                feature.Add(mentionIDs.ToString());
                // mention shapes
                var mentionShapes = mentionIDs.Clear();
                foreach (var word in words)
                {
                    if (mentionShapes.Length == 0)
                    {
                        mentionShapes.Append(GetWordShape(word));
                    }
                    else
                    {
                        mentionShapes.Append("," + GetWordShape(word));
                    }
                }
                feature.Add(mentionShapes.ToString());
            }
            #endregion

            #region mention ID
            {
                feature.Add(DataCenter.GetMentionClusterID(mention).ToString());
            }
            #endregion

            #region mention length
            {
                feature.Add(words.Count.ToString());
            }
            #endregion

            #region Stanford NER
            {
                var ner = StanfordNerPool.GetStanfordNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                StanfordNerPool.ReturnStanfordNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region OpenNLP NER
            {
                var ner = OpenNerPool.GetOpenNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                OpenNerPool.ReturnOpenNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region DBpedia dictionary
            {
                var types = string.Join(",", DataCenter.GetDBpediaType(mention));
                feature.Add(types);
            }
            #endregion

            #region Key words
            {
                var keyWords = DataCenter.ExtractKeyWords(context);
                feature.Add(string.Join(",", keyWords));
            }
            #endregion

            #region TDDO: topic
            {
                // TODO
            }
            #endregion

            #region TDDO: dictionary
            {
                // dictionary
                // TODO
            }
            #endregion

            feature.Add(context);

            return(feature);
        }