Example #1
0
        private void OutputDicTypeValue()
        {
            var dic    = DataCenter.GetDicTyeMap();
            var writer = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.dic_type_value_file), FileMode.OpenOrCreate);

            foreach (var key in dic.Keys)
            {
                if (GlobalParameter.featureNum != 0)
                {
                    writer.WriteLine(key + "\t" + (GlobalParameter.featureNum - DataCenter.GetDicTypeNum() + dic[key]));
                }
                else
                {
                    writer.WriteLine(key + "\t" + dic[key]);
                }
            }
            writer.Close();
        }
Example #2
0
 private void AddFieldToFeture(string word, string posTag)
 {
     if (word != null)
     {
         string generalsurface, ID, shape;
         // mention head
         generalsurface = Generalizer.Generalize(word);
         // Cluster id of last word
         ID = DataCenter.GetWordClusterID(word).ToString();
         // next word shape
         shape = GetWordShape(word);
         // pos tag
         AddToFeature(word, generalsurface, posTag ?? "NULL", ID, shape);
     }
     else
     {
         AddToFeature("NULL", "NULL", "NULL", DataCenter.GetClusterNumber().ToString(), "NULL");
     }
 }
Example #3
0
        /*   Extract feature from the input, and the feature is clustered by field
         *   The input should contains two items:
         *      Mention surface:   the surface text of the mention             // input[0]
         *      Mention context:   the context contains the mention         // input[1]
         *   The output are a list of pairs store the features' index and value:
         *      Mention surface
         *      Mention Shape
         *      Cluster ID of mention words
         *      Mention length
         *      Mention ID
         *      Last token
         *      Last token pos tag
         *      Last token ID
         *      Next token
         *      Next token pos tag
         *      Next token ID
         *      Parent in dependency tree(stanford corenlp)
         *      Dictionary                      :TODO
         *      Topic(Define topic)             :TODO: I am going to work with document cluster
         *
         */
        public List <string> ExtractFeature(Instance instance)
        {
            var mention = instance.Mention;
            var context = instance.Context;

            this.feature.Clear();
            List <string> words     = new List <string>();
            List <string> tokens    = new List <string>();
            var           tokenizer = TokenizerPool.GetTokenizer();

            try
            {
                var ws = tokenizer.Tokenize(mention);
                for (var i = 0; i < ws.Count; i++)
                {
                    if (ws[i].Equals(".") && i > 0 && ws[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    words.Add(ws[i]);
                }
                var ts = tokenizer.Tokenize(context);
                for (var i = 0; i < ts.Count; i++)
                {
                    if (ts[i].Equals(".") && i > 0 && ts[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    tokens.Add(ts[i]);
                }
                TokenizerPool.ReturnTokenizer(tokenizer);
                tokenizer = null;
            }
            catch (Exception e)
            {
                TokenizerPool.ReturnTokenizer(tokenizer);
                throw e;
            }
            // select the first sentence contains mention. This will reduce the parse cost.
            List <string> sentences = null;
            var           sspliter  = SSpliterPool.GetSSpliter();

            try
            {
                sentences = sspliter.SplitSequence(tokens);
                SSpliterPool.ReturnSSpliter(sspliter);
            }
            catch (Exception e)
            {
                SSpliterPool.ReturnSSpliter(sspliter);
                Console.Clear();
                Console.WriteLine("Error in sentence spliter.");
                throw e;
            }
            context = GetSentenceCoverMention(sentences, words);
            if (context == null)
            {
                throw new Exception("Cannot find mention by token within context!");
            }
            // get a parser
            DependencyParser parser = null;

            try
            {
                parser = ParserPool.GetParser();
            }
            catch (Exception)
            {
                throw new Exception("Cannot get a parser!");
            }
            List <Pair <string, string> > pairs = null;
            Pair <int, int> pair = null;

            try
            {
                parser.Parse(context);

                pairs = parser.GetPosTags();
                pair  = GetIndexOfMention(pairs, words);
                if (pair.first == -1)
                {
                    throw new Exception("Cannot find mention by token within context!");
                }
                this.offset = 0;

                #region last word
                {
                    var index = pair.first - 1;
                    while (index >= 0)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = -1;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index--;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index >= 0)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region next word
                {
                    var index = pair.second + 1;
                    while (index < pairs.Count)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = pairs.Count;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index < pairs.Count)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention head
                {
                    string head = null, posTag = null;
                    for (int i = pair.first; i <= pair.second; i++)
                    {
                        if (pairs.ElementAt(i).second.StartsWith("N"))
                        {
                            // last noun
                            head   = pairs.ElementAt(i).first;
                            posTag = pairs.ElementAt(i).second;
                        }
                        else if (pairs.ElementAt(i).second.Equals("IN") || pairs.ElementAt(i).second.Equals(","))
                        {
                            // before IN
                            break;
                        }
                    }
                    if (head == null)
                    {
                        head   = words[words.Count - 1];
                        posTag = pairs.ElementAt(pair.second).second;
                    }
                    AddFieldToFeture(head, posTag);
                }
                #endregion

                #region mention driver
                {
                    int index = parser.GetDriver(pair.first, pair.second);
                    if (index > 0)
                    {
                        var driver = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(driver, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention adjective modifer
                {
                    int index = parser.GetAdjModifier(pair.first, pair.second);
                    if (index > 0)
                    {
                        var adjModifier = pairs.ElementAt(index).first;
                        var posTag      = pairs.ElementAt(index).second;
                        AddFieldToFeture(adjModifier, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention action
                {
                    int index = parser.GetAction(pair.first, pair.second);
                    if (index > 0)
                    {
                        var action = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(action, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                ParserPool.ReturnParser(parser);
                parser = null;
            }
            catch (Exception e)
            {
                if (parser != null)
                {
                    ParserPool.ReturnParser(parser);
                    parser = null;
                }
                throw e;
            }


            #region Mention Words
            {
                // mention surfaces
                var mentionWords = new StringBuilder();
                foreach (var word in words)
                {
                    if (mentionWords.Length == 0)
                    {
                        mentionWords.Append(Generalizer.Generalize(word));
                    }
                    else
                    {
                        mentionWords.Append("," + Generalizer.Generalize(word));
                    }
                }
                // add mention surface
                feature.Add(string.Join(",", words));
                // add stemmed mention surface
                feature.Add(mentionWords.ToString());
                // mention tags
                var mentionTags = mentionWords.Clear();
                for (var i = pair.first; i <= pair.second; i++)
                {
                    if (mentionTags.Length == 0)
                    {
                        mentionTags.Append(pairs.ElementAt(i).second);
                    }
                    else
                    {
                        mentionTags.Append("," + pairs.ElementAt(i).second);
                    }
                }
                feature.Add(mentionTags.ToString());
                // mention IDs
                var mentionIDs = mentionTags.Clear();
                foreach (var word in words)
                {
                    if (mentionIDs.Length == 0)
                    {
                        mentionIDs.Append(DataCenter.GetWordClusterID(word));
                    }
                    else
                    {
                        mentionIDs.Append("," + DataCenter.GetWordClusterID(word));
                    }
                }
                feature.Add(mentionIDs.ToString());
                // mention shapes
                var mentionShapes = mentionIDs.Clear();
                foreach (var word in words)
                {
                    if (mentionShapes.Length == 0)
                    {
                        mentionShapes.Append(GetWordShape(word));
                    }
                    else
                    {
                        mentionShapes.Append("," + GetWordShape(word));
                    }
                }
                feature.Add(mentionShapes.ToString());
            }
            #endregion

            #region mention ID
            {
                feature.Add(DataCenter.GetMentionClusterID(mention).ToString());
            }
            #endregion

            #region mention length
            {
                feature.Add(words.Count.ToString());
            }
            #endregion

            #region Stanford NER
            {
                var ner = StanfordNerPool.GetStanfordNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                StanfordNerPool.ReturnStanfordNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region OpenNLP NER
            {
                var ner = OpenNerPool.GetOpenNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                OpenNerPool.ReturnOpenNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region DBpedia dictionary
            {
                var types = string.Join(",", DataCenter.GetDBpediaType(mention));
                feature.Add(types);
            }
            #endregion

            #region Key words
            {
                var keyWords = DataCenter.ExtractKeyWords(context);
                feature.Add(string.Join(",", keyWords));
            }
            #endregion

            #region TDDO: topic
            {
                // TODO
            }
            #endregion

            #region TDDO: dictionary
            {
                // dictionary
                // TODO
            }
            #endregion

            feature.Add(context);

            return(feature);
        }
Example #4
0
        public List <string> AddFeature(Event e)
        {
            var rawFeature = (List <string>)e.Feature;
            var mention    = rawFeature.ElementAt((int)Event.Field.mentionSurfaces).Replace(',', ' ');
            var context    = rawFeature.ElementAt((int)Event.Field.sentenceContext);

            #region Stanford NER
            if (false)
            {
                var ner = StanfordNerPool.GetStanfordNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                StanfordNerPool.ReturnStanfordNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region OpenNLP NER
            if (false)
            {
                var ner = OpenNerPool.GetOpenNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                OpenNerPool.ReturnOpenNer(ner);
                ner = null;
                rawFeature[(int)Event.Field.opennlpNerType] = type;
            }
            #endregion

            #region DBpedia dictionary
            {
                var types = string.Join(",", DataCenter.GetDBpediaType(mention));
                rawFeature[(int)Event.Field.dbpediaTypes] = types;
            }
            #endregion

            List <Pair <string, string> > pairs = null;
            Pair <int, int> pair = null;

            #region Modify last word
            System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"\W");

            if (false)
            {
                var lastWord = rawFeature.ElementAt((int)Event.Field.lastWord);
                if (lastWord.Equals("##") || lastWord.Equals(".") || lastWord.Equals("!") || lastWord.Equals("?") || lastWord.Equals(";"))
                {
                    rawFeature[(int)Event.Field.lastWord]        = "NULL";
                    rawFeature[(int)Event.Field.lastWordStemmed] = "NULL";
                    rawFeature[(int)Event.Field.lastWordTag]     = "NULL";
                    rawFeature[(int)Event.Field.lastWordID]      = "100";
                    rawFeature[(int)Event.Field.lastWordShape]   = "NULL";
                }
                else if (!lastWord.Equals("'s") && regex.IsMatch(lastWord))
                {
                    var pos = PosTaggerPool.GetPosTagger();
                    try
                    {
                        pairs = pos.TagString(context);
                        PosTaggerPool.ReturnPosTagger(pos);
                        pair = GetIndexOfMention(pairs, mention);
                        var index = pair.first - 1;
                        while (index >= 0)
                        {
                            if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                            {
                                index = -1;
                                break;
                            }
                            else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first))
                            {
                                index--;
                            }
                            else
                            {
                                break;
                            }
                        }
                        if (index >= 0)
                        {
                            var word        = pairs.ElementAt(index).first;
                            var posTag      = pairs.ElementAt(index).second;
                            var wordStemmed = Generalizer.Generalize(word);
                            var ID          = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface
                            var shape       = GetWordShape(word);

                            rawFeature[(int)Event.Field.lastWord]        = word;
                            rawFeature[(int)Event.Field.lastWordStemmed] = wordStemmed;
                            rawFeature[(int)Event.Field.lastWordTag]     = posTag;
                            rawFeature[(int)Event.Field.lastWordID]      = ID;
                            rawFeature[(int)Event.Field.lastWordShape]   = shape;
                        }
                        else
                        {
                            rawFeature[(int)Event.Field.lastWord]        = "NULL";
                            rawFeature[(int)Event.Field.lastWordStemmed] = "NULL";
                            rawFeature[(int)Event.Field.lastWordTag]     = "NULL";
                            rawFeature[(int)Event.Field.lastWordID]      = "100";
                            rawFeature[(int)Event.Field.lastWordShape]   = "NULL";
                        }
                        PosTaggerPool.ReturnPosTagger(pos);
                    }
                    catch (Exception ex)
                    {
                        PosTaggerPool.ReturnPosTagger(pos);
                        throw ex;
                    }
                }
            }
            #endregion

            #region Modify next word
            if (false)
            {
                var nextWord = rawFeature.ElementAt((int)Event.Field.nextWord);
                if (nextWord.Equals("##") || nextWord.Equals(".") || nextWord.Equals("!") || nextWord.Equals("?") || nextWord.Equals(";"))
                {
                    rawFeature[(int)Event.Field.nextWord]        = "NULL";
                    rawFeature[(int)Event.Field.nextWordStemmed] = "NULL";
                    rawFeature[(int)Event.Field.nextWordTag]     = "NULL";
                    rawFeature[(int)Event.Field.nextWordID]      = "100";
                    rawFeature[(int)Event.Field.nextWordShape]   = "NULL";
                }
                else if (!nextWord.Equals("'s") && regex.IsMatch(nextWord))
                {
                    if (pairs == null)
                    {
                        var pos = PosTaggerPool.GetPosTagger();
                        try
                        {
                            pairs = pos.TagString(context);
                            PosTaggerPool.ReturnPosTagger(pos);
                            pair = GetIndexOfMention(pairs, mention);
                        }
                        catch (Exception ex)
                        {
                            PosTaggerPool.ReturnPosTagger(pos);
                            throw ex;
                        }
                    }
                    var index = pair.second + 1;
                    while (index < pairs.Count)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = pairs.Count;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first))
                        {
                            index++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index < pairs.Count)
                    {
                        var word        = pairs.ElementAt(index).first;
                        var posTag      = pairs.ElementAt(index).second;
                        var wordStemmed = Generalizer.Generalize(word);
                        var ID          = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface
                        var shape       = GetWordShape(word);

                        rawFeature[(int)Event.Field.nextWord]        = word;
                        rawFeature[(int)Event.Field.nextWordStemmed] = wordStemmed;
                        rawFeature[(int)Event.Field.nextWordTag]     = posTag;
                        rawFeature[(int)Event.Field.nextWordID]      = ID;
                        rawFeature[(int)Event.Field.nextWordShape]   = shape;
                    }
                    else
                    {
                        rawFeature[(int)Event.Field.nextWord]        = "NULL";
                        rawFeature[(int)Event.Field.nextWordStemmed] = "NULL";
                        rawFeature[(int)Event.Field.nextWordTag]     = "NULL";
                        rawFeature[(int)Event.Field.nextWordID]      = "100";
                        rawFeature[(int)Event.Field.nextWordShape]   = "NULL";
                    }
                }
            }
            #endregion

            #region   Modify mention ID
            if (true)
            {
                var mentionID         = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionID));
                var mentionClusterNum = DataCenter.GetMentionClusterNumber();
                if (mentionID == mentionClusterNum)
                {
                    mentionID = DataCenter.GetMentionClusterID(mention);
                    rawFeature[(int)Event.Field.mentionID] = mentionID.ToString();
                }
            }
            #endregion

            #region Key words
            if (false)
            {
                var keyWords = DataCenter.ExtractKeyWords(context);
                rawFeature[(int)Event.Field.sentenceContext] = string.Join(",", keyWords);

                rawFeature.Add(context);
            }
            #endregion

            return(rawFeature);
        }
Example #5
0
        /*   Extract feature from the input, and the feature is clustered by field
         *   The input should contains two items:
         *      Mention surface:   the surface text of the mention             // input[0]
         *      Mention context:   the context contains the mention         // input[1]
         *   The output are a list of pairs store the features' index and value:
         *      Mention surface
         *      Mention Shape
         *      Cluster ID of mention words
         *      Mention length
         *      Mention ID
         *      Last token
         *      Last token pos tag
         *      Last token ID
         *      Next token
         *      Next token pos tag
         *      Next token ID
         *      Parent in dependency tree(stanford corenlp) : driver, action, adject modifier(TO USE)
         *      Dictionary                      :TODO
         *      Topic(Define topic)             :TODO: I am going to work with document cluster
         *
         */
        public List <string> ExtractFeature(Event e)
        {
            this.feature.Clear();
            this.offset = 0;
            var rawFeature = e.Feature;

            feature.Add("0");

            #region last word make last word more accurate
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.lastWordStemmed),
                                      rawFeature.ElementAt((int)Event.Field.lastWordID),
                                      rawFeature.ElementAt((int)Event.Field.lastWordShape),
                                      rawFeature.ElementAt((int)Event.Field.lastWordTag));
            }
            #endregion

            #region next word
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.nextWordStemmed),
                                      rawFeature.ElementAt((int)Event.Field.nextWordID),
                                      rawFeature.ElementAt((int)Event.Field.nextWordShape),
                                      rawFeature.ElementAt((int)Event.Field.nextWordTag));
            }
            #endregion

            #region  mention head
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionHeadStemmed),
                                      rawFeature.ElementAt((int)Event.Field.mentionHeadID),
                                      rawFeature.ElementAt((int)Event.Field.mentionHeadShape),
                                      rawFeature.ElementAt((int)Event.Field.mentionHeadTag));
            }
            #endregion

            #region mention driver
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionDriverStemmed),
                                      rawFeature.ElementAt((int)Event.Field.mentionDriverID),
                                      rawFeature.ElementAt((int)Event.Field.mentionDriverShape),
                                      rawFeature.ElementAt((int)Event.Field.mentionDriverTag));
            }
            #endregion

            #region mention adjective modifer
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionAdjModifierStemmed),
                                      rawFeature.ElementAt((int)Event.Field.mentionAdjModifierID),
                                      rawFeature.ElementAt((int)Event.Field.mentionAdjModifierShape),
                                      rawFeature.ElementAt((int)Event.Field.mentionAdjModifierTag));
            }
            #endregion

            #region mention action
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionActionStemmed),
                                      rawFeature.ElementAt((int)Event.Field.mentionActionID),
                                      rawFeature.ElementAt((int)Event.Field.mentionActionShape),
                                      rawFeature.ElementAt((int)Event.Field.mentionActionTag));
            }
            #endregion

            #region mention words
            {
                string[] words = null;
                try
                {
                    words = rawFeature.ElementAt((int)Event.Field.mentionSurfacesStemmed).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                }
                catch (Exception)
                {
                    throw new Exception("Mention words is null");
                }
                string[] IDs = null;
                try
                {
                    IDs = rawFeature.ElementAt((int)Event.Field.mentionIDs).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                }
                catch (Exception)
                {
                    throw new Exception("Mention ids is null");
                }
                string[] shapes = null;
                try
                {
                    shapes = rawFeature.ElementAt((int)Event.Field.mentionShapes).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                }
                catch (Exception)
                {
                    throw new Exception("Mention shpaes is null");
                }
                string[] tags = null;
                try
                {
                    tags = rawFeature.ElementAt((int)Event.Field.mentionTags).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                }
                catch (Exception)
                {
                    throw new Exception("Mention tags is null");
                }
                var dic   = new Dictionary <int, int>();
                int value = 0;
                var dic2  = new SortedDictionary <int, int>();
                foreach (var w in words) // words surface
                {
                    var index = offset + DataCenter.GetWordIndex(w);
                    dic.TryGetValue(index, out value);
                    dic[index] = value + 1;
                }
                var keys = dic.Keys.ToList();
                keys.Sort();
                foreach (var key in keys)
                {
                    feature.Add(key + ":" + dic[key]);
                }
                offset += DataCenter.GetWordTableSize() + 1;
                dic.Clear();
                foreach (var ID in IDs) // words' cluster id
                {
                    var index = offset + int.Parse(ID);
                    dic.TryGetValue(index, out value);
                    dic[index] = value + 1;
                }
                keys = dic.Keys.ToList();
                keys.Sort();
                foreach (var key in keys)
                {
                    feature.Add(key + ":" + dic[key]);
                }
                offset += DataCenter.GetClusterNumber() + 1;
                dic.Clear();
                foreach (var shape in shapes) // words shapes
                {
                    var index = offset + DataCenter.GetWordShapeIndex(shape);
                    dic.TryGetValue(index, out value);
                    dic[index] = value + 1;
                }
                keys = dic.Keys.ToList();
                keys.Sort();
                foreach (var key in keys)
                {
                    feature.Add(key + ":" + dic[key]);
                }
                offset += DataCenter.GetWordShapeTableSize() + 1;
                dic.Clear();
                foreach (var tag in tags)
                {   // words pos tags
                    var index = offset + DataCenter.GetPosTagIndex(tag);
                    dic.TryGetValue(index, out value);
                    dic[index] = value + 1;
                }
                keys = dic.Keys.ToList();
                keys.Sort();
                foreach (var key in keys)
                {
                    feature.Add(key + ":" + dic[key]);
                }
                offset += DataCenter.GetPosTagTableSize() + 1;
            }
            #endregion

            #region mention cluster id
            {
                var mentionID = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionID));
                feature.Add((offset + mentionID) + ":1");
                offset += DataCenter.GetMentionClusterNumber() + 1;
            }
            #endregion

            #region mention length: 1,2,3,4 or longer than 5
            {
                var length = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionLength));
                if (length > 5)
                {
                    length = 5;
                }
                feature.Add((offset + length - 1) + ":1");
                offset += 5;
            }
            #endregion

            #region Stanford Ner system
            {
                var stanfordNerType = rawFeature.ElementAt((int)Event.Field.stanfordNerType);
                var index           = DataCenter.GetStanfordTypeIndex(stanfordNerType);
                feature.Add((offset + index) + ":1");
                offset += DataCenter.GetStanfordNerNumber() + 1;
            }
            #endregion

            #region OpenNLP Ner system
            {
                var openNLPNerType = rawFeature.ElementAt((int)Event.Field.opennlpNerType);
                var index          = DataCenter.GetOpenNLPTypeIndex(openNLPNerType);
                feature.Add((offset + index) + ":1");
                offset += DataCenter.GetOpenNLPNerNumber() + 1;
            }
            #endregion

            #region DBpedia types
            {
                var types = rawFeature.ElementAt((int)Event.Field.dbpediaTypes).Split(',');
                var list  = new List <int>();
                foreach (var type in types)
                {
                    var index = DataCenter.GetDBpediaTypeIndex(type);
                    list.Add(index);
                }
                list.Sort();
                foreach (var index in list)
                {
                    feature.Add((offset + index) + ":1");
                }
                offset += DataCenter.GetDBpediaTypeNum(); // the index of typeNum will never occur.
            }
            #endregion

            #region Key words
            {
                var keywords = rawFeature.ElementAt((int)Event.Field.keyWords).Split(',');
                var list     = new List <int>();
                foreach (var word in keywords)
                {
                    var index = DataCenter.GetKeyWordIndex(word);
                    list.Add(offset + index);
                }
                list.Sort();
                foreach (var index in list)
                {
                    feature.Add(index + ":1");
                }
                offset += DataCenter.GetKeyWordNumber();
            }
            #endregion


            #region TODO: topic
            {
            }
            #endregion

            #region TODO: dictionary
            {
            }
            #endregion

            //set feature dimension
            feature[0] = FeatureDimension.ToString();
            return(feature);
        }
Example #6
0
        //public static string StatisticRoundTokenInformation(String sourceFile)
        //{
        //    FileReader reader = new LargeFileReader(sourceFile);
        //    FeatureExtractor extractor = new FeatureExtractor();
        //    // type-->(word-->times)
        //    Dictionary<string, Dictionary<string, int>> lastTokenNumByType = new Dictionary<string, Dictionary<string, int>>();
        //    Dictionary<string, Dictionary<string, int>> nextTokenNumByType = new Dictionary<string, Dictionary<string, int>>();
        //    Dictionary<string, int> dic = null ;

        //    string line;
        //    string lastToken;
        //    string nextToken;
        //    string type;
        //    String[] array;
        //    int count = 0;

        //    while ((line = reader.ReadLine()) != null)
        //    {
        //        if((++count) % 1000 ==0)
        //        {
        //            Console.WriteLine(count);
        //        }
        //        try
        //        {
        //            array = line.Split('\t');
        //            type = array[1];
        //            // get last token
        //            lastToken = extractor.GetLastToken(array[2], array[0]).ToLower();
        //            if (lastToken == null)
        //            {
        //                lastToken = "null";
        //            }
        //            else
        //            {
        //                lastToken = DataCenter.GetStemmedWord(lastToken);
        //            }
        //            // get next token
        //            nextToken = extractor.GetNextToken(array[2], array[0]).ToLower();
        //            if (nextToken == null)
        //            {
        //                nextToken = "null";
        //            }
        //            else
        //            {
        //                nextToken = DataCenter.GetStemmedWord(nextToken);
        //            }
        //            // deal last token
        //            lastTokenNumByType.TryGetValue(type, out dic);
        //            if (dic == null)
        //            {
        //                dic = new Dictionary<string, int>();
        //            }
        //            try
        //            {
        //                dic[lastToken] += 1;
        //            }
        //            catch (Exception)
        //            {
        //                dic[lastToken] = 1;
        //            }
        //            lastTokenNumByType[type] = dic;
        //            // deal next token
        //            nextTokenNumByType.TryGetValue(type, out dic);
        //            if (dic == null)
        //            {
        //                dic = new Dictionary<string, int>();
        //            }
        //            try
        //            {
        //                dic[nextToken] += 1;
        //            }
        //            catch (Exception)
        //            {
        //                dic[nextToken] = 1;
        //            }
        //            nextTokenNumByType[type] = dic;
        //        }
        //        catch(Exception)
        //        {
        //            continue;
        //        }
        //    }
        //    reader.Close();
        //    StringBuilder buffer = new StringBuilder();
        //    // report last token information
        //    buffer.Append("last token report: word:(per times|loc times|org times)\r");
        //    List<Pair<string, int>> list = new List<Pair<string, int>>();
        //    Comparer<Pair<string, int>> comparer = new Pair<string,int>().GetBySecondReverseComparer();
        //    foreach(String item in lastTokenNumByType["people.person"].Keys )
        //    {
        //        Pair<string, int> pair = new Pair<string, int>(item, lastTokenNumByType["people.person"][item]);
        //        list.Add(pair);
        //    }
        //    list.Sort(comparer);
        //    count = 0;
        //    int locNum;
        //    int orgNum;
        //    foreach (Pair<string,int> item in list)
        //    {
        //        count++;
        //        try
        //        {
        //            locNum = lastTokenNumByType["location.location"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            locNum = 0;
        //        }
        //        try
        //        {
        //            orgNum = lastTokenNumByType["organization.organization"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            orgNum = 0;
        //        }
        //        buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
        //        if (count % 5 == 0)
        //        {
        //            buffer.Append("\r");
        //        }
        //    }
        //    buffer.Append("\r");
        //    // report next token information
        //    buffer.Append("next token report:  word:(per times|loc times|org times)\r");
        //    list.Clear();
        //    foreach (String item in nextTokenNumByType["people.person"].Keys)
        //    {
        //        Pair<string, int> pair = new Pair<string, int>(item, nextTokenNumByType["people.person"][item]);
        //        list.Add(pair);
        //    }
        //    list.Sort(comparer);
        //     count = 0;
        //    foreach (Pair<string, int> item in list)
        //    {
        //        count++;
        //        try
        //        {
        //            locNum = nextTokenNumByType["location.location"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            locNum = 0;
        //        }
        //        try
        //        {
        //            orgNum = nextTokenNumByType["organization.organization"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            orgNum = 0;
        //        }
        //        buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
        //        if (count % 5 == 0)
        //        {
        //            buffer.Append("\r");
        //        }
        //    }
        //    return buffer.ToString();
        //}

        //public static string StatisticWithinTokenInfomation(String sourceFile)
        //{
        //    FileReader reader = new LargeFileReader(sourceFile);
        //    FeatureExtractor extractor = new FeatureExtractor();
        //    // type-->(word-->times)
        //    Dictionary<string, Dictionary<string, int>> firstTokenNumByType = new Dictionary<string, Dictionary<string, int>>();
        //    Dictionary<string, Dictionary<string, int>> finalTokenNumByType = new Dictionary<string, Dictionary<string, int>>();
        //    Dictionary<string, int> dic = null ;

        //    string line;
        //    string firstToken;
        //    string finalToken;
        //    string type;
        //    String[] array;
        //    string[] wordArray;
        //    int count = 0;

        //    while ((line = reader.ReadLine()) != null)
        //    {
        //        if ((++count) % 1000 == 0)
        //        {
        //            Console.WriteLine(count);
        //        }
        //        try
        //        {
        //            array = line.Split('\t');
        //            type = array[1];
        //            wordArray = array[0].Split('\t');
        //            // get first token
        //            firstToken = wordArray[0].ToLower();
        //            if (firstToken == null)
        //            {
        //                firstToken = "null";
        //            }
        //            else
        //            {
        //                firstToken = DataCenter.GetStemmedWord(firstToken);
        //            }
        //            // get final token
        //            finalToken = wordArray[wordArray.Length - 1].ToLower();
        //            if (finalToken == null)
        //            {
        //                finalToken = "null";
        //            }
        //            else
        //            {
        //                finalToken = DataCenter.GetStemmedWord(finalToken);
        //            }
        //            // deal first token
        //            firstTokenNumByType.TryGetValue(type, out dic);
        //            if (dic == null)
        //            {
        //                dic = new Dictionary<string, int>();
        //            }
        //            try
        //            {
        //                dic[firstToken] += 1;
        //            }
        //            catch (Exception)
        //            {
        //                dic[firstToken] = 1;
        //            }
        //            firstTokenNumByType[type] = dic;
        //            // deal final token
        //            finalTokenNumByType.TryGetValue(type, out dic);
        //            if (dic == null)
        //            {
        //                dic = new Dictionary<string, int>();
        //            }
        //            try
        //            {
        //                dic[finalToken] += 1;
        //            }
        //            catch (Exception)
        //            {
        //                dic[finalToken] = 1;
        //            }
        //            finalTokenNumByType[type] = dic;
        //        }
        //        catch(Exception)
        //        {
        //            continue;
        //        }
        //    }
        //    reader.Close();
        //    StringBuilder buffer = new StringBuilder();
        //    // report first token information
        //    buffer.Append("first token report: word:(per times|loc times|org times)\r");
        //    List<Pair<string, int>> list = new List<Pair<string, int>>();
        //    Comparer<Pair<string, int>> comparer = new Pair<string,int>().GetBySecondReverseComparer();
        //    foreach(String item in firstTokenNumByType["people.person"].Keys )
        //    {
        //        Pair<string, int> pair = new Pair<string, int>(item, firstTokenNumByType["people.person"][item]);
        //        list.Add(pair);
        //    }
        //    list.Sort(comparer);
        //    count = 0;
        //    int locNum;
        //    int orgNum;
        //    foreach (Pair<string,int> item in list)
        //    {
        //        count++;
        //        try
        //        {
        //            locNum = firstTokenNumByType["location.location"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            locNum = 0;
        //        }
        //        try
        //        {
        //            orgNum = firstTokenNumByType["organization.organization"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            orgNum = 0;
        //        }
        //        buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
        //        if (count % 5 == 0)
        //        {
        //            buffer.Append("\r");
        //        }
        //    }
        //    buffer.Append("\r");
        //    // report final token information
        //    buffer.Append("final token report:  word:(per times|loc times|org times)\r");
        //    list.Clear();
        //    foreach (String item in finalTokenNumByType["people.person"].Keys)
        //    {
        //        Pair<string, int> pair = new Pair<string, int>(item, finalTokenNumByType["people.person"][item]);
        //        list.Add(pair);
        //    }
        //    list.Sort(comparer);
        //     count = 0;
        //    foreach (Pair<string, int> item in list)
        //    {
        //        count++;
        //        try
        //        {
        //            locNum = finalTokenNumByType["location.location"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            locNum = 0;
        //        }
        //        try
        //        {
        //            orgNum = finalTokenNumByType["organization.organization"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            orgNum = 0;
        //        }
        //        buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
        //        if (count % 5 == 0)
        //        {
        //            buffer.Append("\r");
        //        }
        //    }
        //    return buffer.ToString();
        //    }

        public static void Refresh()
        {
            DataCenter.RefreshStemDic();
        }
Example #7
0
 /// <summary>
 ///     Stem word
 /// </summary>
 /// <remarks>
 ///     Input: string type original word
 /// </remarks>
 /// <returns>
 ///     stemmed word or null if word is null
 /// </returns>
 ///
 protected string StemWord(string word)
 {
     return(word == null ? null : DataCenter.GetStemmedWord(word));
 }