C# (CSharp) msra.nlp.tr DataCenter Examples

Programming Language: C# (CSharp)

Namespace/Package Name: msra.nlp.tr

Class/Type: DataCenter

Examples at hotexamples.com: 7

C# (CSharp) msra.nlp.tr DataCenter - 7 examples found. These are the top rated real world C# (CSharp) examples of msra.nlp.tr.DataCenter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GetClusterNumber(3)

ExtractKeyWords(2)

GetDBpediaType(2)

GetMentionClusterID(2)

GetMentionClusterNumber(2)

GetDBpediaTypeIndex(1)

GetDBpediaTypeNum(1)

GetDicTyeMap(1)

GetDicTypeNum(1)

GetKeyWordIndex(1)

GetKeyWordNumber(1)

Example #1

Show file

File: Pipeline.cs Project: v-mipeng/EntityTyping

        private void OutputDicTypeValue()
        {
            var dic    = DataCenter.GetDicTyeMap();
            var writer = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.dic_type_value_file), FileMode.OpenOrCreate);

            foreach (var key in dic.Keys)
            {
                if (GlobalParameter.featureNum != 0)
                {
                    writer.WriteLine(key + "\t" + (GlobalParameter.featureNum - DataCenter.GetDicTypeNum() + dic[key]));
                }
                else
                {
                    writer.WriteLine(key + "\t" + dic[key]);
                }
            }
            writer.Close();
        }

Example #2

Show file

 private void AddFieldToFeture(string word, string posTag)
 {
     if (word != null)
     {
         string generalsurface, ID, shape;
         // mention head
         generalsurface = Generalizer.Generalize(word);
         // Cluster id of last word
         ID = DataCenter.GetWordClusterID(word).ToString();
         // next word shape
         shape = GetWordShape(word);
         // pos tag
         AddToFeature(word, generalsurface, posTag ?? "NULL", ID, shape);
     }
     else
     {
         AddToFeature("NULL", "NULL", "NULL", DataCenter.GetClusterNumber().ToString(), "NULL");
     }
 }

Example #3

Show file

        /*   Extract feature from the input, and the feature is clustered by field
         *   The input should contains two items:
         *      Mention surface:   the surface text of the mention             // input[0]
         *      Mention context:   the context contains the mention         // input[1]
         *   The output are a list of pairs store the features' index and value:
         *      Mention surface
         *      Mention Shape
         *      Cluster ID of mention words
         *      Mention length
         *      Mention ID
         *      Last token
         *      Last token pos tag
         *      Last token ID
         *      Next token
         *      Next token pos tag
         *      Next token ID
         *      Parent in dependency tree(stanford corenlp)
         *      Dictionary                      :TODO
         *      Topic(Define topic)             :TODO: I am going to work with document cluster
         *
         */
        public List <string> ExtractFeature(Instance instance)
        {
            var mention = instance.Mention;
            var context = instance.Context;

            this.feature.Clear();
            List <string> words     = new List <string>();
            List <string> tokens    = new List <string>();
            var           tokenizer = TokenizerPool.GetTokenizer();

            try
            {
                var ws = tokenizer.Tokenize(mention);
                for (var i = 0; i < ws.Count; i++)
                {
                    if (ws[i].Equals(".") && i > 0 && ws[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    words.Add(ws[i]);
                }
                var ts = tokenizer.Tokenize(context);
                for (var i = 0; i < ts.Count; i++)
                {
                    if (ts[i].Equals(".") && i > 0 && ts[i - 1].EndsWith("."))
                    {
                        continue;
                    }
                    tokens.Add(ts[i]);
                }
                TokenizerPool.ReturnTokenizer(tokenizer);
                tokenizer = null;
            }
            catch (Exception e)
            {
                TokenizerPool.ReturnTokenizer(tokenizer);
                throw e;
            }
            // select the first sentence contains mention. This will reduce the parse cost.
            List <string> sentences = null;
            var           sspliter  = SSpliterPool.GetSSpliter();

            try
            {
                sentences = sspliter.SplitSequence(tokens);
                SSpliterPool.ReturnSSpliter(sspliter);
            }
            catch (Exception e)
            {
                SSpliterPool.ReturnSSpliter(sspliter);
                Console.Clear();
                Console.WriteLine("Error in sentence spliter.");
                throw e;
            }
            context = GetSentenceCoverMention(sentences, words);
            if (context == null)
            {
                throw new Exception("Cannot find mention by token within context!");
            }
            // get a parser
            DependencyParser parser = null;

            try
            {
                parser = ParserPool.GetParser();
            }
            catch (Exception)
            {
                throw new Exception("Cannot get a parser!");
            }
            List <Pair <string, string> > pairs = null;
            Pair <int, int> pair = null;

            try
            {
                parser.Parse(context);

                pairs = parser.GetPosTags();
                pair  = GetIndexOfMention(pairs, words);
                if (pair.first == -1)
                {
                    throw new Exception("Cannot find mention by token within context!");
                }
                this.offset = 0;

                #region last word
                {
                    var index = pair.first - 1;
                    while (index >= 0)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = -1;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index--;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index >= 0)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region next word
                {
                    var index = pair.second + 1;
                    while (index < pairs.Count)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = pairs.Count;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first))
                        {
                            index++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index < pairs.Count)
                    {
                        var word   = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(word, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention head
                {
                    string head = null, posTag = null;
                    for (int i = pair.first; i <= pair.second; i++)
                    {
                        if (pairs.ElementAt(i).second.StartsWith("N"))
                        {
                            // last noun
                            head   = pairs.ElementAt(i).first;
                            posTag = pairs.ElementAt(i).second;
                        }
                        else if (pairs.ElementAt(i).second.Equals("IN") || pairs.ElementAt(i).second.Equals(","))
                        {
                            // before IN
                            break;
                        }
                    }
                    if (head == null)
                    {
                        head   = words[words.Count - 1];
                        posTag = pairs.ElementAt(pair.second).second;
                    }
                    AddFieldToFeture(head, posTag);
                }
                #endregion

                #region mention driver
                {
                    int index = parser.GetDriver(pair.first, pair.second);
                    if (index > 0)
                    {
                        var driver = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(driver, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention adjective modifer
                {
                    int index = parser.GetAdjModifier(pair.first, pair.second);
                    if (index > 0)
                    {
                        var adjModifier = pairs.ElementAt(index).first;
                        var posTag      = pairs.ElementAt(index).second;
                        AddFieldToFeture(adjModifier, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                #region mention action
                {
                    int index = parser.GetAction(pair.first, pair.second);
                    if (index > 0)
                    {
                        var action = pairs.ElementAt(index).first;
                        var posTag = pairs.ElementAt(index).second;
                        AddFieldToFeture(action, posTag);
                    }
                    else
                    {
                        AddFieldToFeture(null, null);
                    }
                }
                #endregion

                ParserPool.ReturnParser(parser);
                parser = null;
            }
            catch (Exception e)
            {
                if (parser != null)
                {
                    ParserPool.ReturnParser(parser);
                    parser = null;
                }
                throw e;
            }


            #region Mention Words
            {
                // mention surfaces
                var mentionWords = new StringBuilder();
                foreach (var word in words)
                {
                    if (mentionWords.Length == 0)
                    {
                        mentionWords.Append(Generalizer.Generalize(word));
                    }
                    else
                    {
                        mentionWords.Append("," + Generalizer.Generalize(word));
                    }
                }
                // add mention surface
                feature.Add(string.Join(",", words));
                // add stemmed mention surface
                feature.Add(mentionWords.ToString());
                // mention tags
                var mentionTags = mentionWords.Clear();
                for (var i = pair.first; i <= pair.second; i++)
                {
                    if (mentionTags.Length == 0)
                    {
                        mentionTags.Append(pairs.ElementAt(i).second);
                    }
                    else
                    {
                        mentionTags.Append("," + pairs.ElementAt(i).second);
                    }
                }
                feature.Add(mentionTags.ToString());
                // mention IDs
                var mentionIDs = mentionTags.Clear();
                foreach (var word in words)
                {
                    if (mentionIDs.Length == 0)
                    {
                        mentionIDs.Append(DataCenter.GetWordClusterID(word));
                    }
                    else
                    {
                        mentionIDs.Append("," + DataCenter.GetWordClusterID(word));
                    }
                }
                feature.Add(mentionIDs.ToString());
                // mention shapes
                var mentionShapes = mentionIDs.Clear();
                foreach (var word in words)
                {
                    if (mentionShapes.Length == 0)
                    {
                        mentionShapes.Append(GetWordShape(word));
                    }
                    else
                    {
                        mentionShapes.Append("," + GetWordShape(word));
                    }
                }
                feature.Add(mentionShapes.ToString());
            }
            #endregion

            #region mention ID
            {
                feature.Add(DataCenter.GetMentionClusterID(mention).ToString());
            }
            #endregion

            #region mention length
            {
                feature.Add(words.Count.ToString());
            }
            #endregion

            #region Stanford NER
            {
                var ner = StanfordNerPool.GetStanfordNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                StanfordNerPool.ReturnStanfordNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region OpenNLP NER
            {
                var ner = OpenNerPool.GetOpenNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                OpenNerPool.ReturnOpenNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region DBpedia dictionary
            {
                var types = string.Join(",", DataCenter.GetDBpediaType(mention));
                feature.Add(types);
            }
            #endregion

            #region Key words
            {
                var keyWords = DataCenter.ExtractKeyWords(context);
                feature.Add(string.Join(",", keyWords));
            }
            #endregion

            #region TDDO: topic
            {
                // TODO
            }
            #endregion

            #region TDDO: dictionary
            {
                // dictionary
                // TODO
            }
            #endregion

            feature.Add(context);

            return(feature);
        }

Example #4

Show file

        public List <string> AddFeature(Event e)
        {
            var rawFeature = (List <string>)e.Feature;
            var mention    = rawFeature.ElementAt((int)Event.Field.mentionSurfaces).Replace(',', ' ');
            var context    = rawFeature.ElementAt((int)Event.Field.sentenceContext);

            #region Stanford NER
            if (false)
            {
                var ner = StanfordNerPool.GetStanfordNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                StanfordNerPool.ReturnStanfordNer(ner);
                ner = null;
                feature.Add(type);
            }
            #endregion

            #region OpenNLP NER
            if (false)
            {
                var ner = OpenNerPool.GetOpenNer();
                ner.FindNer(context);
                var type = ner.GetNerType(mention);
                OpenNerPool.ReturnOpenNer(ner);
                ner = null;
                rawFeature[(int)Event.Field.opennlpNerType] = type;
            }
            #endregion

            #region DBpedia dictionary
            {
                var types = string.Join(",", DataCenter.GetDBpediaType(mention));
                rawFeature[(int)Event.Field.dbpediaTypes] = types;
            }
            #endregion

            List <Pair <string, string> > pairs = null;
            Pair <int, int> pair = null;

            #region Modify last word
            System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"\W");

            if (false)
            {
                var lastWord = rawFeature.ElementAt((int)Event.Field.lastWord);
                if (lastWord.Equals("##") || lastWord.Equals(".") || lastWord.Equals("!") || lastWord.Equals("?") || lastWord.Equals(";"))
                {
                    rawFeature[(int)Event.Field.lastWord]        = "NULL";
                    rawFeature[(int)Event.Field.lastWordStemmed] = "NULL";
                    rawFeature[(int)Event.Field.lastWordTag]     = "NULL";
                    rawFeature[(int)Event.Field.lastWordID]      = "100";
                    rawFeature[(int)Event.Field.lastWordShape]   = "NULL";
                }
                else if (!lastWord.Equals("'s") && regex.IsMatch(lastWord))
                {
                    var pos = PosTaggerPool.GetPosTagger();
                    try
                    {
                        pairs = pos.TagString(context);
                        PosTaggerPool.ReturnPosTagger(pos);
                        pair = GetIndexOfMention(pairs, mention);
                        var index = pair.first - 1;
                        while (index >= 0)
                        {
                            if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                            {
                                index = -1;
                                break;
                            }
                            else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first))
                            {
                                index--;
                            }
                            else
                            {
                                break;
                            }
                        }
                        if (index >= 0)
                        {
                            var word        = pairs.ElementAt(index).first;
                            var posTag      = pairs.ElementAt(index).second;
                            var wordStemmed = Generalizer.Generalize(word);
                            var ID          = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface
                            var shape       = GetWordShape(word);

                            rawFeature[(int)Event.Field.lastWord]        = word;
                            rawFeature[(int)Event.Field.lastWordStemmed] = wordStemmed;
                            rawFeature[(int)Event.Field.lastWordTag]     = posTag;
                            rawFeature[(int)Event.Field.lastWordID]      = ID;
                            rawFeature[(int)Event.Field.lastWordShape]   = shape;
                        }
                        else
                        {
                            rawFeature[(int)Event.Field.lastWord]        = "NULL";
                            rawFeature[(int)Event.Field.lastWordStemmed] = "NULL";
                            rawFeature[(int)Event.Field.lastWordTag]     = "NULL";
                            rawFeature[(int)Event.Field.lastWordID]      = "100";
                            rawFeature[(int)Event.Field.lastWordShape]   = "NULL";
                        }
                        PosTaggerPool.ReturnPosTagger(pos);
                    }
                    catch (Exception ex)
                    {
                        PosTaggerPool.ReturnPosTagger(pos);
                        throw ex;
                    }
                }
            }
            #endregion

            #region Modify next word
            if (false)
            {
                var nextWord = rawFeature.ElementAt((int)Event.Field.nextWord);
                if (nextWord.Equals("##") || nextWord.Equals(".") || nextWord.Equals("!") || nextWord.Equals("?") || nextWord.Equals(";"))
                {
                    rawFeature[(int)Event.Field.nextWord]        = "NULL";
                    rawFeature[(int)Event.Field.nextWordStemmed] = "NULL";
                    rawFeature[(int)Event.Field.nextWordTag]     = "NULL";
                    rawFeature[(int)Event.Field.nextWordID]      = "100";
                    rawFeature[(int)Event.Field.nextWordShape]   = "NULL";
                }
                else if (!nextWord.Equals("'s") && regex.IsMatch(nextWord))
                {
                    if (pairs == null)
                    {
                        var pos = PosTaggerPool.GetPosTagger();
                        try
                        {
                            pairs = pos.TagString(context);
                            PosTaggerPool.ReturnPosTagger(pos);
                            pair = GetIndexOfMention(pairs, mention);
                        }
                        catch (Exception ex)
                        {
                            PosTaggerPool.ReturnPosTagger(pos);
                            throw ex;
                        }
                    }
                    var index = pair.second + 1;
                    while (index < pairs.Count)
                    {
                        if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";"))
                        {
                            index = pairs.Count;
                            break;
                        }
                        else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first))
                        {
                            index++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (index < pairs.Count)
                    {
                        var word        = pairs.ElementAt(index).first;
                        var posTag      = pairs.ElementAt(index).second;
                        var wordStemmed = Generalizer.Generalize(word);
                        var ID          = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface
                        var shape       = GetWordShape(word);

                        rawFeature[(int)Event.Field.nextWord]        = word;
                        rawFeature[(int)Event.Field.nextWordStemmed] = wordStemmed;
                        rawFeature[(int)Event.Field.nextWordTag]     = posTag;
                        rawFeature[(int)Event.Field.nextWordID]      = ID;
                        rawFeature[(int)Event.Field.nextWordShape]   = shape;
                    }
                    else
                    {
                        rawFeature[(int)Event.Field.nextWord]        = "NULL";
                        rawFeature[(int)Event.Field.nextWordStemmed] = "NULL";
                        rawFeature[(int)Event.Field.nextWordTag]     = "NULL";
                        rawFeature[(int)Event.Field.nextWordID]      = "100";
                        rawFeature[(int)Event.Field.nextWordShape]   = "NULL";
                    }
                }
            }
            #endregion

            #region   Modify mention ID
            if (true)
            {
                var mentionID         = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionID));
                var mentionClusterNum = DataCenter.GetMentionClusterNumber();
                if (mentionID == mentionClusterNum)
                {
                    mentionID = DataCenter.GetMentionClusterID(mention);
                    rawFeature[(int)Event.Field.mentionID] = mentionID.ToString();
                }
            }
            #endregion

            #region Key words
            if (false)
            {
                var keyWords = DataCenter.ExtractKeyWords(context);
                rawFeature[(int)Event.Field.sentenceContext] = string.Join(",", keyWords);

                rawFeature.Add(context);
            }
            #endregion

            return(rawFeature);
        }

Example #5

Show file

File: SVMFeature.cs Project: v-mipeng/EntityTyping

        /*   Extract feature from the input, and the feature is clustered by field
         *   The input should contains two items:
         *      Mention surface:   the surface text of the mention             // input[0]
         *      Mention context:   the context contains the mention         // input[1]
         *   The output are a list of pairs store the features' index and value:
         *      Mention surface
         *      Mention Shape
         *      Cluster ID of mention words
         *      Mention length
         *      Mention ID
         *      Last token
         *      Last token pos tag
         *      Last token ID
         *      Next token
         *      Next token pos tag
         *      Next token ID
         *      Parent in dependency tree(stanford corenlp) : driver, action, adject modifier(TO USE)
         *      Dictionary                      :TODO
         *      Topic(Define topic)             :TODO: I am going to work with document cluster
         *
         */
        public List <string> ExtractFeature(Event e)
        {
            this.feature.Clear();
            this.offset = 0;
            var rawFeature = e.Feature;

            feature.Add("0");

            #region last word make last word more accurate
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.lastWordStemmed),
                                      rawFeature.ElementAt((int)Event.Field.lastWordID),
                                      rawFeature.ElementAt((int)Event.Field.lastWordShape),
                                      rawFeature.ElementAt((int)Event.Field.lastWordTag));
            }
            #endregion

            #region next word
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.nextWordStemmed),
                                      rawFeature.ElementAt((int)Event.Field.nextWordID),
                                      rawFeature.ElementAt((int)Event.Field.nextWordShape),
                                      rawFeature.ElementAt((int)Event.Field.nextWordTag));
            }
            #endregion

            #region  mention head
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionHeadStemmed),
                                      rawFeature.ElementAt((int)Event.Field.mentionHeadID),
                                      rawFeature.ElementAt((int)Event.Field.mentionHeadShape),
                                      rawFeature.ElementAt((int)Event.Field.mentionHeadTag));
            }
            #endregion

            #region mention driver
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionDriverStemmed),
                                      rawFeature.ElementAt((int)Event.Field.mentionDriverID),
                                      rawFeature.ElementAt((int)Event.Field.mentionDriverShape),
                                      rawFeature.ElementAt((int)Event.Field.mentionDriverTag));
            }
            #endregion

            #region mention adjective modifer
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionAdjModifierStemmed),
                                      rawFeature.ElementAt((int)Event.Field.mentionAdjModifierID),
                                      rawFeature.ElementAt((int)Event.Field.mentionAdjModifierShape),
                                      rawFeature.ElementAt((int)Event.Field.mentionAdjModifierTag));
            }
            #endregion

            #region mention action
            {
                AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionActionStemmed),
                                      rawFeature.ElementAt((int)Event.Field.mentionActionID),
                                      rawFeature.ElementAt((int)Event.Field.mentionActionShape),
                                      rawFeature.ElementAt((int)Event.Field.mentionActionTag));
            }
            #endregion

            #region mention words
            {
                string[] words = null;
                try
                {
                    words = rawFeature.ElementAt((int)Event.Field.mentionSurfacesStemmed).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                }
                catch (Exception)
                {
                    throw new Exception("Mention words is null");
                }
                string[] IDs = null;
                try
                {
                    IDs = rawFeature.ElementAt((int)Event.Field.mentionIDs).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                }
                catch (Exception)
                {
                    throw new Exception("Mention ids is null");
                }
                string[] shapes = null;
                try
                {
                    shapes = rawFeature.ElementAt((int)Event.Field.mentionShapes).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                }
                catch (Exception)
                {
                    throw new Exception("Mention shpaes is null");
                }
                string[] tags = null;
                try
                {
                    tags = rawFeature.ElementAt((int)Event.Field.mentionTags).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                }
                catch (Exception)
                {
                    throw new Exception("Mention tags is null");
                }
                var dic   = new Dictionary <int, int>();
                int value = 0;
                var dic2  = new SortedDictionary <int, int>();
                foreach (var w in words) // words surface
                {
                    var index = offset + DataCenter.GetWordIndex(w);
                    dic.TryGetValue(index, out value);
                    dic[index] = value + 1;
                }
                var keys = dic.Keys.ToList();
                keys.Sort();
                foreach (var key in keys)
                {
                    feature.Add(key + ":" + dic[key]);
                }
                offset += DataCenter.GetWordTableSize() + 1;
                dic.Clear();
                foreach (var ID in IDs) // words' cluster id
                {
                    var index = offset + int.Parse(ID);
                    dic.TryGetValue(index, out value);
                    dic[index] = value + 1;
                }
                keys = dic.Keys.ToList();
                keys.Sort();
                foreach (var key in keys)
                {
                    feature.Add(key + ":" + dic[key]);
                }
                offset += DataCenter.GetClusterNumber() + 1;
                dic.Clear();
                foreach (var shape in shapes) // words shapes
                {
                    var index = offset + DataCenter.GetWordShapeIndex(shape);
                    dic.TryGetValue(index, out value);
                    dic[index] = value + 1;
                }
                keys = dic.Keys.ToList();
                keys.Sort();
                foreach (var key in keys)
                {
                    feature.Add(key + ":" + dic[key]);
                }
                offset += DataCenter.GetWordShapeTableSize() + 1;
                dic.Clear();
                foreach (var tag in tags)
                {   // words pos tags
                    var index = offset + DataCenter.GetPosTagIndex(tag);
                    dic.TryGetValue(index, out value);
                    dic[index] = value + 1;
                }
                keys = dic.Keys.ToList();
                keys.Sort();
                foreach (var key in keys)
                {
                    feature.Add(key + ":" + dic[key]);
                }
                offset += DataCenter.GetPosTagTableSize() + 1;
            }
            #endregion

            #region mention cluster id
            {
                var mentionID = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionID));
                feature.Add((offset + mentionID) + ":1");
                offset += DataCenter.GetMentionClusterNumber() + 1;
            }
            #endregion

            #region mention length: 1,2,3,4 or longer than 5
            {
                var length = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionLength));
                if (length > 5)
                {
                    length = 5;
                }
                feature.Add((offset + length - 1) + ":1");
                offset += 5;
            }
            #endregion

            #region Stanford Ner system
            {
                var stanfordNerType = rawFeature.ElementAt((int)Event.Field.stanfordNerType);
                var index           = DataCenter.GetStanfordTypeIndex(stanfordNerType);
                feature.Add((offset + index) + ":1");
                offset += DataCenter.GetStanfordNerNumber() + 1;
            }
            #endregion

            #region OpenNLP Ner system
            {
                var openNLPNerType = rawFeature.ElementAt((int)Event.Field.opennlpNerType);
                var index          = DataCenter.GetOpenNLPTypeIndex(openNLPNerType);
                feature.Add((offset + index) + ":1");
                offset += DataCenter.GetOpenNLPNerNumber() + 1;
            }
            #endregion

            #region DBpedia types
            {
                var types = rawFeature.ElementAt((int)Event.Field.dbpediaTypes).Split(',');
                var list  = new List <int>();
                foreach (var type in types)
                {
                    var index = DataCenter.GetDBpediaTypeIndex(type);
                    list.Add(index);
                }
                list.Sort();
                foreach (var index in list)
                {
                    feature.Add((offset + index) + ":1");
                }
                offset += DataCenter.GetDBpediaTypeNum(); // the index of typeNum will never occur.
            }
            #endregion

            #region Key words
            {
                var keywords = rawFeature.ElementAt((int)Event.Field.keyWords).Split(',');
                var list     = new List <int>();
                foreach (var word in keywords)
                {
                    var index = DataCenter.GetKeyWordIndex(word);
                    list.Add(offset + index);
                }
                list.Sort();
                foreach (var index in list)
                {
                    feature.Add(index + ":1");
                }
                offset += DataCenter.GetKeyWordNumber();
            }
            #endregion


            #region TODO: topic
            {
            }
            #endregion

            #region TODO: dictionary
            {
            }
            #endregion

            //set feature dimension
            feature[0] = FeatureDimension.ToString();
            return(feature);
        }

Example #6

Show file

File: Statistic.cs Project: v-mipeng/EntityTyping

        //public static string StatisticRoundTokenInformation(String sourceFile)
        //{
        //    FileReader reader = new LargeFileReader(sourceFile);
        //    FeatureExtractor extractor = new FeatureExtractor();
        //    // type-->(word-->times)
        //    Dictionary<string, Dictionary<string, int>> lastTokenNumByType = new Dictionary<string, Dictionary<string, int>>();
        //    Dictionary<string, Dictionary<string, int>> nextTokenNumByType = new Dictionary<string, Dictionary<string, int>>();
        //    Dictionary<string, int> dic = null ;

        //    string line;
        //    string lastToken;
        //    string nextToken;
        //    string type;
        //    String[] array;
        //    int count = 0;

        //    while ((line = reader.ReadLine()) != null)
        //    {
        //        if((++count) % 1000 ==0)
        //        {
        //            Console.WriteLine(count);
        //        }
        //        try
        //        {
        //            array = line.Split('\t');
        //            type = array[1];
        //            // get last token
        //            lastToken = extractor.GetLastToken(array[2], array[0]).ToLower();
        //            if (lastToken == null)
        //            {
        //                lastToken = "null";
        //            }
        //            else
        //            {
        //                lastToken = DataCenter.GetStemmedWord(lastToken);
        //            }
        //            // get next token
        //            nextToken = extractor.GetNextToken(array[2], array[0]).ToLower();
        //            if (nextToken == null)
        //            {
        //                nextToken = "null";
        //            }
        //            else
        //            {
        //                nextToken = DataCenter.GetStemmedWord(nextToken);
        //            }
        //            // deal last token
        //            lastTokenNumByType.TryGetValue(type, out dic);
        //            if (dic == null)
        //            {
        //                dic = new Dictionary<string, int>();
        //            }
        //            try
        //            {
        //                dic[lastToken] += 1;
        //            }
        //            catch (Exception)
        //            {
        //                dic[lastToken] = 1;
        //            }
        //            lastTokenNumByType[type] = dic;
        //            // deal next token
        //            nextTokenNumByType.TryGetValue(type, out dic);
        //            if (dic == null)
        //            {
        //                dic = new Dictionary<string, int>();
        //            }
        //            try
        //            {
        //                dic[nextToken] += 1;
        //            }
        //            catch (Exception)
        //            {
        //                dic[nextToken] = 1;
        //            }
        //            nextTokenNumByType[type] = dic;
        //        }
        //        catch(Exception)
        //        {
        //            continue;
        //        }
        //    }
        //    reader.Close();
        //    StringBuilder buffer = new StringBuilder();
        //    // report last token information
        //    buffer.Append("last token report: word:(per times|loc times|org times)\r");
        //    List<Pair<string, int>> list = new List<Pair<string, int>>();
        //    Comparer<Pair<string, int>> comparer = new Pair<string,int>().GetBySecondReverseComparer();
        //    foreach(String item in lastTokenNumByType["people.person"].Keys )
        //    {
        //        Pair<string, int> pair = new Pair<string, int>(item, lastTokenNumByType["people.person"][item]);
        //        list.Add(pair);
        //    }
        //    list.Sort(comparer);
        //    count = 0;
        //    int locNum;
        //    int orgNum;
        //    foreach (Pair<string,int> item in list)
        //    {
        //        count++;
        //        try
        //        {
        //            locNum = lastTokenNumByType["location.location"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            locNum = 0;
        //        }
        //        try
        //        {
        //            orgNum = lastTokenNumByType["organization.organization"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            orgNum = 0;
        //        }
        //        buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
        //        if (count % 5 == 0)
        //        {
        //            buffer.Append("\r");
        //        }
        //    }
        //    buffer.Append("\r");
        //    // report next token information
        //    buffer.Append("next token report:  word:(per times|loc times|org times)\r");
        //    list.Clear();
        //    foreach (String item in nextTokenNumByType["people.person"].Keys)
        //    {
        //        Pair<string, int> pair = new Pair<string, int>(item, nextTokenNumByType["people.person"][item]);
        //        list.Add(pair);
        //    }
        //    list.Sort(comparer);
        //     count = 0;
        //    foreach (Pair<string, int> item in list)
        //    {
        //        count++;
        //        try
        //        {
        //            locNum = nextTokenNumByType["location.location"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            locNum = 0;
        //        }
        //        try
        //        {
        //            orgNum = nextTokenNumByType["organization.organization"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            orgNum = 0;
        //        }
        //        buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
        //        if (count % 5 == 0)
        //        {
        //            buffer.Append("\r");
        //        }
        //    }
        //    return buffer.ToString();
        //}

        //public static string StatisticWithinTokenInfomation(String sourceFile)
        //{
        //    FileReader reader = new LargeFileReader(sourceFile);
        //    FeatureExtractor extractor = new FeatureExtractor();
        //    // type-->(word-->times)
        //    Dictionary<string, Dictionary<string, int>> firstTokenNumByType = new Dictionary<string, Dictionary<string, int>>();
        //    Dictionary<string, Dictionary<string, int>> finalTokenNumByType = new Dictionary<string, Dictionary<string, int>>();
        //    Dictionary<string, int> dic = null ;

        //    string line;
        //    string firstToken;
        //    string finalToken;
        //    string type;
        //    String[] array;
        //    string[] wordArray;
        //    int count = 0;

        //    while ((line = reader.ReadLine()) != null)
        //    {
        //        if ((++count) % 1000 == 0)
        //        {
        //            Console.WriteLine(count);
        //        }
        //        try
        //        {
        //            array = line.Split('\t');
        //            type = array[1];
        //            wordArray = array[0].Split('\t');
        //            // get first token
        //            firstToken = wordArray[0].ToLower();
        //            if (firstToken == null)
        //            {
        //                firstToken = "null";
        //            }
        //            else
        //            {
        //                firstToken = DataCenter.GetStemmedWord(firstToken);
        //            }
        //            // get final token
        //            finalToken = wordArray[wordArray.Length - 1].ToLower();
        //            if (finalToken == null)
        //            {
        //                finalToken = "null";
        //            }
        //            else
        //            {
        //                finalToken = DataCenter.GetStemmedWord(finalToken);
        //            }
        //            // deal first token
        //            firstTokenNumByType.TryGetValue(type, out dic);
        //            if (dic == null)
        //            {
        //                dic = new Dictionary<string, int>();
        //            }
        //            try
        //            {
        //                dic[firstToken] += 1;
        //            }
        //            catch (Exception)
        //            {
        //                dic[firstToken] = 1;
        //            }
        //            firstTokenNumByType[type] = dic;
        //            // deal final token
        //            finalTokenNumByType.TryGetValue(type, out dic);
        //            if (dic == null)
        //            {
        //                dic = new Dictionary<string, int>();
        //            }
        //            try
        //            {
        //                dic[finalToken] += 1;
        //            }
        //            catch (Exception)
        //            {
        //                dic[finalToken] = 1;
        //            }
        //            finalTokenNumByType[type] = dic;
        //        }
        //        catch(Exception)
        //        {
        //            continue;
        //        }
        //    }
        //    reader.Close();
        //    StringBuilder buffer = new StringBuilder();
        //    // report first token information
        //    buffer.Append("first token report: word:(per times|loc times|org times)\r");
        //    List<Pair<string, int>> list = new List<Pair<string, int>>();
        //    Comparer<Pair<string, int>> comparer = new Pair<string,int>().GetBySecondReverseComparer();
        //    foreach(String item in firstTokenNumByType["people.person"].Keys )
        //    {
        //        Pair<string, int> pair = new Pair<string, int>(item, firstTokenNumByType["people.person"][item]);
        //        list.Add(pair);
        //    }
        //    list.Sort(comparer);
        //    count = 0;
        //    int locNum;
        //    int orgNum;
        //    foreach (Pair<string,int> item in list)
        //    {
        //        count++;
        //        try
        //        {
        //            locNum = firstTokenNumByType["location.location"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            locNum = 0;
        //        }
        //        try
        //        {
        //            orgNum = firstTokenNumByType["organization.organization"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            orgNum = 0;
        //        }
        //        buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
        //        if (count % 5 == 0)
        //        {
        //            buffer.Append("\r");
        //        }
        //    }
        //    buffer.Append("\r");
        //    // report final token information
        //    buffer.Append("final token report:  word:(per times|loc times|org times)\r");
        //    list.Clear();
        //    foreach (String item in finalTokenNumByType["people.person"].Keys)
        //    {
        //        Pair<string, int> pair = new Pair<string, int>(item, finalTokenNumByType["people.person"][item]);
        //        list.Add(pair);
        //    }
        //    list.Sort(comparer);
        //     count = 0;
        //    foreach (Pair<string, int> item in list)
        //    {
        //        count++;
        //        try
        //        {
        //            locNum = finalTokenNumByType["location.location"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            locNum = 0;
        //        }
        //        try
        //        {
        //            orgNum = finalTokenNumByType["organization.organization"][item.first];
        //        }
        //        catch (Exception)
        //        {
        //            orgNum = 0;
        //        }
        //        buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
        //        if (count % 5 == 0)
        //        {
        //            buffer.Append("\r");
        //        }
        //    }
        //    return buffer.ToString();
        //    }

        public static void Refresh()
        {
            DataCenter.RefreshStemDic();
        }

Example #7

Show file

File: Feature.cs Project: v-mipeng/EntityTyping

 /// <summary>
 ///     Stem word
 /// </summary>
 /// <remarks>
 ///     Input: string type original word
 /// </remarks>
 /// <returns>
 ///     stemmed word or null if word is null
 /// </returns>
 ///
 protected string StemWord(string word)
 {
     return(word == null ? null : DataCenter.GetStemmedWord(word));
 }