private void OutputDicTypeValue() { var dic = DataCenter.GetDicTyeMap(); var writer = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.dic_type_value_file), FileMode.OpenOrCreate); foreach (var key in dic.Keys) { if (GlobalParameter.featureNum != 0) { writer.WriteLine(key + "\t" + (GlobalParameter.featureNum - DataCenter.GetDicTypeNum() + dic[key])); } else { writer.WriteLine(key + "\t" + dic[key]); } } writer.Close(); }
private void AddFieldToFeture(string word, string posTag) { if (word != null) { string generalsurface, ID, shape; // mention head generalsurface = Generalizer.Generalize(word); // Cluster id of last word ID = DataCenter.GetWordClusterID(word).ToString(); // next word shape shape = GetWordShape(word); // pos tag AddToFeature(word, generalsurface, posTag ?? "NULL", ID, shape); } else { AddToFeature("NULL", "NULL", "NULL", DataCenter.GetClusterNumber().ToString(), "NULL"); } }
/* Extract feature from the input, and the feature is clustered by field * The input should contains two items: * Mention surface: the surface text of the mention // input[0] * Mention context: the context contains the mention // input[1] * The output are a list of pairs store the features' index and value: * Mention surface * Mention Shape * Cluster ID of mention words * Mention length * Mention ID * Last token * Last token pos tag * Last token ID * Next token * Next token pos tag * Next token ID * Parent in dependency tree(stanford corenlp) * Dictionary :TODO * Topic(Define topic) :TODO: I am going to work with document cluster * */ public List <string> ExtractFeature(Instance instance) { var mention = instance.Mention; var context = instance.Context; this.feature.Clear(); List <string> words = new List <string>(); List <string> tokens = new List <string>(); var tokenizer = TokenizerPool.GetTokenizer(); try { var ws = tokenizer.Tokenize(mention); for (var i = 0; i < ws.Count; i++) { if (ws[i].Equals(".") && i > 0 && ws[i - 1].EndsWith(".")) { continue; } words.Add(ws[i]); } var ts = tokenizer.Tokenize(context); for (var i = 0; i < ts.Count; i++) { if (ts[i].Equals(".") && i > 0 && ts[i - 1].EndsWith(".")) { continue; } tokens.Add(ts[i]); } TokenizerPool.ReturnTokenizer(tokenizer); tokenizer = null; } catch (Exception e) { TokenizerPool.ReturnTokenizer(tokenizer); throw e; } // select the first sentence contains mention. This will reduce the parse cost. List <string> sentences = null; var sspliter = SSpliterPool.GetSSpliter(); try { sentences = sspliter.SplitSequence(tokens); SSpliterPool.ReturnSSpliter(sspliter); } catch (Exception e) { SSpliterPool.ReturnSSpliter(sspliter); Console.Clear(); Console.WriteLine("Error in sentence spliter."); throw e; } context = GetSentenceCoverMention(sentences, words); if (context == null) { throw new Exception("Cannot find mention by token within context!"); } // get a parser DependencyParser parser = null; try { parser = ParserPool.GetParser(); } catch (Exception) { throw new Exception("Cannot get a parser!"); } List <Pair <string, string> > pairs = null; Pair <int, int> pair = null; try { parser.Parse(context); pairs = parser.GetPosTags(); pair = GetIndexOfMention(pairs, words); if (pair.first == -1) { throw new Exception("Cannot find mention by token within context!"); } this.offset = 0; #region last word { var index = pair.first - 1; while (index >= 0) { if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";")) { index = -1; break; } else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first)) { index--; } else { break; } } if (index >= 0) { var word = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(word, posTag); } else { AddFieldToFeture(null, null); } } #endregion #region next word { var index = pair.second + 1; while (index < pairs.Count) { if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";")) { index = pairs.Count; break; } else if (!pairs[index].first.Equals("'s") && allCharRegex.IsMatch(pairs[index].first)) { index++; } else { break; } } if (index < pairs.Count) { var word = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(word, posTag); } else { AddFieldToFeture(null, null); } } #endregion #region mention head { string head = null, posTag = null; for (int i = pair.first; i <= pair.second; i++) { if (pairs.ElementAt(i).second.StartsWith("N")) { // last noun head = pairs.ElementAt(i).first; posTag = pairs.ElementAt(i).second; } else if (pairs.ElementAt(i).second.Equals("IN") || pairs.ElementAt(i).second.Equals(",")) { // before IN break; } } if (head == null) { head = words[words.Count - 1]; posTag = pairs.ElementAt(pair.second).second; } AddFieldToFeture(head, posTag); } #endregion #region mention driver { int index = parser.GetDriver(pair.first, pair.second); if (index > 0) { var driver = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(driver, posTag); } else { AddFieldToFeture(null, null); } } #endregion #region mention adjective modifer { int index = parser.GetAdjModifier(pair.first, pair.second); if (index > 0) { var adjModifier = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(adjModifier, posTag); } else { AddFieldToFeture(null, null); } } #endregion #region mention action { int index = parser.GetAction(pair.first, pair.second); if (index > 0) { var action = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; AddFieldToFeture(action, posTag); } else { AddFieldToFeture(null, null); } } #endregion ParserPool.ReturnParser(parser); parser = null; } catch (Exception e) { if (parser != null) { ParserPool.ReturnParser(parser); parser = null; } throw e; } #region Mention Words { // mention surfaces var mentionWords = new StringBuilder(); foreach (var word in words) { if (mentionWords.Length == 0) { mentionWords.Append(Generalizer.Generalize(word)); } else { mentionWords.Append("," + Generalizer.Generalize(word)); } } // add mention surface feature.Add(string.Join(",", words)); // add stemmed mention surface feature.Add(mentionWords.ToString()); // mention tags var mentionTags = mentionWords.Clear(); for (var i = pair.first; i <= pair.second; i++) { if (mentionTags.Length == 0) { mentionTags.Append(pairs.ElementAt(i).second); } else { mentionTags.Append("," + pairs.ElementAt(i).second); } } feature.Add(mentionTags.ToString()); // mention IDs var mentionIDs = mentionTags.Clear(); foreach (var word in words) { if (mentionIDs.Length == 0) { mentionIDs.Append(DataCenter.GetWordClusterID(word)); } else { mentionIDs.Append("," + DataCenter.GetWordClusterID(word)); } } feature.Add(mentionIDs.ToString()); // mention shapes var mentionShapes = mentionIDs.Clear(); foreach (var word in words) { if (mentionShapes.Length == 0) { mentionShapes.Append(GetWordShape(word)); } else { mentionShapes.Append("," + GetWordShape(word)); } } feature.Add(mentionShapes.ToString()); } #endregion #region mention ID { feature.Add(DataCenter.GetMentionClusterID(mention).ToString()); } #endregion #region mention length { feature.Add(words.Count.ToString()); } #endregion #region Stanford NER { var ner = StanfordNerPool.GetStanfordNer(); ner.FindNer(context); var type = ner.GetNerType(mention); StanfordNerPool.ReturnStanfordNer(ner); ner = null; feature.Add(type); } #endregion #region OpenNLP NER { var ner = OpenNerPool.GetOpenNer(); ner.FindNer(context); var type = ner.GetNerType(mention); OpenNerPool.ReturnOpenNer(ner); ner = null; feature.Add(type); } #endregion #region DBpedia dictionary { var types = string.Join(",", DataCenter.GetDBpediaType(mention)); feature.Add(types); } #endregion #region Key words { var keyWords = DataCenter.ExtractKeyWords(context); feature.Add(string.Join(",", keyWords)); } #endregion #region TDDO: topic { // TODO } #endregion #region TDDO: dictionary { // dictionary // TODO } #endregion feature.Add(context); return(feature); }
public List <string> AddFeature(Event e) { var rawFeature = (List <string>)e.Feature; var mention = rawFeature.ElementAt((int)Event.Field.mentionSurfaces).Replace(',', ' '); var context = rawFeature.ElementAt((int)Event.Field.sentenceContext); #region Stanford NER if (false) { var ner = StanfordNerPool.GetStanfordNer(); ner.FindNer(context); var type = ner.GetNerType(mention); StanfordNerPool.ReturnStanfordNer(ner); ner = null; feature.Add(type); } #endregion #region OpenNLP NER if (false) { var ner = OpenNerPool.GetOpenNer(); ner.FindNer(context); var type = ner.GetNerType(mention); OpenNerPool.ReturnOpenNer(ner); ner = null; rawFeature[(int)Event.Field.opennlpNerType] = type; } #endregion #region DBpedia dictionary { var types = string.Join(",", DataCenter.GetDBpediaType(mention)); rawFeature[(int)Event.Field.dbpediaTypes] = types; } #endregion List <Pair <string, string> > pairs = null; Pair <int, int> pair = null; #region Modify last word System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"\W"); if (false) { var lastWord = rawFeature.ElementAt((int)Event.Field.lastWord); if (lastWord.Equals("##") || lastWord.Equals(".") || lastWord.Equals("!") || lastWord.Equals("?") || lastWord.Equals(";")) { rawFeature[(int)Event.Field.lastWord] = "NULL"; rawFeature[(int)Event.Field.lastWordStemmed] = "NULL"; rawFeature[(int)Event.Field.lastWordTag] = "NULL"; rawFeature[(int)Event.Field.lastWordID] = "100"; rawFeature[(int)Event.Field.lastWordShape] = "NULL"; } else if (!lastWord.Equals("'s") && regex.IsMatch(lastWord)) { var pos = PosTaggerPool.GetPosTagger(); try { pairs = pos.TagString(context); PosTaggerPool.ReturnPosTagger(pos); pair = GetIndexOfMention(pairs, mention); var index = pair.first - 1; while (index >= 0) { if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";")) { index = -1; break; } else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first)) { index--; } else { break; } } if (index >= 0) { var word = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; var wordStemmed = Generalizer.Generalize(word); var ID = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface var shape = GetWordShape(word); rawFeature[(int)Event.Field.lastWord] = word; rawFeature[(int)Event.Field.lastWordStemmed] = wordStemmed; rawFeature[(int)Event.Field.lastWordTag] = posTag; rawFeature[(int)Event.Field.lastWordID] = ID; rawFeature[(int)Event.Field.lastWordShape] = shape; } else { rawFeature[(int)Event.Field.lastWord] = "NULL"; rawFeature[(int)Event.Field.lastWordStemmed] = "NULL"; rawFeature[(int)Event.Field.lastWordTag] = "NULL"; rawFeature[(int)Event.Field.lastWordID] = "100"; rawFeature[(int)Event.Field.lastWordShape] = "NULL"; } PosTaggerPool.ReturnPosTagger(pos); } catch (Exception ex) { PosTaggerPool.ReturnPosTagger(pos); throw ex; } } } #endregion #region Modify next word if (false) { var nextWord = rawFeature.ElementAt((int)Event.Field.nextWord); if (nextWord.Equals("##") || nextWord.Equals(".") || nextWord.Equals("!") || nextWord.Equals("?") || nextWord.Equals(";")) { rawFeature[(int)Event.Field.nextWord] = "NULL"; rawFeature[(int)Event.Field.nextWordStemmed] = "NULL"; rawFeature[(int)Event.Field.nextWordTag] = "NULL"; rawFeature[(int)Event.Field.nextWordID] = "100"; rawFeature[(int)Event.Field.nextWordShape] = "NULL"; } else if (!nextWord.Equals("'s") && regex.IsMatch(nextWord)) { if (pairs == null) { var pos = PosTaggerPool.GetPosTagger(); try { pairs = pos.TagString(context); PosTaggerPool.ReturnPosTagger(pos); pair = GetIndexOfMention(pairs, mention); } catch (Exception ex) { PosTaggerPool.ReturnPosTagger(pos); throw ex; } } var index = pair.second + 1; while (index < pairs.Count) { if (pairs[index].first.Equals("##") || pairs[index].first.Equals(".") || pairs[index].first.Equals("!") || pairs[index].first.Equals("?") || pairs[index].first.Equals(";")) { index = pairs.Count; break; } else if (!pairs[index].first.Equals("'s") && regex.IsMatch(pairs[index].first)) { index++; } else { break; } } if (index < pairs.Count) { var word = pairs.ElementAt(index).first; var posTag = pairs.ElementAt(index).second; var wordStemmed = Generalizer.Generalize(word); var ID = DataCenter.GetWordClusterID(word).ToString(); // id should use original surface var shape = GetWordShape(word); rawFeature[(int)Event.Field.nextWord] = word; rawFeature[(int)Event.Field.nextWordStemmed] = wordStemmed; rawFeature[(int)Event.Field.nextWordTag] = posTag; rawFeature[(int)Event.Field.nextWordID] = ID; rawFeature[(int)Event.Field.nextWordShape] = shape; } else { rawFeature[(int)Event.Field.nextWord] = "NULL"; rawFeature[(int)Event.Field.nextWordStemmed] = "NULL"; rawFeature[(int)Event.Field.nextWordTag] = "NULL"; rawFeature[(int)Event.Field.nextWordID] = "100"; rawFeature[(int)Event.Field.nextWordShape] = "NULL"; } } } #endregion #region Modify mention ID if (true) { var mentionID = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionID)); var mentionClusterNum = DataCenter.GetMentionClusterNumber(); if (mentionID == mentionClusterNum) { mentionID = DataCenter.GetMentionClusterID(mention); rawFeature[(int)Event.Field.mentionID] = mentionID.ToString(); } } #endregion #region Key words if (false) { var keyWords = DataCenter.ExtractKeyWords(context); rawFeature[(int)Event.Field.sentenceContext] = string.Join(",", keyWords); rawFeature.Add(context); } #endregion return(rawFeature); }
/* Extract feature from the input, and the feature is clustered by field * The input should contains two items: * Mention surface: the surface text of the mention // input[0] * Mention context: the context contains the mention // input[1] * The output are a list of pairs store the features' index and value: * Mention surface * Mention Shape * Cluster ID of mention words * Mention length * Mention ID * Last token * Last token pos tag * Last token ID * Next token * Next token pos tag * Next token ID * Parent in dependency tree(stanford corenlp) : driver, action, adject modifier(TO USE) * Dictionary :TODO * Topic(Define topic) :TODO: I am going to work with document cluster * */ public List <string> ExtractFeature(Event e) { this.feature.Clear(); this.offset = 0; var rawFeature = e.Feature; feature.Add("0"); #region last word make last word more accurate { AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.lastWordStemmed), rawFeature.ElementAt((int)Event.Field.lastWordID), rawFeature.ElementAt((int)Event.Field.lastWordShape), rawFeature.ElementAt((int)Event.Field.lastWordTag)); } #endregion #region next word { AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.nextWordStemmed), rawFeature.ElementAt((int)Event.Field.nextWordID), rawFeature.ElementAt((int)Event.Field.nextWordShape), rawFeature.ElementAt((int)Event.Field.nextWordTag)); } #endregion #region mention head { AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionHeadStemmed), rawFeature.ElementAt((int)Event.Field.mentionHeadID), rawFeature.ElementAt((int)Event.Field.mentionHeadShape), rawFeature.ElementAt((int)Event.Field.mentionHeadTag)); } #endregion #region mention driver { AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionDriverStemmed), rawFeature.ElementAt((int)Event.Field.mentionDriverID), rawFeature.ElementAt((int)Event.Field.mentionDriverShape), rawFeature.ElementAt((int)Event.Field.mentionDriverTag)); } #endregion #region mention adjective modifer { AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionAdjModifierStemmed), rawFeature.ElementAt((int)Event.Field.mentionAdjModifierID), rawFeature.ElementAt((int)Event.Field.mentionAdjModifierShape), rawFeature.ElementAt((int)Event.Field.mentionAdjModifierTag)); } #endregion #region mention action { AddWordFieldToFeature(rawFeature.ElementAt((int)Event.Field.mentionActionStemmed), rawFeature.ElementAt((int)Event.Field.mentionActionID), rawFeature.ElementAt((int)Event.Field.mentionActionShape), rawFeature.ElementAt((int)Event.Field.mentionActionTag)); } #endregion #region mention words { string[] words = null; try { words = rawFeature.ElementAt((int)Event.Field.mentionSurfacesStemmed).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); } catch (Exception) { throw new Exception("Mention words is null"); } string[] IDs = null; try { IDs = rawFeature.ElementAt((int)Event.Field.mentionIDs).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); } catch (Exception) { throw new Exception("Mention ids is null"); } string[] shapes = null; try { shapes = rawFeature.ElementAt((int)Event.Field.mentionShapes).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); } catch (Exception) { throw new Exception("Mention shpaes is null"); } string[] tags = null; try { tags = rawFeature.ElementAt((int)Event.Field.mentionTags).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); } catch (Exception) { throw new Exception("Mention tags is null"); } var dic = new Dictionary <int, int>(); int value = 0; var dic2 = new SortedDictionary <int, int>(); foreach (var w in words) // words surface { var index = offset + DataCenter.GetWordIndex(w); dic.TryGetValue(index, out value); dic[index] = value + 1; } var keys = dic.Keys.ToList(); keys.Sort(); foreach (var key in keys) { feature.Add(key + ":" + dic[key]); } offset += DataCenter.GetWordTableSize() + 1; dic.Clear(); foreach (var ID in IDs) // words' cluster id { var index = offset + int.Parse(ID); dic.TryGetValue(index, out value); dic[index] = value + 1; } keys = dic.Keys.ToList(); keys.Sort(); foreach (var key in keys) { feature.Add(key + ":" + dic[key]); } offset += DataCenter.GetClusterNumber() + 1; dic.Clear(); foreach (var shape in shapes) // words shapes { var index = offset + DataCenter.GetWordShapeIndex(shape); dic.TryGetValue(index, out value); dic[index] = value + 1; } keys = dic.Keys.ToList(); keys.Sort(); foreach (var key in keys) { feature.Add(key + ":" + dic[key]); } offset += DataCenter.GetWordShapeTableSize() + 1; dic.Clear(); foreach (var tag in tags) { // words pos tags var index = offset + DataCenter.GetPosTagIndex(tag); dic.TryGetValue(index, out value); dic[index] = value + 1; } keys = dic.Keys.ToList(); keys.Sort(); foreach (var key in keys) { feature.Add(key + ":" + dic[key]); } offset += DataCenter.GetPosTagTableSize() + 1; } #endregion #region mention cluster id { var mentionID = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionID)); feature.Add((offset + mentionID) + ":1"); offset += DataCenter.GetMentionClusterNumber() + 1; } #endregion #region mention length: 1,2,3,4 or longer than 5 { var length = int.Parse(rawFeature.ElementAt((int)Event.Field.mentionLength)); if (length > 5) { length = 5; } feature.Add((offset + length - 1) + ":1"); offset += 5; } #endregion #region Stanford Ner system { var stanfordNerType = rawFeature.ElementAt((int)Event.Field.stanfordNerType); var index = DataCenter.GetStanfordTypeIndex(stanfordNerType); feature.Add((offset + index) + ":1"); offset += DataCenter.GetStanfordNerNumber() + 1; } #endregion #region OpenNLP Ner system { var openNLPNerType = rawFeature.ElementAt((int)Event.Field.opennlpNerType); var index = DataCenter.GetOpenNLPTypeIndex(openNLPNerType); feature.Add((offset + index) + ":1"); offset += DataCenter.GetOpenNLPNerNumber() + 1; } #endregion #region DBpedia types { var types = rawFeature.ElementAt((int)Event.Field.dbpediaTypes).Split(','); var list = new List <int>(); foreach (var type in types) { var index = DataCenter.GetDBpediaTypeIndex(type); list.Add(index); } list.Sort(); foreach (var index in list) { feature.Add((offset + index) + ":1"); } offset += DataCenter.GetDBpediaTypeNum(); // the index of typeNum will never occur. } #endregion #region Key words { var keywords = rawFeature.ElementAt((int)Event.Field.keyWords).Split(','); var list = new List <int>(); foreach (var word in keywords) { var index = DataCenter.GetKeyWordIndex(word); list.Add(offset + index); } list.Sort(); foreach (var index in list) { feature.Add(index + ":1"); } offset += DataCenter.GetKeyWordNumber(); } #endregion #region TODO: topic { } #endregion #region TODO: dictionary { } #endregion //set feature dimension feature[0] = FeatureDimension.ToString(); return(feature); }
//public static string StatisticRoundTokenInformation(String sourceFile) //{ // FileReader reader = new LargeFileReader(sourceFile); // FeatureExtractor extractor = new FeatureExtractor(); // // type-->(word-->times) // Dictionary<string, Dictionary<string, int>> lastTokenNumByType = new Dictionary<string, Dictionary<string, int>>(); // Dictionary<string, Dictionary<string, int>> nextTokenNumByType = new Dictionary<string, Dictionary<string, int>>(); // Dictionary<string, int> dic = null ; // string line; // string lastToken; // string nextToken; // string type; // String[] array; // int count = 0; // while ((line = reader.ReadLine()) != null) // { // if((++count) % 1000 ==0) // { // Console.WriteLine(count); // } // try // { // array = line.Split('\t'); // type = array[1]; // // get last token // lastToken = extractor.GetLastToken(array[2], array[0]).ToLower(); // if (lastToken == null) // { // lastToken = "null"; // } // else // { // lastToken = DataCenter.GetStemmedWord(lastToken); // } // // get next token // nextToken = extractor.GetNextToken(array[2], array[0]).ToLower(); // if (nextToken == null) // { // nextToken = "null"; // } // else // { // nextToken = DataCenter.GetStemmedWord(nextToken); // } // // deal last token // lastTokenNumByType.TryGetValue(type, out dic); // if (dic == null) // { // dic = new Dictionary<string, int>(); // } // try // { // dic[lastToken] += 1; // } // catch (Exception) // { // dic[lastToken] = 1; // } // lastTokenNumByType[type] = dic; // // deal next token // nextTokenNumByType.TryGetValue(type, out dic); // if (dic == null) // { // dic = new Dictionary<string, int>(); // } // try // { // dic[nextToken] += 1; // } // catch (Exception) // { // dic[nextToken] = 1; // } // nextTokenNumByType[type] = dic; // } // catch(Exception) // { // continue; // } // } // reader.Close(); // StringBuilder buffer = new StringBuilder(); // // report last token information // buffer.Append("last token report: word:(per times|loc times|org times)\r"); // List<Pair<string, int>> list = new List<Pair<string, int>>(); // Comparer<Pair<string, int>> comparer = new Pair<string,int>().GetBySecondReverseComparer(); // foreach(String item in lastTokenNumByType["people.person"].Keys ) // { // Pair<string, int> pair = new Pair<string, int>(item, lastTokenNumByType["people.person"][item]); // list.Add(pair); // } // list.Sort(comparer); // count = 0; // int locNum; // int orgNum; // foreach (Pair<string,int> item in list) // { // count++; // try // { // locNum = lastTokenNumByType["location.location"][item.first]; // } // catch (Exception) // { // locNum = 0; // } // try // { // orgNum = lastTokenNumByType["organization.organization"][item.first]; // } // catch (Exception) // { // orgNum = 0; // } // buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")"); // if (count % 5 == 0) // { // buffer.Append("\r"); // } // } // buffer.Append("\r"); // // report next token information // buffer.Append("next token report: word:(per times|loc times|org times)\r"); // list.Clear(); // foreach (String item in nextTokenNumByType["people.person"].Keys) // { // Pair<string, int> pair = new Pair<string, int>(item, nextTokenNumByType["people.person"][item]); // list.Add(pair); // } // list.Sort(comparer); // count = 0; // foreach (Pair<string, int> item in list) // { // count++; // try // { // locNum = nextTokenNumByType["location.location"][item.first]; // } // catch (Exception) // { // locNum = 0; // } // try // { // orgNum = nextTokenNumByType["organization.organization"][item.first]; // } // catch (Exception) // { // orgNum = 0; // } // buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")"); // if (count % 5 == 0) // { // buffer.Append("\r"); // } // } // return buffer.ToString(); //} //public static string StatisticWithinTokenInfomation(String sourceFile) //{ // FileReader reader = new LargeFileReader(sourceFile); // FeatureExtractor extractor = new FeatureExtractor(); // // type-->(word-->times) // Dictionary<string, Dictionary<string, int>> firstTokenNumByType = new Dictionary<string, Dictionary<string, int>>(); // Dictionary<string, Dictionary<string, int>> finalTokenNumByType = new Dictionary<string, Dictionary<string, int>>(); // Dictionary<string, int> dic = null ; // string line; // string firstToken; // string finalToken; // string type; // String[] array; // string[] wordArray; // int count = 0; // while ((line = reader.ReadLine()) != null) // { // if ((++count) % 1000 == 0) // { // Console.WriteLine(count); // } // try // { // array = line.Split('\t'); // type = array[1]; // wordArray = array[0].Split('\t'); // // get first token // firstToken = wordArray[0].ToLower(); // if (firstToken == null) // { // firstToken = "null"; // } // else // { // firstToken = DataCenter.GetStemmedWord(firstToken); // } // // get final token // finalToken = wordArray[wordArray.Length - 1].ToLower(); // if (finalToken == null) // { // finalToken = "null"; // } // else // { // finalToken = DataCenter.GetStemmedWord(finalToken); // } // // deal first token // firstTokenNumByType.TryGetValue(type, out dic); // if (dic == null) // { // dic = new Dictionary<string, int>(); // } // try // { // dic[firstToken] += 1; // } // catch (Exception) // { // dic[firstToken] = 1; // } // firstTokenNumByType[type] = dic; // // deal final token // finalTokenNumByType.TryGetValue(type, out dic); // if (dic == null) // { // dic = new Dictionary<string, int>(); // } // try // { // dic[finalToken] += 1; // } // catch (Exception) // { // dic[finalToken] = 1; // } // finalTokenNumByType[type] = dic; // } // catch(Exception) // { // continue; // } // } // reader.Close(); // StringBuilder buffer = new StringBuilder(); // // report first token information // buffer.Append("first token report: word:(per times|loc times|org times)\r"); // List<Pair<string, int>> list = new List<Pair<string, int>>(); // Comparer<Pair<string, int>> comparer = new Pair<string,int>().GetBySecondReverseComparer(); // foreach(String item in firstTokenNumByType["people.person"].Keys ) // { // Pair<string, int> pair = new Pair<string, int>(item, firstTokenNumByType["people.person"][item]); // list.Add(pair); // } // list.Sort(comparer); // count = 0; // int locNum; // int orgNum; // foreach (Pair<string,int> item in list) // { // count++; // try // { // locNum = firstTokenNumByType["location.location"][item.first]; // } // catch (Exception) // { // locNum = 0; // } // try // { // orgNum = firstTokenNumByType["organization.organization"][item.first]; // } // catch (Exception) // { // orgNum = 0; // } // buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")"); // if (count % 5 == 0) // { // buffer.Append("\r"); // } // } // buffer.Append("\r"); // // report final token information // buffer.Append("final token report: word:(per times|loc times|org times)\r"); // list.Clear(); // foreach (String item in finalTokenNumByType["people.person"].Keys) // { // Pair<string, int> pair = new Pair<string, int>(item, finalTokenNumByType["people.person"][item]); // list.Add(pair); // } // list.Sort(comparer); // count = 0; // foreach (Pair<string, int> item in list) // { // count++; // try // { // locNum = finalTokenNumByType["location.location"][item.first]; // } // catch (Exception) // { // locNum = 0; // } // try // { // orgNum = finalTokenNumByType["organization.organization"][item.first]; // } // catch (Exception) // { // orgNum = 0; // } // buffer.Append("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")"); // if (count % 5 == 0) // { // buffer.Append("\r"); // } // } // return buffer.ToString(); // } public static void Refresh() { DataCenter.RefreshStemDic(); }
/// <summary> /// Stem word /// </summary> /// <remarks> /// Input: string type original word /// </remarks> /// <returns> /// stemmed word or null if word is null /// </returns> /// protected string StemWord(string word) { return(word == null ? null : DataCenter.GetStemmedWord(word)); }