public static void getFeatureSet(string fname) { featureSet.Clear(); //string file = "c." + fname + ".train.txt"; Console.Error.WriteLine("getting feature set..."); List <string> wordsList = new List <string>(); List <string> tagsList = new List <string>(); normalize(fname, wordsList, tagsList); //deal with featureMap and tagMap for train-input. No need for test-input baseHashMap <string, int> featureFreqMap = new baseHashMap <string, int>(); for (int i = 0; i < wordsList.Count; i++) { string words = wordsList[i]; string[] wordAry = words.Split(Global.blankAry); for (int k = 0; k < wordAry.Length; k++) { string word = wordAry[k]; List <string> nodeFeatures = new List <string>(); getNodeFeatures(k, wordAry, ref nodeFeatures); foreach (string f in nodeFeatures) { if (f == "/") { continue; } string[] fAry = f.Split(Global.slashAry); string id = fAry[0]; featureFreqMap[id]++; } } } //build featureSet foreach (baseHashMap <string, int> .KeyValuePair kv in featureFreqMap) { if (kv.Value > Global.featureTrim) { featureSet.Add(kv.Key); } } }
public static void writeFeaturesTag(List <string> wordSeqList, List <string> tagSeqList, string file) { StreamWriter swFeatureFile = new StreamWriter(file); //count length dist baseHashMap <int, int> lengthCountMap = new baseHashMap <int, int>(); int interval = wordSeqList.Count / 10; for (int i = 0; i < wordSeqList.Count; i++) { if (i % interval == 0) { double percent = (double)i / (double)wordSeqList.Count * 100.0; Console.WriteLine("{0}: sentence #{1} --> {2}%", file, i, percent.ToString("f2")); } string wordSeq = wordSeqList[i]; string[] wordAry = wordSeq.Split(Global.blankAry); string tagSeq = tagSeqList[i]; string[] tagAry = tagSeq.Split(Global.blankAry); int length = wordAry.Length; lengthCountMap[length]++; for (int k = 0; k < wordAry.Length; k++) { List <string> nodeFeatures = new List <string>(); getNodeFeatures(k, wordAry, ref nodeFeatures); swFeatureFile.Write(wordAry[k] + " "); //word foreach (string f in nodeFeatures) //features { if (f == "/") { swFeatureFile.Write("/ "); } else { string[] fAry = f.Split(Global.slashAry); string id = fAry[0]; if (featureSet.Contains(id)) { swFeatureFile.Write(f + " "); } else { swFeatureFile.Write("/ "); } } } swFeatureFile.Write(tagAry[k]);//tag swFeatureFile.WriteLine(); } swFeatureFile.WriteLine(); } swFeatureFile.Close(); //output length dist List <string> sortList2 = new List <string>(); foreach (baseHashMap <int, int> .KeyValuePair kv in lengthCountMap) { double v = (double)kv.Value / (double)wordSeqList.Count * 100.0; sortList2.Add(string.Format("{0} count:{1} --> {2}%", kv.Key, kv.Value, v.ToString("f2"))); } sortList2.Sort(ListSortFunc.compareKV_key); Console.WriteLine("length distribution:"); foreach (string im in sortList2) { Console.WriteLine(im); } }