//convert number-based list to letter-based list static void getNewTagList(baseHashMap <int, string> tagMap, ref List <string> tagList) { List <string> tmpList = new List <string>(); foreach (string im in tagList) { string[] tagAry = im.Split(Global.commaAry, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < tagAry.Length; i++) { int index = int.Parse(tagAry[i]); if (!tagMap.ContainsKey(index)) { throw new Exception("error"); } tagAry[i] = tagMap[index]; } string newTags = string.Join(",", tagAry); tmpList.Add(newTags); } tagList.Clear(); foreach (string im in tmpList) { tagList.Add(im); } }
public void getMaps(string file) { if (!File.Exists(file)) { Console.WriteLine("file {0} no exist!", file); return; } Console.WriteLine("file {0} converting...", file); StreamReader sr = new StreamReader(file); baseHashMap <string, int> featureFreqMap = new baseHashMap <string, int>(); baseHashSet <string> tagSet = new baseHashSet <string>(); //get feature-freq info and tagset int nFeatTemp = 0; while (!sr.EndOfStream) { string line = sr.ReadLine(); line = line.Replace("\t", " "); line = line.Replace("\r", ""); if (line == "") { continue; } string[] ary = line.Split(Global.blankAry, StringSplitOptions.RemoveEmptyEntries); nFeatTemp = ary.Length - 2; for (int i = 1; i < ary.Length - 1; i++) { if (ary[i] == "/")//no feature here { continue; } string[] ary2 = ary[i].Split(Global.slashAry, StringSplitOptions.RemoveEmptyEntries);//for real-value features string feature = i.ToString() + "." + ary2[0]; if (featureFreqMap.ContainsKey(feature) == false) { featureFreqMap[feature] = 1; } else { featureFreqMap[feature]++; } } string tag = ary[ary.Length - 1]; tagSet.Add(tag); } //sort features List <string> sortList = new List <string>(); foreach (baseHashMap <string, int> .KeyValuePair kv in featureFreqMap) { sortList.Add(kv.Key + " " + kv.Value); } if (Global.regMode == "GL")//sort based on feature templates { sortList.Sort(listSortFunc.compareKV_key); //sortList.Reverse(); Global.groupStart = new List <int>(); Global.groupEnd = new List <int>(); Global.groupStart.Add(0); for (int k = 1; k < sortList.Count; k++) { string[] thisAry = sortList[k].Split(Global.dotAry, StringSplitOptions.RemoveEmptyEntries); string[] preAry = sortList[k - 1].Split(Global.dotAry, StringSplitOptions.RemoveEmptyEntries); string str = thisAry[0], preStr = preAry[0]; if (str != preStr) { Global.groupStart.Add(k); Global.groupEnd.Add(k); } } Global.groupEnd.Add(sortList.Count); } else//sort based on feature frequency { sortList.Sort(listSortFunc.compareKV_value);//sort feature based on freq, for 1)compress .txt file 2)better edge features sortList.Reverse(); } if (Global.regMode == "GL") { if (nFeatTemp != Global.groupStart.Count) { throw new Exception("inconsistent # of features per line, check the feature file for consistency!"); } } //feature index should begin from 0 StreamWriter swFeat = new StreamWriter("featureIndex.txt"); for (int i = 0; i < sortList.Count; i++) { string[] ary = sortList[i].Split(Global.blankAry); featureIndexMap[ary[0]] = i; swFeat.WriteLine("{0} {1}", ary[0], i); } swFeat.Close(); //label index should begin from 0 StreamWriter swTag = new StreamWriter("tagIndex.txt"); List <string> tagSortList = new List <string>(); foreach (string tag in tagSet) { tagSortList.Add(tag); } tagSortList.Sort();//sort tags for (int i = 0; i < tagSortList.Count; i++) { tagIndexMap[tagSortList[i]] = i; swTag.WriteLine("{0} {1}", tagSortList[i], i); } swTag.Close(); sr.Close(); }
//for mira void updateWeights(dataSeq x, List <int> outStates, List <int> goldStates, float[] w, float[] accumW, int nSamples, int k, double diff) { float t = nSamples - k; //get a_t = F(y*) - F(y) baseHashMap <int, double> a = new baseHashMap <int, double>(); for (int n = 0; n < x.Count; n++) { int outState = outStates[n]; int goldState = goldStates[n]; List <featureTemp> fList = _fGene.getFeatureTemp(x, n); //node feature foreach (featureTemp im in fList) { double fv = im.val; foreach (nodeFeature feat in Global.idNodeFeatures[im.id]) { int s = feat._s; int f = feat._id; if (s == outState) { a[f] -= fv; } if (s == goldState) { a[f] += fv; } } } //edge feature if (n > 0) { //non-rich if (Global.useTraditionalEdge) { int f = _fGene.getEdgeFeatID(outStates[n - 1], outState); a[f]--; f = _fGene.getEdgeFeatID(goldStates[n - 1], goldState); a[f]++; } //rich foreach (featureTemp im in fList) { double fv = im.val; foreach (edgeFeature feat in Global.idEdgeFeatures[im.id]) { int s = feat._s; int sPre = feat._sPre; int f = feat._id; if (sPre == outStates[n - 1] && s == outState) { a[f] -= fv; } if (sPre == goldStates[n - 1] && s == goldState) { a[f] += fv; } } } //rich2 if (Global.richFeat2) { fList = _fGene.getFeatureTemp(x, n - 1); foreach (featureTemp im in fList) { double fv = im.val; foreach (edgeFeature feat in Global.idEdgeFeatures2[im.id]) { int s = feat._s; int sPre = feat._sPre; int f = feat._id; if (sPre == outStates[n - 1] && s == outState) { a[f] -= fv; } if (sPre == goldStates[n - 1] && s == goldState) { a[f] += fv; } } } } } } //compute w*a, ||a||^2 double wa = 0, norm = 0; foreach (baseHashMap <int, double> .KeyValuePair kv in a) { wa += w[kv.Key] * kv.Value; norm += kv.Value * kv.Value; } //compute the scalar double scale = (Math.Sqrt(diff) - wa) / norm; //compute w_{t+1} foreach (baseHashMap <int, double> .KeyValuePair kv in a) { int f = kv.Key; float val = (float)(scale * kv.Value); w[f] += val; accumW[f] += t * val; } }