public List <double> test(dataSet X, string outputfile) { string outfile = outputfile; Global.swOutput = new StreamWriter(outfile); List <double> scoreList; if (Global.evalMetric == "tok.acc") { scoreList = decode_tokAcc(X, _model); } else if (Global.evalMetric == "str.acc") { scoreList = decode_strAcc(X, _model); decode_tokAcc(X, _model);//this is only for record accuracy info on trainLog, this is useful in t-test } else if (Global.evalMetric == "f1") { scoreList = decode_fscore(X, _model); decode_tokAcc(X, _model); } else { throw new Exception("error"); } Global.swOutput.Close(); return(scoreList); }
public static void dataSplit(dataSet X, double v1, double v2, dataSet X1, dataSet X2) { if (v2 < v1) { throw new Exception("error"); } X1.Clear(); X2.Clear(); X1.setDataInfo(X); X2.setDataInfo(X); int n1 = (int)(X.Count * v1); int n2 = (int)(X.Count * v2); for (int i = 0; i < X.Count; i++) { if (i >= n1 && i < n2) { X1.Add(X[i]); } else { X2.Add(X[i]); } } }
static void test() { //Console.WriteLine("test data ..."); Global.swLog.WriteLine("reading test data..."); dataSet XX = new dataSet(Global.fFeatureTest, Global.fGoldTest); Console.WriteLine("test data size: {0}", XX.Count); Global.swLog.WriteLine("Done! test data size: {0}", XX.Count); //load model & feature files for testing toolbox tb = new toolbox(XX, false); //Stopwatch timer = new Stopwatch(); // timer.Start(); List <double> scoreList = tb.test(XX, 0); //timer.Stop(); //double time = timer.ElapsedMilliseconds / 1000.0; //Global.timeList.Add(time); //double score = scoreList[0]; //Global.scoreListList.Add(scoreList); //resSummarize.write(); //return score; }
public List <double> test(dataSet X, double iter) { string outfile = Global.outDir + Global.fOutput; Global.swOutput = new StreamWriter(outfile); List <double> scoreList; if (Global.evalMetric == "tok.acc") { scoreList = decode_tokAcc(X, _model); } else if (Global.evalMetric == "str.acc") { scoreList = decode_strAcc(X, _model); } else if (Global.evalMetric == "f1") { scoreList = decode_fscore(X, _model); } else { throw new Exception("error"); } Global.swOutput.Close(); return(scoreList); }
//compute grad of: sum{-log{P(y*|x,w)}} + R(w) public double getGrad_BFGS(List <double> g, model m, dataSet X) { double error = 0; int nFeature = _fGene.NCompleteFeature; foreach (dataSeq x in X) { double err = 0; err = getGradCRF(g, m, x, null); error += err; } if (Global.reg != 0.0) { for (int f = 0; f < nFeature; f++) { g[f] += m.W[f] / (Global.reg * Global.reg); } } if (Global.reg != 0.0) { float[] tmpWeights = m.W; double sum = arrayTool.squareSum(tmpWeights); error += sum / (2.0 * Global.reg * Global.reg); } return(error); }
public dataSet randomShuffle() { List <int> ri = randomTool <int> .getShuffledIndexList(this.Count); dataSet X = new dataSet(this.NTag, this.NFeature); foreach (int i in ri) { X.Add(this[i]); } return(X); }
static void crossValidation() { //load data Console.WriteLine("reading cross validation data..."); Global.swLog.WriteLine("reading cross validation data..."); List <dataSet> XList = new List <dataSet>(); List <dataSet> XXList = new List <dataSet>(); loadDataForCV(XList, XXList); //start cross validation foreach (double r in Global.regList)//do CV for each different regularizer r (sigma) { Global.swLog.WriteLine("\ncross validation. r={0}", r); Console.WriteLine("\ncross validation. r={0}", r); if (Global.rawResWrite) { Global.swResRaw.WriteLine("% cross validation. r={0}", r); } for (int i = 0; i < Global.nCV; i++) { Global.swLog.WriteLine("\n#validation={0}", i + 1); Console.WriteLine("\n#validation={0}", i + 1); if (Global.rawResWrite) { Global.swResRaw.WriteLine("% #validation={0}", i + 1); } Global.reg = r; dataSet Xi = XList[i]; if (Global.runMode.Contains("rich")) { toolboxRich tb = new toolboxRich(Xi); basicTrain(XXList[i], tb); } else { toolbox tb = new toolbox(Xi); basicTrain(XXList[i], tb); } resSummarize.write(); if (Global.rawResWrite) { Global.swResRaw.WriteLine(); } } if (Global.rawResWrite) { Global.swResRaw.WriteLine(); } } }
//for train & test public featureGenerator(dataSet X) { _nFeatureTemp = X.NFeature; _nTag = X.NTag; Global.swLog.WriteLine("feature templates: {0}", _nFeatureTemp); int nNodeFeature = _nFeatureTemp * _nTag; int nEdgeFeature = _nTag * _nTag; _backoff1 = nNodeFeature; _nCompleteFeature = nNodeFeature + nEdgeFeature; Global.swLog.WriteLine("complete features: {0}", _nCompleteFeature); }
static double train() { //if (Global.formatConvert) //{ // dataFormat df = new dataFormat(); // df.convert(); //} //load data Console.WriteLine("\nreading training & test data..."); Global.swLog.WriteLine("\nreading training & test data..."); dataSet X, XX; if (Global.runMode.Contains("tune"))//put "tune" related code here because train() could be sub-function of tune() { dataSet origX = new dataSet(Global.fFeatureTrain, Global.fGoldTrain); X = new dataSet(); XX = new dataSet(); dataSplit(origX, Global.tuneSplit, X, XX); } else { X = new dataSet(Global.fFeatureTrain, Global.fGoldTrain); XX = new dataSet(Global.fFeatureTest, Global.fGoldTest); dataSizeScale(X); } Console.WriteLine("done! train/test data sizes: {0}/{1}", X.Count, XX.Count); Global.swLog.WriteLine("done! train/test data sizes: {0}/{1}", X.Count, XX.Count); double score = 0; //start training foreach (double r in Global.regList)//train on different r (sigma) { Global.reg = r; Global.swLog.WriteLine("\nr: " + r.ToString()); Console.WriteLine("\nr: " + r.ToString()); if (Global.rawResWrite) { Global.swResRaw.WriteLine("\n%r: " + r.ToString()); } toolbox tb = new toolbox(X, true); score = basicTrain(XX, tb); resSummarize.write();//summarize the results & output the summarized results if (Global.save == 1) { tb.Model.save(Global.fModel);//save model as a .txt file } } return(score); }
//f-score public List <double> decode_fscore(dataSet X, model m) { //multi thread List <dataSeqTest> X2 = new List <dataSeqTest>(); multiThreading(X, X2); List <string> goldTagList = new List <string>(); List <string> resTagList = new List <string>(); foreach (dataSeqTest x in X2) { string res = ""; foreach (int im in x._yOutput) { res += im.ToString() + ","; } resTagList.Add(res); //output tag results if (Global.swOutput != null) { for (int i = 0; i < x._yOutput.Count; i++) { Global.swOutput.Write(x._yOutput[i] + ","); } Global.swOutput.WriteLine(); } List <int> goldTags = x._x.getTags(); string gold = ""; foreach (int im in goldTags) { gold += im.ToString() + ","; } goldTagList.Add(gold); } List <double> scoreList = new List <double>(); if (Global.runMode == "train") { List <double> infoList = new List <double>(); scoreList = fscore.getFscore(goldTagList, resTagList, infoList); Global.swLog.WriteLine("#gold-chunk={0} #output-chunk={1} #correct-output-chunk={2} precision={3} recall={4} f-score={5}", infoList[0], infoList[1], infoList[2], scoreList[1].ToString("f2"), scoreList[2].ToString("f2"), scoreList[0].ToString("f2")); } return(scoreList); }
public static double test() { dataSet X = new dataSet(Global.fFeatureTrain, Global.fGoldTrain); dataSet XX = new dataSet(Global.fFeatureTest, Global.fGoldTest); Global.swLog.WriteLine("data size (test): {0}", XX.Count); //load model for testing toolboxRich tb = new toolboxRich(X, false); List <double> scoreList = tb.test(XX, 0); double score = scoreList[0]; Global.scoreListList.Add(scoreList); resSummarize.write(); return(score); }
//this function can be called by train(), cv(), & richEdge.train() public static double basicTrain(dataSet XTest, toolbox tb) { Global.reinitGlobal(); double score = 0; if (Global.modelOptimizer.EndsWith("bfgs")) { Global.tb = tb; Global.XX = XTest; tb.train(); score = Global.scoreListList[Global.scoreListList.Count - 1][0]; } else { for (int i = 0; i < Global.ttlIter; i++) { Global.glbIter++; Stopwatch timer = new Stopwatch(); timer.Start(); double err = tb.train(); timer.Stop(); double time = timer.ElapsedMilliseconds / 1000.0; Global.timeList.Add(time); Global.errList.Add(err); Global.diffList.Add(Global.diff); List <double> scoreList = tb.test(XTest, i); score = scoreList[0]; Global.scoreListList.Add(scoreList); Global.swLog.WriteLine("iter{0} diff={1} train-time(sec)={2} {3}={4}%", Global.glbIter, Global.diff.ToString("e2"), time.ToString("f2"), Global.metric, score.ToString("f2")); Global.swLog.WriteLine("------------------------------------------------"); Global.swLog.Flush(); Console.WriteLine("iter{0} diff={1} train-time(sec)={2} {3}={4}%", Global.glbIter, Global.diff.ToString("e2"), time.ToString("f2"), Global.metric, score.ToString("f2")); //if (Global.diff < Global.convergeTol) //break; } } return(score); }
//string accuracy public List <double> decode_strAcc(dataSet X, model m) { double xsize = X.Count; double corr = 0; //multi thread List <dataSeqTest> X2 = new List <dataSeqTest>(); multiThreading(X, X2); foreach (dataSeqTest x in X2) { //output tag results if (Global.swOutput != null) { for (int i = 0; i < x._x.Count; i++) { Global.swOutput.Write(x._yOutput[i].ToString() + ","); } Global.swOutput.WriteLine(); } List <int> goldTags = x._x.getTags(); bool ck = true; for (int i = 0; i < x._x.Count; i++) { if (goldTags[i] != x._yOutput[i]) { ck = false; break; } } if (ck) { corr++; } } double acc = corr / xsize * 100.0; Global.swLog.WriteLine("total-tag-strings={0} correct-tag-strings={1} string-accuracy={2}%", xsize, corr, acc); List <double> scoreList = new List <double>(); scoreList.Add(acc); return(scoreList); }
public model(dataSet X, featureGenerator fGen) { _nTag = X.NTag; //default value is 0 if (Global.random == 0) { _w = new float[fGen.NCompleteFeature]; } else if (Global.random == 1) { List <float> randList = randomDoubleTool.getRandomList_float(fGen.NCompleteFeature); _w = randList.ToArray(); } else { throw new Exception("error"); } }
public static double train() { //load data Console.WriteLine("\nreading training & test data..."); Global.swLog.WriteLine("\nreading training & test data..."); dataSet X, XX; if (Global.runMode.Contains("tune")) { dataSet origX = new dataSet(Global.fFeatureTrain, Global.fGoldTrain); X = new dataSet(); XX = new dataSet(); MainClass.dataSplit(origX, Global.tuneSplit, X, XX); } else { X = new dataSet(Global.fFeatureTrain, Global.fGoldTrain); XX = new dataSet(Global.fFeatureTest, Global.fGoldTest); MainClass.dataSizeScale(X); } Global.swLog.WriteLine("data sizes (train, test): {0} {1}", X.Count, XX.Count); double score = 0; foreach (double r in Global.regList) { Global.reg = r; Global.swLog.WriteLine("\nr: " + r.ToString()); Console.WriteLine("\nr: " + r.ToString()); if (Global.rawResWrite) { Global.swResRaw.WriteLine("\n%r: " + r.ToString()); } toolboxRich tb = new toolboxRich(X); score = MainClass.basicTrain(XX, tb); resSummarize.write(); //save model if (Global.save == 1) { tb.Model.save(Global.fModel); } } return(score); }
public void multiThreading(dataSet X, List <dataSeqTest> X2) { if (X.Count < Global.nThread) { Global.nThread = X.Count; } //data for multi thread for (int i = 0; i < X.Count; i++) { X2.Add(new dataSeqTest(X[i], new List <int>())); } Global.threadXX = new List <List <dataSeqTest> >(); for (int i = 0; i < Global.nThread; i++) { Global.threadXX.Add(new List <dataSeqTest>()); } for (int i = 0; i < X2.Count; i++) { int idx = i % Global.nThread; Global.threadXX[idx].Add(X2[i]); } Stopwatch timer = new Stopwatch(); timer.Start(); //multi thread Task[] taskAry = new Task[Global.nThread]; for (int i = 0; i < Global.nThread; i++) { taskAry[i] = new Task(taskRunner_test, i, TaskCreationOptions.PreferFairness); taskAry[i].Start(); } Task.WaitAll(taskAry); timer.Stop(); double time = timer.ElapsedMilliseconds / 1000.0; Global.swLog.WriteLine("**********test run time (sec): " + time.ToString()); }
public toolboxRich(dataSet X, bool train = true) { if (train)//for training { _X = X; _fGene = new featureGeneRich(X); _model = new model(X, _fGene); _inf = new inferRich(this); _grad = new gradRich(this); initOptimizer(); } else//for test { _X = X; _model = new model(Global.fModel); _fGene = new featureGeneRich(X); _inf = new inferRich(this); _grad = new gradRich(this); } }
//for training & test public featureGeneRich(dataSet X) { _nFeatureTemp = X.NFeature; _nFeatureTemp_richEdge = (int)(X.NFeature * Global.edgeReduce); this._nTag = X.NTag; int nNodeFeature = _nFeatureTemp * _nTag; int nEdgeFeature1 = _nTag * _nTag; int nEdgeFeature2 = _nFeatureTemp_richEdge * _nTag * _nTag; _backoff1 = nNodeFeature; _backoff2 = nNodeFeature + nEdgeFeature1; _nCompleteFeature = nNodeFeature + nEdgeFeature1 + nEdgeFeature2; Global.swLog.WriteLine("feature templates & rich-edge feature templates0: {0}, {1}", _nFeatureTemp, _nFeatureTemp_richEdge); Global.swLog.WriteLine("nNodeFeature, nEdgeFeature1, nEdgeFeature2: {0}, {1}, {2}", nNodeFeature, nEdgeFeature1, nEdgeFeature2); Global.swLog.WriteLine("complete features: {0}", _nCompleteFeature); Global.swLog.WriteLine(); Global.swLog.Flush(); }
public toolbox(dataSet X, bool train = true) { if (train)//to train { _X = X; _fGene = new featureGenerator(X); _model = new model(X, _fGene); _inf = new inference(this); _grad = new gradient(this); initOptimizer(); } else//to test { _X = X; _model = new model(Global.fModel); _fGene = new featureGenerator(X); _inf = new inference(this); _grad = new gradient(this); } }
public static void dataSplit(dataSet X, double v, dataSet X1, dataSet X2) { X1.Clear(); X2.Clear(); X1.setDataInfo(X); X2.setDataInfo(X); int n = (int)(X.Count * v); for (int i = 0; i < X.Count; i++) { if (i < n) { X1.Add(X[i]); } else { X2.Add(X[i]); } } }
public static void dataSizeScale(dataSet X) { dataSet XX = new dataSet(); XX.setDataInfo(X); foreach (dataSeq im in X) { XX.Add(im); } X.Clear(); int n = (int)(XX.Count * Global.trainSizeScale); for (int i = 0; i < n; i++) { int j = i; if (j > XX.Count - 1) { j %= XX.Count - 1; } X.Add(XX[j]); } X.setDataInfo(XX); }
public static void loadDataForCV(List <dataSet> XList, List <dataSet> XXList) { XList.Clear(); XXList.Clear(); //load train data only: CV is based only on training data dataSet X = new dataSet(Global.fFeatureTrain, Global.fGoldTrain); double step = 1.0 / Global.nCV; for (double i = 0; i < 1; i += step) { dataSet Xi = new dataSet(); dataSet XRest_i = new dataSet(); dataSplit(X, i, i + step, Xi, XRest_i); XList.Add(XRest_i); XXList.Add(Xi); } Console.WriteLine("Done! cross-validation train/test data sizes (cv_1, ..., cv_n): "); Global.swLog.WriteLine("Done! cross-validation train/test data sizes (cv_1, ..., cv_n): "); for (int i = 0; i < Global.nCV; i++) { Global.swLog.WriteLine("{0}/{1}, ", XList[i].Count, XXList[i].Count); } }
//token accuracy public List <double> decode_tokAcc(dataSet X, model m) { int nTag = m.NTag; int[] tmpAry = new int[nTag]; List <int> corrOutput = new List <int>(tmpAry); List <int> gold = new List <int>(tmpAry); List <int> output = new List <int>(tmpAry); //multi thread List <dataSeqTest> X2 = new List <dataSeqTest>(); multiThreading(X, X2); foreach (dataSeqTest x in X2) { List <int> outTags = x._yOutput; List <int> goldTags = x._x.getTags(); //output tag results if (Global.swOutput != null) { for (int i = 0; i < outTags.Count; i++) { Global.swOutput.Write(outTags[i].ToString() + ","); } Global.swOutput.WriteLine(); } //count for (int i = 0; i < outTags.Count; i++) { gold[goldTags[i]]++; output[outTags[i]]++; if (outTags[i] == goldTags[i]) { corrOutput[outTags[i]]++; } } } Global.swLog.WriteLine("% tag-type #gold #output #correct-output token-precision token-recall token-f-score"); double prec, rec; int sumGold = 0, sumOutput = 0, sumCorrOutput = 0; for (int i = 0; i < nTag; i++) { sumCorrOutput += corrOutput[i]; sumGold += gold[i]; sumOutput += output[i]; if (gold[i] == 0) { rec = 0; } else { rec = ((double)corrOutput[i]) * 100.0 / (double)gold[i]; } if (output[i] == 0) { prec = 0; } else { prec = ((double)corrOutput[i]) * 100.0 / (double)output[i]; } Global.swLog.WriteLine("% {0}: {1} {2} {3} {4} {5} {6}", i, gold[i], output[i], corrOutput[i], prec.ToString("f2"), rec.ToString("f2"), (2 * prec * rec / (prec + rec)).ToString("f2")); } if (sumGold == 0) { rec = 0; } else { rec = ((double)sumCorrOutput) * 100.0 / (double)sumGold; } if (sumOutput == 0) { prec = 0; } else { prec = ((double)sumCorrOutput) * 100.0 / (double)sumOutput; } double fscore; if (prec == 0 && rec == 0) { fscore = 0; } else { fscore = 2 * prec * rec / (prec + rec);//this token-based overall-f-score is also the token-based-accuracy } Global.swLog.WriteLine("% overall-tags: {0} {1} {2} {3} {4} {5}", sumGold, sumOutput, sumCorrOutput, prec.ToString("f2"), rec.ToString("f2"), fscore.ToString("f2")); Global.swLog.Flush(); List <double> scoreList = new List <double>(); scoreList.Add(fscore); return(scoreList); }
public void setDataInfo(dataSet X) { _nTag = X.NTag; _nFeature = X.NFeature; }