Beispiel #1
0
        public List <double> test(dataSet X, string outputfile)
        {
            string outfile = outputfile;

            Global.swOutput = new StreamWriter(outfile);
            List <double> scoreList;

            if (Global.evalMetric == "tok.acc")
            {
                scoreList = decode_tokAcc(X, _model);
            }
            else if (Global.evalMetric == "str.acc")
            {
                scoreList = decode_strAcc(X, _model);
                decode_tokAcc(X, _model);//this is only for record accuracy info on trainLog, this is useful in t-test
            }
            else if (Global.evalMetric == "f1")
            {
                scoreList = decode_fscore(X, _model);
                decode_tokAcc(X, _model);
            }
            else
            {
                throw new Exception("error");
            }
            Global.swOutput.Close();

            return(scoreList);
        }
Beispiel #2
0
        public static void dataSplit(dataSet X, double v1, double v2, dataSet X1, dataSet X2)
        {
            if (v2 < v1)
            {
                throw new Exception("error");
            }
            X1.Clear();
            X2.Clear();
            X1.setDataInfo(X);
            X2.setDataInfo(X);
            int n1 = (int)(X.Count * v1);
            int n2 = (int)(X.Count * v2);

            for (int i = 0; i < X.Count; i++)
            {
                if (i >= n1 && i < n2)
                {
                    X1.Add(X[i]);
                }
                else
                {
                    X2.Add(X[i]);
                }
            }
        }
Beispiel #3
0
        static void test()
        {
            //Console.WriteLine("test data ...");
            Global.swLog.WriteLine("reading test data...");
            dataSet XX = new dataSet(Global.fFeatureTest, Global.fGoldTest);

            Console.WriteLine("test data size: {0}", XX.Count);
            Global.swLog.WriteLine("Done! test data size: {0}", XX.Count);
            //load model & feature files for testing
            toolbox tb = new toolbox(XX, false);

            //Stopwatch timer = new Stopwatch();
            // timer.Start();

            List <double> scoreList = tb.test(XX, 0);

            //timer.Stop();
            //double time = timer.ElapsedMilliseconds / 1000.0;

            //Global.timeList.Add(time);
            //double score = scoreList[0];
            //Global.scoreListList.Add(scoreList);

            //resSummarize.write();
            //return score;
        }
Beispiel #4
0
        public List <double> test(dataSet X, double iter)
        {
            string outfile = Global.outDir + Global.fOutput;

            Global.swOutput = new StreamWriter(outfile);
            List <double> scoreList;

            if (Global.evalMetric == "tok.acc")
            {
                scoreList = decode_tokAcc(X, _model);
            }
            else if (Global.evalMetric == "str.acc")
            {
                scoreList = decode_strAcc(X, _model);
            }
            else if (Global.evalMetric == "f1")
            {
                scoreList = decode_fscore(X, _model);
            }
            else
            {
                throw new Exception("error");
            }
            Global.swOutput.Close();

            return(scoreList);
        }
Beispiel #5
0
        //compute grad of: sum{-log{P(y*|x,w)}} + R(w)
        public double getGrad_BFGS(List <double> g, model m, dataSet X)
        {
            double error    = 0;
            int    nFeature = _fGene.NCompleteFeature;

            foreach (dataSeq x in X)
            {
                double err = 0;
                err    = getGradCRF(g, m, x, null);
                error += err;
            }

            if (Global.reg != 0.0)
            {
                for (int f = 0; f < nFeature; f++)
                {
                    g[f] += m.W[f] / (Global.reg * Global.reg);
                }
            }
            if (Global.reg != 0.0)
            {
                float[] tmpWeights = m.W;
                double  sum        = arrayTool.squareSum(tmpWeights);
                error += sum / (2.0 * Global.reg * Global.reg);
            }
            return(error);
        }
Beispiel #6
0
        public dataSet randomShuffle()
        {
            List <int> ri = randomTool <int> .getShuffledIndexList(this.Count);

            dataSet X = new dataSet(this.NTag, this.NFeature);

            foreach (int i in ri)
            {
                X.Add(this[i]);
            }
            return(X);
        }
Beispiel #7
0
        static void crossValidation()
        {
            //load data
            Console.WriteLine("reading cross validation data...");
            Global.swLog.WriteLine("reading cross validation data...");
            List <dataSet> XList  = new List <dataSet>();
            List <dataSet> XXList = new List <dataSet>();

            loadDataForCV(XList, XXList);

            //start cross validation
            foreach (double r in Global.regList)//do CV for each different regularizer r (sigma)
            {
                Global.swLog.WriteLine("\ncross validation. r={0}", r);
                Console.WriteLine("\ncross validation. r={0}", r);
                if (Global.rawResWrite)
                {
                    Global.swResRaw.WriteLine("% cross validation. r={0}", r);
                }
                for (int i = 0; i < Global.nCV; i++)
                {
                    Global.swLog.WriteLine("\n#validation={0}", i + 1);
                    Console.WriteLine("\n#validation={0}", i + 1);
                    if (Global.rawResWrite)
                    {
                        Global.swResRaw.WriteLine("% #validation={0}", i + 1);
                    }
                    Global.reg = r;
                    dataSet Xi = XList[i];
                    if (Global.runMode.Contains("rich"))
                    {
                        toolboxRich tb = new toolboxRich(Xi);
                        basicTrain(XXList[i], tb);
                    }
                    else
                    {
                        toolbox tb = new toolbox(Xi);
                        basicTrain(XXList[i], tb);
                    }

                    resSummarize.write();
                    if (Global.rawResWrite)
                    {
                        Global.swResRaw.WriteLine();
                    }
                }
                if (Global.rawResWrite)
                {
                    Global.swResRaw.WriteLine();
                }
            }
        }
Beispiel #8
0
        //for train & test
        public featureGenerator(dataSet X)
        {
            _nFeatureTemp = X.NFeature;
            _nTag         = X.NTag;
            Global.swLog.WriteLine("feature templates: {0}", _nFeatureTemp);

            int nNodeFeature = _nFeatureTemp * _nTag;
            int nEdgeFeature = _nTag * _nTag;

            _backoff1         = nNodeFeature;
            _nCompleteFeature = nNodeFeature + nEdgeFeature;
            Global.swLog.WriteLine("complete features: {0}", _nCompleteFeature);
        }
Beispiel #9
0
        static double train()
        {
            //if (Global.formatConvert)
            //{
            //    dataFormat df = new dataFormat();
            //    df.convert();
            //}
            //load data
            Console.WriteLine("\nreading training & test data...");
            Global.swLog.WriteLine("\nreading training & test data...");
            dataSet X, XX;

            if (Global.runMode.Contains("tune"))//put "tune" related code here because train() could be sub-function of tune()
            {
                dataSet origX = new dataSet(Global.fFeatureTrain, Global.fGoldTrain);
                X  = new dataSet();
                XX = new dataSet();
                dataSplit(origX, Global.tuneSplit, X, XX);
            }
            else
            {
                X  = new dataSet(Global.fFeatureTrain, Global.fGoldTrain);
                XX = new dataSet(Global.fFeatureTest, Global.fGoldTest);
                dataSizeScale(X);
            }
            Console.WriteLine("done! train/test data sizes: {0}/{1}", X.Count, XX.Count);
            Global.swLog.WriteLine("done! train/test data sizes: {0}/{1}", X.Count, XX.Count);
            double score = 0;

            //start training
            foreach (double r in Global.regList)//train on different r (sigma)
            {
                Global.reg = r;
                Global.swLog.WriteLine("\nr: " + r.ToString());
                Console.WriteLine("\nr: " + r.ToString());
                if (Global.rawResWrite)
                {
                    Global.swResRaw.WriteLine("\n%r: " + r.ToString());
                }
                toolbox tb = new toolbox(X, true);
                score = basicTrain(XX, tb);
                resSummarize.write();//summarize the results & output the summarized results

                if (Global.save == 1)
                {
                    tb.Model.save(Global.fModel);//save model as a .txt file
                }
            }
            return(score);
        }
Beispiel #10
0
        //f-score
        public List <double> decode_fscore(dataSet X, model m)
        {
            //multi thread
            List <dataSeqTest> X2 = new List <dataSeqTest>();

            multiThreading(X, X2);

            List <string> goldTagList = new List <string>();
            List <string> resTagList  = new List <string>();

            foreach (dataSeqTest x in X2)
            {
                string res = "";
                foreach (int im in x._yOutput)
                {
                    res += im.ToString() + ",";
                }
                resTagList.Add(res);

                //output tag results
                if (Global.swOutput != null)
                {
                    for (int i = 0; i < x._yOutput.Count; i++)
                    {
                        Global.swOutput.Write(x._yOutput[i] + ",");
                    }
                    Global.swOutput.WriteLine();
                }

                List <int> goldTags = x._x.getTags();
                string     gold     = "";
                foreach (int im in goldTags)
                {
                    gold += im.ToString() + ",";
                }
                goldTagList.Add(gold);
            }
            List <double> scoreList = new List <double>();

            if (Global.runMode == "train")
            {
                List <double> infoList = new List <double>();
                scoreList = fscore.getFscore(goldTagList, resTagList, infoList);
                Global.swLog.WriteLine("#gold-chunk={0}  #output-chunk={1}  #correct-output-chunk={2}  precision={3}  recall={4}  f-score={5}", infoList[0], infoList[1], infoList[2], scoreList[1].ToString("f2"), scoreList[2].ToString("f2"), scoreList[0].ToString("f2"));
            }

            return(scoreList);
        }
Beispiel #11
0
        public static double test()
        {
            dataSet X  = new dataSet(Global.fFeatureTrain, Global.fGoldTrain);
            dataSet XX = new dataSet(Global.fFeatureTest, Global.fGoldTest);

            Global.swLog.WriteLine("data size (test): {0}", XX.Count);
            //load model for testing
            toolboxRich tb = new toolboxRich(X, false);

            List <double> scoreList = tb.test(XX, 0);

            double score = scoreList[0];

            Global.scoreListList.Add(scoreList);
            resSummarize.write();
            return(score);
        }
Beispiel #12
0
        //this function can be called by train(), cv(), & richEdge.train()
        public static double basicTrain(dataSet XTest, toolbox tb)
        {
            Global.reinitGlobal();
            double score = 0;

            if (Global.modelOptimizer.EndsWith("bfgs"))
            {
                Global.tb = tb;
                Global.XX = XTest;

                tb.train();
                score = Global.scoreListList[Global.scoreListList.Count - 1][0];
            }
            else
            {
                for (int i = 0; i < Global.ttlIter; i++)
                {
                    Global.glbIter++;
                    Stopwatch timer = new Stopwatch();
                    timer.Start();

                    double err = tb.train();

                    timer.Stop();
                    double time = timer.ElapsedMilliseconds / 1000.0;

                    Global.timeList.Add(time);
                    Global.errList.Add(err);
                    Global.diffList.Add(Global.diff);

                    List <double> scoreList = tb.test(XTest, i);
                    score = scoreList[0];
                    Global.scoreListList.Add(scoreList);

                    Global.swLog.WriteLine("iter{0}  diff={1}  train-time(sec)={2}  {3}={4}%", Global.glbIter, Global.diff.ToString("e2"), time.ToString("f2"), Global.metric, score.ToString("f2"));
                    Global.swLog.WriteLine("------------------------------------------------");
                    Global.swLog.Flush();
                    Console.WriteLine("iter{0}  diff={1}  train-time(sec)={2}  {3}={4}%", Global.glbIter, Global.diff.ToString("e2"), time.ToString("f2"), Global.metric, score.ToString("f2"));

                    //if (Global.diff < Global.convergeTol)
                    //break;
                }
            }
            return(score);
        }
Beispiel #13
0
        //string accuracy
        public List <double> decode_strAcc(dataSet X, model m)
        {
            double xsize = X.Count;
            double corr  = 0;

            //multi thread
            List <dataSeqTest> X2 = new List <dataSeqTest>();

            multiThreading(X, X2);

            foreach (dataSeqTest x in X2)
            {
                //output tag results
                if (Global.swOutput != null)
                {
                    for (int i = 0; i < x._x.Count; i++)
                    {
                        Global.swOutput.Write(x._yOutput[i].ToString() + ",");
                    }
                    Global.swOutput.WriteLine();
                }

                List <int> goldTags = x._x.getTags();
                bool       ck       = true;
                for (int i = 0; i < x._x.Count; i++)
                {
                    if (goldTags[i] != x._yOutput[i])
                    {
                        ck = false;
                        break;
                    }
                }
                if (ck)
                {
                    corr++;
                }
            }
            double acc = corr / xsize * 100.0;

            Global.swLog.WriteLine("total-tag-strings={0}  correct-tag-strings={1}  string-accuracy={2}%", xsize, corr, acc);
            List <double> scoreList = new List <double>();

            scoreList.Add(acc);
            return(scoreList);
        }
Beispiel #14
0
 public model(dataSet X, featureGenerator fGen)
 {
     _nTag = X.NTag;
     //default value is 0
     if (Global.random == 0)
     {
         _w = new float[fGen.NCompleteFeature];
     }
     else if (Global.random == 1)
     {
         List <float> randList = randomDoubleTool.getRandomList_float(fGen.NCompleteFeature);
         _w = randList.ToArray();
     }
     else
     {
         throw new Exception("error");
     }
 }
Beispiel #15
0
        public static double train()
        {
            //load data
            Console.WriteLine("\nreading training & test data...");
            Global.swLog.WriteLine("\nreading training & test data...");
            dataSet X, XX;

            if (Global.runMode.Contains("tune"))
            {
                dataSet origX = new dataSet(Global.fFeatureTrain, Global.fGoldTrain);
                X  = new dataSet();
                XX = new dataSet();
                MainClass.dataSplit(origX, Global.tuneSplit, X, XX);
            }
            else
            {
                X  = new dataSet(Global.fFeatureTrain, Global.fGoldTrain);
                XX = new dataSet(Global.fFeatureTest, Global.fGoldTest);
                MainClass.dataSizeScale(X);
            }
            Global.swLog.WriteLine("data sizes (train, test): {0} {1}", X.Count, XX.Count);

            double score = 0;

            foreach (double r in Global.regList)
            {
                Global.reg = r;
                Global.swLog.WriteLine("\nr: " + r.ToString());
                Console.WriteLine("\nr: " + r.ToString());
                if (Global.rawResWrite)
                {
                    Global.swResRaw.WriteLine("\n%r: " + r.ToString());
                }
                toolboxRich tb = new toolboxRich(X);
                score = MainClass.basicTrain(XX, tb);
                resSummarize.write();
                //save model
                if (Global.save == 1)
                {
                    tb.Model.save(Global.fModel);
                }
            }
            return(score);
        }
Beispiel #16
0
        public void multiThreading(dataSet X, List <dataSeqTest> X2)
        {
            if (X.Count < Global.nThread)
            {
                Global.nThread = X.Count;
            }
            //data for multi thread
            for (int i = 0; i < X.Count; i++)
            {
                X2.Add(new dataSeqTest(X[i], new List <int>()));
            }

            Global.threadXX = new List <List <dataSeqTest> >();
            for (int i = 0; i < Global.nThread; i++)
            {
                Global.threadXX.Add(new List <dataSeqTest>());
            }
            for (int i = 0; i < X2.Count; i++)
            {
                int idx = i % Global.nThread;
                Global.threadXX[idx].Add(X2[i]);
            }

            Stopwatch timer = new Stopwatch();

            timer.Start();



            //multi thread
            Task[] taskAry = new Task[Global.nThread];
            for (int i = 0; i < Global.nThread; i++)
            {
                taskAry[i] = new Task(taskRunner_test, i, TaskCreationOptions.PreferFairness);
                taskAry[i].Start();
            }

            Task.WaitAll(taskAry);

            timer.Stop();
            double time = timer.ElapsedMilliseconds / 1000.0;

            Global.swLog.WriteLine("**********test run time (sec): " + time.ToString());
        }
Beispiel #17
0
 public toolboxRich(dataSet X, bool train = true)
 {
     if (train)//for training
     {
         _X     = X;
         _fGene = new featureGeneRich(X);
         _model = new model(X, _fGene);
         _inf   = new inferRich(this);
         _grad  = new gradRich(this);
         initOptimizer();
     }
     else//for test
     {
         _X     = X;
         _model = new model(Global.fModel);
         _fGene = new featureGeneRich(X);
         _inf   = new inferRich(this);
         _grad  = new gradRich(this);
     }
 }
Beispiel #18
0
        //for training & test
        public featureGeneRich(dataSet X)
        {
            _nFeatureTemp          = X.NFeature;
            _nFeatureTemp_richEdge = (int)(X.NFeature * Global.edgeReduce);

            this._nTag = X.NTag;
            int nNodeFeature  = _nFeatureTemp * _nTag;
            int nEdgeFeature1 = _nTag * _nTag;
            int nEdgeFeature2 = _nFeatureTemp_richEdge * _nTag * _nTag;

            _backoff1         = nNodeFeature;
            _backoff2         = nNodeFeature + nEdgeFeature1;
            _nCompleteFeature = nNodeFeature + nEdgeFeature1 + nEdgeFeature2;

            Global.swLog.WriteLine("feature templates & rich-edge feature templates0: {0}, {1}", _nFeatureTemp, _nFeatureTemp_richEdge);
            Global.swLog.WriteLine("nNodeFeature, nEdgeFeature1, nEdgeFeature2: {0}, {1}, {2}", nNodeFeature, nEdgeFeature1, nEdgeFeature2);
            Global.swLog.WriteLine("complete features: {0}", _nCompleteFeature);
            Global.swLog.WriteLine();
            Global.swLog.Flush();
        }
Beispiel #19
0
 public toolbox(dataSet X, bool train = true)
 {
     if (train)//to train
     {
         _X     = X;
         _fGene = new featureGenerator(X);
         _model = new model(X, _fGene);
         _inf   = new inference(this);
         _grad  = new gradient(this);
         initOptimizer();
     }
     else//to test
     {
         _X     = X;
         _model = new model(Global.fModel);
         _fGene = new featureGenerator(X);
         _inf   = new inference(this);
         _grad  = new gradient(this);
     }
 }
Beispiel #20
0
        public static void dataSplit(dataSet X, double v, dataSet X1, dataSet X2)
        {
            X1.Clear();
            X2.Clear();
            X1.setDataInfo(X);
            X2.setDataInfo(X);
            int n = (int)(X.Count * v);

            for (int i = 0; i < X.Count; i++)
            {
                if (i < n)
                {
                    X1.Add(X[i]);
                }
                else
                {
                    X2.Add(X[i]);
                }
            }
        }
Beispiel #21
0
        public static void dataSizeScale(dataSet X)
        {
            dataSet XX = new dataSet();

            XX.setDataInfo(X);
            foreach (dataSeq im in X)
            {
                XX.Add(im);
            }
            X.Clear();

            int n = (int)(XX.Count * Global.trainSizeScale);

            for (int i = 0; i < n; i++)
            {
                int j = i;
                if (j > XX.Count - 1)
                {
                    j %= XX.Count - 1;
                }
                X.Add(XX[j]);
            }
            X.setDataInfo(XX);
        }
Beispiel #22
0
        public static void loadDataForCV(List <dataSet> XList, List <dataSet> XXList)
        {
            XList.Clear();
            XXList.Clear();
            //load train data only: CV is based only on training data
            dataSet X    = new dataSet(Global.fFeatureTrain, Global.fGoldTrain);
            double  step = 1.0 / Global.nCV;

            for (double i = 0; i < 1; i += step)
            {
                dataSet Xi      = new dataSet();
                dataSet XRest_i = new dataSet();
                dataSplit(X, i, i + step, Xi, XRest_i);
                XList.Add(XRest_i);
                XXList.Add(Xi);
            }

            Console.WriteLine("Done! cross-validation train/test data sizes (cv_1, ..., cv_n): ");
            Global.swLog.WriteLine("Done! cross-validation train/test data sizes (cv_1, ..., cv_n): ");
            for (int i = 0; i < Global.nCV; i++)
            {
                Global.swLog.WriteLine("{0}/{1}, ", XList[i].Count, XXList[i].Count);
            }
        }
Beispiel #23
0
        //token accuracy
        public List <double> decode_tokAcc(dataSet X, model m)
        {
            int nTag = m.NTag;

            int[]      tmpAry     = new int[nTag];
            List <int> corrOutput = new List <int>(tmpAry);
            List <int> gold       = new List <int>(tmpAry);
            List <int> output     = new List <int>(tmpAry);

            //multi thread
            List <dataSeqTest> X2 = new List <dataSeqTest>();

            multiThreading(X, X2);

            foreach (dataSeqTest x in X2)
            {
                List <int> outTags  = x._yOutput;
                List <int> goldTags = x._x.getTags();

                //output tag results
                if (Global.swOutput != null)
                {
                    for (int i = 0; i < outTags.Count; i++)
                    {
                        Global.swOutput.Write(outTags[i].ToString() + ",");
                    }
                    Global.swOutput.WriteLine();
                }

                //count
                for (int i = 0; i < outTags.Count; i++)
                {
                    gold[goldTags[i]]++;
                    output[outTags[i]]++;

                    if (outTags[i] == goldTags[i])
                    {
                        corrOutput[outTags[i]]++;
                    }
                }
            }

            Global.swLog.WriteLine("% tag-type  #gold  #output  #correct-output  token-precision  token-recall  token-f-score");
            double prec, rec;
            int    sumGold = 0, sumOutput = 0, sumCorrOutput = 0;

            for (int i = 0; i < nTag; i++)
            {
                sumCorrOutput += corrOutput[i];
                sumGold       += gold[i];
                sumOutput     += output[i];
                if (gold[i] == 0)
                {
                    rec = 0;
                }
                else
                {
                    rec = ((double)corrOutput[i]) * 100.0 / (double)gold[i];
                }
                if (output[i] == 0)
                {
                    prec = 0;
                }
                else
                {
                    prec = ((double)corrOutput[i]) * 100.0 / (double)output[i];
                }

                Global.swLog.WriteLine("% {0}:  {1}  {2}  {3}  {4}  {5}  {6}", i, gold[i], output[i], corrOutput[i], prec.ToString("f2"), rec.ToString("f2"), (2 * prec * rec / (prec + rec)).ToString("f2"));
            }
            if (sumGold == 0)
            {
                rec = 0;
            }
            else
            {
                rec = ((double)sumCorrOutput) * 100.0 / (double)sumGold;
            }
            if (sumOutput == 0)
            {
                prec = 0;
            }
            else
            {
                prec = ((double)sumCorrOutput) * 100.0 / (double)sumOutput;
            }

            double fscore;

            if (prec == 0 && rec == 0)
            {
                fscore = 0;
            }
            else
            {
                fscore = 2 * prec * rec / (prec + rec);//this token-based overall-f-score is also the token-based-accuracy
            }
            Global.swLog.WriteLine("% overall-tags:  {0}  {1}  {2}  {3}  {4}  {5}", sumGold, sumOutput, sumCorrOutput, prec.ToString("f2"), rec.ToString("f2"), fscore.ToString("f2"));
            Global.swLog.Flush();
            List <double> scoreList = new List <double>();

            scoreList.Add(fscore);
            return(scoreList);
        }
Beispiel #24
0
 public void setDataInfo(dataSet X)
 {
     _nTag     = X.NTag;
     _nFeature = X.NFeature;
 }