Пример #1
0
        //encoding CRF model from training corpus
        public bool Learn(EncoderArgs args)
        {
            if (args.min_diff <= 0.0)
            {
                Console.WriteLine("eta must be > 0.0");
                return(false);
            }

            if (args.C < 0.0)
            {
                Console.WriteLine("C must be >= 0.0");
                return(false);
            }

            if (args.threads_num <= 0)
            {
                Console.WriteLine("thread must be > 0");
                return(false);
            }

            if (args.hugeLexMemLoad > 0)
            {
                Console.WriteLine("Build feature lexical dictionary in huge mode[shrink when mem used rate:{0}%]", args.hugeLexMemLoad);
            }

            Console.WriteLine("Open and check training corpus and templates...");
            ModelWritter modelWritter = new ModelWritter(args.threads_num, args.C, args.hugeLexMemLoad);

            if (modelWritter.Open(args.strTemplateFileName, args.strTrainingCorpus) == false)
            {
                Console.WriteLine("Open training corpus or template file failed.");
                return(false);
            }

            Console.WriteLine("Load training data and generate lexical features: ");
            EncoderTagger[] xList = modelWritter.ReadAllRecords();

            Console.WriteLine();

            Console.WriteLine("Shrinking feature set [frequency is less than {0}]...", args.min_feature_freq);
            modelWritter.Shrink(xList, args.min_feature_freq);

            Console.Write("Saving model meta data...");
            if (!modelWritter.SaveModelMetaData(args.strEncodedModelFileName))
            {
                Console.WriteLine("Failed!");
                return(false);
            }
            else
            {
                Console.WriteLine("Success");
            }

            Console.WriteLine("Indexing feature set with {0} maximum slot usage rate threshold...", args.slot_usage_rate_threshold);
            if (!modelWritter.BuildFeatureSetIntoIndex(args.strEncodedModelFileName, args.slot_usage_rate_threshold, args.debugLevel, args.strRetrainModelFileName))
            {
                Console.WriteLine("Failed!");
                return(false);
            }
            else
            {
                Console.WriteLine("Success");
            }

            Console.WriteLine("Number of sentences: " + xList.Length);
            Console.WriteLine("Number of features:  " + modelWritter.feature_size());
            Console.WriteLine("Number of thread(s): " + args.threads_num);
            Console.WriteLine("Regularization type: " + args.regType.ToString());
            Console.WriteLine("Freq:                " + args.min_feature_freq);
            Console.WriteLine("eta:                 " + args.min_diff);
            Console.WriteLine("C:                   " + args.C);

            bool orthant = false;

            if (args.regType == REG_TYPE.L1)
            {
                orthant = true;
            }
            if (runCRF(xList, modelWritter, orthant, args) == false)
            {
                Console.WriteLine("Some warnings are raised during encoding...");
            }
            Console.Write("Saving model feature's weight...");
            if (modelWritter.SaveFeatureWeight(args.strEncodedModelFileName) == false)
            {
                Console.WriteLine("Failed!");
                return(false);
            }
            else
            {
                Console.WriteLine("Success.");
            }
            Console.WriteLine("\nDone!");

            return(true);
        }
Пример #2
0
        private static void ShowEvaluation(int recordNum, ModelWritter feature_index, LBFGS lbfgs, int termNum, int itr, int[,] merr, int[] yfreq, double diff, DateTime startDT, long nonzero_feature_num, EncoderArgs args)
        {
            TimeSpan ts = DateTime.Now - startDT;

            if (args.debugLevel > 1)
            {
                for (int i = 0; i < feature_index.y_.Count; i++)
                {
                    int total_merr = 0;
                    SortedDictionary <double, List <string> > sdict = new SortedDictionary <double, List <string> >();
                    for (int j = 0; j < feature_index.y_.Count; j++)
                    {
                        total_merr += merr[i, j];
                        double v = (double)merr[i, j] / (double)yfreq[i];
                        if (v > 0.0001)
                        {
                            if (sdict.ContainsKey(v) == false)
                            {
                                sdict.Add(v, new List <string>());
                            }
                            sdict[v].Add(feature_index.y_[j]);
                        }
                    }
                    double vet = (double)total_merr / (double)yfreq[i];
                    vet = vet * 100.0F;

                    Console.ForegroundColor = ConsoleColor.Green;
                    Console.Write("{0} ", feature_index.y_[i]);
                    Console.ResetColor();
                    Console.Write("[FR={0}, TE=", yfreq[i]);
                    Console.ForegroundColor = ConsoleColor.Yellow;
                    Console.Write("{0:0.00}%", vet);
                    Console.ResetColor();
                    Console.WriteLine("]");

                    int n = 0;
                    foreach (KeyValuePair <double, List <string> > pair in sdict.Reverse())
                    {
                        foreach (string item in pair.Value)
                        {
                            n += item.Length + 1 + 7;
                            if (n > 80)
                            {
                                //only show data in one line, more data in tail will not be show.
                                break;
                            }
                            Console.Write("{0}:", item);
                            Console.ForegroundColor = ConsoleColor.Red;
                            Console.Write("{0:0.00}% ", pair.Key * 100);
                            Console.ResetColor();
                        }
                        if (n > 80)
                        {
                            break;
                        }
                    }
                    Console.WriteLine();
                }
            }

            double act_feature_rate = (double)(nonzero_feature_num) / (double)(feature_index.feature_size()) * 100.0;

            Console.WriteLine("iter={0} terr={1:0.00000} serr={2:0.00000} diff={3:0.000000} fsize={4}({5:0.00}% act)", itr, 1.0 * lbfgs.err / termNum, 1.0 * lbfgs.zeroone / recordNum, diff, feature_index.feature_size(), act_feature_rate);
            Console.WriteLine("Time span: {0}, Aver. time span per iter: {1}", ts, new TimeSpan(0, 0, (int)(ts.TotalSeconds / (itr + 1))));
        }
Пример #3
0
        bool runCRF(EncoderTagger[] x, ModelWritter modelWritter, bool orthant, EncoderArgs args)
        {
            double old_obj  = double.MaxValue;
            int    converge = 0;
            LBFGS  lbfgs    = new LBFGS(args.threads_num);

            lbfgs.expected = new double[modelWritter.feature_size() + 1];

            List <CRFEncoderThread> processList = new List <CRFEncoderThread>();

#if NO_SUPPORT_PARALLEL_LIB
#else
            ParallelOptions parallelOption = new ParallelOptions();
            parallelOption.MaxDegreeOfParallelism = args.threads_num;
#endif

            //Initialize encoding threads
            for (int i = 0; i < args.threads_num; i++)
            {
                CRFEncoderThread thread = new CRFEncoderThread();
                thread.start_i    = i;
                thread.thread_num = args.threads_num;
                thread.x          = x;
                thread.lbfgs      = lbfgs;
                thread.Init();
                processList.Add(thread);
            }

            //Statistic term and result tags frequency
            int   termNum = 0;
            int[] yfreq;
            yfreq = new int[modelWritter.y_.Count];
            foreach (EncoderTagger tagger in x)
            {
                termNum += tagger.word_num;
                for (int j = 0; j < tagger.word_num; j++)
                {
                    yfreq[tagger.answer_[j]]++;
                }
            }

            //Iterative training
            DateTime startDT       = DateTime.Now;
            double   dMinErrRecord = 1.0;
            for (int itr = 0; itr < args.max_iter; ++itr)
            {
                //Clear result container
                lbfgs.obj     = 0.0f;
                lbfgs.err     = 0;
                lbfgs.zeroone = 0;

                Array.Clear(lbfgs.expected, 0, lbfgs.expected.Length);

                List <Thread> threadList = new List <Thread>();
                for (int i = 0; i < args.threads_num; i++)
                {
                    Thread thread = new Thread(new ThreadStart(processList[i].Run));
                    thread.Start();
                    threadList.Add(thread);
                }

                int[,] merr;
                merr = new int[modelWritter.y_.Count, modelWritter.y_.Count];
                for (int i = 0; i < args.threads_num; ++i)
                {
                    threadList[i].Join();
                    lbfgs.obj     += processList[i].obj;
                    lbfgs.err     += processList[i].err;
                    lbfgs.zeroone += processList[i].zeroone;

                    //Calculate error
                    for (int j = 0; j < modelWritter.y_.Count; j++)
                    {
                        for (int k = 0; k < modelWritter.y_.Count; k++)
                        {
                            merr[j, k] += processList[i].merr[j, k];
                        }
                    }
                }

                long     num_nonzero = 0;
                long     fsize       = modelWritter.feature_size();
                double[] alpha       = modelWritter.alpha_;
                if (orthant == true)
                {
                    //L1 regularization
#if NO_SUPPORT_PARALLEL_LIB
                    for (long k = 1; k < fsize + 1; k++)
                    {
                        lbfgs.obj += Math.Abs(alpha[k] / modelWritter.cost_factor_);
                        if (alpha[k] != 0.0)
                        {
                            num_nonzero++;
                        }
                    }
#else
                    Parallel.For <double>(1, fsize + 1, parallelOption, () => 0, (k, loop, subtotal) =>
                    {
                        subtotal += Math.Abs(alpha[k] / modelWritter.cost_factor_);
                        if (alpha[k] != 0.0)
                        {
                            Interlocked.Increment(ref num_nonzero);
                        }
                        return(subtotal);
                    },
                                          (subtotal) => // lock free accumulator
                    {
                        double initialValue;
                        double newValue;
                        do
                        {
                            initialValue = lbfgs.obj;               // read current value
                            newValue     = initialValue + subtotal; //calculate new value
                        }while (initialValue != Interlocked.CompareExchange(ref lbfgs.obj, newValue, initialValue));
                    }
                                          );
#endif
                }
                else
                {
                    //L2 regularization
                    num_nonzero = fsize;

#if NO_SUPPORT_PARALLEL_LIB
                    for (long k = 1; k < fsize + 1; k++)
                    {
                        lbfgs.obj         += (alpha[k] * alpha[k] / (2.0 * modelWritter.cost_factor_));
                        lbfgs.expected[k] += (alpha[k] / modelWritter.cost_factor_);
                    }
#else
                    Parallel.For <double>(1, fsize + 1, parallelOption, () => 0, (k, loop, subtotal) =>
                    {
                        subtotal          += (alpha[k] * alpha[k] / (2.0 * modelWritter.cost_factor_));
                        lbfgs.expected[k] += (alpha[k] / modelWritter.cost_factor_);
                        return(subtotal);
                    },
                                          (subtotal) => // lock free accumulator
                    {
                        double initialValue;
                        double newValue;
                        do
                        {
                            initialValue = lbfgs.obj;               // read current value
                            newValue     = initialValue + subtotal; //calculate new value
                        }while (initialValue != Interlocked.CompareExchange(ref lbfgs.obj, newValue, initialValue));
                    }
                                          );
#endif
                }

                //Show each iteration result
                double diff = (itr == 0 ? 1.0f : Math.Abs(old_obj - lbfgs.obj) / old_obj);
                old_obj = lbfgs.obj;

                ShowEvaluation(x.Length, modelWritter, lbfgs, termNum, itr, merr, yfreq, diff, startDT, num_nonzero, args);
                if (diff < args.min_diff)
                {
                    converge++;
                }
                else
                {
                    converge = 0;
                }
                if (itr > args.max_iter || converge == 3)
                {
                    break;  // 3 is ad-hoc
                }

                if (args.debugLevel > 0 && (double)lbfgs.zeroone / (double)x.Length < dMinErrRecord)
                {
                    ConsoleColor cc = Console.ForegroundColor;
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.Write("[Debug Mode] ");
                    Console.ForegroundColor = cc;
                    Console.Write("Saving intermediate feature weights at current directory...");

                    //Save current best feature weight into file
                    dMinErrRecord = (double)lbfgs.zeroone / (double)x.Length;
                    modelWritter.SaveFeatureWeight("feature_weight_tmp");

                    Console.WriteLine("Done.");
                }

                int iret;
                iret = lbfgs.optimize(alpha, modelWritter.cost_factor_, orthant);
                if (iret <= 0)
                {
                    return(false);
                }
            }

            return(true);
        }