Ejemplo n.º 1
0
        //encoding CRF model from training corpus
        public bool Learn(EncoderOptions args)
        {
            if (args.MinDifference <= 0.0)
            {
                return(false);
            }

            if (args.CostFactor < 0.0)
            {
                return(false);
            }

            if (args.ThreadsNum <= 0)
            {
                return(false);
            }

            if (args.HugeLexMemLoad > 0)
            {
            }

            var modelWriter = new ModelWriter(args.ThreadsNum, args.CostFactor,
                                              args.HugeLexMemLoad, args.RetrainModelFileName);

            if (modelWriter.Open(args.TemplateFileName, args.TrainingCorpusFileName) == false)
            {
                return(false);
            }

            var xList = modelWriter.ReadAllRecords();


            modelWriter.Shrink(xList, args.MinFeatureFreq);

            if (!modelWriter.SaveModelMetaData(args.ModelFileName))
            {
                return(false);
            }
            else
            {
            }

            if (!modelWriter.BuildFeatureSetIntoIndex(args.ModelFileName, args.SlotUsageRateThreshold, args.DebugLevel))
            {
                return(false);
            }
            else
            {
            }

            if (xList.Length == 0)
            {
                return(false);
            }

            var orthant = false;

            if (args.RegType == REG_TYPE.L1)
            {
                orthant = true;
            }
            if (runCRF(xList, modelWriter, orthant, args) == false)
            {
            }

            modelWriter.SaveFeatureWeight(args.ModelFileName, args.BVQ);

            return(true);
        }
Ejemplo n.º 2
0
        //encoding CRF model from training corpus
        public bool Learn(EncoderArgs args)
        {
            if (args.min_diff <= 0.0)
            {
                Logger.WriteLine(Logger.Level.err, "eta must be > 0.0");
                return(false);
            }

            if (args.C < 0.0)
            {
                Logger.WriteLine(Logger.Level.err, "C must be >= 0.0");
                return(false);
            }

            if (args.threads_num <= 0)
            {
                Logger.WriteLine(Logger.Level.err, "thread must be > 0");
                return(false);
            }

            if (args.hugeLexMemLoad > 0)
            {
                Logger.WriteLine("Build feature lexical dictionary in huge mode[shrink when mem used rate:{0}%]", args.hugeLexMemLoad);
            }

            Logger.WriteLine("Open and check training corpus and templates...");
            var modelWriter = new ModelWriter(args.threads_num, args.C,
                                              args.hugeLexMemLoad, args.strRetrainModelFileName);

            if (modelWriter.Open(args.strTemplateFileName, args.strTrainingCorpus) == false)
            {
                Logger.WriteLine("Open training corpus or template file failed.");
                return(false);
            }

            Logger.WriteLine("Load training data and generate lexical features: ");
            var xList = modelWriter.ReadAllRecords();

            Logger.WriteLine("");

            Logger.WriteLine("Shrinking feature set [frequency is less than {0}]...", args.min_feature_freq);
            modelWriter.Shrink(xList, args.min_feature_freq);

            Logger.WriteLine("Saving model meta data...");
            if (!modelWriter.SaveModelMetaData(args.strEncodedModelFileName))
            {
                Logger.WriteLine(Logger.Level.err, "Failed!");
                return(false);
            }
            else
            {
                Logger.WriteLine("Success");
            }

            Logger.WriteLine("Indexing feature set with {0} maximum slot usage rate threshold...", args.slot_usage_rate_threshold);
            if (!modelWriter.BuildFeatureSetIntoIndex(args.strEncodedModelFileName, args.slot_usage_rate_threshold, args.debugLevel))
            {
                Logger.WriteLine(Logger.Level.err, "Failed!");
                return(false);
            }
            else
            {
                Logger.WriteLine("Success");
            }

            Logger.WriteLine("Sentences size: " + xList.Length);
            Logger.WriteLine("Features size:  " + modelWriter.feature_size());
            Logger.WriteLine("Thread(s): " + args.threads_num);
            Logger.WriteLine("Regularization type: " + args.regType.ToString());
            Logger.WriteLine("Freq:                " + args.min_feature_freq);
            Logger.WriteLine("eta:                 " + args.min_diff);
            Logger.WriteLine("C:                   " + args.C);
            Logger.WriteLine("Vector quantization: " + args.bVQ);

            if (xList.Length == 0)
            {
                Logger.WriteLine(Logger.Level.err, "No sentence for training.");
                return(false);
            }

            var orthant = false;

            if (args.regType == REG_TYPE.L1)
            {
                orthant = true;
            }
            if (runCRF(xList, modelWriter, orthant, args) == false)
            {
                Logger.WriteLine(Logger.Level.warn, "Some warnings are raised during encoding...");
            }

            Logger.WriteLine("Saving model feature's weight...");
            modelWriter.SaveFeatureWeight(args.strEncodedModelFileName, args.bVQ);

            return(true);
        }