Пример #1
0
        /// <summary>
        /// Main Program       
        /// </summary>
        /// <param name="args">
        /// There should be at least two input parameters from command line:
        /// file name of the stored boost tree, and 
        /// file name of the source test/validation data 
        /// </param>
        public static void Main(string[] args)
        {
            TestArgs cmd = new TestArgs(args);

            Random r = new Random(cmd.seed);

            //Load the model first
            BoostTree boostTree = BoostTree.Read(cmd.binaryTreeFile);
            if (boostTree == null)
            {
                Debug.Assert(false, "Fail to load model");
                Console.WriteLine("Fail to load model " + cmd.binaryTreeFile);
                return;
            }

            int numIter = cmd.numIter;
            if (cmd.numIter == 0) // If iteration not specified, use the optimal validation iteration found during training
            {
                numIter = boostTree.OptIter;
            }

            //compute and output the feature importance for the specified number of iterations
            //            boostTree.SummarizeFeatureImporance(numIter, "featureImportance.txt");

            string[] activeFeatureNames = null;
            //read and process only a subset of activated features as specified in the activeFeatureFile
            if (cmd.activeFeatureFile != null)
            {
                activeFeatureNames = TsvFileLoader.ReadFeatureNames(cmd.activeFeatureFile);
            }

            //feature parser: special module that understand MSN style value encoding
            MsnFeatureParser featureParser = new MsnFeatureParser(activeFeatureNames);

            //the column name for label: values to regress to
            string[] labelName = { cmd.labelName };
            //label/rating parser: special module that understand regression value
            IParser<float> RateParser = new MsnLabelParser(labelName, cmd.labelNameValueFile);

            //data boundary: every row of data is by itself / all data is in one group / no data groups
            OnelineGroup noBoundary = new OnelineGroup();

            //Load coded data if exist
            LabelFeatureData labelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.inputFile, featureParser, RateParser, noBoundary, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads);

            if (!boostTree.SetFeatureNames(labelFeatureData.FeatureNames))
            {
                Debug.Assert(false, "Sub-model failed to initialize");
                Console.WriteLine("Sub-model failed to initialize, program exits");
                return;
            }

            //All data are for test
            float[] percentage = DataGroups.DataSplit("0:0:10"); //"Train:Valid:Test"
            labelFeatureData.DataGroups.PartitionData(percentage, r);

            //Specify the data partitions to be tested
            DataPartitionType[] dataTypes = DataGroups.PartitionTypes("Train:Valid:Test"); //using all data as default

            LabelConverter labelConvert = new LabelConverterNull();
            //set up the error metrics that we like to keep tract of during testing
            //dp.LabelFeatureData is the data we are evaluating
            Metrics metrics;
            if (string.Compare(cmd.metric, "ErrRate", true) == 0)
            {
                metrics = new ClassError(labelFeatureData, labelConvert, dataTypes);
            }
            else if (string.Compare(cmd.metric, "PrecRecall", true) == 0)
            {
                metrics = new PrecRecall(labelFeatureData, labelConvert, dataTypes);
            }
            else
            {
                metrics = new ClassError(labelFeatureData, labelConvert, dataTypes);
            }

            boostTree.Predict(labelFeatureData, numIter, metrics, cmd.silent);

            // Output the testing error history. This should at least help validate the optimal
            // number of iterations, although it is probably better that we use NDCG history
            // for the optimum.
            metrics.SaveAllResults("testErrHistory.txt");
        }
Пример #2
0
        /// <summary>
        /// Main Program 
        /// </summary>
        /// <param name="args">
        /// Should contain five parameters in the order of 
        /// file name for training source data (DataProcess class),
        /// file name for saving the tree
        /// tree size, i.e., maximum number of terminal nodes, usually 16 - 20
        /// learning rate, usually 0.02 - 0.06
        /// number of iterations, usallay >500(can be pruned later)
        /// </param>
        public static void Main(string[] args)
        {
            try
            {
                TrainArgs cmd = new TrainArgs(args);

                Random r = null;
                if (cmd.seed == -1)
                {
                    r = new Random();
                }
                else
                {
                    r = new Random(cmd.seed);
                }

                string[] activeFeatureNames = null;
                //read and process only a subset of activated features as specified in the activeFeatureFile
                if (cmd.activeFeatureFile != null)
                {
                    activeFeatureNames = TsvFileLoader.ReadFeatureNames(cmd.activeFeatureFile);
                }

                //feature parser: special module that understand MSN style value encoding
                MsnFeatureParser featureParser = new MsnFeatureParser(activeFeatureNames);

                //the column name for label:
                string[] labelName = { cmd.labelName };
                //label/rating parser:
                IParser<float> RateParser = new MsnLabelParser(labelName, cmd.labelNameValueFile);

                //data boundary: no boundary
                OnelineGroup noBounadry = new OnelineGroup();

                //Load coded data if exist
                LabelFeatureDataCoded trainLabelFeatureDataCoded = (CLabelFeatureDataCoded)CLabelFeatureData.Load(cmd.trainFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureDataCoded), activeFeatureNames, cmd.cThreads, cmd.SparseCoded);
                LabelFeatureData validLabelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.validFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads, cmd.SparseCoded);
                LabelFeatureData testLabelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.testFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads, cmd.SparseCoded);

                //build composite data - an aggregated data object that keeps tract of train/valid/test data
                CLabelFeatureDataCodedComposite labelFeatureDataCoded = CLabelFeatureDataCodedComposite.Create(trainLabelFeatureDataCoded, validLabelFeatureData, testLabelFeatureData);

                //initial submodel to boost on
                Model subModel = null;
                if (cmd.subModel != null && cmd.subModel.Length > 0)
                {
                    if (cmd.cLayer > 0)
                    {
                        string[] layerNames = new string[cmd.cLayer];
                        for (int i = 0; i < cmd.cLayer; i++)
                        {
                            string num = "";
                            if (cmd.cLayer > 1)
                            {
                                num = (i + 1).ToString();
                            }
                            layerNames[i] = cmd.subModel + "layer" + num + ".txt";
                        }
                        subModel = new NNModel(layerNames);
                    }
                    else
                    {
                        string iniName = cmd.subModel + ".ini";
                        subModel = NNModelMSN.Create(iniName);
                    }
                }

                if (subModel != null && !subModel.SetFeatureNames(labelFeatureDataCoded.FeatureNames))
                {
                    Console.WriteLine("Fail to initialize specified submodel - training with empty submodel");
                    subModel = null;
                }

                LabelConverter labelConvert = new LabelConverterNull();

                LabelFeatureData subModelScore = null;
                McBoostTreeLoss boostTreeLoss = new McBoostTreeLoss(labelFeatureDataCoded, labelConvert, cmd.learnRate);
                BoostTree boostTree = new BoostTree(labelFeatureDataCoded, subModelScore, subModel, boostTreeLoss,
                                                cmd.binaryTreeFile, cmd.textTreeFile);

                //set up the error metrics that we like to keep tract of during testing
                DataPartitionType[] dataTypes = { DataPartitionType.Train, DataPartitionType.Validation, DataPartitionType.Test };

                //dp.LabelFeatureData is the data we are evaluating
                Metrics metrics = new ClassError(labelFeatureDataCoded, labelConvert, dataTypes);

                DataFeatureSampleRate dataFeatureSampleRate = new DataFeatureSampleRate(cmd.sampleFeatureRate, cmd.sampleDataRate, cmd.sampleDataGroupRate);

                boostTree.Build(metrics, dataFeatureSampleRate, cmd.cLeafNodes, cmd.minNumSamples, cmd.numIter, cmd.cThreads, r);

                metrics.SaveAllResults("TrainingErrHistory.txt");

            }
            catch (Exception exc)
            {
                Console.WriteLine(exc.Message);
            }
        }