static void Main(string[] args) { DataProcArgs cmd = new DataProcArgs(args); IGroupBoundary boundary = null; if (cmd.queryBoundary) { //we need to keep tract of the queries for ranking boundary = new QueryBoundary(); } else { //data boundary: no boundary boundary = new OnelineGroup(); } string[] labelName = { cmd.labelName }; IParser<float> RateParser = new MsnLabelParser(labelName, cmd.labelNameValueFile); Console.WriteLine("Loading data from tsv file " + cmd.tsvFile); MsnFeatureParser featureParser = null; //read and process only a subset of activated features as specified in the activeFeatureFile if (cmd.activeFeatureFile != null) { string[] FeatureNames = TsvFileLoader.ReadFeatureNames(cmd.activeFeatureFile); featureParser = new MsnFeatureParser(FeatureNames); } TsvFileLoader tsvFileLoader = new TsvFileLoader(cmd.tsvFile, null, RateParser, featureParser, boundary); Console.WriteLine("Finishing loading the tsv file"); Console.WriteLine("Create LabelFeatureData uncoded ..."); CLabelFeatureData labelFeatureData = new CLabelFeatureData(tsvFileLoader.FeatureName, tsvFileLoader.Labels, tsvFileLoader.GroupId, tsvFileLoader.Feature); Console.WriteLine("Save LabelFeatureData uncoded ..."); if (cmd.binFile != null) { labelFeatureData.Save(cmd.binFile); } Console.WriteLine("Create LabelFeatureData coded ..."); CLabelFeatureDataCoded labelFeatureDataCoded = new CLabelFeatureDataCoded(labelFeatureData, cmd.cThreads, cmd.storeCodedFeature, cmd.fCodedFeatureSparse); Console.WriteLine("Save LabelFeatureData coded ..."); if (cmd.binFileCoded != null) { labelFeatureDataCoded.Save(cmd.binFileCoded); } }
/// <summary> /// Main Program /// </summary> /// <param name="args"> /// There should be at least two input parameters from command line: /// file name of the stored boost tree, and /// file name of the source test/validation data /// </param> public static void Main(string[] args) { TestArgs cmd = new TestArgs(args); Random r = new Random(cmd.seed); //Load the model first BoostTree boostTree = BoostTree.Read(cmd.binaryTreeFile); if (boostTree == null) { Debug.Assert(false, "Fail to load model"); Console.WriteLine("Fail to load model " + cmd.binaryTreeFile); return; } int numIter = cmd.numIter; if (cmd.numIter == 0) // If iteration not specified, use the optimal validation iteration found during training { numIter = boostTree.OptIter; } //compute and output the feature importance for the specified number of iterations // boostTree.SummarizeFeatureImporance(numIter, "featureImportance.txt"); string[] activeFeatureNames = null; //read and process only a subset of activated features as specified in the activeFeatureFile if (cmd.activeFeatureFile != null) { activeFeatureNames = TsvFileLoader.ReadFeatureNames(cmd.activeFeatureFile); } //feature parser: special module that understand MSN style value encoding MsnFeatureParser featureParser = new MsnFeatureParser(activeFeatureNames); //the column name for label: values to regress to string[] labelName = { cmd.labelName }; //label/rating parser: special module that understand regression value IParser<float> RateParser = new MsnLabelParser(labelName, cmd.labelNameValueFile); //data boundary: every row of data is by itself / all data is in one group / no data groups OnelineGroup noBoundary = new OnelineGroup(); //Load coded data if exist LabelFeatureData labelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.inputFile, featureParser, RateParser, noBoundary, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads); if (!boostTree.SetFeatureNames(labelFeatureData.FeatureNames)) { Debug.Assert(false, "Sub-model failed to initialize"); Console.WriteLine("Sub-model failed to initialize, program exits"); return; } //All data are for test float[] percentage = DataGroups.DataSplit("0:0:10"); //"Train:Valid:Test" labelFeatureData.DataGroups.PartitionData(percentage, r); //Specify the data partitions to be tested DataPartitionType[] dataTypes = DataGroups.PartitionTypes("Train:Valid:Test"); //using all data as default LabelConverter labelConvert = new LabelConverterNull(); //set up the error metrics that we like to keep tract of during testing //dp.LabelFeatureData is the data we are evaluating Metrics metrics; if (string.Compare(cmd.metric, "ErrRate", true) == 0) { metrics = new ClassError(labelFeatureData, labelConvert, dataTypes); } else if (string.Compare(cmd.metric, "PrecRecall", true) == 0) { metrics = new PrecRecall(labelFeatureData, labelConvert, dataTypes); } else { metrics = new ClassError(labelFeatureData, labelConvert, dataTypes); } boostTree.Predict(labelFeatureData, numIter, metrics, cmd.silent); // Output the testing error history. This should at least help validate the optimal // number of iterations, although it is probably better that we use NDCG history // for the optimum. metrics.SaveAllResults("testErrHistory.txt"); }
/// <summary> /// Main Program /// </summary> /// <param name="args"> /// Should contain five parameters in the order of /// file name for training source data (DataProcess class), /// file name for saving the tree /// tree size, i.e., maximum number of terminal nodes, usually 16 - 20 /// learning rate, usually 0.02 - 0.06 /// number of iterations, usallay >500(can be pruned later) /// </param> public static void Main(string[] args) { try { TrainArgs cmd = new TrainArgs(args); Random r = null; if (cmd.seed == -1) { r = new Random(); } else { r = new Random(cmd.seed); } string[] activeFeatureNames = null; //read and process only a subset of activated features as specified in the activeFeatureFile if (cmd.activeFeatureFile != null) { activeFeatureNames = TsvFileLoader.ReadFeatureNames(cmd.activeFeatureFile); } //feature parser: special module that understand MSN style value encoding MsnFeatureParser featureParser = new MsnFeatureParser(activeFeatureNames); //the column name for label: string[] labelName = { cmd.labelName }; //label/rating parser: IParser<float> RateParser = new MsnLabelParser(labelName, cmd.labelNameValueFile); //data boundary: no boundary OnelineGroup noBounadry = new OnelineGroup(); //Load coded data if exist LabelFeatureDataCoded trainLabelFeatureDataCoded = (CLabelFeatureDataCoded)CLabelFeatureData.Load(cmd.trainFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureDataCoded), activeFeatureNames, cmd.cThreads, cmd.SparseCoded); LabelFeatureData validLabelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.validFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads, cmd.SparseCoded); LabelFeatureData testLabelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.testFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads, cmd.SparseCoded); //build composite data - an aggregated data object that keeps tract of train/valid/test data CLabelFeatureDataCodedComposite labelFeatureDataCoded = CLabelFeatureDataCodedComposite.Create(trainLabelFeatureDataCoded, validLabelFeatureData, testLabelFeatureData); //initial submodel to boost on Model subModel = null; if (cmd.subModel != null && cmd.subModel.Length > 0) { if (cmd.cLayer > 0) { string[] layerNames = new string[cmd.cLayer]; for (int i = 0; i < cmd.cLayer; i++) { string num = ""; if (cmd.cLayer > 1) { num = (i + 1).ToString(); } layerNames[i] = cmd.subModel + "layer" + num + ".txt"; } subModel = new NNModel(layerNames); } else { string iniName = cmd.subModel + ".ini"; subModel = NNModelMSN.Create(iniName); } } if (subModel != null && !subModel.SetFeatureNames(labelFeatureDataCoded.FeatureNames)) { Console.WriteLine("Fail to initialize specified submodel - training with empty submodel"); subModel = null; } LabelConverter labelConvert = new LabelConverterNull(); LabelFeatureData subModelScore = null; McBoostTreeLoss boostTreeLoss = new McBoostTreeLoss(labelFeatureDataCoded, labelConvert, cmd.learnRate); BoostTree boostTree = new BoostTree(labelFeatureDataCoded, subModelScore, subModel, boostTreeLoss, cmd.binaryTreeFile, cmd.textTreeFile); //set up the error metrics that we like to keep tract of during testing DataPartitionType[] dataTypes = { DataPartitionType.Train, DataPartitionType.Validation, DataPartitionType.Test }; //dp.LabelFeatureData is the data we are evaluating Metrics metrics = new ClassError(labelFeatureDataCoded, labelConvert, dataTypes); DataFeatureSampleRate dataFeatureSampleRate = new DataFeatureSampleRate(cmd.sampleFeatureRate, cmd.sampleDataRate, cmd.sampleDataGroupRate); boostTree.Build(metrics, dataFeatureSampleRate, cmd.cLeafNodes, cmd.minNumSamples, cmd.numIter, cmd.cThreads, r); metrics.SaveAllResults("TrainingErrHistory.txt"); } catch (Exception exc) { Console.WriteLine(exc.Message); } }