Esempio n. 1
0
        private static void Main(string[] args)
        {
            var           report        = new MakeReport();
            List <string> consoleOutput = new List <string>();

            consoleOutput.Add($"{System.DateTime.Now} - Loading");

            var train = ConvertHelper.CSVToDataConstrutor(File.ReadAllLines("train.csv"), ',');
            var test  = ConvertHelper.CSVToDataConstrutor(File.ReadAllLines("test.csv"), ',');

            var fs = new DataProcessing(train, test, "SalePrice", "Id");

            fs.GetFeature("SalePrice").Transform((value) => Math.Log(1 + value));

            fs.SetInactive(new List <string> {
                "Id", "PoolArea", "LandContour", "PoolQC", "LotConfig", "Utilities", "Alley",
                "Street", "BsmtHalfBath", "LowQualFinSF", "3SsnPorch", "LandSlope", "YrSold", "Condition1", "BsmtFinType2", "RoofMatl",
                "MiscVal", "MiscFeature", "BsmtFinSF2", "Condition2", "BldgType", "ScreenPorch", "MoSold", "Functional"
            });

            fs.SetInactive(new List <string> {
                "BsmtCond", "BsmtUnfSF", "GarageCars", "PavedDrive", "SaleType", "SaleCondition",
                "BsmtExposure", "GarageCond", "Fence", "Heating", "BsmtQual",
            });

            //fs.SetInactive(new List<string> { "EnclosedPorch" });

            fs.SetTrainRowsInactiveByIds(new List <string> {
                "1299", "186", "198", "636", "1032", "1183", "1153", "1174"
            });
            //fs.SetTrainRowsInactiveByIds(new List<string> { "130", "188", "199", "268", "305", "497", "524", "530", "692", "770", "884", "1025", "1231", "1371", "1387", "1424", "1441" });
            consoleOutput.Add($"{System.DateTime.Now} - Transforming model");

            fs.GetFeature("LotFrontage").ReplaceValues("NA", "0");
            fs.GetFeature("MasVnrArea").ReplaceValues("NA", "-1");
            fs.GetFeature("GarageYrBlt").ReplaceValues("NA", "-1");

            fs.Features.Where(f => !f.IsNumeric() && !f.IsClass).All(n => { n.TransformEnumToInt(); return(true); });
            //fs.Features.Select(f => new OutlierLine { FeatureName = f.Name, Outliers = string.Join(", ", f.GetOutliers()) }).ToList().ForEach(o => consoleOutput.Add($"{o.FeatureName} => {o.Outliers}"));
            //TODO: To jakoś nie polepszyło, a nawet pogorszyło, trzeba by dodać taką funkcję zamiast wyliczać to tutaj i ująć zmienną GarageYrBlt
            //var oldestYearBuild = fs.GetFeature("YearBuilt").Values.Min(v => double.Parse(v.NewValue));
            //fs.GetFeature("YearBuilt").Transform((value) => (double.Parse(value) - oldestYearBuild).ToString());

            //fs.Features.ToList().ForEach(f => report.AddScatterPlot(f.Name, f.Values.Where(v => !v.IsTest).Select(v => new Point() { X = double.Parse(v.NewValue), Y = double.Parse(fs.GetClassForTrainId(v.RowId)) }).ToList()));
            //report.CreatePDF("Report.pdf");

            //var oldestYearRemodAdd = fs.GetFeature("YearRemodAdd").Values.Min(v => double.Parse(v.NewValue));
            //fs.GetFeature("YearRemodAdd").Transform((value) => (double.Parse(value) - oldestYearRemodAdd).ToString());

            //OUTLIERS
            //foreach (var feature in fs.Features.Where(f => f.IsActive && !f.IsClass && !f.IsId && new List<string> { "EnclosedPorch", "BsmtFinSF2", "GarageYrBlt", "OpenPorchSF", "ScreenPorch", "MasVnrArea", "LotArea", "Condition1", "MSSubClass", "MiscVal" }.IndexOf(f.Name) < 0))
            //{
            //    feature.MarkOutliers();
            //}
            //fs.PrintOutliersAmount();
            consoleOutput.Add($"{System.DateTime.Now} - Gathering data");

            var dataModel = fs.GetDataModel();

            File.WriteAllLines("output-train.csv", ConvertHelper.DataSetToCSV(dataModel.HeadersWithClass, dataModel.OutputTrain, ","));
            File.WriteAllLines("output-test.csv", ConvertHelper.DataSetToCSV(dataModel.HeadersWithoutClass, dataModel.OutputTest, ","));

            consoleOutput.Add($"{System.DateTime.Now} - Preparing SVM problem");

            //if (false)
            {
                //SVM.SetPrintStringFunction(null);
                SVMProblem testSet     = SVMLoadHelper.Load(dataModel.OutputTest, isWithClass: false);
                SVMProblem trainingSet = SVMLoadHelper.Load(dataModel.OutputTrain, isWithClass: true);

                SVMParameter parameter = new SVMParameter()
                {
                    Type   = SVMType.EPSILON_SVR,
                    Kernel = SVMKernelType.RBF,
                    //C = 10,
                    Gamma     = 0.01,
                    CacheSize = 2000,
                    Eps       = 0.1,// * Math.Pow(10, i);
                    //parameter.Probability = true;
                };

                //List<Tuple<string, double>> rmseAfterRemoveFeature = new List<Tuple<string, double>>();
                //var activeFeatures = fs.Features.Where(f => f.IsActive || !f.IsClass);
                //foreach (var feature in activeFeatures)
                //{
                //    feature.IsActive = false;
                //    var outputTrain = fs.GetTransfomedTrain();
                //    SVMProblem trainingSet = SVMLoadHelper.Load(outputTrain, isWithClass: true);

                //    consoleOutput.Add("=====================");
                //    //SVMModel model = trainingSet.Train(parameter);
                //    //double[] trainingResults = trainingSet.Predict(model);
                //    double[] crossvalidationResults;
                //    trainingSet.CrossValidation(parameter, 3, out crossvalidationResults);
                //    //consoleOutput.Add(parameter.GetOutput());
                //    var rmselog = EvaulationHelper.RMSELog(trainingSet.Y.ToArray(), crossvalidationResults);
                //    rmseAfterRemoveFeature.Add(new Tuple<string, double>(feature.Name, rmselog));
                //    consoleOutput.Add($"{System.DateTime.Now} - {feature.Name} - {rmselog}");
                //    feature.IsActive = true;
                //}

                //consoleOutput.Add($"{System.DateTime.Now} - {bestToRemove.Item1} - {bestToRemove.Item2}");
                consoleOutput.Add("=====================");
                consoleOutput.Add("Ordered");
                //consoleOutput.AddRange(rmseAfterRemoveFeature.OrderBy(t => t.Item2).Select(t => $"{t.Item1} - {t.Item2}"));
                SVMModel model        = trainingSet.Train(parameter);
                double[] trainResults = trainingSet.Predict(model);
                double[] testResults  = testSet.Predict(model);

                //double meanSquaredErr = testSet.EvaluateRegressionProblem(testResults, out correlationCoef);

                trainingSet.Y
                .Select((y, index) => Math.Abs(y - trainResults[index]))
                .Select((v, index) => new { index, v })
                .OrderByDescending(v => v.v)
                .Take(15)
                .ToList()
                .ForEach(e => consoleOutput.Add($"{e.index}:{e.v}"));

                var rmselog = EvaulationHelper.RMSELog(trainingSet.Y.ToArray(), trainResults);
                consoleOutput.Add($"{System.DateTime.Now} - {rmselog}");

                File.WriteAllLines("submission_fs.csv", ConvertHelper.ResultToCSV("Id,SalePrice", testResults.Select(v => Math.Exp(v) - 1).ToArray(), dataModel.TestIds));
                consoleOutput.Add($"{System.DateTime.Now} -I had finish");
                SVM.SaveModel(model, "model.txt");
            }

            //report.CreatePDF("Report.pdf");
            consoleOutput.ForEach(s => System.Console.WriteLine(s));
            File.WriteAllLines("consoleOutput.txt", consoleOutput);

            System.Console.ReadLine();
        }