Beispiel #1
0
        /// <summary>
        /// Dump a view in binary format
        /// </summary>
        /// <param name="host">IHost</param>
        /// <param name="view">view to dump</param>
        /// <param name="filename">output filename</param>
        public static void ToIdv(IHostEnvironment host, IDataView view, string filename)
        {
            var    settings    = "Binary";
            var    saver       = ComponentCreation.CreateSaver(host, settings);
            string full_output = Path.GetFullPath(filename);

            using (var ch = host.Start("ToIdv"))
            {
                ch.Info(MessageSensitivity.None, "Saving data into file '{0}' or '{1}'.", filename, full_output);
                using (var fs0 = host.CreateOutputFile(full_output))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs0, true);
            }
        }
Beispiel #2
0
        /// <summary>
        /// Dump a view in csv format
        /// </summary>
        /// <param name="host">IHost</param>
        /// <param name="view">view to dump</param>
        /// <param name="filename">output filename</param>
        /// <param name="sep">column separator</param>
        /// <param name="schema">include the schema</param>
        public static void ToCsv(IHostEnvironment host, IDataView view, string filename, string sep = "\t", bool schema = true)
        {
            var settings = string.Format("Text{{sep={0} header=+ schema={1}}}",
                                         sep == "\t" ? "tab" : sep, schema ? "+" : "-");
            var    saver       = ComponentCreation.CreateSaver(host, settings);
            string full_output = Path.GetFullPath(filename);

            using (var ch = host.Start("ToCsv"))
            {
                ch.Info(MessageSensitivity.None, "Saving data into file '{0}' or '{1}'.", filename, full_output);
                using (var fs0 = host.CreateOutputFile(full_output))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs0, true);
            }
        }
Beispiel #3
0
        PrepareData(MLContext mlContext)
        {
            IDataView data      = null;
            IDataView trainData = null;
            IDataView testData  = null;

            // Step one: read the data as an IDataView.
            // Create the reader: define the data columns
            // and where to find them in the text file.
            var reader = new TextLoader(mlContext, new TextLoader.Arguments
            {
                Column = new[] {
                    // A boolean column depicting the 'label'.
                    new TextLoader.Column("Label", DataKind.BL, 30),
                    // 29 Features V1..V28 + Amount
                    new TextLoader.Column("V1", DataKind.R4, 1),
                    new TextLoader.Column("V2", DataKind.R4, 2),
                    new TextLoader.Column("V3", DataKind.R4, 3),
                    new TextLoader.Column("V4", DataKind.R4, 4),
                    new TextLoader.Column("V5", DataKind.R4, 5),
                    new TextLoader.Column("V6", DataKind.R4, 6),
                    new TextLoader.Column("V7", DataKind.R4, 7),
                    new TextLoader.Column("V8", DataKind.R4, 8),
                    new TextLoader.Column("V9", DataKind.R4, 9),
                    new TextLoader.Column("V10", DataKind.R4, 10),
                    new TextLoader.Column("V11", DataKind.R4, 11),
                    new TextLoader.Column("V12", DataKind.R4, 12),
                    new TextLoader.Column("V13", DataKind.R4, 13),
                    new TextLoader.Column("V14", DataKind.R4, 14),
                    new TextLoader.Column("V15", DataKind.R4, 15),
                    new TextLoader.Column("V16", DataKind.R4, 16),
                    new TextLoader.Column("V17", DataKind.R4, 17),
                    new TextLoader.Column("V18", DataKind.R4, 18),
                    new TextLoader.Column("V19", DataKind.R4, 19),
                    new TextLoader.Column("V20", DataKind.R4, 20),
                    new TextLoader.Column("V21", DataKind.R4, 21),
                    new TextLoader.Column("V22", DataKind.R4, 22),
                    new TextLoader.Column("V23", DataKind.R4, 23),
                    new TextLoader.Column("V24", DataKind.R4, 24),
                    new TextLoader.Column("V25", DataKind.R4, 25),
                    new TextLoader.Column("V26", DataKind.R4, 26),
                    new TextLoader.Column("V27", DataKind.R4, 27),
                    new TextLoader.Column("V28", DataKind.R4, 28),
                    new TextLoader.Column("Amount", DataKind.R4, 29),
                },
                // First line of the file is a header, not a data row.
                HasHeader = true,
                Separator = ","
            });


            // We know that this is a Binary Classification task,
            // so we create a Binary Classification context:
            // it will give us the algorithms we need,
            // as well as the evaluation procedure.
            var classification = new BinaryClassificationContext(mlContext);

            if (!File.Exists(Path.Combine(_outputPath, "testData.idv")) &&
                !File.Exists(Path.Combine(_outputPath, "trainData.idv")))
            {
                // Split the data 80:20 into train and test sets, train and evaluate.

                data = reader.Read(new MultiFileSource(_dataSetFile));
                ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (source)");
                ConsoleHelpers.InspectData(mlContext, data, 4);



                // Can't do stratification when column type is a boolean, is this an issue?
                //(trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2, stratificationColumn: "Label");
                (trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2);

                // save test split
                IHostEnvironment env = (IHostEnvironment)mlContext;
                using (var ch = env.Start("SaveData"))
                    using (var file = env.CreateOutputFile(Path.Combine(_outputPath, "testData.idv")))
                    {
                        var saver = new BinarySaver(mlContext, new BinarySaver.Arguments());
                        DataSaverUtils.SaveDataView(ch, saver, testData, file);
                    }

                // save train split
                using (var ch = ((IHostEnvironment)env).Start("SaveData"))
                    using (var file = env.CreateOutputFile(Path.Combine(_outputPath, "trainData.idv")))
                    {
                        var saver = new BinarySaver(mlContext, new BinarySaver.Arguments());
                        DataSaverUtils.SaveDataView(ch, saver, trainData, file);
                    }
            }
            else
            {
                // Load splited data
                var binTrainData = new BinaryLoader(mlContext, new BinaryLoader.Arguments(), new MultiFileSource(Path.Combine(_outputPath, "trainData.idv")));
                var trainRoles   = new RoleMappedData(binTrainData, roles: TransactionObservation.Roles());
                trainData = trainRoles.Data;


                var binTestData = new BinaryLoader(mlContext, new BinaryLoader.Arguments(), new MultiFileSource(Path.Combine(_outputPath, "testData.idv")));
                var testRoles   = new RoleMappedData(binTestData, roles: TransactionObservation.Roles());
                testData = testRoles.Data;
            }

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (traindata)");
            ConsoleHelpers.InspectData(mlContext, trainData, 4);

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (testData)");
            ConsoleHelpers.InspectData(mlContext, testData, 4);

            return(classification, reader, trainData, testData);
        }
        private Exception DownloadResource(IHostEnvironment env, IChannel ch, WebClient webClient, Uri uri, string path, string fileName, CancellationToken ct)
        {
            if (File.Exists(path))
            {
                return(null);
            }

            var mutex = new Mutex(false, "Resource" + fileName);

            mutex.WaitOne();
            if (File.Exists(path))
            {
                mutex.ReleaseMutex();
                return(null);
            }

            Guid   guid     = Guid.NewGuid();
            string tempPath = Path.GetFullPath(Path.Combine(Path.GetDirectoryName(path), "temp-resource-" + guid.ToString()));

            try
            {
                using (var s = webClient.OpenRead(uri))
                    using (var fh = env.CreateOutputFile(tempPath))
                        using (var ws = fh.CreateWriteStream())
                        {
                            var headers = webClient.ResponseHeaders.GetValues("Content-Length");
                            if (Utils.Size(headers) == 0 || !long.TryParse(headers[0], out var size))
                            {
                                size = 10000000;
                            }

                            long printFreq = (long)(size / 10.0);
                            var  buffer    = new byte[4096];
                            long total     = 0;
                            int  count;
                            // REVIEW: use a progress channel instead.
                            while ((count = s.Read(buffer, 0, 4096)) > 0)
                            {
                                ws.Write(buffer, 0, count);
                                total += count;
                                if ((total - (total / printFreq) * printFreq) <= 4096)
                                {
                                    ch.Info($"{fileName}: Downloaded {total} bytes out of {size}");
                                }
                                if (ct.IsCancellationRequested)
                                {
                                    ch.Error($"{fileName}: Download timed out");
                                    return(ch.Except("Download timed out"));
                                }
                            }
                        }
                File.Move(tempPath, path);
                ch.Info($"{fileName}: Download complete");
                return(null);
            }
            catch (WebException e)
            {
                ch.Error($"{fileName}: Could not download. WebClient returned the following error: {e.Message}");
                return(e);
            }
            finally
            {
                TryDelete(ch, tempPath, warn: false);
                mutex.ReleaseMutex();
            }
        }
        private Exception DownloadResource(IHostEnvironment env, IChannel ch, WebClient webClient, Uri uri, string path, string fileName, CancellationToken ct)
        {
            if (File.Exists(path))
            {
                return(null);
            }

            var mutex = new Mutex(false, "Resource" + fileName);

            mutex.WaitOne();
            if (File.Exists(path))
            {
                mutex.ReleaseMutex();
                return(null);
            }

            Guid   guid     = Guid.NewGuid();
            string tempPath = Path.GetFullPath(Path.Combine(Path.GetDirectoryName(path), "temp-resource-" + guid.ToString()));

            try
            {
                int blockSize = 4096;

                using (var s = webClient.OpenRead(uri))
                    using (var fh = env.CreateOutputFile(tempPath))
                        using (var ws = fh.CreateWriteStream())
                        {
                            var headers = webClient.ResponseHeaders.GetValues("Content-Length");
                            if (uri.Host == "aka.ms" && IsRedirectToDefaultPage(uri.AbsoluteUri))
                            {
                                throw new NotSupportedException($"The provided url ({uri}) redirects to the default url ({DefaultUrl})");
                            }
                            if (Utils.Size(headers) == 0 || !long.TryParse(headers[0], out var size))
                            {
                                size = 10000000;
                            }

                            long printFreq = (long)(size / 10.0);
                            var  buffer    = new byte[blockSize];
                            long total     = 0;

                            // REVIEW: use a progress channel instead.
                            while (true)
                            {
                                var task = s.ReadAsync(buffer, 0, blockSize, ct);
                                task.Wait();
                                int count = task.Result;

                                if (count <= 0)
                                {
                                    break;
                                }

                                ws.Write(buffer, 0, count);
                                total += count;
                                if ((total - (total / printFreq) * printFreq) <= blockSize)
                                {
                                    ch.Info($"{fileName}: Downloaded {total} bytes out of {size}");
                                }
                                if (ct.IsCancellationRequested)
                                {
                                    ch.Error($"{fileName}: Download timed out");
                                    return(ch.Except("Download timed out"));
                                }
                            }
                        }
                File.Move(tempPath, path);
                ch.Info($"{fileName}: Download complete");
                return(null);
            }
            catch (WebException e)
            {
                ch.Error($"{fileName}: Could not download. WebClient returned the following error: {e.Message}");
                return(e);
            }
            finally
            {
                TryDelete(ch, tempPath, warn: false);
                mutex.ReleaseMutex();
            }
        }