Esempio n. 1
0
        private static IDataLoader LoadStopwords(IHostEnvironment env, IChannel ch, string dataFile,
                                                 IComponentFactory <IMultiStreamSource, IDataLoader> loader, ref string stopwordsCol)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(ch, nameof(ch));

            MultiFileSource fileSource = new MultiFileSource(dataFile);
            IDataLoader     dataLoader;

            // First column using the file.
            if (loader == null)
            {
                // Determine the default loader from the extension.
                var  ext         = Path.GetExtension(dataFile);
                bool isBinary    = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase);
                bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase);
                if (isBinary || isTranspose)
                {
                    ch.Assert(isBinary != isTranspose);
                    ch.CheckUserArg(!string.IsNullOrWhiteSpace(stopwordsCol), nameof(Arguments.StopwordsColumn),
                                    "stopwordsColumn should be specified");
                    if (isBinary)
                    {
                        dataLoader = new BinaryLoader(env, new BinaryLoader.Arguments(), fileSource);
                    }
                    else
                    {
                        ch.Assert(isTranspose);
                        dataLoader = new TransposeLoader(env, new TransposeLoader.Arguments(), fileSource);
                    }
                }
                else
                {
                    if (!string.IsNullOrWhiteSpace(stopwordsCol))
                    {
                        ch.Warning("{0} should not be specified when default loader is TextLoader. Ignoring stopwordsColumn={0}",
                                   stopwordsCol);
                    }
                    dataLoader = TextLoader.Create(
                        env,
                        new TextLoader.Arguments()
                    {
                        Separator = "tab",
                        Column    = new[]
                        {
                            new TextLoader.Column("Stopwords", DataKind.TX, 0)
                        }
                    },
                        fileSource);
                    stopwordsCol = "Stopwords";
                }
                ch.AssertNonEmpty(stopwordsCol);
            }
            else
            {
                dataLoader = loader.CreateComponent(env, fileSource);
            }

            return(dataLoader);
        }
        private TermLookupTransform(IChannel ch, ModelLoadContext ctx, IHost host, IDataView input)
            : base(host, ctx, input, TestIsText)
        {
            Host.AssertValue(ch);

            // *** Binary format ***
            // <base>
            ch.AssertNonEmpty(Infos);

            // Extra streams:
            // DefaultMap.idv
            byte[] rgb = null;
            Action <BinaryReader> fn = r => rgb = ReadAllBytes(ch, r);

            if (!ctx.TryLoadBinaryStream(DefaultMapName, fn))
            {
                throw ch.ExceptDecode();
            }
            _bytes = rgb;

            // Process the bytes into the loader and map.
            _ldr = GetLoader(Host, _bytes);
            ValidateLoader(ch, _ldr);
            _valueMap = Train(ch, _ldr);
            SetMetadata();
        }
Esempio n. 3
0
        private static void ExecCore(EnvironmentBlock *penv, IHost host, IChannel ch, string graph, int cdata, DataSourceBlock **ppdata)
        {
            Contracts.AssertValue(ch);
            ch.AssertValue(host);
            ch.AssertNonEmpty(graph);
            ch.Assert(cdata >= 0);
            ch.Assert(ppdata != null || cdata == 0);

            RunGraphCore(penv, host, graph, cdata, ppdata);
        }
Esempio n. 4
0
        private void RunCore(IChannel ch)
        {
            IDataLoader loader = CreateAndSaveLoader();

            using (var schemaWriter = new StringWriter())
            {
                RunOnData(schemaWriter, Args, loader);
                var str = schemaWriter.ToString();
                ch.AssertNonEmpty(str);
                ch.Info(str);
            }
        }
Esempio n. 5
0
        /// <summary>
        /// This method simply prints the overall metrics using EvaluateUtils.PrintOverallMetrics.
        /// Override if something else is needed.
        /// </summary>
        protected virtual void PrintOverallResultsCore(IChannel ch, string filename, Dictionary <string, IDataView>[] metrics)
        {
            ch.AssertNonEmpty(metrics);

            IDataView overall;

            if (!TryGetOverallMetrics(metrics, out overall))
            {
                throw ch.Except("No overall metrics found");
            }

            MetricWriter.PrintOverallMetrics(Host, ch, filename, overall, metrics.Length);
        }
Esempio n. 6
0
        protected override void PrintOverallResultsCore(IChannel ch, string filename, Dictionary <string, IDataView>[] metrics)
        {
            ch.AssertNonEmpty(metrics);

            IDataView overall;

            if (!TryGetOverallMetrics(metrics, out overall))
            {
                throw ch.Except("No overall metrics found");
            }

            // Show only the metrics for the requested index.
            overall = ExtractRelevantIndex(overall);
            MetricWriter.PrintOverallMetrics(Host, ch, filename, overall, metrics.Length);
        }
Esempio n. 7
0
        protected override void PrintAdditionalMetricsCore(IChannel ch, Dictionary <string, IDataView>[] metrics)
        {
            ch.AssertNonEmpty(metrics);

            if (!string.IsNullOrEmpty(_groupSummaryFilename))
            {
                IDataView gs;
                if (!TryGetGroupSummaryMetrics(metrics, out gs))
                {
                    throw ch.Except("Did not find group summary metrics");
                }

                ch.Trace("Saving group-summary results");
                // If the data view contains stratification columns, filter so that only the overall metrics
                // will be present, and drop them.
                gs = MetricWriter.GetNonStratifiedMetrics(Host, gs);
                MetricWriter.SavePerInstance(Host, ch, _groupSummaryFilename, gs);
            }
        }
Esempio n. 8
0
        private static IDataLoader LoadStopwords(IHostEnvironment env, IChannel ch, string dataFile,
                                                 SubComponent <IDataLoader, SignatureDataLoader> loader, ref string stopwordsCol)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(ch, nameof(ch));
            // First column using the file.
            if (!loader.IsGood())
            {
                // Determine the default loader from the extension.
                var  ext         = Path.GetExtension(dataFile);
                bool isBinary    = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase);
                bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase);
                if (isBinary || isTranspose)
                {
                    ch.Assert(isBinary != isTranspose);
                    ch.CheckUserArg(!string.IsNullOrWhiteSpace(stopwordsCol), nameof(Arguments.StopwordsColumn),
                                    "stopwordsColumn should be specified");
                    if (isBinary)
                    {
                        loader = new SubComponent <IDataLoader, SignatureDataLoader>("BinaryLoader");
                    }
                    else
                    {
                        ch.Assert(isTranspose);
                        loader = new SubComponent <IDataLoader, SignatureDataLoader>("TransposeLoader");
                    }
                }
                else
                {
                    if (!string.IsNullOrWhiteSpace(stopwordsCol))
                    {
                        ch.Warning("{0} should not be specified when default loader is TextLoader. Ignoring stopwordsColumn={0}",
                                   stopwordsCol);
                    }
                    loader       = new SubComponent <IDataLoader, SignatureDataLoader>("TextLoader", "sep=tab col=Stopwords:TX:0");
                    stopwordsCol = "Stopwords";
                }
            }
            ch.AssertNonEmpty(stopwordsCol);

            return(loader.CreateInstance(env, new MultiFileSource(dataFile)));
        }
Esempio n. 9
0
        protected override void PrintOverallResultsCore(IChannel ch, string filename, Dictionary <string, IDataView>[] metrics)
        {
            ch.AssertNonEmpty(metrics);

            IDataView overall;

            if (!TryGetOverallMetrics(metrics, out overall))
            {
                throw ch.Except("No overall metrics found");
            }

            var args = new DropColumnsTransform.Arguments();

            args.Column = new[]
            {
                AnomalyDetectionEvaluator.OverallMetrics.NumAnomalies,
                AnomalyDetectionEvaluator.OverallMetrics.ThreshAtK,
                AnomalyDetectionEvaluator.OverallMetrics.ThreshAtP,
                AnomalyDetectionEvaluator.OverallMetrics.ThreshAtNumPos
            };
            overall = new DropColumnsTransform(Host, args, overall);
            MetricWriter.PrintOverallMetrics(Host, ch, filename, overall, metrics.Length);
        }
Esempio n. 10
0
        private void WriteDataCore(IChannel ch, TextWriter writer, IDataView data,
                                   out string argsLoader, out long count, out int min, out int max, params int[] cols)
        {
            _host.AssertValue(ch);
            ch.AssertValue(writer);
            ch.AssertValue(data);
            ch.AssertNonEmpty(cols);

            // Determine the active columns and whether there is header information.
            bool[] active = new bool[data.Schema.ColumnCount];
            for (int i = 0; i < cols.Length; i++)
            {
                ch.Check(0 <= cols[i] && cols[i] < active.Length);
                ch.Check(data.Schema.GetColumnType(cols[i]).ItemType.RawKind != 0);
                active[cols[i]] = true;
            }

            bool hasHeader = false;

            if (_outputHeader)
            {
                for (int i = 0; i < cols.Length; i++)
                {
                    if (hasHeader)
                    {
                        continue;
                    }
                    var type = data.Schema.GetColumnType(cols[i]);
                    if (!type.IsVector)
                    {
                        hasHeader = true;
                        continue;
                    }
                    if (!type.IsKnownSizeVector)
                    {
                        continue;
                    }
                    var typeNames = data.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.SlotNames, cols[i]);
                    if (typeNames != null && typeNames.VectorSize == type.VectorSize && typeNames.ItemType.IsText)
                    {
                        hasHeader = true;
                    }
                }
            }

            using (var cursor = data.GetRowCursor(i => active[i]))
            {
                var pipes = new ValueWriter[cols.Length];
                for (int i = 0; i < cols.Length; i++)
                {
                    pipes[i] = ValueWriter.Create(cursor, cols[i], _sepChar);
                }

                // REVIEW: This should be outside the cursor creation.
                string header = CreateLoaderArguments(data.Schema, pipes, hasHeader, ch);
                argsLoader = header;
                if (_outputSchema)
                {
                    WriteSchemaAsComment(writer, header);
                }

                double rowCount = data.GetRowCount(true) ?? double.NaN;
                using (var pch = !_silent ? _host.StartProgressChannel("TextSaver: saving data") : null)
                {
                    long stateCount = 0;
                    var  state      = new State(this, writer, pipes, hasHeader);
                    if (pch != null)
                    {
                        pch.SetHeader(new ProgressHeader(new[] { "rows" }), e => e.SetProgress(0, stateCount, rowCount));
                    }
                    state.Run(cursor, ref stateCount, out min, out max);
                    count = stateCount;
                    if (pch != null)
                    {
                        pch.Checkpoint(stateCount);
                    }
                }
            }
        }
Esempio n. 11
0
        /// <summary>
        /// Utility method to create the file-based <see cref="TermMap"/> if the <see cref="ArgumentsBase.DataFile"/>
        /// argument of <paramref name="args"/> was present.
        /// </summary>
        private static TermMap CreateFileTermMap(IHostEnvironment env, IChannel ch, ArgumentsBase args, Builder bldr)
        {
            Contracts.AssertValue(ch);
            ch.AssertValue(env);
            ch.AssertValue(args);
            ch.Assert(!string.IsNullOrWhiteSpace(args.DataFile));
            ch.AssertValue(bldr);

            string file = args.DataFile;
            // First column using the file.
            string src = args.TermsColumn;
            var    sub = args.Loader;
            // If the user manually specifies a loader, or this is already a pre-processed binary
            // file, then we assume the user knows what they're doing and do not attempt to convert
            // to the desired type ourselves.
            bool autoConvert = false;

            if (!sub.IsGood())
            {
                // Determine the default loader from the extension.
                var  ext         = Path.GetExtension(file);
                bool isBinary    = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase);
                bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase);
                if (isBinary || isTranspose)
                {
                    ch.Assert(isBinary != isTranspose);
                    ch.CheckUserArg(!string.IsNullOrWhiteSpace(src), nameof(args.TermsColumn),
                                    "Must be specified");
                    if (isBinary)
                    {
                        sub = new SubComponent <IDataLoader, SignatureDataLoader>("BinaryLoader");
                    }
                    else
                    {
                        ch.Assert(isTranspose);
                        sub = new SubComponent <IDataLoader, SignatureDataLoader>("TransposeLoader");
                    }
                }
                else
                {
                    if (!string.IsNullOrWhiteSpace(src))
                    {
                        ch.Warning(
                            "{0} should not be specified when default loader is TextLoader. Ignoring {0}={1}",
                            nameof(Arguments.TermsColumn), src);
                    }
                    sub         = new SubComponent <IDataLoader, SignatureDataLoader>("TextLoader", "sep=tab col=Term:TX:0");
                    src         = "Term";
                    autoConvert = true;
                }
            }
            ch.AssertNonEmpty(src);

            int colSrc;
            var loader = sub.CreateInstance(env, new MultiFileSource(file));

            if (!loader.Schema.TryGetColumnIndex(src, out colSrc))
            {
                throw ch.ExceptUserArg(nameof(args.TermsColumn), "Unknown column '{0}'", src);
            }
            var typeSrc = loader.Schema.GetColumnType(colSrc);

            if (!autoConvert && !typeSrc.Equals(bldr.ItemType))
            {
                throw ch.ExceptUserArg(nameof(args.TermsColumn), "Must be of type '{0}' but was '{1}'", bldr.ItemType, typeSrc);
            }

            using (var cursor = loader.GetRowCursor(col => col == colSrc))
                using (var pch = env.StartProgressChannel("Building term dictionary from file"))
                {
                    var    header   = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" });
                    var    trainer  = Trainer.Create(cursor, colSrc, autoConvert, int.MaxValue, bldr);
                    double rowCount = loader.GetRowCount(true) ?? double.NaN;
                    long   rowCur   = 0;
                    pch.SetHeader(header,
                                  e =>
                    {
                        e.SetProgress(0, rowCur, rowCount);
                        // Purely feedback for the user. That the other thread might be
                        // working in the background is not a problem.
                        e.SetMetric(0, trainer.Count);
                    });
                    while (cursor.MoveNext() && trainer.ProcessRow())
                    {
                        rowCur++;
                    }
                    if (trainer.Count == 0)
                    {
                        ch.Warning("Term map loaded from file resulted in an empty map.");
                    }
                    pch.Checkpoint(trainer.Count, rowCur);
                    return(trainer.Finish());
                }
        }
Esempio n. 12
0
        private void SaveTransposedData(IChannel ch, Stream stream, ITransposeDataView data, int[] cols)
        {
            _host.AssertValue(ch);
            ch.AssertValue(stream);
            ch.AssertValue(data);
            ch.AssertNonEmpty(cols);
            ch.Assert(stream.CanSeek);

            // Initialize what we can in the header, though we will not be writing out things in the
            // header until we have confidence that things were written out correctly.
            TransposeLoader.Header header = default(TransposeLoader.Header);
            header.Signature         = TransposeLoader.Header.SignatureValue;
            header.Version           = TransposeLoader.Header.WriterVersion;
            header.CompatibleVersion = TransposeLoader.Header.WriterVersion;
            VectorType slotType = data.TransposeSchema.GetSlotType(cols[0]);

            ch.AssertValue(slotType);
            header.RowCount    = slotType.ValueCount;
            header.ColumnCount = cols.Length;

            // We keep track of the offsets of the start of each sub-IDV, for use in writing out the
            // offsets/length table later.
            List <long> offsets = new List <long>();

            // First write a bunch of zeros at the head, as a placeholder for the header that
            // will go there assuming we can successfully load it. We'll keep this array around
            // for the real marshalling and writing of the header bytes structure.
            byte[] headerBytes = new byte[TransposeLoader.Header.HeaderSize];
            stream.Write(headerBytes, 0, headerBytes.Length);
            offsets.Add(stream.Position);

            // This is a convenient delegate to write out an IDV substream, then save the offsets
            // where writing stopped to the offsets list.
            Action <string, IDataView> viewAction =
                (name, view) =>
            {
                using (var substream = new SubsetStream(stream))
                {
                    _internalSaver.SaveData(substream, view, Utils.GetIdentityPermutation(view.Schema.ColumnCount));
                    substream.Seek(0, SeekOrigin.End);
                    ch.Info("Wrote {0} data view in {1} bytes", name, substream.Length);
                }
                offsets.Add(stream.Position);
            };

            // First write out the no-row data, limited to these columns.
            IDataView subdata = new ChooseColumnsByIndexTransform(_host,
                                                                  new ChooseColumnsByIndexTransform.Arguments()
            {
                Index = cols
            }, data);

            // If we want the "dual mode" row-wise and slot-wise file, don't filter out anything.
            if (!_writeRowData)
            {
                subdata = SkipTakeFilter.Create(_host, new SkipTakeFilter.TakeArguments()
                {
                    Count = 0
                }, subdata);
            }

            string msg = _writeRowData ? "row-wise data, schema, and metadata" : "schema and metadata";

            viewAction(msg, subdata);
            foreach (var col in cols)
            {
                viewAction(data.Schema.GetColumnName(col), new TransposerUtils.SlotDataView(_host, data, col));
            }

            // Wrote out the dataview. Write out the table offset.
            using (var writer = new BinaryWriter(stream, Encoding.UTF8, leaveOpen: true))
            {
                // Format of the table is offset, length, both as 8-byte integers.
                // As it happens we wrote things out as adjacent sub-IDVs, so the
                // length can be derived from the offsets. The first will be the
                // start of the first sub-IDV, and all subsequent entries will be
                // the start/end of the current/next sub-IDV, respectively, so a total
                // of cols.Length + 2 entries.
                ch.Assert(offsets.Count == cols.Length + 2);
                ch.Assert(offsets[offsets.Count - 1] == stream.Position);
                header.SubIdvTableOffset = stream.Position;
                for (int c = 1; c < offsets.Count; ++c)
                {
                    // 8-byte int for offsets, 8-byte int for length.
                    writer.Write(offsets[c - 1]);
                    writer.Write(offsets[c] - offsets[c - 1]);
                }
                header.TailOffset = stream.Position;
                writer.Write(TransposeLoader.Header.TailSignatureValue);

                // Now we are confident that things will work, so write it out.
                unsafe
                {
                    Marshal.Copy(new IntPtr(&header), headerBytes, 0, Marshal.SizeOf(typeof(Header)));
                }
                writer.Seek(0, SeekOrigin.Begin);
                writer.Write(headerBytes);
            }
        }