private static IDataLoader LoadStopwords(IHostEnvironment env, IChannel ch, string dataFile, IComponentFactory <IMultiStreamSource, IDataLoader> loader, ref string stopwordsCol) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(ch, nameof(ch)); MultiFileSource fileSource = new MultiFileSource(dataFile); IDataLoader dataLoader; // First column using the file. if (loader == null) { // Determine the default loader from the extension. var ext = Path.GetExtension(dataFile); bool isBinary = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase); bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase); if (isBinary || isTranspose) { ch.Assert(isBinary != isTranspose); ch.CheckUserArg(!string.IsNullOrWhiteSpace(stopwordsCol), nameof(Arguments.StopwordsColumn), "stopwordsColumn should be specified"); if (isBinary) { dataLoader = new BinaryLoader(env, new BinaryLoader.Arguments(), fileSource); } else { ch.Assert(isTranspose); dataLoader = new TransposeLoader(env, new TransposeLoader.Arguments(), fileSource); } } else { if (!string.IsNullOrWhiteSpace(stopwordsCol)) { ch.Warning("{0} should not be specified when default loader is TextLoader. Ignoring stopwordsColumn={0}", stopwordsCol); } dataLoader = TextLoader.Create( env, new TextLoader.Arguments() { Separator = "tab", Column = new[] { new TextLoader.Column("Stopwords", DataKind.TX, 0) } }, fileSource); stopwordsCol = "Stopwords"; } ch.AssertNonEmpty(stopwordsCol); } else { dataLoader = loader.CreateComponent(env, fileSource); } return(dataLoader); }
private TermLookupTransform(IChannel ch, ModelLoadContext ctx, IHost host, IDataView input) : base(host, ctx, input, TestIsText) { Host.AssertValue(ch); // *** Binary format *** // <base> ch.AssertNonEmpty(Infos); // Extra streams: // DefaultMap.idv byte[] rgb = null; Action <BinaryReader> fn = r => rgb = ReadAllBytes(ch, r); if (!ctx.TryLoadBinaryStream(DefaultMapName, fn)) { throw ch.ExceptDecode(); } _bytes = rgb; // Process the bytes into the loader and map. _ldr = GetLoader(Host, _bytes); ValidateLoader(ch, _ldr); _valueMap = Train(ch, _ldr); SetMetadata(); }
private static void ExecCore(EnvironmentBlock *penv, IHost host, IChannel ch, string graph, int cdata, DataSourceBlock **ppdata) { Contracts.AssertValue(ch); ch.AssertValue(host); ch.AssertNonEmpty(graph); ch.Assert(cdata >= 0); ch.Assert(ppdata != null || cdata == 0); RunGraphCore(penv, host, graph, cdata, ppdata); }
private void RunCore(IChannel ch) { IDataLoader loader = CreateAndSaveLoader(); using (var schemaWriter = new StringWriter()) { RunOnData(schemaWriter, Args, loader); var str = schemaWriter.ToString(); ch.AssertNonEmpty(str); ch.Info(str); } }
/// <summary> /// This method simply prints the overall metrics using EvaluateUtils.PrintOverallMetrics. /// Override if something else is needed. /// </summary> protected virtual void PrintOverallResultsCore(IChannel ch, string filename, Dictionary <string, IDataView>[] metrics) { ch.AssertNonEmpty(metrics); IDataView overall; if (!TryGetOverallMetrics(metrics, out overall)) { throw ch.Except("No overall metrics found"); } MetricWriter.PrintOverallMetrics(Host, ch, filename, overall, metrics.Length); }
protected override void PrintOverallResultsCore(IChannel ch, string filename, Dictionary <string, IDataView>[] metrics) { ch.AssertNonEmpty(metrics); IDataView overall; if (!TryGetOverallMetrics(metrics, out overall)) { throw ch.Except("No overall metrics found"); } // Show only the metrics for the requested index. overall = ExtractRelevantIndex(overall); MetricWriter.PrintOverallMetrics(Host, ch, filename, overall, metrics.Length); }
protected override void PrintAdditionalMetricsCore(IChannel ch, Dictionary <string, IDataView>[] metrics) { ch.AssertNonEmpty(metrics); if (!string.IsNullOrEmpty(_groupSummaryFilename)) { IDataView gs; if (!TryGetGroupSummaryMetrics(metrics, out gs)) { throw ch.Except("Did not find group summary metrics"); } ch.Trace("Saving group-summary results"); // If the data view contains stratification columns, filter so that only the overall metrics // will be present, and drop them. gs = MetricWriter.GetNonStratifiedMetrics(Host, gs); MetricWriter.SavePerInstance(Host, ch, _groupSummaryFilename, gs); } }
private static IDataLoader LoadStopwords(IHostEnvironment env, IChannel ch, string dataFile, SubComponent <IDataLoader, SignatureDataLoader> loader, ref string stopwordsCol) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(ch, nameof(ch)); // First column using the file. if (!loader.IsGood()) { // Determine the default loader from the extension. var ext = Path.GetExtension(dataFile); bool isBinary = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase); bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase); if (isBinary || isTranspose) { ch.Assert(isBinary != isTranspose); ch.CheckUserArg(!string.IsNullOrWhiteSpace(stopwordsCol), nameof(Arguments.StopwordsColumn), "stopwordsColumn should be specified"); if (isBinary) { loader = new SubComponent <IDataLoader, SignatureDataLoader>("BinaryLoader"); } else { ch.Assert(isTranspose); loader = new SubComponent <IDataLoader, SignatureDataLoader>("TransposeLoader"); } } else { if (!string.IsNullOrWhiteSpace(stopwordsCol)) { ch.Warning("{0} should not be specified when default loader is TextLoader. Ignoring stopwordsColumn={0}", stopwordsCol); } loader = new SubComponent <IDataLoader, SignatureDataLoader>("TextLoader", "sep=tab col=Stopwords:TX:0"); stopwordsCol = "Stopwords"; } } ch.AssertNonEmpty(stopwordsCol); return(loader.CreateInstance(env, new MultiFileSource(dataFile))); }
protected override void PrintOverallResultsCore(IChannel ch, string filename, Dictionary <string, IDataView>[] metrics) { ch.AssertNonEmpty(metrics); IDataView overall; if (!TryGetOverallMetrics(metrics, out overall)) { throw ch.Except("No overall metrics found"); } var args = new DropColumnsTransform.Arguments(); args.Column = new[] { AnomalyDetectionEvaluator.OverallMetrics.NumAnomalies, AnomalyDetectionEvaluator.OverallMetrics.ThreshAtK, AnomalyDetectionEvaluator.OverallMetrics.ThreshAtP, AnomalyDetectionEvaluator.OverallMetrics.ThreshAtNumPos }; overall = new DropColumnsTransform(Host, args, overall); MetricWriter.PrintOverallMetrics(Host, ch, filename, overall, metrics.Length); }
private void WriteDataCore(IChannel ch, TextWriter writer, IDataView data, out string argsLoader, out long count, out int min, out int max, params int[] cols) { _host.AssertValue(ch); ch.AssertValue(writer); ch.AssertValue(data); ch.AssertNonEmpty(cols); // Determine the active columns and whether there is header information. bool[] active = new bool[data.Schema.ColumnCount]; for (int i = 0; i < cols.Length; i++) { ch.Check(0 <= cols[i] && cols[i] < active.Length); ch.Check(data.Schema.GetColumnType(cols[i]).ItemType.RawKind != 0); active[cols[i]] = true; } bool hasHeader = false; if (_outputHeader) { for (int i = 0; i < cols.Length; i++) { if (hasHeader) { continue; } var type = data.Schema.GetColumnType(cols[i]); if (!type.IsVector) { hasHeader = true; continue; } if (!type.IsKnownSizeVector) { continue; } var typeNames = data.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.SlotNames, cols[i]); if (typeNames != null && typeNames.VectorSize == type.VectorSize && typeNames.ItemType.IsText) { hasHeader = true; } } } using (var cursor = data.GetRowCursor(i => active[i])) { var pipes = new ValueWriter[cols.Length]; for (int i = 0; i < cols.Length; i++) { pipes[i] = ValueWriter.Create(cursor, cols[i], _sepChar); } // REVIEW: This should be outside the cursor creation. string header = CreateLoaderArguments(data.Schema, pipes, hasHeader, ch); argsLoader = header; if (_outputSchema) { WriteSchemaAsComment(writer, header); } double rowCount = data.GetRowCount(true) ?? double.NaN; using (var pch = !_silent ? _host.StartProgressChannel("TextSaver: saving data") : null) { long stateCount = 0; var state = new State(this, writer, pipes, hasHeader); if (pch != null) { pch.SetHeader(new ProgressHeader(new[] { "rows" }), e => e.SetProgress(0, stateCount, rowCount)); } state.Run(cursor, ref stateCount, out min, out max); count = stateCount; if (pch != null) { pch.Checkpoint(stateCount); } } } }
/// <summary> /// Utility method to create the file-based <see cref="TermMap"/> if the <see cref="ArgumentsBase.DataFile"/> /// argument of <paramref name="args"/> was present. /// </summary> private static TermMap CreateFileTermMap(IHostEnvironment env, IChannel ch, ArgumentsBase args, Builder bldr) { Contracts.AssertValue(ch); ch.AssertValue(env); ch.AssertValue(args); ch.Assert(!string.IsNullOrWhiteSpace(args.DataFile)); ch.AssertValue(bldr); string file = args.DataFile; // First column using the file. string src = args.TermsColumn; var sub = args.Loader; // If the user manually specifies a loader, or this is already a pre-processed binary // file, then we assume the user knows what they're doing and do not attempt to convert // to the desired type ourselves. bool autoConvert = false; if (!sub.IsGood()) { // Determine the default loader from the extension. var ext = Path.GetExtension(file); bool isBinary = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase); bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase); if (isBinary || isTranspose) { ch.Assert(isBinary != isTranspose); ch.CheckUserArg(!string.IsNullOrWhiteSpace(src), nameof(args.TermsColumn), "Must be specified"); if (isBinary) { sub = new SubComponent <IDataLoader, SignatureDataLoader>("BinaryLoader"); } else { ch.Assert(isTranspose); sub = new SubComponent <IDataLoader, SignatureDataLoader>("TransposeLoader"); } } else { if (!string.IsNullOrWhiteSpace(src)) { ch.Warning( "{0} should not be specified when default loader is TextLoader. Ignoring {0}={1}", nameof(Arguments.TermsColumn), src); } sub = new SubComponent <IDataLoader, SignatureDataLoader>("TextLoader", "sep=tab col=Term:TX:0"); src = "Term"; autoConvert = true; } } ch.AssertNonEmpty(src); int colSrc; var loader = sub.CreateInstance(env, new MultiFileSource(file)); if (!loader.Schema.TryGetColumnIndex(src, out colSrc)) { throw ch.ExceptUserArg(nameof(args.TermsColumn), "Unknown column '{0}'", src); } var typeSrc = loader.Schema.GetColumnType(colSrc); if (!autoConvert && !typeSrc.Equals(bldr.ItemType)) { throw ch.ExceptUserArg(nameof(args.TermsColumn), "Must be of type '{0}' but was '{1}'", bldr.ItemType, typeSrc); } using (var cursor = loader.GetRowCursor(col => col == colSrc)) using (var pch = env.StartProgressChannel("Building term dictionary from file")) { var header = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" }); var trainer = Trainer.Create(cursor, colSrc, autoConvert, int.MaxValue, bldr); double rowCount = loader.GetRowCount(true) ?? double.NaN; long rowCur = 0; pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); // Purely feedback for the user. That the other thread might be // working in the background is not a problem. e.SetMetric(0, trainer.Count); }); while (cursor.MoveNext() && trainer.ProcessRow()) { rowCur++; } if (trainer.Count == 0) { ch.Warning("Term map loaded from file resulted in an empty map."); } pch.Checkpoint(trainer.Count, rowCur); return(trainer.Finish()); } }
private void SaveTransposedData(IChannel ch, Stream stream, ITransposeDataView data, int[] cols) { _host.AssertValue(ch); ch.AssertValue(stream); ch.AssertValue(data); ch.AssertNonEmpty(cols); ch.Assert(stream.CanSeek); // Initialize what we can in the header, though we will not be writing out things in the // header until we have confidence that things were written out correctly. TransposeLoader.Header header = default(TransposeLoader.Header); header.Signature = TransposeLoader.Header.SignatureValue; header.Version = TransposeLoader.Header.WriterVersion; header.CompatibleVersion = TransposeLoader.Header.WriterVersion; VectorType slotType = data.TransposeSchema.GetSlotType(cols[0]); ch.AssertValue(slotType); header.RowCount = slotType.ValueCount; header.ColumnCount = cols.Length; // We keep track of the offsets of the start of each sub-IDV, for use in writing out the // offsets/length table later. List <long> offsets = new List <long>(); // First write a bunch of zeros at the head, as a placeholder for the header that // will go there assuming we can successfully load it. We'll keep this array around // for the real marshalling and writing of the header bytes structure. byte[] headerBytes = new byte[TransposeLoader.Header.HeaderSize]; stream.Write(headerBytes, 0, headerBytes.Length); offsets.Add(stream.Position); // This is a convenient delegate to write out an IDV substream, then save the offsets // where writing stopped to the offsets list. Action <string, IDataView> viewAction = (name, view) => { using (var substream = new SubsetStream(stream)) { _internalSaver.SaveData(substream, view, Utils.GetIdentityPermutation(view.Schema.ColumnCount)); substream.Seek(0, SeekOrigin.End); ch.Info("Wrote {0} data view in {1} bytes", name, substream.Length); } offsets.Add(stream.Position); }; // First write out the no-row data, limited to these columns. IDataView subdata = new ChooseColumnsByIndexTransform(_host, new ChooseColumnsByIndexTransform.Arguments() { Index = cols }, data); // If we want the "dual mode" row-wise and slot-wise file, don't filter out anything. if (!_writeRowData) { subdata = SkipTakeFilter.Create(_host, new SkipTakeFilter.TakeArguments() { Count = 0 }, subdata); } string msg = _writeRowData ? "row-wise data, schema, and metadata" : "schema and metadata"; viewAction(msg, subdata); foreach (var col in cols) { viewAction(data.Schema.GetColumnName(col), new TransposerUtils.SlotDataView(_host, data, col)); } // Wrote out the dataview. Write out the table offset. using (var writer = new BinaryWriter(stream, Encoding.UTF8, leaveOpen: true)) { // Format of the table is offset, length, both as 8-byte integers. // As it happens we wrote things out as adjacent sub-IDVs, so the // length can be derived from the offsets. The first will be the // start of the first sub-IDV, and all subsequent entries will be // the start/end of the current/next sub-IDV, respectively, so a total // of cols.Length + 2 entries. ch.Assert(offsets.Count == cols.Length + 2); ch.Assert(offsets[offsets.Count - 1] == stream.Position); header.SubIdvTableOffset = stream.Position; for (int c = 1; c < offsets.Count; ++c) { // 8-byte int for offsets, 8-byte int for length. writer.Write(offsets[c - 1]); writer.Write(offsets[c] - offsets[c - 1]); } header.TailOffset = stream.Position; writer.Write(TransposeLoader.Header.TailSignatureValue); // Now we are confident that things will work, so write it out. unsafe { Marshal.Copy(new IntPtr(&header), headerBytes, 0, Marshal.SizeOf(typeof(Header))); } writer.Seek(0, SeekOrigin.Begin); writer.Write(headerBytes); } }