private static byte[] GetBytesOne(IHost host, string dataFile, IComponentFactory <IMultiStreamSource, IDataLoader> loaderFactory, string termColumn, string valueColumn) { Contracts.AssertValue(host); host.Assert(!string.IsNullOrWhiteSpace(dataFile)); host.AssertNonEmpty(termColumn); host.AssertNonEmpty(valueColumn); IMultiStreamSource fileSource = new MultiFileSource(dataFile); IDataLoader loader; if (loaderFactory == null) { // REVIEW: Should there be defaults for loading from text? var ext = Path.GetExtension(dataFile); bool isBinary = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase); bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase); if (!isBinary && !isTranspose) { throw host.ExceptUserArg(nameof(Arguments.Loader), "must specify the loader"); } host.Assert(isBinary != isTranspose); // One or the other must be true. if (isBinary) { loader = new BinaryLoader(host, new BinaryLoader.Arguments(), fileSource); } else { loader = new TransposeLoader(host, new TransposeLoader.Arguments(), fileSource); } } else { loader = loaderFactory.CreateComponent(host, fileSource); } return(GetBytesFromDataView(host, loader, termColumn, valueColumn)); }
/// <summary> /// Utility method to create the file-based <see cref="TermMap"/> if the <see cref="ArgumentsBase.DataFile"/> /// argument of <paramref name="args"/> was present. /// </summary> private static TermMap CreateFileTermMap(IHostEnvironment env, IChannel ch, ArgumentsBase args, Builder bldr) { Contracts.AssertValue(ch); ch.AssertValue(env); ch.AssertValue(args); ch.Assert(!string.IsNullOrWhiteSpace(args.DataFile)); ch.AssertValue(bldr); string file = args.DataFile; // First column using the file. string src = args.TermsColumn; IMultiStreamSource fileSource = new MultiFileSource(file); var loaderFactory = args.Loader; // If the user manually specifies a loader, or this is already a pre-processed binary // file, then we assume the user knows what they're doing and do not attempt to convert // to the desired type ourselves. bool autoConvert = false; IDataLoader loader; if (loaderFactory != null) { loader = loaderFactory.CreateComponent(env, fileSource); } else { // Determine the default loader from the extension. var ext = Path.GetExtension(file); bool isBinary = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase); bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase); if (isBinary || isTranspose) { ch.Assert(isBinary != isTranspose); ch.CheckUserArg(!string.IsNullOrWhiteSpace(src), nameof(args.TermsColumn), "Must be specified"); if (isBinary) { loader = new BinaryLoader(env, new BinaryLoader.Arguments(), fileSource); } else { ch.Assert(isTranspose); loader = new TransposeLoader(env, new TransposeLoader.Arguments(), fileSource); } } else { if (!string.IsNullOrWhiteSpace(src)) { ch.Warning( "{0} should not be specified when default loader is TextLoader. Ignoring {0}={1}", nameof(Arguments.TermsColumn), src); } loader = new TextLoader(env, new TextLoader.Arguments() { Separator = "tab", Column = new[] { new TextLoader.Column() { Name = "Term", Type = DataKind.TX, Source = new[] { new TextLoader.Range() { Min = 0 } } } } }, fileSource); src = "Term"; autoConvert = true; } } ch.AssertNonEmpty(src); int colSrc; if (!loader.Schema.TryGetColumnIndex(src, out colSrc)) { throw ch.ExceptUserArg(nameof(args.TermsColumn), "Unknown column '{0}'", src); } var typeSrc = loader.Schema.GetColumnType(colSrc); if (!autoConvert && !typeSrc.Equals(bldr.ItemType)) { throw ch.ExceptUserArg(nameof(args.TermsColumn), "Must be of type '{0}' but was '{1}'", bldr.ItemType, typeSrc); } using (var cursor = loader.GetRowCursor(col => col == colSrc)) using (var pch = env.StartProgressChannel("Building term dictionary from file")) { var header = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" }); var trainer = Trainer.Create(cursor, colSrc, autoConvert, int.MaxValue, bldr); double rowCount = loader.GetRowCount(true) ?? double.NaN; long rowCur = 0; pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); // Purely feedback for the user. That the other thread might be // working in the background is not a problem. e.SetMetric(0, trainer.Count); }); while (cursor.MoveNext() && trainer.ProcessRow()) { rowCur++; } if (trainer.Count == 0) { ch.Warning("Term map loaded from file resulted in an empty map."); } pch.Checkpoint(trainer.Count, rowCur); return(trainer.Finish()); } }