private static byte[] GetBytesOne(IHost host, string dataFile, IComponentFactory <IMultiStreamSource, IDataLoader> loaderFactory,
                                          string termColumn, string valueColumn)
        {
            Contracts.AssertValue(host);
            host.Assert(!string.IsNullOrWhiteSpace(dataFile));
            host.AssertNonEmpty(termColumn);
            host.AssertNonEmpty(valueColumn);

            IMultiStreamSource fileSource = new MultiFileSource(dataFile);
            IDataLoader        loader;

            if (loaderFactory == null)
            {
                // REVIEW: Should there be defaults for loading from text?
                var  ext         = Path.GetExtension(dataFile);
                bool isBinary    = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase);
                bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase);
                if (!isBinary && !isTranspose)
                {
                    throw host.ExceptUserArg(nameof(Arguments.Loader), "must specify the loader");
                }
                host.Assert(isBinary != isTranspose); // One or the other must be true.
                if (isBinary)
                {
                    loader = new BinaryLoader(host, new BinaryLoader.Arguments(), fileSource);
                }
                else
                {
                    loader = new TransposeLoader(host, new TransposeLoader.Arguments(), fileSource);
                }
            }
            else
            {
                loader = loaderFactory.CreateComponent(host, fileSource);
            }

            return(GetBytesFromDataView(host, loader, termColumn, valueColumn));
        }
Esempio n. 2
0
        /// <summary>
        /// Utility method to create the file-based <see cref="TermMap"/> if the <see cref="ArgumentsBase.DataFile"/>
        /// argument of <paramref name="args"/> was present.
        /// </summary>
        private static TermMap CreateFileTermMap(IHostEnvironment env, IChannel ch, ArgumentsBase args, Builder bldr)
        {
            Contracts.AssertValue(ch);
            ch.AssertValue(env);
            ch.AssertValue(args);
            ch.Assert(!string.IsNullOrWhiteSpace(args.DataFile));
            ch.AssertValue(bldr);

            string file = args.DataFile;
            // First column using the file.
            string             src        = args.TermsColumn;
            IMultiStreamSource fileSource = new MultiFileSource(file);

            var loaderFactory = args.Loader;
            // If the user manually specifies a loader, or this is already a pre-processed binary
            // file, then we assume the user knows what they're doing and do not attempt to convert
            // to the desired type ourselves.
            bool        autoConvert = false;
            IDataLoader loader;

            if (loaderFactory != null)
            {
                loader = loaderFactory.CreateComponent(env, fileSource);
            }
            else
            {
                // Determine the default loader from the extension.
                var  ext         = Path.GetExtension(file);
                bool isBinary    = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase);
                bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase);
                if (isBinary || isTranspose)
                {
                    ch.Assert(isBinary != isTranspose);
                    ch.CheckUserArg(!string.IsNullOrWhiteSpace(src), nameof(args.TermsColumn),
                                    "Must be specified");
                    if (isBinary)
                    {
                        loader = new BinaryLoader(env, new BinaryLoader.Arguments(), fileSource);
                    }
                    else
                    {
                        ch.Assert(isTranspose);
                        loader = new TransposeLoader(env, new TransposeLoader.Arguments(), fileSource);
                    }
                }
                else
                {
                    if (!string.IsNullOrWhiteSpace(src))
                    {
                        ch.Warning(
                            "{0} should not be specified when default loader is TextLoader. Ignoring {0}={1}",
                            nameof(Arguments.TermsColumn), src);
                    }
                    loader = new TextLoader(env,
                                            new TextLoader.Arguments()
                    {
                        Separator = "tab",
                        Column    = new[]
                        {
                            new TextLoader.Column()
                            {
                                Name   = "Term",
                                Type   = DataKind.TX,
                                Source = new[] { new TextLoader.Range()
                                                 {
                                                     Min = 0
                                                 } }
                            }
                        }
                    },
                                            fileSource);
                    src         = "Term";
                    autoConvert = true;
                }
            }
            ch.AssertNonEmpty(src);

            int colSrc;

            if (!loader.Schema.TryGetColumnIndex(src, out colSrc))
            {
                throw ch.ExceptUserArg(nameof(args.TermsColumn), "Unknown column '{0}'", src);
            }
            var typeSrc = loader.Schema.GetColumnType(colSrc);

            if (!autoConvert && !typeSrc.Equals(bldr.ItemType))
            {
                throw ch.ExceptUserArg(nameof(args.TermsColumn), "Must be of type '{0}' but was '{1}'", bldr.ItemType, typeSrc);
            }

            using (var cursor = loader.GetRowCursor(col => col == colSrc))
                using (var pch = env.StartProgressChannel("Building term dictionary from file"))
                {
                    var    header   = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" });
                    var    trainer  = Trainer.Create(cursor, colSrc, autoConvert, int.MaxValue, bldr);
                    double rowCount = loader.GetRowCount(true) ?? double.NaN;
                    long   rowCur   = 0;
                    pch.SetHeader(header,
                                  e =>
                    {
                        e.SetProgress(0, rowCur, rowCount);
                        // Purely feedback for the user. That the other thread might be
                        // working in the background is not a problem.
                        e.SetMetric(0, trainer.Count);
                    });
                    while (cursor.MoveNext() && trainer.ProcessRow())
                    {
                        rowCur++;
                    }
                    if (trainer.Count == 0)
                    {
                        ch.Warning("Term map loaded from file resulted in an empty map.");
                    }
                    pch.Checkpoint(trainer.Count, rowCur);
                    return(trainer.Finish());
                }
        }