Пример #1
0
        private static bool TryParseFile(TextLoader.Arguments args, IMultiStreamSource source, out ColumnSplitResult result)
        {
            result = null;
            var textLoader   = new TextLoader(new MLContext(), args);
            var idv          = textLoader.Read(source).Take(1000);
            var columnCounts = new List <int>();
            var column       = idv.Schema["C"];
            var columnIndex  = column.Index;

            using (var cursor = idv.GetRowCursor(x => x == columnIndex))
            {
                var getter = cursor.GetGetter <VBuffer <ReadOnlyMemory <char> > >(columnIndex);

                VBuffer <ReadOnlyMemory <char> > line = default;
                while (cursor.MoveNext())
                {
                    getter(ref line);
                    columnCounts.Add(line.Length);
                }
            }

            var mostCommon = columnCounts.GroupBy(x => x).OrderByDescending(x => x.Count()).First();

            if (mostCommon.Count() < UniformColumnCountThreshold * columnCounts.Count)
            {
                return(false);
            }

            // disallow single-column case
            if (mostCommon.Key <= 1)
            {
                return(false);
            }

            result = new ColumnSplitResult(true, args.Separator, args.AllowQuoting, args.AllowSparse, mostCommon.Key);
            return(true);
        }
Пример #2
0
        private static bool TryParseFile(IChannel ch, TextLoader.Arguments args, IMultiStreamSource source, bool skipStrictValidation, out ColumnSplitResult result)
        {
            result = default(ColumnSplitResult);
            try
            {
                // No need to provide information from unsuccessful loader, so we create temporary environment and get information from it in case of success
                using (var loaderEnv = new TlcEnvironment(0, true))
                {
                    var messages = new ConcurrentBag <ChannelMessage>();
                    loaderEnv.AddListener <ChannelMessage>(
                        (src, msg) =>
                    {
                        messages.Add(msg);
                    });
                    var  idv          = TextLoader.ReadFile(loaderEnv, args, source).Take(1000);
                    var  columnCounts = new List <int>();
                    int  columnIndex;
                    bool found = idv.Schema.TryGetColumnIndex("C", out columnIndex);
                    ch.Assert(found);

                    using (var cursor = idv.GetRowCursor(x => x == columnIndex))
                    {
                        var getter = cursor.GetGetter <VBuffer <DvText> >(columnIndex);

                        VBuffer <DvText> line = default(VBuffer <DvText>);
                        while (cursor.MoveNext())
                        {
                            getter(ref line);
                            columnCounts.Add(line.Length);
                        }
                    }

                    Contracts.Check(columnCounts.Count > 0);
                    var mostCommon = columnCounts.GroupBy(x => x).OrderByDescending(x => x.Count()).First();
                    if (!skipStrictValidation && mostCommon.Count() < UniformColumnCountThreshold * columnCounts.Count)
                    {
                        return(false);
                    }

                    // If user explicitly specified separator we're allowing "single" column case;
                    // Otherwise user will see message informing that we were not able to detect any columns.
                    if (!skipStrictValidation && mostCommon.Key <= 1)
                    {
                        return(false);
                    }

                    result = new ColumnSplitResult(true, args.Separator, args.AllowQuoting, args.AllowSparse, mostCommon.Key);
                    ch.Trace("Discovered {0} columns using separator '{1}'", mostCommon.Key, args.Separator);
                    foreach (var msg in messages)
                    {
                        ch.Send(msg);
                    }
                    return(true);
                }
            }
            catch (Exception ex)
            {
                if (!ex.IsMarked())
                {
                    throw;
                }
                // For known exceptions, we just continue to the next separator candidate.
            }
            return(false);
        }