示例#1
0
        // This method is called if only a datafile is specified, without a loader/term and value columns.
        // It determines the type of the Value column and returns the appropriate TextLoader component factory.
        private static IComponentFactory <IMultiStreamSource, IDataLoader> GetLoaderFactory(string filename, bool keyValues, IHost host)
        {
            Contracts.AssertValue(host);

            // If the user specified non-key values, we define the value column to be numeric.
            if (!keyValues)
            {
                return(ComponentFactoryUtils.CreateFromFunction <IMultiStreamSource, IDataLoader>(
                           (env, files) => new TextLoader(
                               env,
                               new TextLoader.Arguments()
                {
                    Column = new[]
                    {
                        new TextLoader.Column("Term", DataKind.TX, 0),
                        new TextLoader.Column("Value", DataKind.Num, 1)
                    }
                },
                               files)));
            }

            // If the user specified key values, we scan the values to determine the range of the key type.
            ulong min = ulong.MaxValue;
            ulong max = ulong.MinValue;

            try
            {
                var  txtArgs = new TextLoader.Arguments();
                bool parsed  = CmdParser.ParseArguments(host, "col=Term:TX:0 col=Value:TX:1", txtArgs);
                host.Assert(parsed);
                var txtLoader = new TextLoader(host, txtArgs, new MultiFileSource(filename));
                using (var cursor = txtLoader.GetRowCursor(c => true))
                {
                    var    getTerm = cursor.GetGetter <DvText>(0);
                    var    getVal  = cursor.GetGetter <DvText>(1);
                    DvText txt     = default(DvText);

                    using (var ch = host.Start("Creating Text Lookup Loader"))
                    {
                        long countNonKeys = 0;
                        while (cursor.MoveNext())
                        {
                            getVal(ref txt);
                            ulong res;
                            // Try to parse the text as a key value between 1 and ulong.MaxValue. If this succeeds and res>0,
                            // we update max and min accordingly. If res==0 it means the value is missing, in which case we ignore it for
                            // computing max and min.
                            if (Conversions.Instance.TryParseKey(ref txt, 1, ulong.MaxValue, out res))
                            {
                                if (res < min && res != 0)
                                {
                                    min = res;
                                }
                                if (res > max)
                                {
                                    max = res;
                                }
                            }
                            // If parsing as key did not succeed, the value can still be 0, so we try parsing it as a ulong. If it succeeds,
                            // then the value is 0, and we update min accordingly.
                            else if (Conversions.Instance.TryParse(ref txt, out res))
                            {
                                ch.Assert(res == 0);
                                min = 0;
                            }
                            //If parsing as a ulong fails, we increment the counter for the non-key values.
                            else
                            {
                                var term = default(DvText);
                                getTerm(ref term);
                                if (countNonKeys < 5)
                                {
                                    ch.Warning("Term '{0}' in mapping file is mapped to non key value '{1}'", term, txt);
                                }
                                countNonKeys++;
                            }
                        }
                        if (countNonKeys > 0)
                        {
                            ch.Warning("Found {0} non key values in the file '{1}'", countNonKeys, filename);
                        }
                        if (min > max)
                        {
                            min = 0;
                            max = uint.MaxValue - 1;
                            ch.Warning("did not find any valid key values in the file '{0}'", filename);
                        }
                        else
                        {
                            ch.Info("Found key values in the range {0} to {1} in the file '{2}'", min, max, filename);
                        }
                        ch.Done();
                    }
                }
            }
            catch (Exception e)
            {
                throw host.Except(e, "Failed to parse the lookup file '{0}' in TermLookupTransform", filename);
            }

            TextLoader.Column valueColumn = new TextLoader.Column("Value", DataKind.U4, 1);
            if (max - min < (ulong)int.MaxValue)
            {
                valueColumn.KeyRange = new KeyRange(min, max);
            }
            else if (max - min < (ulong)uint.MaxValue)
            {
                valueColumn.KeyRange = new KeyRange(min);
            }
            else
            {
                valueColumn.Type     = DataKind.U8;
                valueColumn.KeyRange = new KeyRange(min);
            }

            return(ComponentFactoryUtils.CreateFromFunction <IMultiStreamSource, IDataLoader>(
                       (env, files) => new TextLoader(
                           env,
                           new TextLoader.Arguments()
            {
                Column = new[]
                {
                    new TextLoader.Column("Term", DataKind.TX, 0),
                    valueColumn
                }
            },
                           files)));
        }