private int[][] CompileSlotMap(string slotMapString, int srcSlotCount) { var parts = ReadOnlyMemoryUtils.Split(slotMapString.AsMemory(), new[] { ';' }).ToArray(); var slotMap = new int[parts.Length][]; for (int i = 0; i < slotMap.Length; i++) { var slotIndices = ReadOnlyMemoryUtils.Split(parts[i], new[] { ',' }).ToArray(); var slots = new int[slotIndices.Length]; slotMap[i] = slots; for (int j = 0; j < slots.Length; j++) { int index; if (!int.TryParse(slotIndices[j].ToString(), out index) || index < 0 || index >= srcSlotCount) { throw Host.Except("Unexpected slot index '{1}' in group {0}. Expected 0 to {2}", i, slotIndices[j], srcSlotCount - 1); } slots[j] = index; } if (slots.Distinct().Count() < slots.Length) { throw Host.Except("Group '{0}' has duplicate slot indices", parts[i]); } } return(slotMap); }
public static CommonOutputs.TransformOutput RenameBinaryPredictionScoreColumns(IHostEnvironment env, RenameBinaryPredictionScoreColumnsInput input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("ScoreModel"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); if (input.PredictorModel.Predictor.PredictionKind == PredictionKind.BinaryClassification) { DataViewType labelType; var labelNames = input.PredictorModel.GetLabelInfo(host, out labelType); if (labelNames != null && labelNames.Length == 2) { var positiveClass = labelNames[1]; // Rename all the score columns. int colMax; var maxScoreId = input.Data.Schema.GetMaxAnnotationKind(out colMax, AnnotationUtils.Kinds.ScoreColumnSetId); var copyCols = new List <(string name, string source)>(); for (int i = 0; i < input.Data.Schema.Count; i++) { if (input.Data.Schema[i].IsHidden) { continue; } if (!ShouldAddColumn(input.Data.Schema, i, null, maxScoreId)) { continue; } // Do not rename the PredictedLabel column. ReadOnlyMemory <char> tmp = default; if (input.Data.Schema.TryGetAnnotation(TextDataViewType.Instance, AnnotationUtils.Kinds.ScoreValueKind, i, ref tmp) && ReadOnlyMemoryUtils.EqualsStr(AnnotationUtils.Const.ScoreValueKind.PredictedLabel, tmp)) { continue; } var source = input.Data.Schema[i].Name; var name = source + "." + positiveClass; copyCols.Add((name, source)); } var copyColumn = new ColumnCopyingTransformer(env, copyCols.ToArray()).Transform(input.Data); var dropColumn = ColumnSelectingTransformer.CreateDrop(env, copyColumn, copyCols.Select(c => c.source).ToArray()); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, dropColumn, input.Data), OutputData = dropColumn }); } } var newView = NopTransform.CreateIfNeeded(env, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, newView, input.Data), OutputData = newView }); }
private void Evaluate(IndentedTextWriter wrt, Delegate del, DataViewType typeRes, DataViewType[] types, string text, int ichMin, int ichLim) { Contracts.AssertValue(del); Contracts.AssertNonEmpty(types); var args = new object[types.Length]; var getters = new Func <ReadOnlyMemory <char>, bool> [types.Length]; for (int i = 0; i < getters.Length; i++) { getters[i] = GetGetter(i, types[i], args); } StringBuilder sb = new StringBuilder(); Action <object> printer = GetPrinter(typeRes, sb); ReadOnlyMemory <char> chars = text.AsMemory().Slice(ichMin, ichLim - ichMin); for (bool more = true; more;) { ReadOnlyMemory <char> line; if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { more = ReadOnlyMemoryUtils.SplitOne(chars, '\x0D', out line, out chars); } else { more = ReadOnlyMemoryUtils.SplitOne(chars, '\x0A', out line, out chars); } line = ReadOnlyMemoryUtils.TrimWhiteSpace(line); if (line.IsEmpty) { continue; } // Note this "hack" to map _ to empty. It's easier than fully handling quoting and is sufficient // for these tests. var vals = ReadOnlyMemoryUtils.Split(line, new char[] { ',' }) .Select(x => ReadOnlyMemoryUtils.TrimWhiteSpace(x)) .Select(x => ReadOnlyMemoryUtils.EqualsStr("_", x) ? ReadOnlyMemory <char> .Empty : x) .ToArray(); Contracts.Assert(vals.Length == getters.Length); for (int i = 0; i < getters.Length; i++) { if (!getters[i](vals[i])) { wrt.Write("*** Parsing {0} Failed *** ", vals[i]); } } var res = del.DynamicInvoke(args); printer(res); wrt.WriteLine(sb); } }
public static TX Lower(TX a) { if (a.IsEmpty) { return(a); } var sb = new StringBuilder(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(a.Span, sb); return(sb.ToString().AsMemory()); }
protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) { Host.AssertValueOrNull(ch); Host.AssertValue(input); Host.Assert(0 <= iinfo && iinfo < Infos.Length); Host.Assert(Infos[iinfo].TypeSrc.IsVector & Infos[iinfo].TypeSrc.ItemType.IsText); disposer = null; var ex = _exes[iinfo]; Language stopWordslang = ex.Lang; var lang = default(ReadOnlyMemory <char>); var getLang = ex.LangsColIndex >= 0 ? input.GetGetter <ReadOnlyMemory <char> >(ex.LangsColIndex) : null; var getSrc = GetSrcGetter <VBuffer <ReadOnlyMemory <char> > >(input, iinfo); var src = default(VBuffer <ReadOnlyMemory <char> >); var buffer = new StringBuilder(); var list = new List <ReadOnlyMemory <char> >(); ValueGetter <VBuffer <ReadOnlyMemory <char> > > del = (ref VBuffer <ReadOnlyMemory <char> > dst) => { var langToUse = stopWordslang; UpdateLanguage(ref langToUse, getLang, ref lang); getSrc(ref src); list.Clear(); var srcValues = src.GetValues(); for (int i = 0; i < srcValues.Length; i++) { if (srcValues[i].IsEmpty) { continue; } buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(srcValues[i].Span, buffer); // REVIEW nihejazi: Consider using a trie for string matching (Aho-Corasick, etc.) if (StopWords[(int)langToUse].Get(buffer) == null) { list.Add(srcValues[i]); } } VBufferUtils.Copy(list, ref dst, list.Count); }; return(del); }
public void TestCrossValidationMacroWithNonDefaultNames() { string dataPath = GetDataPath(@"adult.tiny.with-schema.txt"); var env = new MLContext(42); var subGraph = env.CreateExperiment(); var textToKey = new Legacy.Transforms.TextToKeyConverter(); textToKey.Column = new[] { new Legacy.Transforms.ValueToKeyMappingTransformerColumn() { Name = "Label1", Source = "Label" } }; var textToKeyOutput = subGraph.Add(textToKey); var hash = new Legacy.Transforms.HashConverter(); hash.Column = new[] { new Legacy.Transforms.HashJoiningTransformColumn() { Name = "GroupId1", Source = "Workclass" } }; hash.Data = textToKeyOutput.OutputData; var hashOutput = subGraph.Add(hash); var learnerInput = new Legacy.Trainers.FastTreeRanker { TrainingData = hashOutput.OutputData, NumThreads = 1, LabelColumn = "Label1", GroupIdColumn = "GroupId1" }; var learnerOutput = subGraph.Add(learnerInput); var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <TransformModel>(textToKeyOutput.Model, hashOutput.Model), PredictorModel = learnerOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new Legacy.Data.TextLoader(dataPath); importInput.Arguments.HasHeader = true; importInput.Arguments.Column = new TextLoaderColumn[] { new TextLoaderColumn { Name = "Label", Source = new[] { new TextLoaderRange(0) } }, new TextLoaderColumn { Name = "Workclass", Source = new[] { new TextLoaderRange(1) }, Type = Legacy.Data.DataKind.Text }, new TextLoaderColumn { Name = "Features", Source = new[] { new TextLoaderRange(9, 14) } } }; var importOutput = experiment.Add(importInput); var crossValidate = new Legacy.Models.CrossValidator { Data = importOutput.Data, Nodes = subGraph, TransformModel = null, LabelColumn = "Label1", GroupColumn = "GroupId1", NameColumn = "Workclass", Kind = Legacy.Models.MacroUtilsTrainerKinds.SignatureRankerTrainer }; crossValidate.Inputs.Data = textToKey.Data; crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidate); experiment.Compile(); experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false)); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics); var schema = data.Schema; var b = schema.TryGetColumnIndex("NDCG", out int metricCol); Assert.True(b); b = schema.TryGetColumnIndex("Fold Index", out int foldCol); Assert.True(b); using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol)) { var getter = cursor.GetGetter <VBuffer <double> >(metricCol); var foldGetter = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol); ReadOnlyMemory <char> fold = default; // Get the verage. b = cursor.MoveNext(); Assert.True(b); var avg = default(VBuffer <double>); getter(ref avg); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold)); // Get the standard deviation. b = cursor.MoveNext(); Assert.True(b); var stdev = default(VBuffer <double>); getter(ref stdev); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold)); var stdevValues = stdev.GetValues(); Assert.Equal(2.462, stdevValues[0], 3); Assert.Equal(2.763, stdevValues[1], 3); Assert.Equal(3.273, stdevValues[2], 3); var sumBldr = new BufferBuilder <double>(R8Adder.Instance); sumBldr.Reset(avg.Length, true); var val = default(VBuffer <double>); for (int f = 0; f < 2; f++) { b = cursor.MoveNext(); Assert.True(b); getter(ref val); foldGetter(ref fold); sumBldr.AddFeatures(0, in val); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold)); } var sum = default(VBuffer <double>); sumBldr.GetResult(ref sum); var avgValues = avg.GetValues(); var sumValues = sum.GetValues(); for (int i = 0; i < avgValues.Length; i++) { Assert.Equal(avgValues[i], sumValues[i] / 2); } b = cursor.MoveNext(); Assert.False(b); } data = experiment.GetOutput(crossValidateOutput.PerInstanceMetrics); Assert.True(data.Schema.TryGetColumnIndex("Instance", out int nameCol)); using (var cursor = data.GetRowCursor(col => col == nameCol)) { var getter = cursor.GetGetter <ReadOnlyMemory <char> >(nameCol); while (cursor.MoveNext()) { ReadOnlyMemory <char> name = default; getter(ref name); Assert.Subset(new HashSet <string>() { "Private", "?", "Federal-gov" }, new HashSet <string>() { name.ToString() }); if (cursor.Position > 4) { break; } } } }
public void TestCrossValidationMacroWithStratification() { var dataPath = GetDataPath(@"breast-cancer.txt"); var env = new MLContext(42); var subGraph = env.CreateExperiment(); var nop = new Legacy.Transforms.NoOperation(); var nopOutput = subGraph.Add(nop); var learnerInput = new Legacy.Trainers.StochasticDualCoordinateAscentBinaryClassifier { TrainingData = nopOutput.OutputData, NumThreads = 1 }; var learnerOutput = subGraph.Add(learnerInput); var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <TransformModel>(nopOutput.Model), PredictorModel = learnerOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new Legacy.Data.TextLoader(dataPath); importInput.Arguments.Column = new Legacy.Data.TextLoaderColumn[] { new Legacy.Data.TextLoaderColumn { Name = "Label", Source = new[] { new Legacy.Data.TextLoaderRange(0) } }, new Legacy.Data.TextLoaderColumn { Name = "Strat", Source = new[] { new Legacy.Data.TextLoaderRange(1) } }, new Legacy.Data.TextLoaderColumn { Name = "Features", Source = new[] { new Legacy.Data.TextLoaderRange(2, 9) } } }; var importOutput = experiment.Add(importInput); var crossValidate = new Legacy.Models.CrossValidator { Data = importOutput.Data, Nodes = subGraph, TransformModel = null, StratificationColumn = "Strat" }; crossValidate.Inputs.Data = nop.Data; crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidate); experiment.Compile(); experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false)); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics); var schema = data.Schema; var b = schema.TryGetColumnIndex("AUC", out int metricCol); Assert.True(b); b = schema.TryGetColumnIndex("Fold Index", out int foldCol); Assert.True(b); using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol)) { var getter = cursor.GetGetter <double>(metricCol); var foldGetter = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol); ReadOnlyMemory <char> fold = default; // Get the verage. b = cursor.MoveNext(); Assert.True(b); double avg = 0; getter(ref avg); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold)); // Get the standard deviation. b = cursor.MoveNext(); Assert.True(b); double stdev = 0; getter(ref stdev); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold)); Assert.Equal(0.00488, stdev, 5); double sum = 0; double val = 0; for (int f = 0; f < 2; f++) { b = cursor.MoveNext(); Assert.True(b); getter(ref val); foldGetter(ref fold); sum += val; Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold)); } Assert.Equal(avg, sum / 2); b = cursor.MoveNext(); Assert.False(b); } }
public void TestCrossValidationMacroWithMultiClass() { var dataPath = GetDataPath(@"Train-Tiny-28x28.txt"); var env = new MLContext(42); var subGraph = env.CreateExperiment(); var nop = new Legacy.Transforms.NoOperation(); var nopOutput = subGraph.Add(nop); var learnerInput = new Legacy.Trainers.StochasticDualCoordinateAscentClassifier { TrainingData = nopOutput.OutputData, NumThreads = 1 }; var learnerOutput = subGraph.Add(learnerInput); var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <TransformModel>(nopOutput.Model), PredictorModel = learnerOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new Legacy.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var crossValidate = new Legacy.Models.CrossValidator { Data = importOutput.Data, Nodes = subGraph, Kind = Legacy.Models.MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer, TransformModel = null }; crossValidate.Inputs.Data = nop.Data; crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidate); experiment.Compile(); importInput.SetInput(env, experiment); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics); var schema = data.Schema; var b = schema.TryGetColumnIndex("Accuracy(micro-avg)", out int metricCol); Assert.True(b); b = schema.TryGetColumnIndex("Fold Index", out int foldCol); Assert.True(b); using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol)) { var getter = cursor.GetGetter <double>(metricCol); var foldGetter = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol); ReadOnlyMemory <char> fold = default; // Get the average. b = cursor.MoveNext(); Assert.True(b); double avg = 0; getter(ref avg); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold)); // Get the standard deviation. b = cursor.MoveNext(); Assert.True(b); double stdev = 0; getter(ref stdev); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold)); Assert.Equal(0.015, stdev, 3); double sum = 0; double val = 0; for (int f = 0; f < 2; f++) { b = cursor.MoveNext(); Assert.True(b); getter(ref val); foldGetter(ref fold); sum += val; Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold)); } Assert.Equal(avg, sum / 2); b = cursor.MoveNext(); Assert.False(b); } var confusion = experiment.GetOutput(crossValidateOutput.ConfusionMatrix); schema = confusion.Schema; b = schema.TryGetColumnIndex("Count", out int countCol); Assert.True(b); b = schema.TryGetColumnIndex("Fold Index", out foldCol); Assert.True(b); var type = schema[countCol].Metadata.Schema[MetadataUtils.Kinds.SlotNames].Type; Assert.True(type is VectorType vecType && vecType.ItemType is TextType && vecType.Size == 10); var slotNames = default(VBuffer <ReadOnlyMemory <char> >); schema[countCol].GetSlotNames(ref slotNames); var slotNameValues = slotNames.GetValues(); for (int i = 0; i < slotNameValues.Length; i++) { Assert.True(ReadOnlyMemoryUtils.EqualsStr(i.ToString(), slotNameValues[i])); } using (var curs = confusion.GetRowCursor(col => true)) { var countGetter = curs.GetGetter <VBuffer <double> >(countCol); var foldGetter = curs.GetGetter <ReadOnlyMemory <char> >(foldCol); var confCount = default(VBuffer <double>); var foldIndex = default(ReadOnlyMemory <char>); int rowCount = 0; var foldCur = "Fold 0"; while (curs.MoveNext()) { countGetter(ref confCount); foldGetter(ref foldIndex); rowCount++; Assert.True(ReadOnlyMemoryUtils.EqualsStr(foldCur, foldIndex)); if (rowCount == 10) { rowCount = 0; foldCur = "Fold 1"; } } Assert.Equal(0, rowCount); } var warnings = experiment.GetOutput(crossValidateOutput.Warnings); using (var cursor = warnings.GetRowCursor(col => true)) Assert.False(cursor.MoveNext()); }
[ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline public void TestCrossValidationMacro() { var dataPath = GetDataPath(TestDatasets.generatedRegressionDatasetmacro.trainFilename); var env = new MLContext(42); var subGraph = env.CreateExperiment(); var nop = new Legacy.Transforms.NoOperation(); var nopOutput = subGraph.Add(nop); var generate = new Legacy.Transforms.RandomNumberGenerator(); generate.Column = new[] { new Legacy.Transforms.GenerateNumberTransformColumn() { Name = "Weight1" } }; generate.Data = nopOutput.OutputData; var generateOutput = subGraph.Add(generate); var learnerInput = new Legacy.Trainers.PoissonRegressor { TrainingData = generateOutput.OutputData, NumThreads = 1, WeightColumn = "Weight1" }; var learnerOutput = subGraph.Add(learnerInput); var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner { TransformModels = new ArrayVar <TransformModel>(nopOutput.Model, generateOutput.Model), PredictorModel = learnerOutput.PredictorModel }; var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); var importInput = new Legacy.Data.TextLoader(dataPath) { Arguments = new Legacy.Data.TextLoaderArguments { Separator = new[] { ';' }, HasHeader = true, Column = new[] { new TextLoaderColumn() { Name = "Label", Source = new [] { new TextLoaderRange(11) }, Type = Legacy.Data.DataKind.Num }, new TextLoaderColumn() { Name = "Features", Source = new [] { new TextLoaderRange(0, 10) }, Type = Legacy.Data.DataKind.Num } } } }; var importOutput = experiment.Add(importInput); var crossValidate = new Legacy.Models.CrossValidator { Data = importOutput.Data, Nodes = subGraph, Kind = Legacy.Models.MacroUtilsTrainerKinds.SignatureRegressorTrainer, TransformModel = null, WeightColumn = "Weight1" }; crossValidate.Inputs.Data = nop.Data; crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel; var crossValidateOutput = experiment.Add(crossValidate); experiment.Compile(); importInput.SetInput(env, experiment); experiment.Run(); var data = experiment.GetOutput(crossValidateOutput.OverallMetrics); var schema = data.Schema; var b = schema.TryGetColumnIndex("L1(avg)", out int metricCol); Assert.True(b); b = schema.TryGetColumnIndex("Fold Index", out int foldCol); Assert.True(b); b = schema.TryGetColumnIndex("IsWeighted", out int isWeightedCol); using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol || col == isWeightedCol)) { var getter = cursor.GetGetter <double>(metricCol); var foldGetter = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol); ReadOnlyMemory <char> fold = default; var isWeightedGetter = cursor.GetGetter <bool>(isWeightedCol); bool isWeighted = default; double avg = 0; double weightedAvg = 0; for (int w = 0; w < 2; w++) { // Get the average. b = cursor.MoveNext(); Assert.True(b); if (w == 1) { getter(ref weightedAvg); } else { getter(ref avg); } foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold)); isWeightedGetter(ref isWeighted); Assert.True(isWeighted == (w == 1)); // Get the standard deviation. b = cursor.MoveNext(); Assert.True(b); double stdev = 0; getter(ref stdev); foldGetter(ref fold); Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold)); if (w == 1) { Assert.Equal(1.585, stdev, 3); } else { Assert.Equal(1.39, stdev, 2); } isWeightedGetter(ref isWeighted); Assert.True(isWeighted == (w == 1)); } double sum = 0; double weightedSum = 0; for (int f = 0; f < 2; f++) { for (int w = 0; w < 2; w++) { b = cursor.MoveNext(); Assert.True(b); double val = 0; getter(ref val); foldGetter(ref fold); if (w == 1) { weightedSum += val; } else { sum += val; } Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold)); isWeightedGetter(ref isWeighted); Assert.True(isWeighted == (w == 1)); } } Assert.Equal(weightedAvg, weightedSum / 2); Assert.Equal(avg, sum / 2); b = cursor.MoveNext(); Assert.False(b); } }
private void LoadStopWords(IHostEnvironment env, IChannel ch, ArgumentsBase loaderArgs, out NormStr.Pool stopWordsMap) { Contracts.AssertValue(env); env.AssertValue(ch); ch.AssertValue(loaderArgs); if ((!string.IsNullOrEmpty(loaderArgs.Stopwords) || Utils.Size(loaderArgs.Stopword) > 0) && (!string.IsNullOrWhiteSpace(loaderArgs.DataFile) || loaderArgs.Loader != null || !string.IsNullOrWhiteSpace(loaderArgs.StopwordsColumn))) { ch.Warning("Explicit stopwords list specified. Data file arguments will be ignored"); } var src = default(ReadOnlyMemory <char>); stopWordsMap = new NormStr.Pool(); var buffer = new StringBuilder(); var stopwords = loaderArgs.Stopwords.AsMemory(); stopwords = ReadOnlyMemoryUtils.TrimSpaces(stopwords); if (!stopwords.IsEmpty) { bool warnEmpty = true; for (bool more = true; more;) { ReadOnlyMemory <char> stopword; more = ReadOnlyMemoryUtils.SplitOne(stopwords, ',', out stopword, out stopwords); stopword = ReadOnlyMemoryUtils.TrimSpaces(stopword); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopwords' specification"); warnEmpty = false; } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopwords), "stopwords is empty"); } else if (Utils.Size(loaderArgs.Stopword) > 0) { bool warnEmpty = true; foreach (string word in loaderArgs.Stopword) { var stopword = word.AsSpan(); stopword = stopword.Trim(' '); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopword' specification"); warnEmpty = false; } } } else { string srcCol = loaderArgs.StopwordsColumn; var loader = LoadStopwords(env, ch, loaderArgs.DataFile, loaderArgs.Loader, ref srcCol); int colSrc; if (!loader.Schema.TryGetColumnIndex(srcCol, out colSrc)) { throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol); } var typeSrc = loader.Schema[colSrc].Type; ch.CheckUserArg(typeSrc.IsText, nameof(Arguments.StopwordsColumn), "Must be a scalar text column"); // Accumulate the stopwords. using (var cursor = loader.GetRowCursor(col => col == colSrc)) { bool warnEmpty = true; var getter = cursor.GetGetter <ReadOnlyMemory <char> >(colSrc); while (cursor.MoveNext()) { getter(ref src); if (!src.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty rows ignored in data file"); warnEmpty = false; } } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty"); } }