static void Main(string[] args) { Console.Title = "Machine Intelligence (Text Analytics) with TPL Data Flows"; // CONFIG // Instantiate new ML.NET Context // Note: MlContext is thread-safe var mlContext = new MLContext(100); // GET Current Environment Folder var currentEnrichmentFolder = System.IO.Path.Combine(Environment.CurrentDirectory, "EnrichedDocuments"); System.IO.Directory.CreateDirectory(currentEnrichmentFolder); // SET language to English StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English; // SET the max degree of parallelism // Note: Default is to use 75% of the workstation or server cores. // Note: If cores are hyperthreaded, adjust accordingly (i.e. multiply *2) var isHyperThreaded = false; var executionDataFlowOptions = new ExecutionDataflowBlockOptions(); executionDataFlowOptions.MaxDegreeOfParallelism = // Use 75% of the cores, if hyper-threading multiply cores *2 Convert.ToInt32(Math.Ceiling((Environment.ProcessorCount * 0.75) * (isHyperThreaded ? 2: 1))); // SET the Data Flow Block Options // This controls the data flow from the Producer level var dataFlowBlockOptions = new DataflowBlockOptions { BoundedCapacity = 5, MaxMessagesPerTask = 5 }; // SET the data flow pipeline options // Note: Set MaxMessages to the number of books to process // Note: For example, setting MaxMessages to 2 will run only two books through the pipeline var dataFlowLinkOptions = new DataflowLinkOptions { PropagateCompletion = true, //MaxMessages = 1 };
public static IActionResult RunTokenizer( [HttpTrigger(AuthorizationLevel.Function, "post", Route = null)] HttpRequest req, ILogger log, ExecutionContext executionContext) { log.LogInformation("Tokenizer Custom Skill: C# HTTP trigger function processed a request."); string skillName = executionContext.FunctionName; IEnumerable <WebApiRequestRecord> requestRecords = WebApiSkillHelpers.GetRequestRecords(req); if (requestRecords == null) { return(new BadRequestObjectResult($"{skillName} - Invalid request record array.")); } WebApiSkillResponse response = WebApiSkillHelpers.ProcessRequestRecords(skillName, requestRecords, (inRecord, outRecord) => { var text = new TextData { Text = inRecord.Data["text"] as string }; StopWordsRemovingEstimator.Language language = MapToMlNetLanguage(inRecord.Data.TryGetValue("languageCode", out object languageCode) ? languageCode as string : "en"); var mlContext = new MLContext(); IDataView emptyDataView = mlContext.Data.LoadFromEnumerable(new List <TextData>()); EstimatorChain <StopWordsRemovingTransformer> textPipeline = mlContext.Transforms.Text .NormalizeText("Text", caseMode: TextNormalizingEstimator.CaseMode.Lower, keepDiacritics: true, keepPunctuations: false, keepNumbers: false) .Append(mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text", separators: new[] { ' ' })) .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Words", language: language)); TransformerChain <StopWordsRemovingTransformer> textTransformer = textPipeline.Fit(emptyDataView); PredictionEngine <TextData, TransformedTextData> predictionEngine = mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(textTransformer); outRecord.Data["words"] = predictionEngine.Predict(text).Words ?? Array.Empty <string>(); return(outRecord); }); return(new OkObjectResult(response)); }
/// <summary> /// Remove stop words from incoming text. /// </summary> /// <param name="input">The column to apply to.</param> /// <param name="language">Langauge of the input text.</param> public static VarVector <string> RemoveStopwords(this VarVector <string> input, StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) => new OutPipelineColumn(input, language);
public Reconciler(StopWordsRemovingEstimator.Language language) { _language = language; }
public OutPipelineColumn(VarVector <string> input, StopWordsRemovingEstimator.Language language) : base(new Reconciler(language), input) { Input = input; }