RunFuzzyMatchPLINQ( string[] wordsLookup, IEnumerable<string> files) { var matchSet = await ( from contentFile in files.Traverse(f => File.ReadAllTextAsync(f)) from words in contentFile.Traverse(text => WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w))) let wordSet = words.Flatten().AsSet() from bestMatch in wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold)) select bestMatch.Flatten()); // NOTES // Here the code that leverages the "ReadFileLinesAndFlatten" method //var matchSet = await ( // from contentFile in files.Traverse(f => ReadFileLinesAndFlatten(f)) // let wordSet = contentFile.Flatten().AsSet() // from bestMatch in wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold)) // select bestMatch.Flatten()); return PrintSummary(matchSet.AsSet()); }
RunFuzzyMatchTaskContinuation( string[] wordsLookup, IEnumerable <string> files) { // Let's start by converting the I/O operation to be asynchronous // The continuation passing style avoids to block any threads // // What about the error handlimg ? and cancellation (if any) ? var matchSet = new HashSet <WordDistanceStruct>(); foreach (var file in files) { var readFileTask = File.ReadAllTextAsync(file); IEnumerable <WordDistanceStruct[]> bestMatches = await readFileTask .ContinueWith(readText => { return(WordRegex.Value.Split(readText.Result) .Where(w => !IgnoreWords.Contains(w))); }) .ContinueWith(words => { var tasks = (from wl in wordsLookup select JaroWinklerModule.bestMatchTask(words.Result, wl, threshold)).ToList(); return(Task.WhenAll(tasks)); }).Unwrap(); matchSet.AddRange(bestMatches.Flatten()); } return(PrintSummary(matchSet)); }
RunFuzzyMatchTaskProcessAsCompleteAbstracted( string[] wordsLookup, IEnumerable <string> files) { var matchSet = new HashSet <WordDistanceStruct>(); // TODO (4) : Implement a resuable function called "ContinueAsComplete" to abstract the implementation of the // previous method "RunFuzzyMatchTaskProcessAsCompleteBasic". // The function "ContinueAsComplete" should sattisfy the following signatue: // Signature : // Enumerable<Task<R>> ContinueAsComplete<T, R>(this IEnumerable<T> input, Func<T, Task<R>> selector) // // C# : go to "Module 2\TaskAsComplete.cs" and add the missing code (4) // F# : go to the FSharp project "Module 2\TaskAsComplete.fs" and add the missing code (4) foreach (var textTask in files.ContinueAsComplete(file => File.ReadAllTextAsync(file))) { var text = await textTask; var words = WordRegex.Value .Split(text) .Where(w => !IgnoreWords.Contains(w)) .AsSet(); foreach (var matchTask in wordsLookup.ContinueAsComplete( wl => JaroWinklerModule.bestMatchTask(words, wl, threshold))) { matchSet.AddRange(await matchTask); } } return(PrintSummary(matchSet)); }
RunFuzzyMatchSequential( string[] wordsLookup, IEnumerable<string> files) { // Sequential workflow -> how can we parallelize this work? // The collection 'matchSet' cannot be shared among threads var matchSet = new HashSet<WordDistanceStruct>(); foreach (var file in files) { string readText = File.ReadAllText(file); var words = readText.Split(punctuation.Value) .Where(w => !IgnoreWords.Contains(w)) .AsSet(); foreach (var wl in wordsLookup) { var bestMatch = JaroWinklerModule.bestMatch(words, wl, threshold); matchSet.AddRange(bestMatch); } } return PrintSummary(matchSet); }
public static void RunFuzzyMatchPipeline( string[] wordsLookup, IList <string> files) { var pipeline = Pipeline <string, string[]> .Create(file => File.ReadAllLinesAsync(file)); pipeline .Then(lines => lines.SelectMany(l => l.Split(punctuation.Value) .Where(w => !IgnoreWords.Contains(w))).AsSet() ) .Then(wordSet => wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold)) ) .Then(matcheSet => PrintSummary(matcheSet.Flatten().AsSet()) ); foreach (var file in files) { Console.WriteLine($"analyzing file {file}"); pipeline.Enqueue(file); } // End C# Pipeline }
public static void RunFuzzyMatchPipelineFSharp( string[] wordsLookup, IList <string> files) { // TODO (3) : In the previous example you have implemented the Monadic operator SelectMany (usually called Bind) // This operator enables the compiler to understand the monadic (LINQ) pattern, which allows you to write // expressive/declarative code in LINQ style // Let's implement a parallel Pipeline that allows you to keep the continuation semantic, // with the advantage of running // the transoramations in parallel // // Implement the "Then" operator (instance method) that can be used to create and fluently compose a pipeline // for example: // pipeline.Then( .... ).Then(...) // // Also implement the logic in the "Enqueue" method // // F# : go to the FSharp project "Module 2\Pipeline.fs" and add the missing code (3.a and 3.b) // // To be bale to handle correctly a Multi-Producer/Multi-Consumer scenario, // take look to these oprions // BlockingCollection<TInput>.TryAddToAny // BlockingCollection<TInput>.TryTakeFromAny // // When you are complete, uncomment the code and run it // TODO (3) Start F# Pipeline var pipelineFSharp = Pipeline.Pipeline <string, string[]> .Create(file => File.ReadAllLinesAsync(file)) .Then(lines => lines.SelectMany(l => l.Split(punctuation.Value) .Where(w => !IgnoreWords.Contains(w))).AsSet() ) .Then(wordSet => wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold)) ) .Then(matcheSet => matcheSet.Flatten().AsSet() ); pipelineFSharp.Execute(4, CancellationToken.None); var unit = (Unit)Activator.CreateInstance(typeof(Unit), true); foreach (var file in files) { pipelineFSharp.Enqueue(file, (tup => { Console.WriteLine($"analyzing file {file}"); PrintSummary(tup.Item2); return(unit); })); } // End F# Pipeline }
RunFuzzyMatchBetterTaskContinuation( string[] wordsLookup, IEnumerable<string> files) { // Ideally, we should handle potential errors or cancellations // This is a lot of code which goes against the DRY principal var matchSet = new HashSet<WordDistanceStruct>(); foreach (var file in files) { var readFileTask = File.ReadAllTextAsync(file); var bestMatches = await readFileTask .ContinueWith(readText => { switch (readText.Status) { case TaskStatus.Faulted: Exception ex = readText.Exception; while (ex is AggregateException && ex.InnerException != null) ex = ex.InnerException; // do something with ex return null; case TaskStatus.Canceled: // do something because Task cancelled return null; default: return WordRegex.Value.Split(readText.Result) .Where(w => !IgnoreWords.Contains(w)); } }) .ContinueWith(words => { switch (words.Status) { case TaskStatus.Faulted: Exception ex = words.Exception; while (ex is AggregateException && ex.InnerException != null) ex = ex.InnerException; // do something with ex return null; case TaskStatus.Canceled: // do something because Task cancelled return null; default: return wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(words.Result, wl, threshold)); } }).Unwrap(); matchSet.AddRange(bestMatches.Flatten()); } return PrintSummary(matchSet); }
RunFuzzyMatchTaskProcessAsCompleteBasic( string[] wordsLookup, IEnumerable <string> files) { // An alternative pattern to parallize the FuzzyMatch is the "Procces as complete" // The idea of this pattern is to start the execution of all the operations (tasks) // at the same time, and then proccess them as they complete instead of waiting for all the operations // to be completed before continuing. // In other words, this pattern returns a sequence of tasks which will be observed to complete with the same set // of results as the given input tasks, but in the order in which the original tasks complete. // // Here a simple implementation :: var matchSet = new HashSet <WordDistanceStruct>(); var readFileTasks = (from file in files select File.ReadAllTextAsync(file) ).ToList(); while (readFileTasks.Count > 0) { await Task.WhenAny(readFileTasks) .ContinueWith(async readTask => { var finishedReadTask = readTask.Result; readFileTasks.Remove(finishedReadTask); var words = WordRegex.Value .Split(finishedReadTask.Result) .Where(w => !IgnoreWords.Contains(w)); var matchTasks = (from wl in wordsLookup select JaroWinklerModule.bestMatchTask(words, wl, threshold) ).ToList(); while (matchTasks.Count > 0) { await Task.WhenAny(matchTasks) .ContinueWith(matchTask => { var finishedMatchTask = matchTask.Result; matchTasks.Remove(finishedMatchTask); matchSet.AddRange(finishedMatchTask.Result); }); } }); } return(PrintSummary(matchSet)); }
RunFuzzyMatchTaskComposition( string[] wordsLookup, IEnumerable <string> files) { // A better apporach is to create a custom operator that preserves // the continuation semantic, while handling cases of error, exception and transformation // Signatures : // Task<TOut> Then<TIn, TOut>(this Task<TIn> task, Func<TIn, TOut> next) : Functor // Task<TOut> Then<TIn, TOut>(this Task<TIn> task, Func<TIn, Task<TOut>> next) : Bind // Traverese the given files in parallel // TODO (1) : Implement a reusable and optimizied fucntion called "Then" that satisfied the previous signature // C# : go to the "Module 1\TaskCompoistion.cs" and add the missing code in TODO (1) // F# : go to the FSharp project "Module 1\TaskCompoistion.fs" and add the missing code // // Optional/bonus function to implement with signature : // Task<TOut> SelectMany<TIn, TMid, TOut>(this Task<TIn> input, Func<TIn, Task<TMid>> f, Func<TIn, TMid, TOut> projection) return (files.Traverse(file => File.ReadAllTextAsync(file)) .Then(fileContent => fileContent .SelectMany(text => WordRegex.Value.Split(text)) .Where(w => !IgnoreWords.Contains(w)) .AsSet() ) .Then(wordsSplit => wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordsSplit, wl, threshold)) ) .Then(matcheSet => PrintSummary(matcheSet.Flatten().AsSet()))); // NOTES // In this scenario, and only for demo purposes, we are reading the text asynchronously // in one operation, and then we are treating the text as a unique string. // In the case that the text is a large string, there are some performance penalties especially // during the Regex Split. A better approach is to read text files in line and run the Regex against // chunks of strings. // One solution is to create a Task that reads, splits and flattens the input text // in one operation. The method "ReadFileLinesAndFlatten" ( in the "TaskEx" static class ) // implements this design. // Feel free to check the method and use it if you would like. // Here is the code that replaces the previus code. //return // files.Traverse(file => ReadFileLinesAndFlatten(file)) // .Then(wordsSplit => // { // var words = wordsSplit.Flatten(); // return wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(words, wl, threshold)); // }) // .Then(matcheSet => PrintSummary(matcheSet.Flatten().AsSet())); }
// Utility method private static Task <HashSet <string> > ReadFileLinesAndFlatten(string file) { var tcs = new TaskCompletionSource <HashSet <string> >(); Task <string[]> readFileLinesTask = File.ReadAllLinesAsync(file); readFileLinesTask.ContinueWith(fs => fs.Result.Traverse(line => WordRegex.Value.Split(line) .Where(w => !IgnoreWords.Contains(w)) ) ).ContinueWith(t => tcs.FromTask(t, task => task.Result.Flatten().AsSet()) ); return(tcs.Task); }
public string Process(string word) { if (word.Length < 2) { return(word); } if (word.ToCharArray().Any(char.IsDigit)) { return(word); } if (IgnoreWords.Contains(word)) { return(word); } var suggestions = Speller.Lookup(word, SymSpell.Verbosity.Closest, MaxDistance); return(suggestions.Count == 0 ? word : suggestions[0].term); }
RunFuzzyMatchTaskComposition( string[] wordsLookup, IEnumerable<string> files) { return files.Traverse(file => File.ReadAllTextAsync(file)) .Then(fileContent => fileContent .SelectMany(text => WordRegex.Value.Split(text)) .Where(w => !IgnoreWords.Contains(w)) .AsSet() ) .Then(wordsSplit => wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordsSplit, wl, threshold)) ) .Then(matcheSet => PrintSummary(matcheSet.Flatten().AsSet())); // NOTES // In this scenario, and only for demo purposes, we are reading the text asynchronously // in one operation, and then we are treating the text as a unique string. // In the case that the text is a large string, there are some performance penalties especially // during the Regex Split. A better approach is to read text files in line and run the Regex against // chunks of strings. // One solution is to create a Task that reads, splits and flattens the input text // in one operation. The method "ReadFileLinesAndFlatten" ( in the "TaskEx" static class ) // implements this design. // Feel free to check the method and use it if you would like. // Here is the code that replaces the previus code. //return // files.Traverse(file => ReadFileLinesAndFlatten(file)) // .Then(wordsSplit => // { // var words = wordsSplit.Flatten(); // return wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(words, wl, threshold)); // }) // .Then(matcheSet => PrintSummary(matcheSet.Flatten().AsSet())); }
RunFuzzyMatchTaskProcessAsCompleteAbstracted( string[] wordsLookup, IEnumerable <string> files) { var matchSet = new HashSet <WordDistanceStruct>(); foreach (var textTask in files.ContinueAsComplete(file => File.ReadAllTextAsync(file))) { var text = await textTask; var words = WordRegex.Value .Split(text) .Where(w => !IgnoreWords.Contains(w)) .AsSet(); foreach (var matchTask in wordsLookup.ContinueAsComplete( wl => JaroWinklerModule.bestMatchTask(words, wl, threshold))) { matchSet.AddRange(await matchTask); } } return(PrintSummary(matchSet)); }
public static async Task RunFuzzyMatchDataFlow(string[] wordsLookup, IList <string> files) { var cts = new CancellationTokenSource(); var opt = new ExecutionDataflowBlockOptions { BoundedCapacity = 10, // TODO, change this value and check what is happening MaxDegreeOfParallelism = 1, CancellationToken = cts.Token }; int fileCount = files.Count; var inputBlock = new BufferBlock <string>(opt); var readLinesBlock = new TransformBlock <string, string>( async file => await File.ReadAllTextAsync(file, cts.Token), opt); var splitWordsBlock = new TransformBlock <string, HashSet <string> >( text => WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w)).AsSet(), opt); var batch = new BatchBlock <HashSet <string> >(fileCount); var foundMatchesBlock = new TransformBlock <HashSet <string>[], WordDistanceStruct[]>( async wordSet => { var wordSetFlatten = wordSet.Flatten().AsSet(); var matches = await wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSetFlatten, wl, threshold)); return(matches.Flatten().ToArray()); }, opt); // TODO (5) // Implement a block name "printBlock", which prints the output of // the foundMatchesBlock using the "PrintSummary" method // Then link the block to the "foundMatchesBlock" block // var printBlock = // missing code var linkOptions = new DataflowLinkOptions { PropagateCompletion = true }; IDisposable disposeAll = new CompositeDisposable( inputBlock.LinkTo(readLinesBlock, linkOptions), readLinesBlock.LinkTo(splitWordsBlock, linkOptions), splitWordsBlock.LinkTo(batch, linkOptions), batch.LinkTo(foundMatchesBlock, linkOptions) // TODO uncoment this code after // implemented TODO (5) // foundMatchesBlock.LinkTo(printBlock) ); cts.Token.Register(disposeAll.Dispose); // TODO (6) // After have completed TODO (5), remove or unlink the printBlock, and replace the output of the "foundMatchesBlock" block // with Reactive Extensions "AsObservable", maintaining the call to the "PrintSummary" method foreach (var file in files) { await inputBlock.SendAsync(file, cts.Token); } inputBlock.Complete(); await foundMatchesBlock.Completion.ContinueWith(_ => disposeAll.Dispose()); }
// C# example public static async Task RunFuzzyMatchAgentCSharp(string[] wordsLookup, IList <string> files) { var cts = new CancellationTokenSource(); var opt = new ExecutionDataflowBlockOptions { BoundedCapacity = 10, MaxDegreeOfParallelism = 4, CancellationToken = cts.Token }; var inputBlock = new BufferBlock <string>(opt); var readLinesBlock = new TransformBlock <string, string>( async file => await File.ReadAllTextAsync(file, cts.Token), opt); var splitWordsBlock = new TransformBlock <string, string[]>( text => WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w)).AsSet().ToArray(), opt); var foundMatchesBlock = new TransformBlock <string[], WordDistanceStruct[]>(async wordSet => { var matches = await wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold)); return(matches.Flatten().ToArray()); }, opt); var linkOptions = new DataflowLinkOptions { PropagateCompletion = true }; // TODO (7) (for C#) // Implement a stateful agent using the TPL Dataflow. // The Agent should have an internal state protected from external access. // The function passed in the constractor applies a project/reduce to the incoming messages and in the current state, // to return a new state // (see AgentAggregator.cs) var agent = Agent.Start(new Dictionary <string, HashSet <string> >(), (Dictionary <string, HashSet <string> > state, WordDistanceStruct[] matches) => { var matchesDic = matches .GroupBy(w => w.Word) .ToDictionary( k => k.Key, v => v.Select(w => w.Match).AsSet()); var newState = Clone(state); foreach (var match in matchesDic) { if (newState.TryGetValue(match.Key, out HashSet <string> values)) { values.AddRange(match.Value); newState[match.Key] = values; } else { newState.Add(match.Key, match.Value); } } return(newState); }); IDisposable disposeAll = new CompositeDisposable( inputBlock.LinkTo(readLinesBlock, linkOptions), readLinesBlock.LinkTo(splitWordsBlock, linkOptions), splitWordsBlock.LinkTo(foundMatchesBlock, linkOptions), foundMatchesBlock.LinkTo(agent), agent.AsObservable() .Subscribe( summaryMathces => PrintSummary(summaryMathces)) ); cts.Token.Register(disposeAll.Dispose); foreach (var file in files) { await inputBlock.SendAsync(file, cts.Token); } // inputBlock.Complete(); // await foundMatchesBlock.Completion.ContinueWith(_ => // disposeAll.Dispose()); }
RunFuzzyMatchPLINQ( string[] wordsLookup, IEnumerable <string> files) { // TODO (2) : After have copmletd TODO (1), we should be able to implement // effortlessly a LINQ pattern using the Task. // Rename the "Then" function implemented with the name SelectMany, such that these three followig signatures // are sattisfied : // // Task<TOut> SelectMany<TIn, TMid, TOut>(this Task<TIn> input, Func<TIn, Task<TMid>> f, Func<TIn, TMid, TOut> projection) // Task<TOut> SelectMany<TIn, TOut>(this Task<TIn> first, Func<TIn, Task<TOut>> next) // Task<TOut> Select<TIn, TOut>(this Task<TIn> task, Func<TIn, TOut> projection) // // Then uncomment the following code, add the missing code and run it var matchSet = await( from contentFile in files.Traverse(f => File.ReadAllTextAsync(f)) from words in contentFile.Traverse(text => WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w))) let wordSet = words.Flatten().AsSet() // TODO (2) from bestMatch in wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold)) select bestMatch.Flatten()); // NOTES // Here the code that leverages the "ReadFileLinesAndFlatten" method //var matchSet = await ( // from contentFile in files.Traverse(f => ReadFileLinesAndFlatten(f)) // let wordSet = contentFile.Flatten().AsSet() // from bestMatch in wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold)) // select bestMatch.Flatten()); return(PrintSummary(matchSet.AsSet())); }
// Example F# public static async Task RunFuzzyMatchAgentFSharp(string[] wordsLookup, IList <string> files) { var cts = new CancellationTokenSource(); var opt = new ExecutionDataflowBlockOptions { BoundedCapacity = 10, MaxDegreeOfParallelism = 4, CancellationToken = cts.Token }; var inputBlock = new BufferBlock <string>(opt); var readLinesBlock = new TransformBlock <string, string>( file => File.ReadAllTextAsync(file, cts.Token), opt); var splitWordsBlock = new TransformBlock <string, string[]>( text => WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w)).AsSet().ToArray(), opt); var foundMatchesBlock = new TransformBlock <string[], WordDistanceStruct[]>(async wordSet => { var matches = await wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold)); return(matches.Flatten().ToArray()); }, opt); var linkOptions = new DataflowLinkOptions { PropagateCompletion = true }; // TODO (7) (for F#) // Implement a Reactive MailboxProcessor in F#. // Go to the Fsharp project, Module 3 and follow the instructions (7.a) // then, uncomment the following code and remove the previous code that uses // the Agent based on TPL Dataflow var agent = new ReactiveAgent.AgentObservable <WordDistanceStruct[], Dictionary <string, HashSet <string> > > (new Dictionary <string, HashSet <string> >(), (state, matches) => { var matchesDic = matches .GroupBy(w => w.Word).ToDictionary(k => k.Key, v => v.Select(w => w.Match).AsSet()); // Clone is important to be race condition free // or use an immutable collection var newState = Clone(state); foreach (var match in matchesDic) { if (newState.TryGetValue(match.Key, out HashSet <string> values)) { values.AddRange(match.Value); newState[match.Key] = values; } else { newState.Add(match.Key, match.Value); } } return(newState); }); IDisposable disposeAll = new CompositeDisposable( inputBlock.LinkTo(readLinesBlock, linkOptions), readLinesBlock.LinkTo(splitWordsBlock, linkOptions), splitWordsBlock.LinkTo(foundMatchesBlock, linkOptions), foundMatchesBlock.LinkTo(agent), agent.AsObservable().Subscribe( summaryMathces => PrintSummary(summaryMathces)) ); cts.Token.Register(disposeAll.Dispose); foreach (var file in files) { await inputBlock.SendAsync(file, cts.Token); } // inputBlock.Complete(); // await foundMatchesBlock.Completion.ContinueWith(_ => disposeAll.Dispose()); }