Exemplo n.º 1
        public static void RunFuzzyMatchPipeline(
            string[] wordsLookup,
            IList <string> files)
            var pipeline = Pipeline <string, string[]> .Create(file => File.ReadAllLinesAsync(file));

            .Then(lines =>
                  lines.SelectMany(l => l.Split(punctuation.Value)
                                   .Where(w => !IgnoreWords.Contains(w))).AsSet()
            .Then(wordSet =>
                  wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold))
            .Then(matcheSet =>

            foreach (var file in files)
                Console.WriteLine($"analyzing file {file}");
            // End C# Pipeline
            string[] wordsLookup,
            IEnumerable <string> files)
            // Let's start by converting the I/O operation to be asynchronous
            // The continuation passing style avoids to block any threads
            // What about the error handlimg ? and cancellation (if any) ?

            var matchSet = new HashSet <WordDistanceStruct>();

            foreach (var file in files)
                var readFileTask = File.ReadAllTextAsync(file);

                IEnumerable <WordDistanceStruct[]> bestMatches =
                    await readFileTask
                    .ContinueWith(readText =>
                           .Where(w => !IgnoreWords.Contains(w)));
                    .ContinueWith(words =>
                    var tasks = (from wl in wordsLookup
                                 select JaroWinklerModule.bestMatchTask(words.Result, wl, threshold)).ToList();


Exemplo n.º 3
            string[] wordsLookup,
            IEnumerable <string> files)
            var matchSet = new HashSet <WordDistanceStruct>();

            // TODO (4) : Implement a resuable function called "ContinueAsComplete" to abstract the implementation of the
            // previous method "RunFuzzyMatchTaskProcessAsCompleteBasic".
            // The function "ContinueAsComplete" should sattisfy the following signatue:
            // Signature :
            // Enumerable<Task<R>> ContinueAsComplete<T, R>(this IEnumerable<T> input, Func<T, Task<R>> selector)
            // C# : go to "Module 2\TaskAsComplete.cs" and add the missing code (4)
            // F# : go to the FSharp project "Module 2\TaskAsComplete.fs" and add the missing code (4)

            foreach (var textTask in files.ContinueAsComplete(file => File.ReadAllTextAsync(file)))
                var text = await textTask;

                var words = WordRegex.Value
                            .Where(w => !IgnoreWords.Contains(w))

                foreach (var matchTask in wordsLookup.ContinueAsComplete(
                             wl => JaroWinklerModule.bestMatchTask(words, wl, threshold)))
                    matchSet.AddRange(await matchTask);

                string[] wordsLookup, 
                IEnumerable<string> files)
            var matchSet = await (
                from contentFile in files.Traverse(f => File.ReadAllTextAsync(f))
                from words in contentFile.Traverse(text =>
                    WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w)))
                let wordSet = words.Flatten().AsSet()
                from bestMatch in wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold))
                select bestMatch.Flatten());
            // NOTES
            // Here the code that leverages the "ReadFileLinesAndFlatten" method
            //var matchSet = await (
            //    from contentFile in files.Traverse(f => ReadFileLinesAndFlatten(f))
            //    let wordSet = contentFile.Flatten().AsSet()
            //    from bestMatch in wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold))
            //    select bestMatch.Flatten());

            return PrintSummary(matchSet.AsSet());
Exemplo n.º 5
        public static void RunFuzzyMatchPipelineFSharp(
            string[] wordsLookup,
            IList <string> files)
            // TODO (3) : In the previous example you have implemented the Monadic operator SelectMany (usually called Bind)
            // This operator enables the compiler to understand the monadic (LINQ) pattern, which allows you to write
            // expressive/declarative code in LINQ style
            // Let's implement a parallel Pipeline that allows you to keep the continuation semantic,
            // with the advantage of running
            // the transoramations in parallel
            // Implement the "Then" operator (instance method) that can be used to create and fluently compose a pipeline
            // for example:
            // pipeline.Then( .... ).Then(...)
            // Also implement the logic in the "Enqueue" method
            // F# : go to the FSharp project "Module 2\Pipeline.fs" and add the missing code (3.a and 3.b)
            // To be bale to handle correctly a Multi-Producer/Multi-Consumer scenario,
            // take look to these oprions
            //      BlockingCollection<TInput>.TryAddToAny
            //      BlockingCollection<TInput>.TryTakeFromAny
            // When you are complete, uncomment the code and run it

            // TODO (3) Start F# Pipeline
            var pipelineFSharp =
                Pipeline.Pipeline <string, string[]>
                .Create(file => File.ReadAllLinesAsync(file))
                .Then(lines =>
                      lines.SelectMany(l => l.Split(punctuation.Value)
                                       .Where(w => !IgnoreWords.Contains(w))).AsSet()
                .Then(wordSet =>
                      wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold))
                .Then(matcheSet =>

            pipelineFSharp.Execute(4, CancellationToken.None);

            var unit = (Unit)Activator.CreateInstance(typeof(Unit), true);

            foreach (var file in files)
                                       (tup =>
                    Console.WriteLine($"analyzing file {file}");
            // End F# Pipeline
                string[] wordsLookup, 
                IEnumerable<string> files)
            // Ideally, we should handle potential errors or cancellations
            // This is a lot of code which goes against the DRY principal

            var matchSet = new HashSet<WordDistanceStruct>();

            foreach (var file in files)
                var readFileTask = File.ReadAllTextAsync(file);
                var bestMatches = await readFileTask
                    .ContinueWith(readText =>
                        switch (readText.Status)
                            case TaskStatus.Faulted:
                                Exception ex = readText.Exception;
                                while (ex is AggregateException && ex.InnerException != null)
                                    ex = ex.InnerException;
                                // do something with ex
                                return null;
                            case TaskStatus.Canceled:
                                // do something because Task cancelled
                                return null;
                                return WordRegex.Value.Split(readText.Result)
                                    .Where(w => !IgnoreWords.Contains(w));
                    .ContinueWith(words =>
                        switch (words.Status)
                            case TaskStatus.Faulted:
                                Exception ex = words.Exception;
                                while (ex is AggregateException && ex.InnerException != null)
                                    ex = ex.InnerException;
                                // do something with ex
                                return null;
                            case TaskStatus.Canceled:
                                // do something because Task cancelled
                                return null;
                                return wordsLookup.Traverse(wl =>
                                    JaroWinklerModule.bestMatchTask(words.Result, wl, threshold));


            return PrintSummary(matchSet);
Exemplo n.º 7
            string[] wordsLookup,
            IEnumerable <string> files)
            // An alternative pattern to parallize the FuzzyMatch is the "Procces as complete"
            // The idea of this pattern is to start the execution of all the operations (tasks)
            // at the same time, and then proccess them as they complete instead of waiting for all the operations
            // to be completed before continuing.
            // In other words, this pattern returns a sequence of tasks which will be observed to complete with the same set
            // of results as the given input tasks, but in the order in which the original tasks complete.
            // Here a simple implementation ::

            var matchSet = new HashSet <WordDistanceStruct>();

            var readFileTasks =
                (from file in files
                 select File.ReadAllTextAsync(file)

            while (readFileTasks.Count > 0)
                await Task.WhenAny(readFileTasks)
                .ContinueWith(async readTask =>
                    var finishedReadTask = readTask.Result;

                    var words = WordRegex.Value
                                .Where(w => !IgnoreWords.Contains(w));

                    var matchTasks =
                        (from wl in wordsLookup
                         select JaroWinklerModule.bestMatchTask(words, wl, threshold)

                    while (matchTasks.Count > 0)
                        await Task.WhenAny(matchTasks)
                        .ContinueWith(matchTask =>
                            var finishedMatchTask = matchTask.Result;


            string[] wordsLookup,
            IEnumerable <string> files)
            // A better apporach is to create a custom operator that preserves
            // the continuation semantic, while handling cases of error, exception and transformation
            // Signatures :
            //     Task<TOut> Then<TIn, TOut>(this Task<TIn> task, Func<TIn, TOut> next)  : Functor
            //     Task<TOut> Then<TIn, TOut>(this Task<TIn> task, Func<TIn, Task<TOut>> next)   : Bind

            // Traverese the given files in parallel
            // TODO (1) : Implement a reusable and optimizied fucntion called "Then" that satisfied the previous signature
            // C# : go to the "Module 1\TaskCompoistion.cs" and add the missing code in TODO (1)
            // F# : go to the FSharp project "Module 1\TaskCompoistion.fs" and add the missing code
            // Optional/bonus function to implement with signature :
            // Task<TOut> SelectMany<TIn, TMid, TOut>(this Task<TIn> input, Func<TIn, Task<TMid>> f, Func<TIn, TMid, TOut> projection)

                (files.Traverse(file => File.ReadAllTextAsync(file))
                 .Then(fileContent =>
                       .SelectMany(text => WordRegex.Value.Split(text))
                       .Where(w => !IgnoreWords.Contains(w))
                 .Then(wordsSplit =>
                       wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordsSplit, wl, threshold))
                 .Then(matcheSet => PrintSummary(matcheSet.Flatten().AsSet())));

            // NOTES
            // In this scenario, and only for demo purposes, we are reading the text asynchronously
            // in one operation, and then we are treating the text as a unique string.
            // In the case that the text is a large string, there are some performance penalties especially
            // during the Regex Split. A better approach is to read text files in line and run the Regex against
            // chunks of strings.
            // One solution is to create a Task that reads, splits and flattens the input text
            // in one operation. The method "ReadFileLinesAndFlatten" ( in the "TaskEx" static class )
            // implements this design.
            // Feel free to check the method and use it if you would like.
            // Here is the code that replaces the previus code.

            //    files.Traverse(file => ReadFileLinesAndFlatten(file))
            //        .Then(wordsSplit =>
            //        {
            //            var words = wordsSplit.Flatten();
            //            return wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(words, wl, threshold));
            //        })
            //        .Then(matcheSet => PrintSummary(matcheSet.Flatten().AsSet()));
                string[] wordsLookup, 
                IEnumerable<string> files)

                files.Traverse(file => File.ReadAllTextAsync(file))
                    .Then(fileContent =>
                             .SelectMany(text => WordRegex.Value.Split(text))
                             .Where(w => !IgnoreWords.Contains(w))
                    .Then(wordsSplit =>
                        wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordsSplit, wl, threshold))
                    .Then(matcheSet => PrintSummary(matcheSet.Flatten().AsSet()));

            // NOTES
            // In this scenario, and only for demo purposes, we are reading the text asynchronously 
            // in one operation, and then we are treating the text as a unique string. 
            // In the case that the text is a large string, there are some performance penalties especially  
            // during the Regex Split. A better approach is to read text files in line and run the Regex against
            // chunks of strings.
            // One solution is to create a Task that reads, splits and flattens the input text 
            // in one operation. The method "ReadFileLinesAndFlatten" ( in the "TaskEx" static class )
            // implements this design. 
            // Feel free to check the method and use it if you would like.
            // Here is the code that replaces the previus code.
            //    files.Traverse(file => ReadFileLinesAndFlatten(file))
            //        .Then(wordsSplit =>
            //        {
            //            var words = wordsSplit.Flatten();
            //            return wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(words, wl, threshold));
            //        })
            //        .Then(matcheSet => PrintSummary(matcheSet.Flatten().AsSet()));
Exemplo n.º 10
            string[] wordsLookup,
            IEnumerable <string> files)
            var matchSet = new HashSet <WordDistanceStruct>();

            foreach (var textTask in files.ContinueAsComplete(file => File.ReadAllTextAsync(file)))
                var text = await textTask;

                var words = WordRegex.Value
                            .Where(w => !IgnoreWords.Contains(w))

                foreach (var matchTask in wordsLookup.ContinueAsComplete(
                             wl => JaroWinklerModule.bestMatchTask(words, wl, threshold)))
                    matchSet.AddRange(await matchTask);

Exemplo n.º 11
            string[] wordsLookup,
            IEnumerable <string> files)
            // TODO (2) : After have copmletd TODO (1), we should be able to implement
            // effortlessly a LINQ pattern using the Task.
            // Rename the "Then" function implemented with the name SelectMany, such that these three followig signatures
            // are sattisfied :
            // Task<TOut> SelectMany<TIn, TMid, TOut>(this Task<TIn> input, Func<TIn, Task<TMid>> f, Func<TIn, TMid, TOut> projection)
            // Task<TOut> SelectMany<TIn, TOut>(this Task<TIn> first, Func<TIn, Task<TOut>> next)
            // Task<TOut> Select<TIn, TOut>(this Task<TIn> task, Func<TIn, TOut> projection)
            // Then uncomment the following code, add the missing code and run it

            var matchSet = await(
                from contentFile in files.Traverse(f => File.ReadAllTextAsync(f))
                from words in contentFile.Traverse(text =>
                                                   WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w)))
                let wordSet = words.Flatten().AsSet()
                              // TODO (2)
                              from bestMatch in wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold))
                              select bestMatch.Flatten());

            // NOTES
            // Here the code that leverages the "ReadFileLinesAndFlatten" method

            //var matchSet = await (
            //    from contentFile in files.Traverse(f => ReadFileLinesAndFlatten(f))
            //    let wordSet = contentFile.Flatten().AsSet()
            //    from bestMatch in wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold))
            //    select bestMatch.Flatten());

Exemplo n.º 12
        // C# example
        public static async Task RunFuzzyMatchAgentCSharp(string[] wordsLookup, IList <string> files)
            var cts = new CancellationTokenSource();
            var opt = new ExecutionDataflowBlockOptions
                BoundedCapacity        = 10,
                MaxDegreeOfParallelism = 4,
                CancellationToken      = cts.Token

            var inputBlock = new BufferBlock <string>(opt);

            var readLinesBlock =
                new TransformBlock <string, string>(
                    async file => await File.ReadAllTextAsync(file, cts.Token), opt);

            var splitWordsBlock =
                new TransformBlock <string, string[]>(
                    text => WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w)).AsSet().ToArray(), opt);

            var foundMatchesBlock =
                new TransformBlock <string[], WordDistanceStruct[]>(async wordSet =>
                var matches =
                    await wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold));
            }, opt);

            var linkOptions = new DataflowLinkOptions {
                PropagateCompletion = true

            // TODO (7) (for C#)
            // Implement a stateful agent using the TPL Dataflow.
            // The Agent should have an internal state protected from external access.
            // The function passed in the constractor applies a project/reduce to the incoming messages and in the current state,
            // to return a new state
            // (see AgentAggregator.cs)
            var agent = Agent.Start(new Dictionary <string, HashSet <string> >(),
                                    (Dictionary <string, HashSet <string> > state, WordDistanceStruct[] matches) =>
                var matchesDic = matches
                                 .GroupBy(w => w.Word)
                    k => k.Key,
                    v => v.Select(w => w.Match).AsSet());

                var newState = Clone(state);
                foreach (var match in matchesDic)
                    if (newState.TryGetValue(match.Key, out HashSet <string> values))
                        newState[match.Key] = values;
                        newState.Add(match.Key, match.Value);


            IDisposable disposeAll = new CompositeDisposable(
                inputBlock.LinkTo(readLinesBlock, linkOptions),
                readLinesBlock.LinkTo(splitWordsBlock, linkOptions),
                splitWordsBlock.LinkTo(foundMatchesBlock, linkOptions),
                    summaryMathces => PrintSummary(summaryMathces))


            foreach (var file in files)
                await inputBlock.SendAsync(file, cts.Token);

            //  inputBlock.Complete();
            //  await foundMatchesBlock.Completion.ContinueWith(_ =>
            //      disposeAll.Dispose());
Exemplo n.º 13
        public static async Task RunFuzzyMatchDataFlow(string[] wordsLookup, IList <string> files)
            var cts = new CancellationTokenSource();
            var opt = new ExecutionDataflowBlockOptions
                BoundedCapacity = 10,
                // TODO, change this value and check what is happening
                MaxDegreeOfParallelism = 1,
                CancellationToken      = cts.Token

            int fileCount = files.Count;

            var inputBlock = new BufferBlock <string>(opt);

            var readLinesBlock =
                new TransformBlock <string, string>(
                    async file => await File.ReadAllTextAsync(file, cts.Token), opt);

            var splitWordsBlock =
                new TransformBlock <string, HashSet <string> >(
                    text => WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w)).AsSet(), opt);

            var batch =
                new BatchBlock <HashSet <string> >(fileCount);

            var foundMatchesBlock =
                new TransformBlock <HashSet <string>[], WordDistanceStruct[]>(
                    async wordSet =>
                var wordSetFlatten = wordSet.Flatten().AsSet();
                var matches        =
                    await wordsLookup.Traverse(wl =>
                                               JaroWinklerModule.bestMatchTask(wordSetFlatten, wl, threshold));
            }, opt);

            // TODO (5)
            // Implement a block name "printBlock", which prints the output of
            // the foundMatchesBlock using the "PrintSummary" method
            // Then link the block to the "foundMatchesBlock" block
            // var printBlock = // missing code

            var linkOptions = new DataflowLinkOptions {
                PropagateCompletion = true

            IDisposable disposeAll = new CompositeDisposable(
                inputBlock.LinkTo(readLinesBlock, linkOptions),
                readLinesBlock.LinkTo(splitWordsBlock, linkOptions),
                splitWordsBlock.LinkTo(batch, linkOptions),
                batch.LinkTo(foundMatchesBlock, linkOptions)
                // TODO uncoment this code after
                // implemented TODO (5)
                // foundMatchesBlock.LinkTo(printBlock)


            // TODO (6)
            // After have completed TODO (5), remove or unlink the printBlock, and replace the output of the "foundMatchesBlock" block
            // with Reactive Extensions "AsObservable", maintaining the call to the "PrintSummary" method

            foreach (var file in files)
                await inputBlock.SendAsync(file, cts.Token);

            await foundMatchesBlock.Completion.ContinueWith(_ => disposeAll.Dispose());
Exemplo n.º 14
        // Example F#
        public static async Task RunFuzzyMatchAgentFSharp(string[] wordsLookup, IList <string> files)
            var cts = new CancellationTokenSource();
            var opt = new ExecutionDataflowBlockOptions
                BoundedCapacity        = 10,
                MaxDegreeOfParallelism = 4,
                CancellationToken      = cts.Token

            var inputBlock = new BufferBlock <string>(opt);

            var readLinesBlock =
                new TransformBlock <string, string>(
                    file => File.ReadAllTextAsync(file, cts.Token), opt);

            var splitWordsBlock = new TransformBlock <string, string[]>(
                text => WordRegex.Value.Split(text).Where(w => !IgnoreWords.Contains(w)).AsSet().ToArray(), opt);

            var foundMatchesBlock =
                new TransformBlock <string[], WordDistanceStruct[]>(async wordSet =>
                var matches =
                    await wordsLookup.Traverse(wl => JaroWinklerModule.bestMatchTask(wordSet, wl, threshold));
            }, opt);

            var linkOptions = new DataflowLinkOptions {
                PropagateCompletion = true

            // TODO (7) (for F#)
            // Implement a Reactive MailboxProcessor in F#.
            // Go to the Fsharp project, Module 3 and follow the instructions (7.a)
            // then, uncomment the following code and remove the previous code that uses
            // the Agent based on TPL Dataflow

            var agent =
                new ReactiveAgent.AgentObservable <WordDistanceStruct[], Dictionary <string, HashSet <string> > >
                    (new Dictionary <string, HashSet <string> >(),
                    (state, matches) =>
                var matchesDic = matches
                                 .GroupBy(w => w.Word).ToDictionary(k => k.Key,
                                                                    v => v.Select(w => w.Match).AsSet());

                // Clone is important to be race condition free
                // or use an immutable collection
                var newState = Clone(state);
                foreach (var match in matchesDic)
                    if (newState.TryGetValue(match.Key, out HashSet <string> values))
                        newState[match.Key] = values;
                        newState.Add(match.Key, match.Value);


            IDisposable disposeAll = new CompositeDisposable(
                inputBlock.LinkTo(readLinesBlock, linkOptions),
                readLinesBlock.LinkTo(splitWordsBlock, linkOptions),
                splitWordsBlock.LinkTo(foundMatchesBlock, linkOptions),
                    summaryMathces => PrintSummary(summaryMathces))


            foreach (var file in files)
                await inputBlock.SendAsync(file, cts.Token);

            // inputBlock.Complete();
            // await foundMatchesBlock.Completion.ContinueWith(_ => disposeAll.Dispose());