private static bool IsMediaType(HttpContentInfo contextInfo, string compareType) { if (contextInfo == null) { return(false); } HttpContent context = contextInfo.Content; if (context == null) { return(false); } string mediaType = context.Headers.ContentType.MediaType; return(string.Compare(mediaType, compareType, true) == 0); }
private static bool IsMediaTypeStartWith(HttpContentInfo contextInfo, string compareType) { if (contextInfo == null) { return(false); } HttpContent context = contextInfo.Content; if (context == null) { return(false); } string mediaType = context.Headers.ContentType.MediaType; bool result = mediaType.StartsWith(compareType, StringComparison.InvariantCultureIgnoreCase); return(result); }
static async void StartCrawling() { if (!Directory.Exists("Images")) { Directory.CreateDirectory("Images"); } try { #region Dataflow block Options var downloaderOptions = new ExecutionDataflowBlockOptions { // enforce fairness, after handling n messages // the block's task will be re-schedule. // this will give the opportunity for other block // to actively process there messages (to avoid over subscription // the Tpl dataflow does not schedule all task at once if the machine // does not have enough cores) MaxMessagesPerTask = DOWNLOADER_MAX_MESSAGE_PER_TASK, // by default Tpl dataflow assign a single task per block, // but you can control it by using the MaxDegreeOfParallelism MaxDegreeOfParallelism = DOWNLOADER_MAX_DEGREE_OF_PARALLELISM, // the size of the block input buffer BoundedCapacity = DOWNLOADER_BOUNDED_CAPACITY }; var transformerOptions = new ExecutionDataflowBlockOptions { MaxMessagesPerTask = MAX_MESSAGE_PER_TASK, }; var writerOptions = new ExecutionDataflowBlockOptions { // by default Tpl dataflow assign a single task per block, // but you can control it by using the MaxDegreeOfParallelism MaxDegreeOfParallelism = WRITER_MAX_DEGREE_OF_PARALLELISM, // MaxMessagesPerTask = MAX_MESSAGE_PER_TASK, }; var linkOption = new DataflowLinkOptions { PropagateCompletion = true }; #endregion // Dataflow block Options #region Downloader var downloader = new TransformBlock <string, HttpContentInfo>( // "text/html, image/jpeg" async(url) => { try { #region Validation if (_urls.ContainsKey(url)) { return(null); } _urls.TryAdd(url, true); if (!ShouldContinue(url)) { return(null); } #endregion // Validation HttpClient client = new HttpClient(); client.Timeout = TimeSpan.FromSeconds(DOWNLOAD_TIMEOUT_SEC); //Trace.WriteLine("Downloading: " + url); // using IOCP the thread pool worker thread does return to the pool HttpResponseMessage response = await client.GetAsync(url); if (!response.IsSuccessStatusCode) { WriteToConsole("Fail to download html: [{0}] \r\n\tStatus Code = {1}", ConsoleColor.Red, url, response.StatusCode); return(null); } HttpContent content = response.Content; var contentType = content.Headers.ContentType; string mediaType = contentType.MediaType; #region Validation if (contentType == null) { WriteToConsole("Unknown content type [{0}]: {1}", ConsoleColor.Gray, mediaType, url); return(null); } #endregion // Validation WriteToConsole("Downloaded [{0}]: {1}", ConsoleColor.White, mediaType, url); var info = new HttpContentInfo(url, response.Content); if (!IsMediaType(info, "text/html")) { Trace.WriteLine("Downloaded [" + mediaType + "]: " + url); } return(info); } #region Exception Handling catch (UriFormatException ex) { WriteToConsole("invalid URL", ConsoleColor.Red, ex.Message); } catch (WebException ex) { WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, ex.Message); } catch (AggregateException ex) { foreach (var exc in ex.Flatten().InnerExceptions) { WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, exc.Message); } } catch (Exception ex) { WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message); } #endregion // Exception Handling return(null); }, downloaderOptions); #endregion // Downloader #region Parser var parser = new TransformManyBlock <HttpContentInfo, string>( async contentInfo => { HttpContent content = contentInfo.Content; // using IOCP the thread pool worker thread does return to the pool string html = await content.ReadAsStringAsync(); var output = new List <string>(); try { var links = _linkRegex.Matches(html); foreach (Match item in links) { var value = item.Value; //Trace.WriteLine("\t\tPARSED: " + value); output.Add(value); } } #region Exception Handling catch (Exception ex) { WriteToConsole("Error {0}", ConsoleColor.Red, ex.Message); } #endregion // Exception Handling return(output); }, transformerOptions); #endregion // Parser #region Writer var writer = new ActionBlock <HttpContentInfo>(async contentInfo => { try { HttpContent content = contentInfo.Content; // using IOCP the thread pool worker thread does return to the pool using (Stream source = await content.ReadAsStreamAsync()) using (var image = Image.FromStream(source)) { string fileName = Path.GetFileName(contentInfo.Url); //Trace.WriteLine("\tWRITTING: " + contentInfo.Url); #region Validation if (!_images.TryAdd(fileName, true)) { return; } if (image.Width < MIN_SIZE.Width || image.Height < MIN_SIZE.Height) { return; } #endregion // Validation string name = @"Images\" + fileName; using (Stream dest = OpenWriteAsync(name)) { source.Position = 0; // using IOCP the thread pool worker thread does return to the pool await source.CopyToAsync(dest); WriteToConsole("{0}: Width:{1}, Height:{2}", ConsoleColor.Yellow, fileName, image.Width, image.Height); } } } #region Exception Handling catch (WebException ex) { WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, ex.Message); } catch (Exception ex) { WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message); } #endregion // Exception Handling }, writerOptions); #endregion // Writer var garbageContent = DataflowBlock.NullTarget <HttpContentInfo>(); var garbageUrl = DataflowBlock.NullTarget <string>(); #region LinkTo //////////////////////////////////////////////////////// // // // garbage <------- downloader <-------------- // // / \ | // // writer parser -------- // // // //////////////////////////////////////////////////////// downloader.LinkTo(writer, linkOption, _isImage); downloader.LinkTo(parser, linkOption, info => info != null); downloader.LinkTo(garbageContent, linkOption); // fallback (otherwise empty messages will be stack in the block buffer and the block will never complete) parser.LinkTo(downloader, linkOption, url => !string.IsNullOrEmpty(url)); parser.LinkTo(garbageUrl, linkOption); #endregion // LinkTo downloader.Post(URL_CRAWL_TARGET); Console.WriteLine("Crawling"); Thread.Sleep(COMPLETE_AFTER_SEC * 1000); #region Complete downloader.Complete(); #region WriteToConsole ("Try to Complete...") ConsoleColor color = ConsoleColor.Yellow; WriteToConsole( @"Try to Complete (items in the buffer = downloader: is completed = {0}, input={1} , output={2} writer: is completed = {3}, input ={4} parser: is completed = {5}, input={6} , output={7}", color, downloader.Completion.IsCompleted, downloader.InputCount, downloader.OutputCount, writer.Completion.IsCompleted, writer.InputCount, parser.Completion.IsCompleted, parser.InputCount, parser.OutputCount); #endregion // WriteToConsole ("Try to Complete...") Task completeAll = Task.WhenAll( downloader.Completion, parser.Completion, writer.Completion); await Task.Run(async() => { while (!completeAll.IsCompleted) { await Task.Delay(2000); #region WriteToConsole (status) color = color == ConsoleColor.Magenta ? ConsoleColor.White : ConsoleColor.Yellow; WriteToConsole( @"Complete Status (items in the buffer = downloader: is completed = {0}, input={1} , output={2} writer: is completed = {3}, input ={4} parser: is completed = {5}, input={6} , output={7}", color, downloader.Completion.IsCompleted, downloader.InputCount, downloader.OutputCount, writer.Completion.IsCompleted, writer.InputCount, parser.Completion.IsCompleted, parser.InputCount, parser.OutputCount); } #endregion // WriteToConsole (status) }); WriteToConsole("Complete (items in the writer input buffer = {0})", ConsoleColor.Green, writer.InputCount); #endregion // Complete } catch (Exception ex) { WriteToConsole("EXCEPTION: {0}", ConsoleColor.DarkRed, ex); } }