public async Task Process(WebDataPacketReader packet) { try { Consumer.PacketOpened(); do { try { Consumer.NewResource(); if (Consumer.ProcessesRequest) { string requestString = packet.GetRequestString(); await Consumer.ProcessRequest(requestString).ConfigureAwait(false); } if (Consumer.ProcessesResponse) { string responseHeaders = packet.GetResponseHeaders(); await Consumer.ProcessResponseHeaders(responseHeaders).ConfigureAwait(false); using (var response = packet.GetResponseStream()) { await Consumer.ProcessResponseStream(response).ConfigureAwait(false); } } if (Consumer.ProcessesException) { string exception = packet.GetException(); await Consumer.ProcessException(exception).ConfigureAwait(false); } } catch (Exception ex) { log.Error(ex); } finally { } ResourcesProcessedCount++; if (packet.ResourceCountSeen > WebDataPacketReader.MaxResourcesInAFile) { throw new FetchoException("Something wrong with packet - it keeps spinning"); } }while (packet.NextResource()); } catch (Exception ex) { Consumer.ReadingException(ex); } finally { Consumer.PacketClosed(); } }
public override async Task ProcessResponseHeaders(string responseHeaders) { CountOfHeaderBytes += (ulong)responseHeaders.Length; ContentType = WebDataPacketReader.GetContentTypeFromResponseHeaders(responseHeaders); var headers = WebDataPacketReader.GetHeaders(responseHeaders); if (headers.ContainsKey("content-encoding")) { Increment(ContentEncoding, headers["content-encoding"].ToLower()); } else { Increment(ContentEncoding, "(not specified)"); } if (headers.ContainsKey("content-language")) { Increment(ContentLanguage, headers["content-language"].ToLower()); } else { Increment(ContentLanguage, "(not specified)"); } }
public override async Task ProcessException(string exception) { if (WebDataPacketReader.IsException(exception)) { ExceptionCount++; var classification = ExceptionClassifier.Classify(exception); Increment(ExceptionCounts, classification.ToString()); } }
/// <summary> /// Download a robots file /// </summary> /// <param name="robotsUri"></param> /// <param name="lastFetched"></param> /// <returns></returns> public static async Task <RobotsFile> DownloadRobots(Uri anyUri, DateTime?lastFetched) { RobotsFile robots = null; var robotsUri = MakeRobotsUri(anyUri); try { var ip = await Utility.GetHostIPAddress(robotsUri); /*while (!await FetchoConfiguration.Current.HostCache.WaitToFetch(ip, 60000)) * Utility.LogInfo("IP Congestion {0}", ip);*/ var bb = new BufferBlock <IWebResourceWriter>(); using (var ms = new MemoryStream()) { using (var packet = new WebDataPacketWriter(ms)) { // this is annoying, I shouldn't have to create a buffer block to get a robots file // or we should put robots into the standard flow of things await bb.SendAsync(packet); await(new HttpResourceFetcher()).Fetch(null, robotsUri, null, lastFetched, bb); } ms.Seek(0, SeekOrigin.Begin); using (var packet = new WebDataPacketReader(CreateXmlReader(ms))) { using (var stream = packet.GetResponseStream()) { if (stream == null) { robots = new RobotsFile(); } else { robots = new RobotsFile(robotsUri, stream); } } } } } catch (Exception ex) { Utility.LogInfo("Fetching {0}:", robotsUri); Utility.LogException(ex); } return(robots); }
public override async Task ProcessRequest(string request) { CurrentUri = WebDataPacketReader.GetUriFromRequestString(request); ResourceCount++; CountOfRequestBytes += (ulong)request.Length; if (CurrentUri == null) { return; } var domain = domainParser.Get(CurrentUri?.Host); Increment(TLDCounts, domain == null ? "(blank)" : domain.TLD); Increment(HostCounts, CurrentUri?.Host); var headers = WebDataPacketReader.GetHeaders(request); if (headers.ContainsKey("responsetime")) { ResponseTimeMilliseconds += TimeSpan.Parse(headers["responsetime"]).TotalMilliseconds; } }