Beispiel #1
0
 public void OutputStartResource(QueueItem item)
 {
     Writer.WriteStartElement("resource");
 }
 /// <summary>
 /// Returns true if the source and target hosts are equal
 /// </summary>
 /// <param name="item"></param>
 /// <returns></returns>
 bool HostsAreEqual(QueueItem item) => item.SourceUri == null || item.SourceUri.Host == item.TargetUri.Host;
        /// <summary>
        /// Fetches a copy of a HTTP resource
        /// </summary>
        /// <param name="uri"></param>
        /// <param name="writeStream"></param>
        /// <param name="lastFetchedDate">Date we last fetched the resource - helps in optimising resources</param>
        /// <returns></returns>
        public override async Task Fetch(
            QueueItem queueItem,
            Uri uri,
            Uri refererUri,
            DateTime?lastFetchedDate,
            BufferBlock <IWebResourceWriter> writerPool
            )
        {
            // the process is
            // 1. update the DB to say we're fetching
            // 2. create a request
            // 3. wait for the response
            // 4. download the response into memory buffers
            // 5. aquire a write lock
            // 6. write to the XML
            // 7. clean up

            HttpWebResponse response  = null;
            HttpWebRequest  request   = null;
            DateTime        startTime = DateTime.UtcNow;
            Exception       exception = null;
            bool            wroteOk   = false;

            try
            {
                base.BeginRequest();

                request = CreateRequest(queueItem, refererUri, uri, lastFetchedDate);

                var netTask = request.GetResponseAsync();

                await Task.WhenAny(netTask, Task.Delay(request.Timeout)).ConfigureAwait(false);

                if (netTask.Status != TaskStatus.RanToCompletion)
                {
                    if (netTask.Exception != null)
                    {
                        throw netTask.Exception;
                    }
                    else
                    {
                        throw new TimeoutException(string.Format("Request timed out: {0}", request.RequestUri));
                    }
                }

                response = await netTask as HttpWebResponse;

                if (FetchoConfiguration.Current.BlockProvider.IsBlocked(request, response, out string block_reason))
                {
                    response.Close();
                    throw new FetchoResourceBlockedException(block_reason);
                }
                else
                {
                    bool firstTime = true;
                    var  rspw      = WriteOutResponse(queueItem, writerPool, request, response, startTime);

                    while (rspw.Status != TaskStatus.RanToCompletion && rspw.Status != TaskStatus.Faulted)
                    {
                        if (!firstTime)
                        {
                            throw new TimeoutException("WriteOutResponse timed out");
                        }
                        var wait = Task.Delay(queueItem == null ? FetchoConfiguration.Current.ResponseReadTimeoutInMilliseconds : queueItem.ReadTimeoutInMilliseconds);
                        await Task.WhenAny(wait, rspw);

                        if (!firstTime && ActiveFetches < 5)
                        {
                            log.DebugFormat("Been waiting a while for {0}", request.RequestUri);
                        }
                        firstTime = false;
                    }

                    wroteOk = await rspw.ConfigureAwait(false);

                    //await DoBookkeeping(queueItem, request);
                }
            }
            catch (Exception ex)
            {
                await ErrorHandler(ex, request, queueItem);

                exception = ex;
            }
            finally
            {
                try
                {
                    if (!wroteOk)
                    {
                        var packet = await writerPool.ReceiveAsync();

                        packet.OutputStartResource(queueItem);
                        packet.OutputRequest(request, startTime);
                        packet.OutputException(exception);
                        packet.OutputEndResource();
                        await writerPool.SendAsync(packet);
                    }
                }
                catch (Exception ex)
                {
                    log.ErrorFormat("Barfing because we got an error in the error handling code: {0}", ex);
                    Environment.Exit(1);
                }
                response?.Dispose();
                response = null;
                base.EndRequest();
            }
        }
        private async Task ErrorHandler(Exception ex, HttpWebRequest request, QueueItem queueItem)
        {
            IncFetchExceptions();

            if (ex is AggregateException aggex)
            {
                ex = aggex.InnerException;
            }

            if (ex is FetchoResourceBlockedException)
            {
                // do nothing ignore it
            }
            else if (ex is WebException webex)
            {
                if (webex.InnerException is SocketException)
                {
                    if (queueItem == null)
                    {
                        await RecordNetworkIssues(await Utility.GetHostIPAddress(request?.RequestUri));
                    }
                    else
                    {
                        await RecordNetworkIssues(queueItem);
                    }
                    TerseExceptionOutput(request?.RequestUri, ex);
                }
                else if (webex.Response is HttpWebResponse resp)
                {
                    if (resp.StatusCode == (HttpStatusCode)429) // too fast - increase our wait time
                    {
                        if (queueItem == null)
                        {
                            await IncreaseFetchTimeoutForHost(await Utility.GetHostIPAddress(request?.RequestUri));
                        }
                        else
                        {
                            await IncreaseFetchTimeoutForHost(queueItem.TargetIP);
                        }
                        TerseExceptionOutput(request?.RequestUri, ex);
                    }
                    else if (resp.StatusCode == HttpStatusCode.NotFound)
                    {
                        IncNotFound();
                    }
                    else if (resp.StatusCode == HttpStatusCode.Forbidden || resp.StatusCode == HttpStatusCode.Unauthorized)
                    {
                        IncForbidden();
                    }
                    else
                    {
                        TerseExceptionOutput(request?.RequestUri, ex);
                    }
                }
            }
            else if (ex is TimeoutException timeout)
            {
                if (queueItem == null)
                {
                    await RecordNetworkIssues(await Utility.GetHostIPAddress(request?.RequestUri));
                }
                else
                {
                    await RecordNetworkIssues(queueItem);
                }
                //TerseExceptionOutput(request?.RequestUri, ex);
            }
            else
            {
                VerboseExceptionOutput(request?.RequestUri, ex);
            }

            // In memorandum:
            // The line here was originally if ( !OutputInUse) await OutputSync.WaitAsync();
            // OutputInUse was defined as "OutputSync.CurrentCount == 0"
            // This contains a very subtle race condition it took about 300-400gb of
            // downloading before it finally reared its ugly head and corrupted the
            // Xml file.
        }
        /// <summary>
        /// </summary>
        /// <param name="writers"></param>
        /// <param name="request"></param>
        /// <param name="response"></param>
        /// <param name="startTime"></param>
        /// <returns></returns>
        private async Task <bool> WriteOutResponse(
            QueueItem queueItem,
            BufferBlock <IWebResourceWriter> writers,
            HttpWebRequest request,
            HttpWebResponse response,
            DateTime startTime
            )
        {
            IWebResourceWriter packet    = null;
            Exception          exception = null;
            bool wroteOk = false;

            // this has a potential to cause memory issues if theres lots of waiting
            byte[] buffer    = new byte[FetchoConfiguration.Current.MaxFileDownloadLengthInBytes];
            int    bytesread = 0;

            // Read as much into memory as possible up to the max limit

            Stream readStream = null;

            try
            {
                // this is down without using() as it may run slow and the cleanup of the object may happen before we get to cleanup
                // ie. race conditions. By doing it after we handle exceptions we control when the disposal occurs ourselves.
                readStream = response.GetResponseStream();

                int l = 0;
                do
                {
                    l = await readStream.ReadAsync(buffer, bytesread, buffer.Length - bytesread); // dont configureawait - disposed?

                    bytesread += l;
                }while (l > 0 && bytesread < buffer.Length); // read up to the buffer limit and ditch the rest
            }
            catch (Exception ex)
            {
                log.Error(ex);
                exception = ex;
                bytesread = 0;
            }
            finally
            {
                readStream.Dispose();
            }

            try
            {
                // once we've got plenty of bytes go find a lock
                IncWaitingToWrite();
                packet = await writers.ReceiveAsync().ConfigureAwait(false);

                DecWaitingToWrite();

                packet.OutputStartResource(queueItem);
                packet.OutputRequest(request, startTime);
                if (bytesread > 0)
                {
                    packet.OutputResponse(response, buffer, bytesread);
                }
            }
            catch (Exception ex)
            {
                await ErrorHandler(ex, request, queueItem);

                exception = ex;
            }
            finally
            {
                packet.OutputException(exception);
                packet.OutputEndResource();
                await writers.SendAsync(packet).ConfigureAwait(false);

                wroteOk = true;
            }
            return(wroteOk);
        }