Beispiel #1
0
        private void ParseCrawlRequests()
        {
            try
            {
                lock (_crawlRequestsLock)
                {
                    string[] crawlRequests = File.ReadAllLines(Path.Combine(Path.GetDirectoryName(Environment.CommandLine.Replace("\"", string.Empty)), "CrawlRequests.txt"));

                    foreach (string crawlRequest in crawlRequests)
                    {
                        if (crawlRequest.Trim().StartsWith("//") || string.IsNullOrWhiteSpace(crawlRequest))
                        {
                            continue;
                        }

                        string[] crawlRequestSplit = crawlRequest.Split(",".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

                        UriClassificationType restrictCrawlTo = UriClassificationType.None;

                        foreach (string uriClassificationType in crawlRequestSplit[2].Split("|".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
                        {
                            restrictCrawlTo |= (UriClassificationType)Enum.Parse(typeof(UriClassificationType), uriClassificationType);
                        }

                        UriClassificationType restrictDiscoveriesTo = UriClassificationType.None;

                        foreach (string uriClassificationType in crawlRequestSplit[3].Split("|".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
                        {
                            restrictDiscoveriesTo |= (UriClassificationType)Enum.Parse(typeof(UriClassificationType), uriClassificationType);
                        }

                        CrawlRequest <ArachnodeDAO> crawlRequest2 = new CrawlRequest <ArachnodeDAO>(new Discovery <ArachnodeDAO>(crawlRequestSplit[0]), int.Parse(crawlRequestSplit[1]), restrictCrawlTo, restrictDiscoveriesTo, double.Parse(crawlRequestSplit[4]), (RenderType)Enum.Parse(typeof(RenderType), crawlRequestSplit[5]), (RenderType)Enum.Parse(typeof(RenderType), crawlRequestSplit[6]));

                        _crawler.Crawl(crawlRequest2);
                    }
                }
            }
            catch (Exception exception)
            {
                new ArachnodeDAO(_applicationSettings.ConnectionString, _applicationSettings, _webSettings, false, false).InsertException(null, null, exception, true);
            }
        }
Beispiel #2
0
        public async Task <IReadOnlyCollection <Uri> > ExtractLinks(CrawlRequest crawlRequest, HttpContent content)
        {
            if ("text/html".Equals(content.Headers.ContentType.MediaType, StringComparison.OrdinalIgnoreCase) == false)
            {
                return(new Uri[0]);
            }

            using (var contentStream = await content.ReadAsStreamAsync())
            {
                var parser   = new HtmlParser();
                var document = await parser.ParseAsync(contentStream);

                return(document.Links
                       .Select(l => l.GetAttribute(AttributeNames.Href))
                       .Select(href => BuildUri(crawlRequest, href))
                       .Where(uri => exludedSchemas.Contains(uri.Scheme, StringComparer.OrdinalIgnoreCase) == false)
                       .Select(this.RemoveFragment)
                       .Distinct()
                       .ToList());
            }
        }
Beispiel #3
0
        private void InsertCrawlRequestIntoDatabase(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            if (crawlRequest.Originator != null)
            {
//ANOEDT: Could be improved... images from msn should be processed before images from joescrabshack.com.  TEST TEST TEST!!
                if (crawlRequest.Discovery.ExpectFileOrImage)
                {
                    if (_applicationSettings.InsertCrawlRequests)
                    {
                        arachnodeDAO.InsertCrawlRequest(SqlDateTime.MinValue.Value.AddSeconds(_databaseCrawlRequestCreatedHelper), crawlRequest.Originator.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Parent.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Discovery.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority + 1000000, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren);
                    }
                }
                else
                {
                    if (_applicationSettings.InsertCrawlRequests)
                    {
                        arachnodeDAO.InsertCrawlRequest(crawlRequest.Created, crawlRequest.Originator.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Parent.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Discovery.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren);
                    }
                }
            }
            else
            {
                if (crawlRequest.Discovery.ExpectFileOrImage)
                {
                    if (_applicationSettings.InsertCrawlRequests)
                    {
                        arachnodeDAO.InsertCrawlRequest(SqlDateTime.MinValue.Value.AddSeconds(_databaseCrawlRequestCreatedHelper), null, crawlRequest.Parent.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Discovery.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority + 1000000, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren);
                    }
                }
                else
                {
                    if (_applicationSettings.InsertCrawlRequests)
                    {
                        arachnodeDAO.InsertCrawlRequest(crawlRequest.Created, null, crawlRequest.Parent.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Discovery.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren);
                    }
                }
            }

            _databaseCrawlRequestCreatedHelper += 1;
        }
        /// <summary>
        ///     Processes the web page.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "webPageManager">The web page manager.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        protected override void ProcessWebPage(CrawlRequest <TArachnodeDAO> crawlRequest, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO)
        {
            _consoleManager.OutputWebPageDiscovered(crawlRequest.Crawl.CrawlInfo.ThreadNumber, crawlRequest);
            Counters.GetInstance().WebPagesDiscovered(1);

            /**/

            webPageManager.ManageWebPage(crawlRequest);

            //the Crawler may(/will) be null if PostProcessing...
            if (ApplicationSettings.ProcessDiscoveriesAsynchronously && !crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously && crawlRequest.Crawl.Crawler != null)
            {
                crawlRequest.Crawl.Crawler.Engine.DiscoveryProcessors[crawlRequest.Crawl.CrawlInfo.ThreadNumber].AddCrawlRequestToBeProcessed(crawlRequest);
            }
            else
            {
                ProcessDiscoveries(crawlRequest, arachnodeDAO);
            }
        }

        public override void ProcessDiscoveries(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            /**/

            //Email Addresses

            ProcessEmailAddresses(crawlRequest, arachnodeDAO);

            /**/

            //HyperLinks

            ProcessHyperLinks(crawlRequest, arachnodeDAO);

            /**/

            //Files and Images

            ProcessFilesAndImages(crawlRequest, arachnodeDAO);
        }
Beispiel #5
0
        /// <summary>
        ///     Begins a Crawl.  This method bypasses the Cache, and is experimental/for advanced users.
        ///     This method does not function with the DEMO version.
        /// </summary>
        /// <param name = "crawlRequest"></param>
        /// <param name = "obeyCrawlRules"></param>
        /// <param name = "executeCrawlActions"></param>
        public void BeginCrawl(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions, bool processDiscoveriesAsynchronously)
        {
#if DEMO
            return;
#endif
            _crawlInfo.ThreadNumber = -1;

            do
            {
                crawlRequest.Crawl = this;

                crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously = !processDiscoveriesAsynchronously;
                crawlRequest.CurrentDepth = crawlRequest.MaximumDepth;

                lock (_beginCrawlLock)
                {
                    ProcessCrawlRequest(crawlRequest, obeyCrawlRules, executeCrawlActions);

                    crawlRequest = UncrawledCrawlRequests.Dequeue();
                }
            } while (crawlRequest != null);
        }
Beispiel #6
0
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            bool isDisallowed = false;

            if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage)
            {
                crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;
                crawlRequest.IsDisallowedReason       = "Disallowed by Source.";

                if (UserDefinedFunctions.IsDisallowedForSource(crawlRequest.DecodedHtml, false))
                {
                    isDisallowed = true;
                }

                if (_negateIsDisallowed)
                {
                    isDisallowed = !isDisallowed;
                }
            }

            return(isDisallowed);
        }
        internal void BeginDiscoveryProcessor(object o)
        {
            while (_crawler.Engine.State == EngineState.Start || _crawler.Engine.State == EngineState.Pause || _crawler.Engine.State == EngineState.None)
            {
                if (_crawler.Engine.State == EngineState.Start)
                {
                    _crawler.Engine.StateControl.WaitOne();

                    lock (_crawlRequestLock)
                    {
                        IsProcessingDiscoveries = true;

                        while (UnprocessedCrawlRequests.Count != 0)
                        {
                            CrawlRequest <TArachnodeDAO> crawlRequest = UnprocessedCrawlRequests.Dequeue();

                            crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously = true;

                            try
                            {
                                _crawlRequestManager.ProcessDiscoveries(crawlRequest, _arachnodeDAO);
                            }
                            catch (Exception exception)
                            {
                                _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                            }

                            crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously = false;
                        }

                        IsProcessingDiscoveries = false;
                    }
                }

                Thread.Sleep(5);
            }

            IsProcessingDiscoveries = false;
        }
        public override void ResubmitCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool retryIndefinitely, IArachnodeDAO arachnodeDAO)
        {
            Thread.Sleep(10);

            if (crawlRequest.Discovery.HttpWebRequestRetriesRemaining != 0 || retryIndefinitely)
            {
                //resetting the DiscoveryState to allow the CrawlRequest to (attempt to) be re-crawled...
                crawlRequest.Discovery.DiscoveryState = DiscoveryState.Undiscovered;

                //removed because it will be re-added...
                Counters.GetInstance().CrawlRequestRemoved();

                if (crawlRequest.Priority > 0)
                {
                    crawlRequest.Priority = double.MinValue + crawlRequest.Priority;
                }
                _cache.UncrawledCrawlRequests.Enqueue(crawlRequest, crawlRequest.Priority);

                if (!retryIndefinitely)
                {
                    crawlRequest.Discovery.HttpWebRequestRetriesRemaining--;
                }
            }
            else
            {
                crawlRequest.Crawl.Crawler.Engine.OnCrawlRequestCanceled(crawlRequest);

                if (crawlRequest.IsFromDatabase)
                {
                    arachnodeDAO.DeleteCrawlRequest(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri);
                }

                Counters.GetInstance().ReportCurrentDepth(crawlRequest.CurrentDepth);

                Counters.GetInstance().CrawlRequestRemoved();

                Counters.GetInstance().CrawlRequestProcessed();
            }
        }
Beispiel #9
0
        public void Receive_CrawlRequest(G2ReceivedPacket packet)
        {
            CrawlRequest request = CrawlRequest.Decode(packet);


            if (Local.Equals(request.Target))
            {
                Send_CrawlAck(request, packet);
            }

            // Forward to appropriate node
            else
            {
                TcpConnect client = TcpControl.GetProxy(request.Target);

                if (client != null)
                {
                    request.FromAddress = packet.Source; // add so receiving host knows where to send response too

                    client.SendPacket(request);
                }
            }
        }
Beispiel #10
0
        public Crawler(CrawlRequest request, ILogger log)
        {
            _id   = request.Id;
            _host = request.Host;

            var internalDomains = new List <string>
            {
                _host,
                _githubDomain
            };

            _processor = new CloudflareCgiProcesser()
                         .Next(new LegacyProcessor()
                               .Next(new ImageProcessor(new ClientWrapper(log))
                                     .Next(new ContentProcessor(new ClientWrapper(log))
                                           .Next(new KnownPageProcessor()
                                                 .Next(new EmailProcessor()
                                                       .Next(new ExternalPageProcessor(internalDomains)
                                                             .Next(new PodcastRoadmapProcessor(new ClientWrapper(log)))
                                                             .Next(new PageProcessor(_gistDomain, new ClientWrapper(log), null)
                                                                   .Next(new PageProcessor(_githubDomain, new ClientWrapper(log), null)
                                                                         .Next(new PageProcessor(_host, new ClientWrapper(log), new ContentLinksExtractor(_host))
                                                                               .Next(new UnknownProcessor()))))))))));
        }
Beispiel #11
0
        /// <summary>
        ///     Saves the crawl requests to database.
        /// </summary>
        internal void SaveCrawlRequestsToDatabase()
        {
            while (UncrawledCrawlRequests.Count != 0)
            {
                _consoleManager.OutputString("Saving Crawl.UncrawledCrawlRequests: " + _crawlInfo.ThreadNumber + " : " + UncrawledCrawlRequests.Count + " CrawlRequests remaining to be inserted.", ConsoleColor.Gray, ConsoleColor.Gray);

                CrawlRequest <TArachnodeDAO> crawlRequest = UncrawledCrawlRequests.Dequeue();

                if (!_ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreRequest, _arachnodeDAO))
                {
                    if (crawlRequest.Originator != null)
                    {
                        if (_applicationSettings.InsertCrawlRequests)
                        {
                            _arachnodeDAO.InsertCrawlRequest(crawlRequest.Created, crawlRequest.Originator.Uri.AbsoluteUri, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren);
                        }
                    }
                    else
                    {
                        if (_applicationSettings.InsertCrawlRequests)
                        {
                            _arachnodeDAO.InsertCrawlRequest(crawlRequest.Created, null, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren);
                        }
                    }
                }
                else
                {
                    if (_applicationSettings.InsertDisallowedAbsoluteUris)
                    {
                        _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris);
                    }
                }

                Counters.GetInstance().CrawlRequestRemoved();
            }
        }
        public void PerformActionTest()
        {
            ApplicationSettings applicationSettings = new ApplicationSettings();
            WebSettings         webSettings         = new WebSettings();

            ArachnodeDAO arachnodeDAO = new ArachnodeDAO(applicationSettings.ConnectionString, applicationSettings, webSettings, true, true);

            Crawler <ArachnodeDAO> crawler = new Crawler <ArachnodeDAO>(applicationSettings, webSettings, CrawlMode.BreadthFirstByPriority, false);

            CrawlRequest <ArachnodeDAO> crawlRequest = new CrawlRequest <ArachnodeDAO>(new Discovery <ArachnodeDAO>("http://trycatchfail.com/blog/post/2008/11/12/Deep-web-crawling-with-NET-Getting-Started.aspx"), 1, UriClassificationType.Host, UriClassificationType.Host, 1, RenderType.None, RenderType.None);

            Crawl <ArachnodeDAO> crawl = new Crawl <ArachnodeDAO>(applicationSettings, webSettings, crawler, crawler.ActionManager, crawler.ConsoleManager, crawler.CookieManager, crawler.CrawlRequestManager, crawler.DataTypeManager, crawler.DiscoveryManager, crawler.EncodingManager, crawler.HtmlManager, crawler.PolitenessManager, crawler.ProxyManager, crawler.RuleManager, true);

            applicationSettings.MaximumNumberOfCrawlThreads = 0;

            UserDefinedFunctions.ConnectionString = "Data Source=.;Initial Catalog=arachnode.net;Integrated Security=True;Connection Timeout=3600;";
            crawler.Engine.Start();

            crawl.BeginCrawl(crawlRequest, false, false, false);

            Templater <ArachnodeDAO> target = new Templater <ArachnodeDAO>(applicationSettings, webSettings);

            target.PerformAction(crawlRequest, arachnodeDAO);
        }
Beispiel #13
0
        private ScannerResult CheckForFileType(string url, StringBuilder sb, string fileExtension, StringBuilder linkBuilder = null)
        {
            ScannerResult result = new ScannerResult();

            try
            {
                CrawlRequest request = new CrawlRequest();
                request.FileType = fileExtension;
                request.Address  = url.Trim('/').Replace("https://", "").Replace("http://", "");
                request.Limit    = 50;
                request.FindAll  = true;
                List <string> info = Crawler.SearchFileType(request, true);

                if (info.Count != 0)
                {
                    result.Success = true;
                    sb.Append("\t" + fileExtension + " Files Found! " + info + "! Email sent." + Environment.NewLine);
                    SendEmail("\t" + fileExtension + " Files Found ", url + " appears to have " + fileExtension + " files: " + Environment.NewLine + String.Join(Environment.NewLine, info.ToArray()));
                    result.Results.AddRange(info);
                    if (linkBuilder != null)
                    {
                        linkBuilder.Append(String.Join(Environment.NewLine, info.ToArray()) + Environment.NewLine);
                    }
                }
                else
                {
                    sb.Append("\tNo " + fileExtension + " files found." + Environment.NewLine);
                }
            }
            catch (Exception ex)
            {
                throw new Exception("File finder exception: " + ex.Message);
            }

            return(result);
        }
        /// <summary>
        ///     Manages the image.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "imageID">The image ID.</param>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <param name = "source">The source.</param>
        /// <param name = "fullTextIndexType">Full type of the text index.</param>
        /// <param name = "extractImageMetaData">if set to <c>true</c> [extract image meta data].</param>
        /// <param name = "insertImageMetaData">if set to <c>true</c> [insert image meta data].</param>
        /// <param name = "saveImageToDisk">if set to <c>true</c> [save image to disk].</param>
        /// <returns></returns>
        public override ManagedImage ManageImage(CrawlRequest <TArachnodeDAO> crawlRequest, long imageID, string absoluteUri, byte[] source, string fullTextIndexType, bool extractImageMetaData, bool insertImageMetaData, bool saveImageToDisk)
        {
            try
            {
                using (MemoryStream memoryStream = new MemoryStream(source, true))
                {
                    ManagedImage managedImage = new ManagedImage();

                    managedImage.Image = Image.FromStream(memoryStream);

                    if (extractImageMetaData)
                    {
                        XmlDocument xmlDocument = new XmlDocument();
                        XmlElement  xmlElement;

                        xmlDocument.AppendChild(xmlDocument.CreateNode(XmlNodeType.XmlDeclaration, "", ""));
                        xmlDocument.AppendChild(xmlDocument.CreateElement("", "EXIFData", ""));

                        Dictionary <string, string> dictionary = new Dictionary <string, string>();

                        foreach (Pair pair in new EXIFExtractor(managedImage.Image, "", ""))
                        {
                            dictionary.Add(pair.First.ToString(), pair.Second.ToString());
                        }

                        foreach (KeyValuePair <string, string> keyValuePair in dictionary)
                        {
                            xmlElement = xmlDocument.CreateElement("", keyValuePair.Key.Replace(" ", "_"), "");

                            string value = UserDefinedFunctions.ExtractAlphaNumericCharacters(keyValuePair.Value).Value ?? string.Empty;

                            xmlElement.AppendChild(xmlDocument.CreateTextNode(value));

                            xmlDocument.ChildNodes.Item(1).AppendChild(xmlElement);
                        }

                        managedImage.EXIFData = xmlDocument;

                        if (insertImageMetaData)
                        {
                            _arachnodeDAO.InsertImageMetaData(absoluteUri, imageID, xmlDocument.InnerXml, managedImage.Image.Flags, managedImage.Image.Height, managedImage.Image.HorizontalResolution, managedImage.Image.VerticalResolution, managedImage.Image.Width);
                        }
                    }

                    if (saveImageToDisk)
                    {
                        managedImage.DiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, absoluteUri, fullTextIndexType);

                        managedImage.Image.Save(managedImage.DiscoveryPath);
                    }

                    return(managedImage);
                } //ANODET: Parameter is not valid in the exception handler...
            }
            catch (Exception exception)
            {
                //ANODET: Images of 7 bytes (Generic GDI Error)...
#if !DEMO
                if (crawlRequest != null)
                {
                    _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, absoluteUri, exception, false);
                }
                else
                {
                    _arachnodeDAO.InsertException(null, absoluteUri, exception, false);
                }
#endif
            }

            return(null);
        }
Beispiel #15
0
 /// <summary>
 ///     Manages the file.
 /// </summary>
 /// <param name = "crawlRequest">The crawl request.</param>
 /// <param name = "fileID">The file ID.</param>
 /// <param name = "absoluteUri">The absolute URI.</param>
 /// <param name = "source">The source.</param>
 /// <param name = "fullTextIndexType">Full type of the text index.</param>
 /// <param name = "extractFileMetaData">if set to <c>true</c> [extract file meta data].</param>
 /// <param name = "insertFileMetaData">if set to <c>true</c> [insert file meta data].</param>
 /// <param name = "saveFileToDisk">if set to <c>true</c> [save file to disk].</param>
 /// <returns></returns>
 public abstract ManagedFile ManageFile(CrawlRequest <TArachnodeDAO> crawlRequest, long fileID, string absoluteUri, byte[] source, string fullTextIndexType, bool extractFileMetaData, bool insertFileMetaData, bool saveFileToDisk);
Beispiel #16
0
 /// <summary>
 ///     Manages the file.
 /// </summary>
 /// <param name = "crawlRequest">The crawl request.</param>
 public abstract void ManageFile(CrawlRequest <TArachnodeDAO> crawlRequest);
Beispiel #17
0
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "crawlRuleType">Type of the rule.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, CrawlRuleType crawlRuleType, IArachnodeDAO arachnodeDAO)
        {
            switch (crawlRuleType)
            {
            case CrawlRuleType.PreRequest:
                foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _preRequestCrawlRules.Values)
                {
                    foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules)
                    {
                        try
                        {
                            if (crawlRule.IsEnabled && crawlRule.IsDisallowed(crawlRequest, arachnodeDAO))
                            {
                                crawlRequest.IsDisallowed                 = true;
                                crawlRequest.Discovery.IsDisallowed       = true;
                                crawlRequest.Discovery.IsDisallowedReason = crawlRequest.IsDisallowedReason;

                                return(true);
                            }
                        }
                        catch (Exception exception)
                        {
                            arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);

                            return(true);
                        }
                    }
                }
                break;

            case CrawlRuleType.PreGet:
                foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _preGetCrawlRules.Values)
                {
                    foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules)
                    {
                        try
                        {
                            if (crawlRule.IsEnabled && crawlRule.IsDisallowed(crawlRequest, arachnodeDAO))
                            {
                                crawlRequest.IsDisallowed                 = true;
                                crawlRequest.Discovery.IsDisallowed       = true;
                                crawlRequest.Discovery.IsDisallowedReason = crawlRequest.IsDisallowedReason;

                                return(true);
                            }
                        }
                        catch (Exception exception)
                        {
                            arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);

                            return(true);
                        }
                    }
                }
                break;

            case CrawlRuleType.PostRequest:
                foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _postRequestCrawlRules.Values)
                {
                    foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules)
                    {
                        try
                        {
                            if (crawlRule.IsEnabled && crawlRule.IsDisallowed(crawlRequest, arachnodeDAO))
                            {
                                crawlRequest.IsDisallowed                 = true;
                                crawlRequest.Discovery.IsDisallowed       = true;
                                crawlRequest.Discovery.IsDisallowedReason = crawlRequest.IsDisallowedReason;

                                return(true);
                            }
                        }
                        catch (Exception exception)
                        {
                            arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);

                            return(true);
                        }
                    }
                }
                break;
            }

            return(false);
        }
Beispiel #18
0
        public static async Task <CrawlResponse> DownloadJson(Uri uri, string userAgent, CrawlRequest reqBody)
        {
            var request = (HttpWebRequest)WebRequest.Create(uri);

            request.Method    = "GET";
            request.UserAgent = userAgent;

            if (!string.IsNullOrEmpty(reqBody.ETag))
            {
                request.Headers.Add(HttpRequestHeader.IfNoneMatch, reqBody.ETag);
            }


            using (var response = (HttpWebResponse)await request.GetResponseAsync())
            {
                using (var stream = response.GetResponseStream())
                    using (var reader = new StreamReader(stream))
                    {
                        return(new CrawlResponse
                        {
                            Features = JToken.Parse(await reader.ReadToEndAsync())
                        });
                    }
            }
        }
Beispiel #19
0
        public static async Task <HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log)
        {
            CrawlRequest crawlRequest = null;
            string       reqBodyStr   = null;

            try
            {
                using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.HTML"))
                {
                    reqBodyStr = await req.Content.ReadAsStringAsync();

                    var reqBody = JsonConvert.DeserializeObject <CrawlRequest>(reqBodyStr);

                    operation.Telemetry.Properties.Add("AppId", reqBody.Site);
                    operation.Telemetry.Properties.Add("ActionId", reqBody.Id);
                    operation.Telemetry.Properties.Add("Url", reqBody.Url);

                    log.Info($"Crawl AppId={reqBody.Site} Id={reqBody.Id} Url={reqBody.Url}");

                    var crawlResponse = await Download(reqBody);

                    // always return a valid object so that downstream workflows can continue
                    if (crawlResponse == null)
                    {
                        crawlResponse = new CrawlResponse();
                    }

                    crawlResponse.Url  = reqBody.Url;
                    crawlResponse.Site = reqBody.Site;
                    crawlResponse.Id   = reqBody.Id;

                    var json = JsonConvert.SerializeObject(crawlResponse, new JsonSerializerSettings
                    {
                        Formatting           = Formatting.None,
                        StringEscapeHandling = StringEscapeHandling.EscapeNonAscii
                    });

                    return(new HttpResponseMessage(HttpStatusCode.OK)
                    {
                        Content = new StringContent(
                            json,
                            new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
                            "application/json")
                    });
                }
            }
            catch (Exception ex)
            {
                var props = new Dictionary <string, string>
                {
                    { "Service", req.RequestUri.ToString() }
                };

                if (crawlRequest == null)
                {
                    props.Add("JSON", reqBodyStr);
                }
                else
                {
                    props.Add("Url", crawlRequest.Url);
                    props.Add("AppId", crawlRequest.Site);
                    props.Add("ActionId", crawlRequest.Id);
                }

                Services.TelemetryClient.TrackException(ex, props);

                throw ex;
            }
        }
 /// <summary>
 ///     Assigns the file and image discoveries.
 /// </summary>
 /// <param name = "crawlRequest">The crawl request.</param>
 public abstract void AssignFileAndImageDiscoveries(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO);
        /// <summary>
        ///     Processes the hyper links.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        public override void ProcessHyperLinks(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            if (ApplicationSettings.AssignHyperLinkDiscoveries)
            {
                _discoveryManager.AssignHyperLinkDiscoveries(crawlRequest, arachnodeDAO);
            }

            foreach (Discovery <TArachnodeDAO> hyperLinkDiscovery in crawlRequest.Discoveries.HyperLinks.Values)
            {
                if (!hyperLinkDiscovery.IsDisallowed)
                {
                    if (hyperLinkDiscovery.DiscoveryState == DiscoveryState.Undiscovered)
                    {
                        if (crawlRequest.CurrentDepth < crawlRequest.MaximumDepth)
                        {
                            if (!_discoveryManager.IsCrawlRestricted(crawlRequest, hyperLinkDiscovery.Uri.AbsoluteUri))
                            {
                                _cache.AddCrawlRequestToBeCrawled(new CrawlRequest <TArachnodeDAO>(crawlRequest, hyperLinkDiscovery, crawlRequest.CurrentDepth + 1, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority + hyperLinkDiscovery.PriorityBoost, crawlRequest.RenderTypeForChildren, crawlRequest.RenderTypeForChildren), false, false, arachnodeDAO);
                            }
                        }

                        if (ApplicationSettings.InsertHyperLinks && hyperLinkDiscovery.IsStorable)
                        {
                            arachnodeDAO.InsertHyperLink(crawlRequest.Discovery.Uri.AbsoluteUri, hyperLinkDiscovery.Uri.AbsoluteUri, ApplicationSettings.ClassifyAbsoluteUris);
                        }

                        _consoleManager.OutputHyperLinkDiscovered(crawlRequest.Crawl.CrawlInfo.ThreadNumber, crawlRequest, hyperLinkDiscovery);
                    }
                    else
                    {
                        if (ApplicationSettings.InsertHyperLinkDiscoveries && hyperLinkDiscovery.IsStorable)
                        {
                            arachnodeDAO.InsertHyperLinkDiscovery(crawlRequest.Discovery.Uri.AbsoluteUri, hyperLinkDiscovery.Uri.AbsoluteUri);
                        }

                        _consoleManager.OutputCacheHit(crawlRequest.Crawl.CrawlInfo, crawlRequest, hyperLinkDiscovery);
                    }
                }
                else
                {
                    if (ApplicationSettings.InsertDisallowedAbsoluteUris)
                    {
                        if (hyperLinkDiscovery.DiscoveryState == DiscoveryState.Undiscovered)
                        {
                            arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Discovery.Uri.AbsoluteUri, hyperLinkDiscovery.Uri.AbsoluteUri, hyperLinkDiscovery.IsDisallowedReason, ApplicationSettings.ClassifyAbsoluteUris);
                        }
                        else
                        {
                            if (ApplicationSettings.InsertDisallowedAbsoluteUriDiscoveries)
                            {
                                arachnodeDAO.InsertDisallowedAbsoluteUriDiscovery(crawlRequest.Discovery.Uri.AbsoluteUri, hyperLinkDiscovery.Uri.AbsoluteUri);
                            }
                        }
                    }

                    _consoleManager.OutputIsDisallowedReason(crawlRequest.Crawl.CrawlInfo, crawlRequest, hyperLinkDiscovery);
                }
            }

            Counters.GetInstance().HyperLinksDiscovered(crawlRequest.Discoveries.HyperLinks.Count);
        }
 /// <summary>
 ///     Determines whether the specified crawl request is restricted.
 /// </summary>
 /// <param name = "crawlRequest">The crawl request.</param>
 /// <param name = "absoluteUri">The absolute URI.</param>
 /// <returns>
 ///     <c>true</c> if the specified crawl request is restricted; otherwise, <c>false</c>.
 /// </returns>
 public abstract bool IsCrawlRestricted(CrawlRequest <TArachnodeDAO> crawlRequest, string absoluteUri);
Beispiel #23
0
        /// <summary>
        ///     Processes the crawl request.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "obeyCrawlRules">if set to <c>true</c> [obey crawl rules].</param>
        /// <param name = "executeCrawlActions">if set to <c>true</c> [execute crawl actions].</param>
        public void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions)
        {
            //HACK:!!!  Solve this!!!
//#if DEMO
//            return;
//#endif

            bool wasACacheHit = false;

            try
            {
                crawlRequest.WebClient = WebClient;

                if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered)
                {
                    if (!_politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestRequested, _arachnodeDAO))
                    {
                        Crawler.Engine.OnCrawlRequestThrottled(crawlRequest);

                        return;
                    }

                    _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest);

                    _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PreRequest, _arachnodeDAO);

                    if (obeyCrawlRules)
                    {
                        _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreRequest, _arachnodeDAO);
                    }

                    if (executeCrawlActions)
                    {
                        _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreRequest, _arachnodeDAO);
                    }

                    if (!crawlRequest.IsDisallowed)
                    {
                        _stopwatch.Reset();
                        _stopwatch.Start();

                        try
                        {
                            _dataManager.ProcessCrawlRequest(crawlRequest, obeyCrawlRules, executeCrawlActions);
                        }
                        catch (Exception exception2)
                        {
                            throw new Exception(exception2.Message, exception2);
                        }
                        finally
                        {
                            _stopwatch.Stop();

                            _crawlInfo.TotalHttpWebResponseTime += _stopwatch.Elapsed;
                            crawlRequest.HttpWebResponseTime     = _stopwatch.Elapsed;

                            _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO);
                        }

                        Counters.GetInstance().TotalBytesDiscovered(crawlRequest.Data.LongLength);

                        _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PostRequest, _arachnodeDAO);

                        _encodingManager.ProcessCrawlRequest(crawlRequest, _arachnodeDAO);

                        if (obeyCrawlRules)
                        {
                            _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PostRequest, _arachnodeDAO);
                        }

                        //the CrawlRequest could be Disallowed by a PreGet CrawlRule - specifically DataType.cs.
                        if (!crawlRequest.IsDisallowed)
                        {
                            if (_processData)
                            {
                                _crawlRequestManager.ProcessCrawlRequest(crawlRequest, _fileManager, _imageManager, _webPageManager, _arachnodeDAO);
                            }
                        }
                        else
                        {
                            if (crawlRequest.DataType.ContentType == null)
                            {
                                crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest);
                            }

                            if (_applicationSettings.InsertDisallowedAbsoluteUris)
                            {
                                _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris);
                            }

                            _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest);
                        }
                    }
                    else
                    {
                        _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO);

                        if (crawlRequest.DataType.ContentType == null)
                        {
                            crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest);
                        }

                        if (_applicationSettings.InsertDisallowedAbsoluteUris)
                        {
                            _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris);
                        }

                        _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest);
                    }
                }
                else
                {
                    wasACacheHit = true;

                    //this should only occur when you submit a CR from a rule, or action...
                    _consoleManager.OutputCacheHit(_crawlInfo, crawlRequest, crawlRequest.Discovery);
                }
            }
            catch (Exception exception)
            {
                _stopwatch.Stop();

                if (Crawler.Engine.State != EngineState.Start)
                {
                    //the request was aborted as it was long running and Engine was requested to Stop.
                    if ((crawlRequest.WebClient.WebException != null && crawlRequest.WebClient.WebException.Status == WebExceptionStatus.RequestCanceled) || (exception.InnerException != null && exception.InnerException.Message == "The request was aborted: The request was canceled."))
                    {
                        return;
                    }
                }

                if (crawlRequest.WebClient.WebException != null && crawlRequest.Discovery.HttpWebRequestRetriesRemaining != 0 && crawlRequest.WebClient.WebException.Message.StartsWith("Unable to connect to the remote server"))
                {
                    _politenessManager.ResubmitCrawlRequest(crawlRequest, false, _arachnodeDAO);

                    _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCanceled, _arachnodeDAO);

                    return;
                }

                try
                {
                    _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO);
                }
                catch (Exception exception2)
                {
                    exception = exception2;
                }

                if (exception.InnerException == null)
                {
                    _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                }
                else
                {
                    _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.InnerException, false);
                }

                crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest);

                if (_applicationSettings.InsertDisallowedAbsoluteUris)
                {
                    if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered)
                    {
                        _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.Message, _applicationSettings.ClassifyAbsoluteUris);
                    }
                    else
                    {
                        if (_applicationSettings.InsertDisallowedAbsoluteUriDiscoveries)
                        {
                            _arachnodeDAO.InsertDisallowedAbsoluteUriDiscovery(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri);
                        }
                    }
                }

                _consoleManager.OutputException(_crawlInfo.ThreadNumber, crawlRequest, _arachnodeDAO.LastExceptionID, _arachnodeDAO.LastExceptionMessage);
            }

            if (crawlRequest.IsFromDatabase)
            {
                _arachnodeDAO.DeleteCrawlRequest(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri);
            }

            _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.Discovered, _arachnodeDAO);

            if (!wasACacheHit)
            {
                if (executeCrawlActions)
                {
                    _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, _arachnodeDAO);
                }

                Crawler.Engine.OnCrawlRequestCompleted(crawlRequest);
            }

            _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest);

            Counters.GetInstance().ReportCurrentDepth(crawlRequest.CurrentDepth);

            Counters.GetInstance().CrawlRequestRemoved();

            Counters.GetInstance().CrawlRequestProcessed();

            _crawlInfo.TotalCrawlRequestsProcessed++;
        }
Beispiel #24
0
 private void Engine_CrawlRequestCompleted(CrawlRequest <ArachnodeDAO> crawlRequest)
 {
 }
Beispiel #25
0
 /// <summary>
 ///     Determines whether the specified crawl request is disallowed.
 /// </summary>
 /// <param name = "crawlRequest">The crawl request.</param>
 /// <param name = "arachnodeDAO">The arachnode DAO.</param>
 /// <returns>
 ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
 /// </returns>
 public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
 {
     return(IsDisallowed(crawlRequest, crawlRequest.Discovery.Uri));
 }
Beispiel #26
0
        public static List <string> SearchFileType(CrawlRequest request, bool filterKnownAttackFiles)//, CrawlerContext context)
        {
            if (_knownAttackFiles.Count == 0)
            {
                Initilize();
            }

            List <string> foundFiles   = new List <string>();
            string        resumeKey    = "";
            bool          continueLoop = true;

            do
            {
                WebPageRequest webRequest = new WebPageRequest();
                // webRequest.Address = "https://web.archive.org/cdx/search?url=" + request.Address + "&matchType=domain&collapse=urlkey&output=text&fl=original&filter=urlkey:.*"+request.FileType+"&limit=10&page=1";
                webRequest.Address = "https://web.archive.org/cdx/search?url=" + request.Address + "/&matchType=host" +
                                     "&collapse=urlkey" +
                                     "&output=text" +
                                     "&fl=original" +
                                     @"&filter=original:.*\." + request.FileType + "$" +
                                     "&filter=statuscode:200" +
                                     "&limit=" + request.Limit +
                                     "&showResumeKey=" + request.FindAll.ToString().ToLower() +
                                     "&resumeKey=" + resumeKey;
                WebPageLoader.Load(webRequest);

                if (!String.IsNullOrEmpty(webRequest.Response.Body))
                {
                    //  return webRequest.Response.Body;
                    List <string> foundStrings = webRequest.Response.Body.Split(new string[] { "\n" }, StringSplitOptions.RemoveEmptyEntries).ToList();
                    if (foundStrings.Count <= request.Limit)
                    {
                        foundFiles.AddRange(foundStrings);
                        continueLoop = false;
                    }
                    else
                    {
                        foundFiles.AddRange(foundStrings.Take(request.Limit));
                        resumeKey = foundStrings.LastOrDefault();
                        if (resumeKey == null)
                        {
                            continueLoop = false;
                        }
                    }
                }
                else
                {
                    continueLoop = false;
                }
            } while (request.FindAll && continueLoop);



            if (filterKnownAttackFiles && foundFiles.Count != 0)
            {
                List <string> dangerzone = new List <string>();

                foreach (string url in foundFiles)
                {
                    string file = url.Split('/').LastOrDefault();
                    if (file == null)
                    {
                        continue;
                    }

                    foreach (AttackFile attack in _knownAttackFiles)
                    {
                        if (file.Equals(attack.File, StringComparison.InvariantCultureIgnoreCase))
                        {
                            foreach (string attackString in attack.Attacks)
                            {
                                dangerzone.Add(url + attackString);
                            }
                        }
                    }
                }

                foundFiles = dangerzone;
            }

            return(foundFiles);
        }
Beispiel #27
0
        public static async Task <CrawlResponse> DownloadHtml(Uri uri, string userAgent, CrawlRequest reqBody)
        {
            var request = (HttpWebRequest)WebRequest.Create(uri);

            request.Method    = "GET";
            request.UserAgent = userAgent;

            if (!string.IsNullOrEmpty(reqBody.ETag))
            {
                request.Headers.Add(HttpRequestHeader.IfNoneMatch, reqBody.ETag);
            }


            using (var response = (HttpWebResponse)await request.GetResponseAsync())
            {
                using (var stream = response.GetResponseStream())
                    using (var reader = new StreamReader(stream))
                    {
                        // TODO: look for schema.org
                        var html = await reader.ReadToEndAsync();

                        // TODO: support microsoft:ds_id
                        return(HtmlExtractor.Parse(html, new Uri(reqBody.Url)));
                    }
            }
        }
Beispiel #28
0
        public void Send_CrawlAck(CrawlRequest req, G2ReceivedPacket packet)
        {
            CrawlAck ack = new CrawlAck();

            ack.Source = GetLocalSource();
            ack.Version = Core.Context.LocalSeqVersion.ToString();
            ack.Uptime = (Core.TimeNow - Core.StartTime).Seconds;

            foreach (TcpConnect connection in TcpControl.ProxyServers)
                ack.ProxyServers.Add(new DhtContact(connection, connection.RemoteIP));

            foreach (TcpConnect connection in TcpControl.ProxyClients)
                ack.ProxyClients.Add(new DhtContact(connection, connection.RemoteIP));

            if (packet.ReceivedTcp)
            {
                ack.ToAddress = packet.Source;
                packet.Tcp.SendPacket(ack);
            }
            else
                SendPacket(packet.Source, ack);
        }
Beispiel #29
0
        public static async Task <CrawlResponse> Download(CrawlRequest reqBody)
        {
            Uri uri;

            if (!Uri.TryCreate(reqBody.Url, UriKind.Absolute, out uri))
            {
                return(null);
            }

            foreach (var userAgent in UserAgents)
            {
                var headRequest = (HttpWebRequest)WebRequest.Create(uri);
                headRequest.Method    = "HEAD";
                headRequest.UserAgent = userAgent;

                try
                {
                    // make sure we only crawl HTML
                    using (var response = (HttpWebResponse)await headRequest.GetResponseAsync())
                    {
                        var contentType = response.GetResponseHeader("Content-Type");

                        CrawlResponse result = null;

                        if (string.IsNullOrWhiteSpace(contentType) || contentType.StartsWith("text/html"))
                        {
                            result = await DownloadHtml(uri, userAgent, reqBody);
                        }

                        if (contentType.StartsWith("application/json"))
                        {
                            result = await DownloadJson(uri, userAgent, reqBody);
                        }

                        if (contentType.StartsWith("video/") || contentType.StartsWith("audio/"))
                        {
                            result = new CrawlResponse {
                                Video = reqBody.Url
                            }
                        }
                        ;

                        if (contentType.StartsWith("image/"))
                        {
                            result = new CrawlResponse {
                                Image = reqBody.Url
                            }
                        }
                        ;

                        return(result);
                    }
                }
                catch (WebException we)
                {
                    HttpWebResponse httpResponse = we.Response as HttpWebResponse;
                    if (we.Status == WebExceptionStatus.ServerProtocolViolation)
                    {
                        // Get a little more telemetry about what is going on here, though most cases don't
                        // have a Response object.
                        IDictionary <string, string> traceData = new Dictionary <string, string>()
                        {
                            { "HasResponse", (we.Response != null).ToString() }
                        };

                        if (we.Response != null)
                        {
                            traceData["Response.SupportsHeaders"] = we.Response.SupportsHeaders.ToString();

                            if (we.Response.SupportsHeaders)
                            {
                                for (int i = 0; i < we.Response.Headers.Count; i++)
                                {
                                    string headerName  = we.Response.Headers.GetKey(i);
                                    string headerValue = we.Response.Headers.Get(i);
                                    traceData[$"Response.Headers.{headerName}"] = headerValue;
                                }
                            }

                            if (httpResponse != null)
                            {
                                traceData["HttpResponse.StatusCode"] = httpResponse.StatusCode.ToString();
                            }
                        }

                        Services.TelemetryClient.TrackTrace($"Download target ({uri}) ServerProtocolViolation", SeverityLevel.Error, traceData);

                        // Ignore known cases where crawl fails due to error on the crawl-target side - these should not
                        // cause a hard failure on our end.
                        continue;
                    }

                    if (httpResponse != null)
                    {
                        // Ignore known cases where crawl fails due to error on the crawl-target side - these should not
                        // cause a hard failure on our end.
                        if (httpResponse.StatusCode == HttpStatusCode.Forbidden ||
                            httpResponse.StatusCode == HttpStatusCode.NotFound ||
                            httpResponse.StatusCode == HttpStatusCode.ServiceUnavailable)
                        {
                            continue;
                        }
                    }

                    throw;
                }
            }

            throw new UnauthorizedAccessException("Unable to access HTTP endpoint");
        }
Beispiel #30
0
        public void Send_CrawlRequest(DhtAddress address, DhtClient target)
        {
            CrawlRequest request = new CrawlRequest();

            request.Target = target;

            SendPacket(address, request);
        }
        public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            //Rendering determines the Encoding...
            if (crawlRequest.RenderType == RenderType.None)
            {
                if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage)
                {
                    string contentType = null;
                    if (crawlRequest.WebClient.HttpWebResponse.Headers["Content-Type"] != null)
                    {
                        string[] contentTypeHeader = crawlRequest.WebClient.HttpWebResponse.Headers["Content-Type"].Split('=');

                        if (contentTypeHeader.Length == 2)
                        {
                            contentType = contentTypeHeader[1].Replace("utf8", "utf-8");
                        }
                    }

                    Encoding encoding    = null;
                    string   decodedHtml = null;

                    try
                    {
                        //first, try and get the Encoding from the 'Content-Type'...
                        if (!string.IsNullOrEmpty(contentType))
                        {
                            encoding = Encoding.GetEncoding(contentType);
                        }
                        else
                        {
                            decodedHtml = DetermineEncoding(crawlRequest, out encoding);
                        }
                    }
                    catch (Exception exception)
                    {
                        try
                        {
                            //if there is an error, try and get the Encoding from the 'Charset'...
                            decodedHtml = DetermineEncoding(crawlRequest, out encoding);
                        }
                        catch (Exception exception2)
                        {
                            //if there is an error, default to UTF8.
                            arachnodeDAO.InsertException(crawlRequest.Discovery.Uri.AbsoluteUri, null, exception, false);
                            arachnodeDAO.InsertException(crawlRequest.Discovery.Uri.AbsoluteUri, null, exception2, false);

                            encoding = Encoding.UTF8;
                        }
                    }

                    crawlRequest.Encoding = encoding;

                    if (encoding == Encoding.UTF8 && decodedHtml != null)
                    {
                        crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(decodedHtml);
                        crawlRequest.Html        = decodedHtml;
                    }
                    else
                    {
                        crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(encoding.GetString(crawlRequest.Data));
                        crawlRequest.Html        = encoding.GetString(crawlRequest.Data);
                    }
                }
            }
        }
Beispiel #32
0
 public override void SendCrawlRequestResponseMessageToCrawlerPeer(CrawlerPeer crawlerPeer, CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
 {
 }