private void ParseCrawlRequests() { try { lock (_crawlRequestsLock) { string[] crawlRequests = File.ReadAllLines(Path.Combine(Path.GetDirectoryName(Environment.CommandLine.Replace("\"", string.Empty)), "CrawlRequests.txt")); foreach (string crawlRequest in crawlRequests) { if (crawlRequest.Trim().StartsWith("//") || string.IsNullOrWhiteSpace(crawlRequest)) { continue; } string[] crawlRequestSplit = crawlRequest.Split(",".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); UriClassificationType restrictCrawlTo = UriClassificationType.None; foreach (string uriClassificationType in crawlRequestSplit[2].Split("|".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { restrictCrawlTo |= (UriClassificationType)Enum.Parse(typeof(UriClassificationType), uriClassificationType); } UriClassificationType restrictDiscoveriesTo = UriClassificationType.None; foreach (string uriClassificationType in crawlRequestSplit[3].Split("|".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { restrictDiscoveriesTo |= (UriClassificationType)Enum.Parse(typeof(UriClassificationType), uriClassificationType); } CrawlRequest <ArachnodeDAO> crawlRequest2 = new CrawlRequest <ArachnodeDAO>(new Discovery <ArachnodeDAO>(crawlRequestSplit[0]), int.Parse(crawlRequestSplit[1]), restrictCrawlTo, restrictDiscoveriesTo, double.Parse(crawlRequestSplit[4]), (RenderType)Enum.Parse(typeof(RenderType), crawlRequestSplit[5]), (RenderType)Enum.Parse(typeof(RenderType), crawlRequestSplit[6])); _crawler.Crawl(crawlRequest2); } } } catch (Exception exception) { new ArachnodeDAO(_applicationSettings.ConnectionString, _applicationSettings, _webSettings, false, false).InsertException(null, null, exception, true); } }
public async Task <IReadOnlyCollection <Uri> > ExtractLinks(CrawlRequest crawlRequest, HttpContent content) { if ("text/html".Equals(content.Headers.ContentType.MediaType, StringComparison.OrdinalIgnoreCase) == false) { return(new Uri[0]); } using (var contentStream = await content.ReadAsStreamAsync()) { var parser = new HtmlParser(); var document = await parser.ParseAsync(contentStream); return(document.Links .Select(l => l.GetAttribute(AttributeNames.Href)) .Select(href => BuildUri(crawlRequest, href)) .Where(uri => exludedSchemas.Contains(uri.Scheme, StringComparer.OrdinalIgnoreCase) == false) .Select(this.RemoveFragment) .Distinct() .ToList()); } }
private void InsertCrawlRequestIntoDatabase(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { if (crawlRequest.Originator != null) { //ANOEDT: Could be improved... images from msn should be processed before images from joescrabshack.com. TEST TEST TEST!! if (crawlRequest.Discovery.ExpectFileOrImage) { if (_applicationSettings.InsertCrawlRequests) { arachnodeDAO.InsertCrawlRequest(SqlDateTime.MinValue.Value.AddSeconds(_databaseCrawlRequestCreatedHelper), crawlRequest.Originator.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Parent.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Discovery.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority + 1000000, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren); } } else { if (_applicationSettings.InsertCrawlRequests) { arachnodeDAO.InsertCrawlRequest(crawlRequest.Created, crawlRequest.Originator.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Parent.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Discovery.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren); } } } else { if (crawlRequest.Discovery.ExpectFileOrImage) { if (_applicationSettings.InsertCrawlRequests) { arachnodeDAO.InsertCrawlRequest(SqlDateTime.MinValue.Value.AddSeconds(_databaseCrawlRequestCreatedHelper), null, crawlRequest.Parent.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Discovery.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority + 1000000, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren); } } else { if (_applicationSettings.InsertCrawlRequests) { arachnodeDAO.InsertCrawlRequest(crawlRequest.Created, null, crawlRequest.Parent.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.Discovery.Uri.AbsoluteUri + _applicationSettings.UniqueIdentifier, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren); } } } _databaseCrawlRequestCreatedHelper += 1; }
/// <summary> /// Processes the web page. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> protected override void ProcessWebPage(CrawlRequest <TArachnodeDAO> crawlRequest, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { _consoleManager.OutputWebPageDiscovered(crawlRequest.Crawl.CrawlInfo.ThreadNumber, crawlRequest); Counters.GetInstance().WebPagesDiscovered(1); /**/ webPageManager.ManageWebPage(crawlRequest); //the Crawler may(/will) be null if PostProcessing... if (ApplicationSettings.ProcessDiscoveriesAsynchronously && !crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously && crawlRequest.Crawl.Crawler != null) { crawlRequest.Crawl.Crawler.Engine.DiscoveryProcessors[crawlRequest.Crawl.CrawlInfo.ThreadNumber].AddCrawlRequestToBeProcessed(crawlRequest); } else { ProcessDiscoveries(crawlRequest, arachnodeDAO); } } public override void ProcessDiscoveries(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { /**/ //Email Addresses ProcessEmailAddresses(crawlRequest, arachnodeDAO); /**/ //HyperLinks ProcessHyperLinks(crawlRequest, arachnodeDAO); /**/ //Files and Images ProcessFilesAndImages(crawlRequest, arachnodeDAO); }
/// <summary> /// Begins a Crawl. This method bypasses the Cache, and is experimental/for advanced users. /// This method does not function with the DEMO version. /// </summary> /// <param name = "crawlRequest"></param> /// <param name = "obeyCrawlRules"></param> /// <param name = "executeCrawlActions"></param> public void BeginCrawl(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions, bool processDiscoveriesAsynchronously) { #if DEMO return; #endif _crawlInfo.ThreadNumber = -1; do { crawlRequest.Crawl = this; crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously = !processDiscoveriesAsynchronously; crawlRequest.CurrentDepth = crawlRequest.MaximumDepth; lock (_beginCrawlLock) { ProcessCrawlRequest(crawlRequest, obeyCrawlRules, executeCrawlActions); crawlRequest = UncrawledCrawlRequests.Dequeue(); } } while (crawlRequest != null); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { bool isDisallowed = false; if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage) { crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; crawlRequest.IsDisallowedReason = "Disallowed by Source."; if (UserDefinedFunctions.IsDisallowedForSource(crawlRequest.DecodedHtml, false)) { isDisallowed = true; } if (_negateIsDisallowed) { isDisallowed = !isDisallowed; } } return(isDisallowed); }
internal void BeginDiscoveryProcessor(object o) { while (_crawler.Engine.State == EngineState.Start || _crawler.Engine.State == EngineState.Pause || _crawler.Engine.State == EngineState.None) { if (_crawler.Engine.State == EngineState.Start) { _crawler.Engine.StateControl.WaitOne(); lock (_crawlRequestLock) { IsProcessingDiscoveries = true; while (UnprocessedCrawlRequests.Count != 0) { CrawlRequest <TArachnodeDAO> crawlRequest = UnprocessedCrawlRequests.Dequeue(); crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously = true; try { _crawlRequestManager.ProcessDiscoveries(crawlRequest, _arachnodeDAO); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously = false; } IsProcessingDiscoveries = false; } } Thread.Sleep(5); } IsProcessingDiscoveries = false; }
public override void ResubmitCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool retryIndefinitely, IArachnodeDAO arachnodeDAO) { Thread.Sleep(10); if (crawlRequest.Discovery.HttpWebRequestRetriesRemaining != 0 || retryIndefinitely) { //resetting the DiscoveryState to allow the CrawlRequest to (attempt to) be re-crawled... crawlRequest.Discovery.DiscoveryState = DiscoveryState.Undiscovered; //removed because it will be re-added... Counters.GetInstance().CrawlRequestRemoved(); if (crawlRequest.Priority > 0) { crawlRequest.Priority = double.MinValue + crawlRequest.Priority; } _cache.UncrawledCrawlRequests.Enqueue(crawlRequest, crawlRequest.Priority); if (!retryIndefinitely) { crawlRequest.Discovery.HttpWebRequestRetriesRemaining--; } } else { crawlRequest.Crawl.Crawler.Engine.OnCrawlRequestCanceled(crawlRequest); if (crawlRequest.IsFromDatabase) { arachnodeDAO.DeleteCrawlRequest(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri); } Counters.GetInstance().ReportCurrentDepth(crawlRequest.CurrentDepth); Counters.GetInstance().CrawlRequestRemoved(); Counters.GetInstance().CrawlRequestProcessed(); } }
public void Receive_CrawlRequest(G2ReceivedPacket packet) { CrawlRequest request = CrawlRequest.Decode(packet); if (Local.Equals(request.Target)) { Send_CrawlAck(request, packet); } // Forward to appropriate node else { TcpConnect client = TcpControl.GetProxy(request.Target); if (client != null) { request.FromAddress = packet.Source; // add so receiving host knows where to send response too client.SendPacket(request); } } }
public Crawler(CrawlRequest request, ILogger log) { _id = request.Id; _host = request.Host; var internalDomains = new List <string> { _host, _githubDomain }; _processor = new CloudflareCgiProcesser() .Next(new LegacyProcessor() .Next(new ImageProcessor(new ClientWrapper(log)) .Next(new ContentProcessor(new ClientWrapper(log)) .Next(new KnownPageProcessor() .Next(new EmailProcessor() .Next(new ExternalPageProcessor(internalDomains) .Next(new PodcastRoadmapProcessor(new ClientWrapper(log))) .Next(new PageProcessor(_gistDomain, new ClientWrapper(log), null) .Next(new PageProcessor(_githubDomain, new ClientWrapper(log), null) .Next(new PageProcessor(_host, new ClientWrapper(log), new ContentLinksExtractor(_host)) .Next(new UnknownProcessor())))))))))); }
/// <summary> /// Saves the crawl requests to database. /// </summary> internal void SaveCrawlRequestsToDatabase() { while (UncrawledCrawlRequests.Count != 0) { _consoleManager.OutputString("Saving Crawl.UncrawledCrawlRequests: " + _crawlInfo.ThreadNumber + " : " + UncrawledCrawlRequests.Count + " CrawlRequests remaining to be inserted.", ConsoleColor.Gray, ConsoleColor.Gray); CrawlRequest <TArachnodeDAO> crawlRequest = UncrawledCrawlRequests.Dequeue(); if (!_ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreRequest, _arachnodeDAO)) { if (crawlRequest.Originator != null) { if (_applicationSettings.InsertCrawlRequests) { _arachnodeDAO.InsertCrawlRequest(crawlRequest.Created, crawlRequest.Originator.Uri.AbsoluteUri, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren); } } else { if (_applicationSettings.InsertCrawlRequests) { _arachnodeDAO.InsertCrawlRequest(crawlRequest.Created, null, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.CurrentDepth, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority, (byte)crawlRequest.RenderType, (byte)crawlRequest.RenderTypeForChildren); } } } else { if (_applicationSettings.InsertDisallowedAbsoluteUris) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris); } } Counters.GetInstance().CrawlRequestRemoved(); } }
public void PerformActionTest() { ApplicationSettings applicationSettings = new ApplicationSettings(); WebSettings webSettings = new WebSettings(); ArachnodeDAO arachnodeDAO = new ArachnodeDAO(applicationSettings.ConnectionString, applicationSettings, webSettings, true, true); Crawler <ArachnodeDAO> crawler = new Crawler <ArachnodeDAO>(applicationSettings, webSettings, CrawlMode.BreadthFirstByPriority, false); CrawlRequest <ArachnodeDAO> crawlRequest = new CrawlRequest <ArachnodeDAO>(new Discovery <ArachnodeDAO>("http://trycatchfail.com/blog/post/2008/11/12/Deep-web-crawling-with-NET-Getting-Started.aspx"), 1, UriClassificationType.Host, UriClassificationType.Host, 1, RenderType.None, RenderType.None); Crawl <ArachnodeDAO> crawl = new Crawl <ArachnodeDAO>(applicationSettings, webSettings, crawler, crawler.ActionManager, crawler.ConsoleManager, crawler.CookieManager, crawler.CrawlRequestManager, crawler.DataTypeManager, crawler.DiscoveryManager, crawler.EncodingManager, crawler.HtmlManager, crawler.PolitenessManager, crawler.ProxyManager, crawler.RuleManager, true); applicationSettings.MaximumNumberOfCrawlThreads = 0; UserDefinedFunctions.ConnectionString = "Data Source=.;Initial Catalog=arachnode.net;Integrated Security=True;Connection Timeout=3600;"; crawler.Engine.Start(); crawl.BeginCrawl(crawlRequest, false, false, false); Templater <ArachnodeDAO> target = new Templater <ArachnodeDAO>(applicationSettings, webSettings); target.PerformAction(crawlRequest, arachnodeDAO); }
private ScannerResult CheckForFileType(string url, StringBuilder sb, string fileExtension, StringBuilder linkBuilder = null) { ScannerResult result = new ScannerResult(); try { CrawlRequest request = new CrawlRequest(); request.FileType = fileExtension; request.Address = url.Trim('/').Replace("https://", "").Replace("http://", ""); request.Limit = 50; request.FindAll = true; List <string> info = Crawler.SearchFileType(request, true); if (info.Count != 0) { result.Success = true; sb.Append("\t" + fileExtension + " Files Found! " + info + "! Email sent." + Environment.NewLine); SendEmail("\t" + fileExtension + " Files Found ", url + " appears to have " + fileExtension + " files: " + Environment.NewLine + String.Join(Environment.NewLine, info.ToArray())); result.Results.AddRange(info); if (linkBuilder != null) { linkBuilder.Append(String.Join(Environment.NewLine, info.ToArray()) + Environment.NewLine); } } else { sb.Append("\tNo " + fileExtension + " files found." + Environment.NewLine); } } catch (Exception ex) { throw new Exception("File finder exception: " + ex.Message); } return(result); }
/// <summary> /// Manages the image. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "imageID">The image ID.</param> /// <param name = "absoluteUri">The absolute URI.</param> /// <param name = "source">The source.</param> /// <param name = "fullTextIndexType">Full type of the text index.</param> /// <param name = "extractImageMetaData">if set to <c>true</c> [extract image meta data].</param> /// <param name = "insertImageMetaData">if set to <c>true</c> [insert image meta data].</param> /// <param name = "saveImageToDisk">if set to <c>true</c> [save image to disk].</param> /// <returns></returns> public override ManagedImage ManageImage(CrawlRequest <TArachnodeDAO> crawlRequest, long imageID, string absoluteUri, byte[] source, string fullTextIndexType, bool extractImageMetaData, bool insertImageMetaData, bool saveImageToDisk) { try { using (MemoryStream memoryStream = new MemoryStream(source, true)) { ManagedImage managedImage = new ManagedImage(); managedImage.Image = Image.FromStream(memoryStream); if (extractImageMetaData) { XmlDocument xmlDocument = new XmlDocument(); XmlElement xmlElement; xmlDocument.AppendChild(xmlDocument.CreateNode(XmlNodeType.XmlDeclaration, "", "")); xmlDocument.AppendChild(xmlDocument.CreateElement("", "EXIFData", "")); Dictionary <string, string> dictionary = new Dictionary <string, string>(); foreach (Pair pair in new EXIFExtractor(managedImage.Image, "", "")) { dictionary.Add(pair.First.ToString(), pair.Second.ToString()); } foreach (KeyValuePair <string, string> keyValuePair in dictionary) { xmlElement = xmlDocument.CreateElement("", keyValuePair.Key.Replace(" ", "_"), ""); string value = UserDefinedFunctions.ExtractAlphaNumericCharacters(keyValuePair.Value).Value ?? string.Empty; xmlElement.AppendChild(xmlDocument.CreateTextNode(value)); xmlDocument.ChildNodes.Item(1).AppendChild(xmlElement); } managedImage.EXIFData = xmlDocument; if (insertImageMetaData) { _arachnodeDAO.InsertImageMetaData(absoluteUri, imageID, xmlDocument.InnerXml, managedImage.Image.Flags, managedImage.Image.Height, managedImage.Image.HorizontalResolution, managedImage.Image.VerticalResolution, managedImage.Image.Width); } } if (saveImageToDisk) { managedImage.DiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, absoluteUri, fullTextIndexType); managedImage.Image.Save(managedImage.DiscoveryPath); } return(managedImage); } //ANODET: Parameter is not valid in the exception handler... } catch (Exception exception) { //ANODET: Images of 7 bytes (Generic GDI Error)... #if !DEMO if (crawlRequest != null) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, absoluteUri, exception, false); } else { _arachnodeDAO.InsertException(null, absoluteUri, exception, false); } #endif } return(null); }
/// <summary> /// Manages the file. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "fileID">The file ID.</param> /// <param name = "absoluteUri">The absolute URI.</param> /// <param name = "source">The source.</param> /// <param name = "fullTextIndexType">Full type of the text index.</param> /// <param name = "extractFileMetaData">if set to <c>true</c> [extract file meta data].</param> /// <param name = "insertFileMetaData">if set to <c>true</c> [insert file meta data].</param> /// <param name = "saveFileToDisk">if set to <c>true</c> [save file to disk].</param> /// <returns></returns> public abstract ManagedFile ManageFile(CrawlRequest <TArachnodeDAO> crawlRequest, long fileID, string absoluteUri, byte[] source, string fullTextIndexType, bool extractFileMetaData, bool insertFileMetaData, bool saveFileToDisk);
/// <summary> /// Manages the file. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> public abstract void ManageFile(CrawlRequest <TArachnodeDAO> crawlRequest);
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "crawlRuleType">Type of the rule.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, CrawlRuleType crawlRuleType, IArachnodeDAO arachnodeDAO) { switch (crawlRuleType) { case CrawlRuleType.PreRequest: foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _preRequestCrawlRules.Values) { foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules) { try { if (crawlRule.IsEnabled && crawlRule.IsDisallowed(crawlRequest, arachnodeDAO)) { crawlRequest.IsDisallowed = true; crawlRequest.Discovery.IsDisallowed = true; crawlRequest.Discovery.IsDisallowedReason = crawlRequest.IsDisallowedReason; return(true); } } catch (Exception exception) { arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); return(true); } } } break; case CrawlRuleType.PreGet: foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _preGetCrawlRules.Values) { foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules) { try { if (crawlRule.IsEnabled && crawlRule.IsDisallowed(crawlRequest, arachnodeDAO)) { crawlRequest.IsDisallowed = true; crawlRequest.Discovery.IsDisallowed = true; crawlRequest.Discovery.IsDisallowedReason = crawlRequest.IsDisallowedReason; return(true); } } catch (Exception exception) { arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); return(true); } } } break; case CrawlRuleType.PostRequest: foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _postRequestCrawlRules.Values) { foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules) { try { if (crawlRule.IsEnabled && crawlRule.IsDisallowed(crawlRequest, arachnodeDAO)) { crawlRequest.IsDisallowed = true; crawlRequest.Discovery.IsDisallowed = true; crawlRequest.Discovery.IsDisallowedReason = crawlRequest.IsDisallowedReason; return(true); } } catch (Exception exception) { arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); return(true); } } } break; } return(false); }
public static async Task <CrawlResponse> DownloadJson(Uri uri, string userAgent, CrawlRequest reqBody) { var request = (HttpWebRequest)WebRequest.Create(uri); request.Method = "GET"; request.UserAgent = userAgent; if (!string.IsNullOrEmpty(reqBody.ETag)) { request.Headers.Add(HttpRequestHeader.IfNoneMatch, reqBody.ETag); } using (var response = (HttpWebResponse)await request.GetResponseAsync()) { using (var stream = response.GetResponseStream()) using (var reader = new StreamReader(stream)) { return(new CrawlResponse { Features = JToken.Parse(await reader.ReadToEndAsync()) }); } } }
public static async Task <HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log) { CrawlRequest crawlRequest = null; string reqBodyStr = null; try { using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.HTML")) { reqBodyStr = await req.Content.ReadAsStringAsync(); var reqBody = JsonConvert.DeserializeObject <CrawlRequest>(reqBodyStr); operation.Telemetry.Properties.Add("AppId", reqBody.Site); operation.Telemetry.Properties.Add("ActionId", reqBody.Id); operation.Telemetry.Properties.Add("Url", reqBody.Url); log.Info($"Crawl AppId={reqBody.Site} Id={reqBody.Id} Url={reqBody.Url}"); var crawlResponse = await Download(reqBody); // always return a valid object so that downstream workflows can continue if (crawlResponse == null) { crawlResponse = new CrawlResponse(); } crawlResponse.Url = reqBody.Url; crawlResponse.Site = reqBody.Site; crawlResponse.Id = reqBody.Id; var json = JsonConvert.SerializeObject(crawlResponse, new JsonSerializerSettings { Formatting = Formatting.None, StringEscapeHandling = StringEscapeHandling.EscapeNonAscii }); return(new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent( json, new UTF8Encoding(encoderShouldEmitUTF8Identifier: false), "application/json") }); } } catch (Exception ex) { var props = new Dictionary <string, string> { { "Service", req.RequestUri.ToString() } }; if (crawlRequest == null) { props.Add("JSON", reqBodyStr); } else { props.Add("Url", crawlRequest.Url); props.Add("AppId", crawlRequest.Site); props.Add("ActionId", crawlRequest.Id); } Services.TelemetryClient.TrackException(ex, props); throw ex; } }
/// <summary> /// Assigns the file and image discoveries. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> public abstract void AssignFileAndImageDiscoveries(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO);
/// <summary> /// Processes the hyper links. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> public override void ProcessHyperLinks(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { if (ApplicationSettings.AssignHyperLinkDiscoveries) { _discoveryManager.AssignHyperLinkDiscoveries(crawlRequest, arachnodeDAO); } foreach (Discovery <TArachnodeDAO> hyperLinkDiscovery in crawlRequest.Discoveries.HyperLinks.Values) { if (!hyperLinkDiscovery.IsDisallowed) { if (hyperLinkDiscovery.DiscoveryState == DiscoveryState.Undiscovered) { if (crawlRequest.CurrentDepth < crawlRequest.MaximumDepth) { if (!_discoveryManager.IsCrawlRestricted(crawlRequest, hyperLinkDiscovery.Uri.AbsoluteUri)) { _cache.AddCrawlRequestToBeCrawled(new CrawlRequest <TArachnodeDAO>(crawlRequest, hyperLinkDiscovery, crawlRequest.CurrentDepth + 1, crawlRequest.MaximumDepth, crawlRequest.RestrictCrawlTo, crawlRequest.RestrictDiscoveriesTo, crawlRequest.Priority + hyperLinkDiscovery.PriorityBoost, crawlRequest.RenderTypeForChildren, crawlRequest.RenderTypeForChildren), false, false, arachnodeDAO); } } if (ApplicationSettings.InsertHyperLinks && hyperLinkDiscovery.IsStorable) { arachnodeDAO.InsertHyperLink(crawlRequest.Discovery.Uri.AbsoluteUri, hyperLinkDiscovery.Uri.AbsoluteUri, ApplicationSettings.ClassifyAbsoluteUris); } _consoleManager.OutputHyperLinkDiscovered(crawlRequest.Crawl.CrawlInfo.ThreadNumber, crawlRequest, hyperLinkDiscovery); } else { if (ApplicationSettings.InsertHyperLinkDiscoveries && hyperLinkDiscovery.IsStorable) { arachnodeDAO.InsertHyperLinkDiscovery(crawlRequest.Discovery.Uri.AbsoluteUri, hyperLinkDiscovery.Uri.AbsoluteUri); } _consoleManager.OutputCacheHit(crawlRequest.Crawl.CrawlInfo, crawlRequest, hyperLinkDiscovery); } } else { if (ApplicationSettings.InsertDisallowedAbsoluteUris) { if (hyperLinkDiscovery.DiscoveryState == DiscoveryState.Undiscovered) { arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Discovery.Uri.AbsoluteUri, hyperLinkDiscovery.Uri.AbsoluteUri, hyperLinkDiscovery.IsDisallowedReason, ApplicationSettings.ClassifyAbsoluteUris); } else { if (ApplicationSettings.InsertDisallowedAbsoluteUriDiscoveries) { arachnodeDAO.InsertDisallowedAbsoluteUriDiscovery(crawlRequest.Discovery.Uri.AbsoluteUri, hyperLinkDiscovery.Uri.AbsoluteUri); } } } _consoleManager.OutputIsDisallowedReason(crawlRequest.Crawl.CrawlInfo, crawlRequest, hyperLinkDiscovery); } } Counters.GetInstance().HyperLinksDiscovered(crawlRequest.Discoveries.HyperLinks.Count); }
/// <summary> /// Determines whether the specified crawl request is restricted. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "absoluteUri">The absolute URI.</param> /// <returns> /// <c>true</c> if the specified crawl request is restricted; otherwise, <c>false</c>. /// </returns> public abstract bool IsCrawlRestricted(CrawlRequest <TArachnodeDAO> crawlRequest, string absoluteUri);
/// <summary> /// Processes the crawl request. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "obeyCrawlRules">if set to <c>true</c> [obey crawl rules].</param> /// <param name = "executeCrawlActions">if set to <c>true</c> [execute crawl actions].</param> public void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions) { //HACK:!!! Solve this!!! //#if DEMO // return; //#endif bool wasACacheHit = false; try { crawlRequest.WebClient = WebClient; if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered) { if (!_politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestRequested, _arachnodeDAO)) { Crawler.Engine.OnCrawlRequestThrottled(crawlRequest); return; } _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest); _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PreRequest, _arachnodeDAO); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreRequest, _arachnodeDAO); } if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreRequest, _arachnodeDAO); } if (!crawlRequest.IsDisallowed) { _stopwatch.Reset(); _stopwatch.Start(); try { _dataManager.ProcessCrawlRequest(crawlRequest, obeyCrawlRules, executeCrawlActions); } catch (Exception exception2) { throw new Exception(exception2.Message, exception2); } finally { _stopwatch.Stop(); _crawlInfo.TotalHttpWebResponseTime += _stopwatch.Elapsed; crawlRequest.HttpWebResponseTime = _stopwatch.Elapsed; _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO); } Counters.GetInstance().TotalBytesDiscovered(crawlRequest.Data.LongLength); _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PostRequest, _arachnodeDAO); _encodingManager.ProcessCrawlRequest(crawlRequest, _arachnodeDAO); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PostRequest, _arachnodeDAO); } //the CrawlRequest could be Disallowed by a PreGet CrawlRule - specifically DataType.cs. if (!crawlRequest.IsDisallowed) { if (_processData) { _crawlRequestManager.ProcessCrawlRequest(crawlRequest, _fileManager, _imageManager, _webPageManager, _arachnodeDAO); } } else { if (crawlRequest.DataType.ContentType == null) { crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); } if (_applicationSettings.InsertDisallowedAbsoluteUris) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris); } _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest); } } else { _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO); if (crawlRequest.DataType.ContentType == null) { crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); } if (_applicationSettings.InsertDisallowedAbsoluteUris) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris); } _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest); } } else { wasACacheHit = true; //this should only occur when you submit a CR from a rule, or action... _consoleManager.OutputCacheHit(_crawlInfo, crawlRequest, crawlRequest.Discovery); } } catch (Exception exception) { _stopwatch.Stop(); if (Crawler.Engine.State != EngineState.Start) { //the request was aborted as it was long running and Engine was requested to Stop. if ((crawlRequest.WebClient.WebException != null && crawlRequest.WebClient.WebException.Status == WebExceptionStatus.RequestCanceled) || (exception.InnerException != null && exception.InnerException.Message == "The request was aborted: The request was canceled.")) { return; } } if (crawlRequest.WebClient.WebException != null && crawlRequest.Discovery.HttpWebRequestRetriesRemaining != 0 && crawlRequest.WebClient.WebException.Message.StartsWith("Unable to connect to the remote server")) { _politenessManager.ResubmitCrawlRequest(crawlRequest, false, _arachnodeDAO); _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCanceled, _arachnodeDAO); return; } try { _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO); } catch (Exception exception2) { exception = exception2; } if (exception.InnerException == null) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } else { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.InnerException, false); } crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); if (_applicationSettings.InsertDisallowedAbsoluteUris) { if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.Message, _applicationSettings.ClassifyAbsoluteUris); } else { if (_applicationSettings.InsertDisallowedAbsoluteUriDiscoveries) { _arachnodeDAO.InsertDisallowedAbsoluteUriDiscovery(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri); } } } _consoleManager.OutputException(_crawlInfo.ThreadNumber, crawlRequest, _arachnodeDAO.LastExceptionID, _arachnodeDAO.LastExceptionMessage); } if (crawlRequest.IsFromDatabase) { _arachnodeDAO.DeleteCrawlRequest(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri); } _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.Discovered, _arachnodeDAO); if (!wasACacheHit) { if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, _arachnodeDAO); } Crawler.Engine.OnCrawlRequestCompleted(crawlRequest); } _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest); Counters.GetInstance().ReportCurrentDepth(crawlRequest.CurrentDepth); Counters.GetInstance().CrawlRequestRemoved(); Counters.GetInstance().CrawlRequestProcessed(); _crawlInfo.TotalCrawlRequestsProcessed++; }
private void Engine_CrawlRequestCompleted(CrawlRequest <ArachnodeDAO> crawlRequest) { }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { return(IsDisallowed(crawlRequest, crawlRequest.Discovery.Uri)); }
public static List <string> SearchFileType(CrawlRequest request, bool filterKnownAttackFiles)//, CrawlerContext context) { if (_knownAttackFiles.Count == 0) { Initilize(); } List <string> foundFiles = new List <string>(); string resumeKey = ""; bool continueLoop = true; do { WebPageRequest webRequest = new WebPageRequest(); // webRequest.Address = "https://web.archive.org/cdx/search?url=" + request.Address + "&matchType=domain&collapse=urlkey&output=text&fl=original&filter=urlkey:.*"+request.FileType+"&limit=10&page=1"; webRequest.Address = "https://web.archive.org/cdx/search?url=" + request.Address + "/&matchType=host" + "&collapse=urlkey" + "&output=text" + "&fl=original" + @"&filter=original:.*\." + request.FileType + "$" + "&filter=statuscode:200" + "&limit=" + request.Limit + "&showResumeKey=" + request.FindAll.ToString().ToLower() + "&resumeKey=" + resumeKey; WebPageLoader.Load(webRequest); if (!String.IsNullOrEmpty(webRequest.Response.Body)) { // return webRequest.Response.Body; List <string> foundStrings = webRequest.Response.Body.Split(new string[] { "\n" }, StringSplitOptions.RemoveEmptyEntries).ToList(); if (foundStrings.Count <= request.Limit) { foundFiles.AddRange(foundStrings); continueLoop = false; } else { foundFiles.AddRange(foundStrings.Take(request.Limit)); resumeKey = foundStrings.LastOrDefault(); if (resumeKey == null) { continueLoop = false; } } } else { continueLoop = false; } } while (request.FindAll && continueLoop); if (filterKnownAttackFiles && foundFiles.Count != 0) { List <string> dangerzone = new List <string>(); foreach (string url in foundFiles) { string file = url.Split('/').LastOrDefault(); if (file == null) { continue; } foreach (AttackFile attack in _knownAttackFiles) { if (file.Equals(attack.File, StringComparison.InvariantCultureIgnoreCase)) { foreach (string attackString in attack.Attacks) { dangerzone.Add(url + attackString); } } } } foundFiles = dangerzone; } return(foundFiles); }
public static async Task <CrawlResponse> DownloadHtml(Uri uri, string userAgent, CrawlRequest reqBody) { var request = (HttpWebRequest)WebRequest.Create(uri); request.Method = "GET"; request.UserAgent = userAgent; if (!string.IsNullOrEmpty(reqBody.ETag)) { request.Headers.Add(HttpRequestHeader.IfNoneMatch, reqBody.ETag); } using (var response = (HttpWebResponse)await request.GetResponseAsync()) { using (var stream = response.GetResponseStream()) using (var reader = new StreamReader(stream)) { // TODO: look for schema.org var html = await reader.ReadToEndAsync(); // TODO: support microsoft:ds_id return(HtmlExtractor.Parse(html, new Uri(reqBody.Url))); } } }
public void Send_CrawlAck(CrawlRequest req, G2ReceivedPacket packet) { CrawlAck ack = new CrawlAck(); ack.Source = GetLocalSource(); ack.Version = Core.Context.LocalSeqVersion.ToString(); ack.Uptime = (Core.TimeNow - Core.StartTime).Seconds; foreach (TcpConnect connection in TcpControl.ProxyServers) ack.ProxyServers.Add(new DhtContact(connection, connection.RemoteIP)); foreach (TcpConnect connection in TcpControl.ProxyClients) ack.ProxyClients.Add(new DhtContact(connection, connection.RemoteIP)); if (packet.ReceivedTcp) { ack.ToAddress = packet.Source; packet.Tcp.SendPacket(ack); } else SendPacket(packet.Source, ack); }
public static async Task <CrawlResponse> Download(CrawlRequest reqBody) { Uri uri; if (!Uri.TryCreate(reqBody.Url, UriKind.Absolute, out uri)) { return(null); } foreach (var userAgent in UserAgents) { var headRequest = (HttpWebRequest)WebRequest.Create(uri); headRequest.Method = "HEAD"; headRequest.UserAgent = userAgent; try { // make sure we only crawl HTML using (var response = (HttpWebResponse)await headRequest.GetResponseAsync()) { var contentType = response.GetResponseHeader("Content-Type"); CrawlResponse result = null; if (string.IsNullOrWhiteSpace(contentType) || contentType.StartsWith("text/html")) { result = await DownloadHtml(uri, userAgent, reqBody); } if (contentType.StartsWith("application/json")) { result = await DownloadJson(uri, userAgent, reqBody); } if (contentType.StartsWith("video/") || contentType.StartsWith("audio/")) { result = new CrawlResponse { Video = reqBody.Url } } ; if (contentType.StartsWith("image/")) { result = new CrawlResponse { Image = reqBody.Url } } ; return(result); } } catch (WebException we) { HttpWebResponse httpResponse = we.Response as HttpWebResponse; if (we.Status == WebExceptionStatus.ServerProtocolViolation) { // Get a little more telemetry about what is going on here, though most cases don't // have a Response object. IDictionary <string, string> traceData = new Dictionary <string, string>() { { "HasResponse", (we.Response != null).ToString() } }; if (we.Response != null) { traceData["Response.SupportsHeaders"] = we.Response.SupportsHeaders.ToString(); if (we.Response.SupportsHeaders) { for (int i = 0; i < we.Response.Headers.Count; i++) { string headerName = we.Response.Headers.GetKey(i); string headerValue = we.Response.Headers.Get(i); traceData[$"Response.Headers.{headerName}"] = headerValue; } } if (httpResponse != null) { traceData["HttpResponse.StatusCode"] = httpResponse.StatusCode.ToString(); } } Services.TelemetryClient.TrackTrace($"Download target ({uri}) ServerProtocolViolation", SeverityLevel.Error, traceData); // Ignore known cases where crawl fails due to error on the crawl-target side - these should not // cause a hard failure on our end. continue; } if (httpResponse != null) { // Ignore known cases where crawl fails due to error on the crawl-target side - these should not // cause a hard failure on our end. if (httpResponse.StatusCode == HttpStatusCode.Forbidden || httpResponse.StatusCode == HttpStatusCode.NotFound || httpResponse.StatusCode == HttpStatusCode.ServiceUnavailable) { continue; } } throw; } } throw new UnauthorizedAccessException("Unable to access HTTP endpoint"); }
public void Send_CrawlRequest(DhtAddress address, DhtClient target) { CrawlRequest request = new CrawlRequest(); request.Target = target; SendPacket(address, request); }
public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { //Rendering determines the Encoding... if (crawlRequest.RenderType == RenderType.None) { if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage) { string contentType = null; if (crawlRequest.WebClient.HttpWebResponse.Headers["Content-Type"] != null) { string[] contentTypeHeader = crawlRequest.WebClient.HttpWebResponse.Headers["Content-Type"].Split('='); if (contentTypeHeader.Length == 2) { contentType = contentTypeHeader[1].Replace("utf8", "utf-8"); } } Encoding encoding = null; string decodedHtml = null; try { //first, try and get the Encoding from the 'Content-Type'... if (!string.IsNullOrEmpty(contentType)) { encoding = Encoding.GetEncoding(contentType); } else { decodedHtml = DetermineEncoding(crawlRequest, out encoding); } } catch (Exception exception) { try { //if there is an error, try and get the Encoding from the 'Charset'... decodedHtml = DetermineEncoding(crawlRequest, out encoding); } catch (Exception exception2) { //if there is an error, default to UTF8. arachnodeDAO.InsertException(crawlRequest.Discovery.Uri.AbsoluteUri, null, exception, false); arachnodeDAO.InsertException(crawlRequest.Discovery.Uri.AbsoluteUri, null, exception2, false); encoding = Encoding.UTF8; } } crawlRequest.Encoding = encoding; if (encoding == Encoding.UTF8 && decodedHtml != null) { crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(decodedHtml); crawlRequest.Html = decodedHtml; } else { crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(encoding.GetString(crawlRequest.Data)); crawlRequest.Html = encoding.GetString(crawlRequest.Data); } } } }
public override void SendCrawlRequestResponseMessageToCrawlerPeer(CrawlerPeer crawlerPeer, CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { }