public WebList ToWebList() { var body = GetPageBody(); var mainPageFragment = new PageFragment(Driver, body); return(mainPageFragment.ToWebList()); }
public WebList ToWebList() { var body = Driver.FindElementByTagName("body"); var mainPageFragment = new PageFragment(Driver, body); return(mainPageFragment.ToWebList()); }
private void UpdateCountersList(ref PageFragment pf) { MainForm.UpdateSemaphoresStatus(semaphore.CurrentCount); MainForm.UpdateVisitedPagesStatus(visitedPages, pagesTovisit); foreach (var title in pf.Titles) { if (title.TitleLength > TitleCharMax || title.TitleLength < TitleCharMin) { MainForm.IncreaseTitleCharProblemsCounter(); } if (title.TitlePixelWidth > TitlePixMax || title.TitlePixelWidth < TitlePixMin) { MainForm.IncreaseTitlePixelProblemsCounter(); } } foreach (var desc in pf.MetaDescriptions) { if (desc.MetaDescriptionLength > TitleCharMax || desc.MetaDescriptionLength < TitleCharMin) { MainForm.IncreaseDescCharProblemsCounter(); } if (desc.MetaDescriptionPixelWidth > TitlePixMax || desc.MetaDescriptionPixelWidth < TitlePixMin) { MainForm.IncreaseDescPixelProblemsCounter(); } } if (pf.Address.Length > UrlCharMax) { MainForm.IncreaseUrlProblemsCounter(); } foreach (var heading in pf.HeadingsOne) { if (heading.HeadingOneLength > H1CharMax) { MainForm.IncreaseHeadOneProblemsCounter(); } } foreach (var heading in pf.HeadingsTwo) { if (heading.HeadingTwoLength > H2CharMax) { MainForm.IncreaseHeadTwoProblemsCounter(); } } if (pf.Size > ImgSizeMax && pf.ContentType.Contains("image")) { MainForm.IncreaseImgProblemsCounter(); Debug.WriteLine("Imageee: " + pf.Address); } visitedPages++; }
private void CrawlThroughAnchors(HtmlDocument htmlDocument, ref PageFragment pf) { // Collect all anchor tags (<a/>) and start crawling urls inside href attribute var anchors = htmlDocument.DocumentNode.Descendants("a").ToList(); foreach (var anchor in anchors) { string address = anchor.GetAttributeValue("href", String.Empty); TryCrawlingNextPage(address, ref pf); } }
private void CrawlThroughIframes(HtmlDocument htmlDocument, ref PageFragment pf) { // Collect all iframe tags (<iframe/>) and start crawling urls inside src attribute var iframes = htmlDocument.DocumentNode.Descendants("iframe").ToList(); foreach (var iframe in iframes) { string address = iframe.GetAttributeValue("src", String.Empty); TryCrawlingNextPage(address, ref pf); } }
private void TryCrawlingNextPage(string address, ref PageFragment pf) { // Refactor addres to make it a full absolute URL NormalizeAddress(BaseUrl, ref address, pf.Address); // Check whether address is correct or not if (address == null) { return; } if (Uri.Compare(BaseUrl, new Uri(address), UriComponents.Host, UriFormat.SafeUnescaped, StringComparison.CurrentCulture) == 0) { pf.OutLinks++; if (!pf.OutLinksAdresses.Contains(address)) { pf.OutLinksAdresses.Add(address); pf.UniqueOutLinks++; } if (!inLinksData.ContainsKey(address)) { inLinksData.Add(address, new InLinksCounter()); } inLinksData[address].InLinksCount++; inLinksData[address].UniqueInLinks.Add(pf.Address); } else { pf.ExternalOutLinks++; if (!pf.ExternalOutLinksAdresses.Contains(address)) { pf.ExternalOutLinksAdresses.Add(address); pf.UniqueExternalOutLinks++; } } // Check whether address had been crawled before or not if (crawledPages.Contains(new Uri(address))) { return; } // Check whether total crawl limit has been exceeded if (crawledPages.Count >= Utils.TotalCrawlLimit) { return; } pagesTovisit++; _ = StartCrawlingPage(new Uri(address), cts.Token); }
private static void ManagePageFragmentIfExternal(ref PageFragment pf, ref HttpResponseMessage response, Uri page) { pf.Address = page.AbsoluteUri; pf.IsInternal = false; pf.ContentType = response.Content.Headers.ContentType.MediaType; pf.StatusCode = ((int)response.StatusCode).ToString(); pf.Status = response.StatusCode.ToString(); pf.UrlDepth = -1; foreach (string segment in page.Segments) { pf.UrlDepth++; } }
private void CrawlThroughScripts(HtmlDocument htmlDocument, ref PageFragment pf) { // Collect all script tags (<script/>) and start crawling urls inside src attribute // Only interested in .js files var scripts = htmlDocument.DocumentNode.Descendants("script").ToList(); foreach (var script in scripts) { if (script.GetAttributeValue("type", "null") == "text/javascript") { string address = script.GetAttributeValue("src", String.Empty); TryCrawlingNextPage(address, ref pf); } } }
private void CrawlThroughImages(HtmlDocument htmlDocument, ref PageFragment pf) { // Collect all image tags (<img/>) and start crawling urls inside src attribute var images = htmlDocument.DocumentNode.Descendants("img").ToList(); foreach (var image in images) { string address = image.GetAttributeValue("src", String.Empty); if (address.Contains("?")) { address = address.Remove(address.IndexOf("?")); } TryCrawlingNextPage(address, ref pf); } }
private static void AnalyzeHeadingsTwo(ref PageFragment pf, ref HtmlDocument htmlDocument) { List <HtmlNode> htmlHeadingsTwo = htmlDocument.DocumentNode.Descendants("h2").ToList(); foreach (HtmlNode htmlHeadingTwo in htmlHeadingsTwo) { HeadingTwo headingTwo = new HeadingTwo(); headingTwo.HeadingTwoText = htmlHeadingTwo.InnerText; headingTwo.HeadingTwoLength = headingTwo.HeadingTwoText.Length; pf.HeadingsTwo.Add(headingTwo); if (pf.HeadingsTwo.Count == Utils.MaxHeadsTwo) { break; } } }
private static void AnalyzeHeadingsOne(ref PageFragment pf, ref HtmlDocument htmlDocument) { List <HtmlNode> htmlHeadingsOne = htmlDocument.DocumentNode.Descendants("h1").ToList(); foreach (HtmlNode htmlHeadingOne in htmlHeadingsOne) { HeadingOne headingOne = new HeadingOne(); headingOne.HeadingOneText = htmlHeadingOne.InnerText; headingOne.HeadingOneLength = headingOne.HeadingOneText.Length; pf.HeadingsOne.Add(headingOne); if (pf.HeadingsOne.Count == Utils.MaxHeadsOne) { break; } } }
private static void AnalyzeMetas(ref PageFragment pf, ref HtmlDocument htmlDocument) { List <HtmlNode> metas = htmlDocument.DocumentNode.Descendants("meta").ToList(); foreach (HtmlNode meta in metas) { if (meta.GetAttributeValue("name", "null") == "robots") { if (meta.GetAttributeValue("content", "null") == "noindex") { pf.Indexability = "Non-Indexable"; pf.IndexabilityStatus = "Noindex"; } } else if (meta.GetAttributeValue("name", "null") == "description") { if (pf.MetaDescriptions.Count == Utils.MaxDescs) { continue; } MetaDescription metaDesc = new MetaDescription(); metaDesc.MetaDescriptionText = meta.GetAttributeValue("content", ""); metaDesc.MetaDescriptionLength = metaDesc.MetaDescriptionText.Length; Font arialBold = new Font("Arial", 13.0F); metaDesc.MetaDescriptionPixelWidth = System.Windows.Forms.TextRenderer .MeasureText(metaDesc.MetaDescriptionText, arialBold).Width; pf.MetaDescriptions.Add(metaDesc); } else if (meta.GetAttributeValue("name", "null") == "keywords") { if (pf.MetaKeywords.Count == Utils.MaxKeywords) { continue; } MetaKeywords metaKey = new MetaKeywords(); metaKey.MetaKeywordsText = meta.GetAttributeValue("content", ""); metaKey.MetaKeywordsLength = metaKey.MetaKeywordsText.Length; pf.MetaKeywords.Add(metaKey); } } }
private void CrawlThroughLinks(HtmlDocument htmlDocument, ref PageFragment pf) { // Collect all link tags (<link/>) and start crawling urls inside href attribute // Only interested in .CSS files var links = htmlDocument.DocumentNode.Descendants("link").ToList(); foreach (var link in links) { if (link.GetAttributeValue("rel", "null") == "stylesheet") { string address = link.GetAttributeValue("href", String.Empty); if (address.Contains("?")) { address = address.Remove(address.IndexOf("?")); } TryCrawlingNextPage(address, ref pf); } } }
private void CrawlFurther(HtmlDocument htmlDocument, ref PageFragment pf) { CrawlThroughAnchors(htmlDocument, ref pf); if (Utils.CrawlCss) { CrawlThroughLinks(htmlDocument, ref pf); } if (Utils.CrawlJavaScript) { CrawlThroughScripts(htmlDocument, ref pf); } if (Utils.CrawlIframes) { CrawlThroughIframes(htmlDocument, ref pf); } if (Utils.CrawlImages) { CrawlThroughImages(htmlDocument, ref pf); } }
private static void AnalyzeTitles(ref PageFragment pf, ref HtmlDocument htmlDocument) { List <HtmlNode> htmlTitles = htmlDocument.DocumentNode.Descendants("title").ToList(); foreach (HtmlNode htmlTitle in htmlTitles) { Title title = new Title(); title.TitleText = htmlTitle.InnerText; title.TitleLength = title.TitleText.Length; Font arialBold = new Font("Arial", 16.0F); title.TitlePixelWidth = System.Windows.Forms.TextRenderer .MeasureText(title.TitleText, arialBold).Width; pf.Titles.Add(title); if (pf.Titles.Count == Utils.MaxTitles) { break; } } }
public virtual Task <bool> WritePageFragmentAsync(ScriptScopeContext scope, PageFragment fragment, CancellationToken token) => TypeConstants.FalseTask;
public override async Task <bool> WritePageFragmentAsync(ScriptScopeContext scope, PageFragment fragment, CancellationToken token) { if (fragment is PageStringFragment str) { await scope.OutputStream.WriteAsync(str.ValueUtf8, token); } else if (fragment is PageVariableFragment var) { if (var.Binding?.Equals(ScriptConstants.Page) == true && !scope.ScopedParams.ContainsKey(ScriptConstants.PartialArg)) { if (scope.PageResult.PageProcessed) { throw new NotSupportedException("{{page}} can only be called once per render, in the Layout page."); } scope.PageResult.PageProcessed = true; await scope.PageResult.WritePageAsync(scope.PageResult.Page, scope.PageResult.CodePage, scope, token); if (scope.PageResult.HaltExecution) { scope.PageResult.HaltExecution = false; //break out of page but continue evaluating layout } } else { await scope.PageResult.WriteVarAsync(scope, var, token); } } else if (fragment is PageBlockFragment blockFragment) { var block = scope.PageResult.GetBlock(blockFragment.Name); await block.WriteAsync(scope, blockFragment, token); } else { return(false); } return(true); }
public override async Task <bool> WritePageFragmentAsync(ScriptScopeContext scope, PageFragment fragment, CancellationToken token) { var page = scope.PageResult; if (fragment is PageJsBlockStatementFragment blockFragment) { var blockStatements = blockFragment.Block.Statements; if (blockFragment.Quiet && scope.OutputStream != Stream.Null) { scope = scope.ScopeWithStream(Stream.Null); } await page.WriteStatementsAsync(scope, blockStatements, token); return(true); } return(false); }
private void UpdateDataTable(PageFragment pf) { DataRow row = dt.NewRow(); row[ADDRESS_COL] = pf.Address; row[CONTET_TYPE_COL] = pf.ContentType; row[STATUS_CODE_COL] = pf.StatusCode; row[STATUS_COL] = pf.Status; if (Utils.ExtractIndexability) { row[INDEXABILITY_COL] = pf.Indexability; row[INDEXABILITY_STATUS_COL] = pf.IndexabilityStatus; } row[ISINTERNAL_COL] = pf.IsInternal; if (Utils.ExtractPageSize) { row[SIZE_COL] = pf.Size; } if (Utils.ExtractHash) { row[ISDUPLICATE_COL] = pf.IsDuplicate; } if (pf.WordCount > 0 && Utils.ExtractWordCount) { row[WORD_COUNT_COL] = pf.WordCount; } if (pf.TextRatio > 0 && Utils.ExtractTxtCodeRatio) { row[TEXT_RATIO_COL] = pf.TextRatio.ToString("F"); } row[URL_DEPTH_COL] = pf.UrlDepth; HandleTitles(ref row, pf.Titles); HandleDesc(ref row, pf.MetaDescriptions); HandleKeywords(ref row, pf.MetaKeywords); if (Utils.ExtractH1) { HandleHeadsOne(ref row, pf.HeadingsOne); } if (Utils.ExtractH2) { HandleHeadsTwo(ref row, pf.HeadingsTwo); } if (pf.OutLinks > 0) { row[OUTLINKS_COL] = pf.OutLinks; row[UNIQUE_OUTLINKS_COL] = pf.UniqueOutLinks; } if (pf.ExternalOutLinks > 0) { row[EXTERNAL_OUTLIKNS_COL] = pf.ExternalOutLinks; row[UNIQUE_EXTERNAL_OUTLIKNS_COL] = pf.UniqueExternalOutLinks; } if (inLinksData.ContainsKey(pf.Address)) { row[INLINKS_COL] = inLinksData[pf.Address].InLinksCount; row[UNIQUE_INLINKS_COL] = inLinksData[pf.Address].UniqueInLinks.Count; float temp = ((float)inLinksData[pf.Address].UniqueInLinks.Count / (float)inLinksData[pf.Address].InLinksCount) * 100; row[UNIQUE_INLINKS_OF_TOTAL_COL] = temp.ToString("F"); } if (Utils.ExtractHash) { row[HASH_VALUE_COL] = pf.Hash; } dt.Rows.Add(row); }
private async Task StartCrawlingPage(Uri page, CancellationToken ctsToken) { MainForm.UpdateSemaphoresStatus(semaphore.CurrentCount); crawledPages.Add(page); // Wait for semaphore await this.semaphore.WaitAsync(cancellationToken); try { // Checking cancelation token (checking whether stop button has been pressed) if (!cts.IsCancellationRequested) { PageFragment pf = new PageFragment { Address = page.AbsoluteUri }; // Download page HttpClient httpClient = new HttpClient(); HttpResponseMessage response = await httpClient.GetAsync(page); try { // Check whether page is internal or external if (Uri.Compare(BaseUrl, page, UriComponents.Host, UriFormat.SafeUnescaped, StringComparison.CurrentCulture) == 0) { // Get page source string sourceHtml = await response.Content.ReadAsStringAsync(); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(sourceHtml); // Check whether url depth is in acceptable range if not don't crawl further if (page.Segments.Length - 1 <= Utils.CrawlDepthLimit) { // Crawl deeper through urls found on this page // (it happens in separate threads simultanously) CrawlFurther(htmlDocument, ref pf); } // Fulfill PageFragment with data ManagePageFragment(ref pf, ref response, ref htmlDocument, page); // Update data source UpdateDataTable(pf); } else { // Fulfill PageFragment with data ManagePageFragmentIfExternal(ref pf, ref response, page); // Update data source UpdateDataTable(pf); } } catch (UriFormatException ex) { Debug.WriteLine(" Podstrona: " + page + " ma niepoprawnie sformatowany url. Message: " + ex.Message); } catch (WebException ex) when((ex.Response as HttpWebResponse)?.StatusCode == HttpStatusCode.NotFound) { pf.StatusCode = "404"; Debug.WriteLine(" strona " + page + " jest niedostepna -> 404 NotFound"); } catch (WebException ex) { try { string status = (ex.Response as HttpWebResponse)?.StatusCode.ToString(); pf.StatusCode = status; Debug.WriteLine(" strona " + page + " WebEx: " + status); } catch (NullReferenceException e) { pf.StatusCode = "Undefined"; Debug.WriteLine(" strona " + page + " WebEx: Undefined. Message: " + e); } } catch (Exception ex) { Debug.WriteLine(" strona " + page + " spotkala niezdefiniowany (nieobsłużony indywidualnie) wyjątek: " + ex.Message); } UpdateCountersList(ref pf); } } catch (OperationCanceledException) { Debug.WriteLine("Task anulowany"); } catch (Exception) { Debug.WriteLine("Task sie nie powiodl"); } this.semaphore.Release(); }
private void ManagePageFragment(ref PageFragment pf, ref HttpResponseMessage response, ref HtmlDocument htmlDocument, Uri page) { pf.Address = page.AbsoluteUri; pf.IsInternal = true; pf.ContentType = response.Content.Headers.ContentType.MediaType; pf.StatusCode = ((int)response.StatusCode).ToString(); pf.Status = response.StatusCode.ToString(); if (Utils.ExtractPageSize) { pf.Size = response.Content.Headers.ContentLength.GetValueOrDefault(); } if (Utils.ExtractHash) { pf.Hash = htmlDocument.Text.GetHashCode(); string hash = pf.Hash.GetHashCode().ToString(); var k = (from row in dt.Rows.OfType <DataRow>() where row[HASH_VALUE_COL].ToString() == hash select row) .FirstOrDefault(); if (k != null) { k[ISDUPLICATE_COL] = true; pf.IsDuplicate = true; } } if (Utils.ExtractWordCount || Utils.ExtractTxtCodeRatio) { List <HtmlNode> bodies = htmlDocument.DocumentNode.Descendants("body").ToList(); int nonHtmlCharsCount = 0; foreach (HtmlNode body in bodies) { pf.WordCount += Utils.CountWords(body.InnerText); nonHtmlCharsCount += body.InnerText.Length; } if (Utils.ExtractTxtCodeRatio) { if (htmlDocument.Text.Length != 0) { pf.TextRatio = (float)nonHtmlCharsCount / (float)htmlDocument.Text.Length; } else { pf.TextRatio = 0; } } } pf.UrlDepth = page.Segments.Length - 1; if (pf.StatusCode != "200") { pf.Indexability = "Non-indexable"; if (pf.StatusCode.StartsWith("3")) { pf.IndexabilityStatus = "Redirect"; } else if (pf.StatusCode.StartsWith("4")) { pf.IndexabilityStatus = "Client Error"; } else if (pf.StatusCode.StartsWith("5")) { pf.IndexabilityStatus = "Server Error"; } } // Analyze all tags - description, keywords and checking rel robots to realize whether size is inexable AnalyzeMetas(ref pf, ref htmlDocument); AnalyzeTitles(ref pf, ref htmlDocument); AnalyzeHeadingsOne(ref pf, ref htmlDocument); AnalyzeHeadingsTwo(ref pf, ref htmlDocument); }
public override async Task <bool> WritePageFragmentAsync(ScriptScopeContext scope, PageFragment fragment, CancellationToken token) { if (fragment is PageStringFragment str) { await scope.OutputStream.WriteAsync(str.ValueUtf8, token); } else if (fragment is PageVariableFragment var) { if (var.Binding?.Equals(ScriptConstants.Page) == true) { await scope.PageResult.WritePageAsync(scope.PageResult.Page, scope.PageResult.CodePage, scope, token); if (scope.PageResult.HaltExecution) { scope.PageResult.HaltExecution = false; //break out of page but continue evaluating layout } } else { await scope.PageResult.WriteVarAsync(scope, var, token); } } else if (fragment is PageBlockFragment blockFragment) { var block = scope.PageResult.GetBlock(blockFragment.Name); await block.WriteAsync(scope, blockFragment, token); } else { return(false); } return(true); }