Example #1
0
        public WebList ToWebList()
        {
            var body             = GetPageBody();
            var mainPageFragment = new PageFragment(Driver, body);

            return(mainPageFragment.ToWebList());
        }
Example #2
0
        public WebList ToWebList()
        {
            var body             = Driver.FindElementByTagName("body");
            var mainPageFragment = new PageFragment(Driver, body);

            return(mainPageFragment.ToWebList());
        }
Example #3
0
        private void UpdateCountersList(ref PageFragment pf)
        {
            MainForm.UpdateSemaphoresStatus(semaphore.CurrentCount);
            MainForm.UpdateVisitedPagesStatus(visitedPages, pagesTovisit);

            foreach (var title in pf.Titles)
            {
                if (title.TitleLength > TitleCharMax || title.TitleLength < TitleCharMin)
                {
                    MainForm.IncreaseTitleCharProblemsCounter();
                }
                if (title.TitlePixelWidth > TitlePixMax || title.TitlePixelWidth < TitlePixMin)
                {
                    MainForm.IncreaseTitlePixelProblemsCounter();
                }
            }
            foreach (var desc in pf.MetaDescriptions)
            {
                if (desc.MetaDescriptionLength > TitleCharMax || desc.MetaDescriptionLength < TitleCharMin)
                {
                    MainForm.IncreaseDescCharProblemsCounter();
                }
                if (desc.MetaDescriptionPixelWidth > TitlePixMax || desc.MetaDescriptionPixelWidth < TitlePixMin)
                {
                    MainForm.IncreaseDescPixelProblemsCounter();
                }
            }

            if (pf.Address.Length > UrlCharMax)
            {
                MainForm.IncreaseUrlProblemsCounter();
            }

            foreach (var heading in pf.HeadingsOne)
            {
                if (heading.HeadingOneLength > H1CharMax)
                {
                    MainForm.IncreaseHeadOneProblemsCounter();
                }
            }

            foreach (var heading in pf.HeadingsTwo)
            {
                if (heading.HeadingTwoLength > H2CharMax)
                {
                    MainForm.IncreaseHeadTwoProblemsCounter();
                }
            }

            if (pf.Size > ImgSizeMax && pf.ContentType.Contains("image"))
            {
                MainForm.IncreaseImgProblemsCounter(); Debug.WriteLine("Imageee: " + pf.Address);
            }
            visitedPages++;
        }
Example #4
0
        private void CrawlThroughAnchors(HtmlDocument htmlDocument, ref PageFragment pf)
        {
            // Collect all anchor tags (<a/>) and start crawling urls inside href attribute
            var anchors = htmlDocument.DocumentNode.Descendants("a").ToList();

            foreach (var anchor in anchors)
            {
                string address = anchor.GetAttributeValue("href", String.Empty);
                TryCrawlingNextPage(address, ref pf);
            }
        }
Example #5
0
        private void CrawlThroughIframes(HtmlDocument htmlDocument, ref PageFragment pf)
        {
            // Collect all iframe tags (<iframe/>) and start crawling urls inside src attribute
            var iframes = htmlDocument.DocumentNode.Descendants("iframe").ToList();

            foreach (var iframe in iframes)
            {
                string address = iframe.GetAttributeValue("src", String.Empty);
                TryCrawlingNextPage(address, ref pf);
            }
        }
Example #6
0
        private void TryCrawlingNextPage(string address, ref PageFragment pf)
        {
            // Refactor addres to make it a full absolute URL
            NormalizeAddress(BaseUrl, ref address, pf.Address);
            // Check whether address is correct or not
            if (address == null)
            {
                return;
            }

            if (Uri.Compare(BaseUrl, new Uri(address), UriComponents.Host, UriFormat.SafeUnescaped, StringComparison.CurrentCulture) == 0)
            {
                pf.OutLinks++;
                if (!pf.OutLinksAdresses.Contains(address))
                {
                    pf.OutLinksAdresses.Add(address);
                    pf.UniqueOutLinks++;
                }
                if (!inLinksData.ContainsKey(address))
                {
                    inLinksData.Add(address, new InLinksCounter());
                }

                inLinksData[address].InLinksCount++;
                inLinksData[address].UniqueInLinks.Add(pf.Address);
            }
            else
            {
                pf.ExternalOutLinks++;
                if (!pf.ExternalOutLinksAdresses.Contains(address))
                {
                    pf.ExternalOutLinksAdresses.Add(address);
                    pf.UniqueExternalOutLinks++;
                }
            }

            // Check whether address had been crawled before or not
            if (crawledPages.Contains(new Uri(address)))
            {
                return;
            }
            // Check whether total crawl limit has been exceeded
            if (crawledPages.Count >= Utils.TotalCrawlLimit)
            {
                return;
            }

            pagesTovisit++;
            _ = StartCrawlingPage(new Uri(address), cts.Token);
        }
        private static void ManagePageFragmentIfExternal(ref PageFragment pf, ref HttpResponseMessage response, Uri page)
        {
            pf.Address    = page.AbsoluteUri;
            pf.IsInternal = false;

            pf.ContentType = response.Content.Headers.ContentType.MediaType;
            pf.StatusCode  = ((int)response.StatusCode).ToString();
            pf.Status      = response.StatusCode.ToString();
            pf.UrlDepth    = -1;
            foreach (string segment in page.Segments)
            {
                pf.UrlDepth++;
            }
        }
Example #8
0
        private void CrawlThroughScripts(HtmlDocument htmlDocument, ref PageFragment pf)
        {
            // Collect all script tags (<script/>) and start crawling urls inside src attribute
            // Only interested in .js files
            var scripts = htmlDocument.DocumentNode.Descendants("script").ToList();

            foreach (var script in scripts)
            {
                if (script.GetAttributeValue("type", "null") == "text/javascript")
                {
                    string address = script.GetAttributeValue("src", String.Empty);
                    TryCrawlingNextPage(address, ref pf);
                }
            }
        }
Example #9
0
        private void CrawlThroughImages(HtmlDocument htmlDocument, ref PageFragment pf)
        {
            // Collect all image tags (<img/>) and start crawling urls inside src attribute
            var images = htmlDocument.DocumentNode.Descendants("img").ToList();

            foreach (var image in images)
            {
                string address = image.GetAttributeValue("src", String.Empty);
                if (address.Contains("?"))
                {
                    address = address.Remove(address.IndexOf("?"));
                }
                TryCrawlingNextPage(address, ref pf);
            }
        }
        private static void AnalyzeHeadingsTwo(ref PageFragment pf, ref HtmlDocument htmlDocument)
        {
            List <HtmlNode> htmlHeadingsTwo = htmlDocument.DocumentNode.Descendants("h2").ToList();

            foreach (HtmlNode htmlHeadingTwo in htmlHeadingsTwo)
            {
                HeadingTwo headingTwo = new HeadingTwo();
                headingTwo.HeadingTwoText   = htmlHeadingTwo.InnerText;
                headingTwo.HeadingTwoLength = headingTwo.HeadingTwoText.Length;

                pf.HeadingsTwo.Add(headingTwo);
                if (pf.HeadingsTwo.Count == Utils.MaxHeadsTwo)
                {
                    break;
                }
            }
        }
        private static void AnalyzeHeadingsOne(ref PageFragment pf, ref HtmlDocument htmlDocument)
        {
            List <HtmlNode> htmlHeadingsOne = htmlDocument.DocumentNode.Descendants("h1").ToList();

            foreach (HtmlNode htmlHeadingOne in htmlHeadingsOne)
            {
                HeadingOne headingOne = new HeadingOne();
                headingOne.HeadingOneText   = htmlHeadingOne.InnerText;
                headingOne.HeadingOneLength = headingOne.HeadingOneText.Length;

                pf.HeadingsOne.Add(headingOne);
                if (pf.HeadingsOne.Count == Utils.MaxHeadsOne)
                {
                    break;
                }
            }
        }
        private static void AnalyzeMetas(ref PageFragment pf, ref HtmlDocument htmlDocument)
        {
            List <HtmlNode> metas = htmlDocument.DocumentNode.Descendants("meta").ToList();

            foreach (HtmlNode meta in metas)
            {
                if (meta.GetAttributeValue("name", "null") == "robots")
                {
                    if (meta.GetAttributeValue("content", "null") == "noindex")
                    {
                        pf.Indexability       = "Non-Indexable";
                        pf.IndexabilityStatus = "Noindex";
                    }
                }
                else if (meta.GetAttributeValue("name", "null") == "description")
                {
                    if (pf.MetaDescriptions.Count == Utils.MaxDescs)
                    {
                        continue;
                    }

                    MetaDescription metaDesc = new MetaDescription();
                    metaDesc.MetaDescriptionText   = meta.GetAttributeValue("content", "");
                    metaDesc.MetaDescriptionLength = metaDesc.MetaDescriptionText.Length;

                    Font arialBold = new Font("Arial", 13.0F);
                    metaDesc.MetaDescriptionPixelWidth = System.Windows.Forms.TextRenderer
                                                         .MeasureText(metaDesc.MetaDescriptionText, arialBold).Width;

                    pf.MetaDescriptions.Add(metaDesc);
                }
                else if (meta.GetAttributeValue("name", "null") == "keywords")
                {
                    if (pf.MetaKeywords.Count == Utils.MaxKeywords)
                    {
                        continue;
                    }

                    MetaKeywords metaKey = new MetaKeywords();
                    metaKey.MetaKeywordsText   = meta.GetAttributeValue("content", "");
                    metaKey.MetaKeywordsLength = metaKey.MetaKeywordsText.Length;

                    pf.MetaKeywords.Add(metaKey);
                }
            }
        }
Example #13
0
        private void CrawlThroughLinks(HtmlDocument htmlDocument, ref PageFragment pf)
        {
            // Collect all link tags (<link/>) and start crawling urls inside href attribute
            // Only interested in .CSS files
            var links = htmlDocument.DocumentNode.Descendants("link").ToList();

            foreach (var link in links)
            {
                if (link.GetAttributeValue("rel", "null") == "stylesheet")
                {
                    string address = link.GetAttributeValue("href", String.Empty);
                    if (address.Contains("?"))
                    {
                        address = address.Remove(address.IndexOf("?"));
                    }
                    TryCrawlingNextPage(address, ref pf);
                }
            }
        }
Example #14
0
 private void CrawlFurther(HtmlDocument htmlDocument, ref PageFragment pf)
 {
     CrawlThroughAnchors(htmlDocument, ref pf);
     if (Utils.CrawlCss)
     {
         CrawlThroughLinks(htmlDocument, ref pf);
     }
     if (Utils.CrawlJavaScript)
     {
         CrawlThroughScripts(htmlDocument, ref pf);
     }
     if (Utils.CrawlIframes)
     {
         CrawlThroughIframes(htmlDocument, ref pf);
     }
     if (Utils.CrawlImages)
     {
         CrawlThroughImages(htmlDocument, ref pf);
     }
 }
        private static void AnalyzeTitles(ref PageFragment pf, ref HtmlDocument htmlDocument)
        {
            List <HtmlNode> htmlTitles = htmlDocument.DocumentNode.Descendants("title").ToList();

            foreach (HtmlNode htmlTitle in htmlTitles)
            {
                Title title = new Title();
                title.TitleText   = htmlTitle.InnerText;
                title.TitleLength = title.TitleText.Length;

                Font arialBold = new Font("Arial", 16.0F);
                title.TitlePixelWidth = System.Windows.Forms.TextRenderer
                                        .MeasureText(title.TitleText, arialBold).Width;

                pf.Titles.Add(title);
                if (pf.Titles.Count == Utils.MaxTitles)
                {
                    break;
                }
            }
        }
Example #16
0
 public virtual Task <bool> WritePageFragmentAsync(ScriptScopeContext scope, PageFragment fragment, CancellationToken token) => TypeConstants.FalseTask;
Example #17
0
        public override async Task <bool> WritePageFragmentAsync(ScriptScopeContext scope, PageFragment fragment, CancellationToken token)
        {
            if (fragment is PageStringFragment str)
            {
                await scope.OutputStream.WriteAsync(str.ValueUtf8, token);
            }
            else if (fragment is PageVariableFragment var)
            {
                if (var.Binding?.Equals(ScriptConstants.Page) == true &&
                    !scope.ScopedParams.ContainsKey(ScriptConstants.PartialArg))
                {
                    if (scope.PageResult.PageProcessed)
                    {
                        throw new NotSupportedException("{{page}} can only be called once per render, in the Layout page.");
                    }
                    scope.PageResult.PageProcessed = true;

                    await scope.PageResult.WritePageAsync(scope.PageResult.Page, scope.PageResult.CodePage, scope, token);

                    if (scope.PageResult.HaltExecution)
                    {
                        scope.PageResult.HaltExecution = false; //break out of page but continue evaluating layout
                    }
                }
                else
                {
                    await scope.PageResult.WriteVarAsync(scope, var, token);
                }
            }
            else if (fragment is PageBlockFragment blockFragment)
            {
                var block = scope.PageResult.GetBlock(blockFragment.Name);
                await block.WriteAsync(scope, blockFragment, token);
            }
            else
            {
                return(false);
            }

            return(true);
        }
        public override async Task <bool> WritePageFragmentAsync(ScriptScopeContext scope, PageFragment fragment, CancellationToken token)
        {
            var page = scope.PageResult;

            if (fragment is PageJsBlockStatementFragment blockFragment)
            {
                var blockStatements = blockFragment.Block.Statements;
                if (blockFragment.Quiet && scope.OutputStream != Stream.Null)
                {
                    scope = scope.ScopeWithStream(Stream.Null);
                }

                await page.WriteStatementsAsync(scope, blockStatements, token);

                return(true);
            }
            return(false);
        }
Example #19
0
        private void UpdateDataTable(PageFragment pf)
        {
            DataRow row = dt.NewRow();

            row[ADDRESS_COL]     = pf.Address;
            row[CONTET_TYPE_COL] = pf.ContentType;
            row[STATUS_CODE_COL] = pf.StatusCode;
            row[STATUS_COL]      = pf.Status;
            if (Utils.ExtractIndexability)
            {
                row[INDEXABILITY_COL]        = pf.Indexability;
                row[INDEXABILITY_STATUS_COL] = pf.IndexabilityStatus;
            }

            row[ISINTERNAL_COL] = pf.IsInternal;
            if (Utils.ExtractPageSize)
            {
                row[SIZE_COL] = pf.Size;
            }

            if (Utils.ExtractHash)
            {
                row[ISDUPLICATE_COL] = pf.IsDuplicate;
            }
            if (pf.WordCount > 0 && Utils.ExtractWordCount)
            {
                row[WORD_COUNT_COL] = pf.WordCount;
            }
            if (pf.TextRatio > 0 && Utils.ExtractTxtCodeRatio)
            {
                row[TEXT_RATIO_COL] = pf.TextRatio.ToString("F");
            }
            row[URL_DEPTH_COL] = pf.UrlDepth;

            HandleTitles(ref row, pf.Titles);
            HandleDesc(ref row, pf.MetaDescriptions);
            HandleKeywords(ref row, pf.MetaKeywords);
            if (Utils.ExtractH1)
            {
                HandleHeadsOne(ref row, pf.HeadingsOne);
            }
            if (Utils.ExtractH2)
            {
                HandleHeadsTwo(ref row, pf.HeadingsTwo);
            }

            if (pf.OutLinks > 0)
            {
                row[OUTLINKS_COL]        = pf.OutLinks;
                row[UNIQUE_OUTLINKS_COL] = pf.UniqueOutLinks;
            }

            if (pf.ExternalOutLinks > 0)
            {
                row[EXTERNAL_OUTLIKNS_COL]        = pf.ExternalOutLinks;
                row[UNIQUE_EXTERNAL_OUTLIKNS_COL] = pf.UniqueExternalOutLinks;
            }

            if (inLinksData.ContainsKey(pf.Address))
            {
                row[INLINKS_COL]        = inLinksData[pf.Address].InLinksCount;
                row[UNIQUE_INLINKS_COL] = inLinksData[pf.Address].UniqueInLinks.Count;
                float temp = ((float)inLinksData[pf.Address].UniqueInLinks.Count / (float)inLinksData[pf.Address].InLinksCount) * 100;
                row[UNIQUE_INLINKS_OF_TOTAL_COL] = temp.ToString("F");
            }

            if (Utils.ExtractHash)
            {
                row[HASH_VALUE_COL] = pf.Hash;
            }

            dt.Rows.Add(row);
        }
Example #20
0
        private async Task StartCrawlingPage(Uri page, CancellationToken ctsToken)
        {
            MainForm.UpdateSemaphoresStatus(semaphore.CurrentCount);
            crawledPages.Add(page);

            // Wait for semaphore
            await this.semaphore.WaitAsync(cancellationToken);

            try {
                // Checking cancelation token (checking whether stop button has been pressed)
                if (!cts.IsCancellationRequested)
                {
                    PageFragment pf = new PageFragment {
                        Address = page.AbsoluteUri
                    };

                    // Download page
                    HttpClient          httpClient = new HttpClient();
                    HttpResponseMessage response   = await httpClient.GetAsync(page);

                    try
                    {
                        // Check whether page is internal or external
                        if (Uri.Compare(BaseUrl, page, UriComponents.Host,
                                        UriFormat.SafeUnescaped, StringComparison.CurrentCulture) == 0)
                        {
                            // Get page source
                            string sourceHtml = await response.Content.ReadAsStringAsync();

                            HtmlDocument htmlDocument = new HtmlDocument();
                            htmlDocument.LoadHtml(sourceHtml);

                            // Check whether url depth is in acceptable range if not don't crawl further
                            if (page.Segments.Length - 1 <= Utils.CrawlDepthLimit)
                            {
                                // Crawl deeper through urls found on this page
                                // (it happens in separate threads simultanously)
                                CrawlFurther(htmlDocument, ref pf);
                            }

                            // Fulfill PageFragment with data
                            ManagePageFragment(ref pf, ref response, ref htmlDocument, page);

                            // Update data source
                            UpdateDataTable(pf);
                        }
                        else
                        {
                            // Fulfill PageFragment with data
                            ManagePageFragmentIfExternal(ref pf, ref response, page);

                            // Update data source
                            UpdateDataTable(pf);
                        }
                    }
                    catch (UriFormatException ex)
                    {
                        Debug.WriteLine(" Podstrona: " + page + " ma niepoprawnie sformatowany url. Message: " + ex.Message);
                    }
                    catch (WebException ex) when((ex.Response as HttpWebResponse)?.StatusCode == HttpStatusCode.NotFound)
                    {
                        pf.StatusCode = "404";
                        Debug.WriteLine(" strona " + page + " jest niedostepna -> 404 NotFound");
                    }
                    catch (WebException ex)
                    {
                        try
                        {
                            string status = (ex.Response as HttpWebResponse)?.StatusCode.ToString();
                            pf.StatusCode = status;
                            Debug.WriteLine(" strona " + page + " WebEx: " + status);
                        }
                        catch (NullReferenceException e)
                        {
                            pf.StatusCode = "Undefined";
                            Debug.WriteLine(" strona " + page + " WebEx: Undefined. Message: " + e);
                        }
                    }
                    catch (Exception ex)
                    {
                        Debug.WriteLine(" strona " + page + " spotkala niezdefiniowany (nieobsłużony indywidualnie) wyjątek: " + ex.Message);
                    }

                    UpdateCountersList(ref pf);
                }
            }
            catch (OperationCanceledException)
            {
                Debug.WriteLine("Task anulowany");
            }
            catch (Exception)
            {
                Debug.WriteLine("Task sie nie powiodl");
            }

            this.semaphore.Release();
        }
        private void ManagePageFragment(ref PageFragment pf, ref HttpResponseMessage response, ref HtmlDocument htmlDocument, Uri page)
        {
            pf.Address     = page.AbsoluteUri;
            pf.IsInternal  = true;
            pf.ContentType = response.Content.Headers.ContentType.MediaType;
            pf.StatusCode  = ((int)response.StatusCode).ToString();
            pf.Status      = response.StatusCode.ToString();
            if (Utils.ExtractPageSize)
            {
                pf.Size = response.Content.Headers.ContentLength.GetValueOrDefault();
            }

            if (Utils.ExtractHash)
            {
                pf.Hash = htmlDocument.Text.GetHashCode();
                string hash = pf.Hash.GetHashCode().ToString();
                var    k    = (from row in dt.Rows.OfType <DataRow>() where row[HASH_VALUE_COL].ToString() == hash select row)
                              .FirstOrDefault();
                if (k != null)
                {
                    k[ISDUPLICATE_COL] = true;
                    pf.IsDuplicate     = true;
                }
            }

            if (Utils.ExtractWordCount || Utils.ExtractTxtCodeRatio)
            {
                List <HtmlNode> bodies            = htmlDocument.DocumentNode.Descendants("body").ToList();
                int             nonHtmlCharsCount = 0;
                foreach (HtmlNode body in bodies)
                {
                    pf.WordCount      += Utils.CountWords(body.InnerText);
                    nonHtmlCharsCount += body.InnerText.Length;
                }

                if (Utils.ExtractTxtCodeRatio)
                {
                    if (htmlDocument.Text.Length != 0)
                    {
                        pf.TextRatio = (float)nonHtmlCharsCount / (float)htmlDocument.Text.Length;
                    }
                    else
                    {
                        pf.TextRatio = 0;
                    }
                }
            }

            pf.UrlDepth = page.Segments.Length - 1;

            if (pf.StatusCode != "200")
            {
                pf.Indexability = "Non-indexable";
                if (pf.StatusCode.StartsWith("3"))
                {
                    pf.IndexabilityStatus = "Redirect";
                }
                else if (pf.StatusCode.StartsWith("4"))
                {
                    pf.IndexabilityStatus = "Client Error";
                }
                else if (pf.StatusCode.StartsWith("5"))
                {
                    pf.IndexabilityStatus = "Server Error";
                }
            }

            // Analyze all tags - description, keywords and checking rel robots to realize whether size is inexable
            AnalyzeMetas(ref pf, ref htmlDocument);
            AnalyzeTitles(ref pf, ref htmlDocument);
            AnalyzeHeadingsOne(ref pf, ref htmlDocument);
            AnalyzeHeadingsTwo(ref pf, ref htmlDocument);
        }
        public override async Task <bool> WritePageFragmentAsync(ScriptScopeContext scope, PageFragment fragment, CancellationToken token)
        {
            if (fragment is PageStringFragment str)
            {
                await scope.OutputStream.WriteAsync(str.ValueUtf8, token);
            }
            else if (fragment is PageVariableFragment var)
            {
                if (var.Binding?.Equals(ScriptConstants.Page) == true)
                {
                    await scope.PageResult.WritePageAsync(scope.PageResult.Page, scope.PageResult.CodePage, scope, token);

                    if (scope.PageResult.HaltExecution)
                    {
                        scope.PageResult.HaltExecution = false; //break out of page but continue evaluating layout
                    }
                }
                else
                {
                    await scope.PageResult.WriteVarAsync(scope, var, token);
                }
            }
            else if (fragment is PageBlockFragment blockFragment)
            {
                var block = scope.PageResult.GetBlock(blockFragment.Name);
                await block.WriteAsync(scope, blockFragment, token);
            }
            else
            {
                return(false);
            }

            return(true);
        }