public void TestLinksInTextDocs()
        {
            string Url = @"https://nazuke.github.io/dummy.txt";
            MacroscopeJobMaster          JobMaster;
            MacroscopeDocumentCollection DocCollection;

            JobMaster = new MacroscopeJobMaster(
                JobRunTimeMode: MacroscopeConstants.RunTimeMode.LIVE,
                TaskController: this
                );

            DocCollection = new MacroscopeDocumentCollection(JobMaster: JobMaster);

            MacroscopeDocument msDoc = DocCollection.CreateDocument(Url: Url);

            Assert.IsNotNull(msDoc, string.Format("FAIL: {0}", Url));

            msDoc.ProcessPureTextOutlinks(TextDoc: this.TextDoc, LinkType: MacroscopeConstants.InOutLinkType.PURETEXT);

            foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks())
            {
                Assert.Contains(Outlink.GetTargetUrl(), this.TextLinks);
            }

            Assert.AreEqual(5, msDoc.CountOutlinks());
        }
        /**************************************************************************/

        private Dictionary <string, Dictionary <string, bool> > BuildSitemapUrlList(MacroscopeDocumentCollection DocCollection)
        {
            Dictionary <string, Dictionary <string, bool> > UrlMap = new Dictionary <string, Dictionary <string, bool> >();

            try
            {
                MacroscopeDocumentList SitemapDocumentList = this.FindSitemaps(DocCollection: DocCollection);

                foreach (MacroscopeDocument msDoc in SitemapDocumentList.IterateDocuments())
                {
                    string SitemapUrl = msDoc.GetUrl();

                    if (!UrlMap.ContainsKey(SitemapUrl))
                    {
                        UrlMap.Add(SitemapUrl, new Dictionary <string, bool>());
                    }

                    foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks())
                    {
                        string TargetUrl = Outlink.GetTargetUrl();

                        if (!UrlMap[SitemapUrl].ContainsKey(TargetUrl))
                        {
                            UrlMap[SitemapUrl].Add(TargetUrl, false);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("BuildSitemapUrlList: {0}", ex.Message));
            }

            return(UrlMap);
        }
Beispiel #3
0
        /** Robots.txt Out Links **************************************************/

        private void ProcessRobotsTextOutlinks(List <string> TextDoc)
        {
            if (this.GetIsExternal())
            {
                return;
            }

            foreach (string Url in TextDoc)
            {
                if (Regex.IsMatch(Url, @"^Sitemap:\s*[^\s]+", RegexOptions.IgnoreCase))
                {
                    string UrlProcessing = Regex.Replace(Url, @"^(Sitemap:\s*)", "", RegexOptions.IgnoreCase);
                    string UrlCleaned    = null;

                    UrlProcessing = UrlProcessing.Trim();

                    if (!string.IsNullOrEmpty(UrlProcessing))
                    {
                        try
                        {
                            Uri SitemapUri = new Uri(UrlProcessing);
                            if (SitemapUri != null)
                            {
                                UrlCleaned = UrlProcessing;
                            }
                        }
                        catch (UriFormatException ex)
                        {
                            DebugMsg(string.Format("ProcessRobotsTextOutlinks: {0}", ex.Message));
                            UrlCleaned = null;
                        }
                        catch (Exception ex)
                        {
                            DebugMsg(string.Format("ProcessRobotsTextOutlinks: {0}", ex.Message));
                            UrlCleaned = null;
                        }

                        if (UrlCleaned != null)
                        {
                            MacroscopeLink Outlink;

                            Outlink = this.AddDocumentOutlink(
                                AbsoluteUrl: UrlCleaned,
                                LinkType: MacroscopeConstants.InOutLinkType.ROBOTSTEXT,
                                Follow: true
                                );

                            if (Outlink != null)
                            {
                                Outlink.SetRawTargetUrl(UrlCleaned);
                            }
                        }
                    }
                }
            }
        }
        /**************************************************************************/

        private void BuildWorksheetSitemapErrors(
            MacroscopeJobMaster JobMaster,
            CsvWriter ws
            )
        {
            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();

            {
                ws.WriteField("Sitemap URL");
                ws.WriteField("Status Code");
                ws.WriteField("Robots");
                ws.WriteField("URL");

                ws.NextRecord();
            }

            foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments())
            {
                if (msDoc.GetIsInternal() && msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML))
                {
                    foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks())
                    {
                        string             TargetUrl   = Outlink.GetTargetUrl();
                        MacroscopeDocument msDocLinked = DocCollection.GetDocumentByUrl(Url: TargetUrl);
                        bool InsertRow = false;

                        if (msDocLinked.GetIsInternal())
                        {
                            int StatusCode = (int)msDocLinked.GetStatusCode();
                            if ((StatusCode >= 400) && (StatusCode <= 599))
                            {
                                InsertRow = true;
                            }
                            if (!msDocLinked.GetAllowedByRobots())
                            {
                                InsertRow = true;
                            }
                        }

                        if (InsertRow)
                        {
                            this.InsertAndFormatUrlCell(ws, msDoc);

                            this.InsertAndFormatStatusCodeCell(ws, msDoc);

                            this.InsertAndFormatRobotsCell(ws, msDoc);

                            this.InsertAndFormatUrlCell(ws, TargetUrl);

                            ws.NextRecord();
                        }
                    }
                }
            }
        }
        /**************************************************************************/

        private void ProcessOutlinks(MacroscopeDocument msDoc)
        {
            if (
                (this.JobMaster.GetRunTimeMode() == MacroscopeConstants.RunTimeMode.LISTFILE) ||
                (this.JobMaster.GetRunTimeMode() == MacroscopeConstants.RunTimeMode.LISTTEXT) ||
                (this.JobMaster.GetRunTimeMode() == MacroscopeConstants.RunTimeMode.SITEMAP))
            {
                if (!MacroscopePreferencesManager.GetScanSitesInList())
                {
                    return;
                }
            }

            foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks())
            {
                Boolean Proceed = true;

                if (!Outlink.GetDoFollow())
                {
                    continue;
                }

                if (Outlink.GetTargetUrl() == null)
                {
                    continue;
                }

                if (this.JobMaster.GetJobHistory().SeenHistoryItem(Outlink.GetTargetUrl()))
                {
                    continue;
                }

                if (this.JobMaster.GetPageLimit() > -1)
                {
                    if (this.JobMaster.GetPageLimitCount() >= this.JobMaster.GetPageLimit())
                    {
                        this.DebugMsg(
                            string.Format(
                                "PAGE LIMIT REACHED: {0} :: {1}",
                                this.JobMaster.GetPageLimit(),
                                this.JobMaster.GetPageLimitCount()
                                )
                            );
                        Proceed = false;
                    }
                }

                if (Proceed)
                {
                    this.JobMaster.AddUrlQueueItem(
                        Url: Outlink.GetTargetUrl(),
                        Check: true
                        );
                }
            }
        }
Beispiel #6
0
        /** Text Sitemap Out Links ************************************************/

        private void ProcessSitemapTextOutlinks(List <string> TextDoc)
        {
            if (this.GetIsExternal())
            {
                return;
            }

            foreach (string Url in TextDoc)
            {
                string UrlProcessing = Regex.Replace(Url, @"\s+", "");
                string UrlCleaned    = null;

                if (!string.IsNullOrEmpty(UrlProcessing))
                {
                    try
                    {
                        Uri SitemapUri = new Uri(UrlProcessing);
                        if (SitemapUri != null)
                        {
                            UrlCleaned = UrlProcessing;
                        }
                    }
                    catch (UriFormatException ex)
                    {
                        DebugMsg(string.Format("ProcessSitemapTextOutlinks: {0}", ex.Message));
                        UrlCleaned = null;
                    }
                    catch (Exception ex)
                    {
                        DebugMsg(string.Format("ProcessSitemapTextOutlinks: {0}", ex.Message));
                        UrlCleaned = null;
                    }

                    if (UrlCleaned != null)
                    {
                        MacroscopeLink Outlink;

                        Outlink = this.AddSitemapTextOutlink(
                            AbsoluteUrl: UrlCleaned,
                            LinkType: MacroscopeConstants.InOutLinkType.SITEMAPTEXT,
                            Follow: true
                            );

                        if (Outlink != null)
                        {
                            Outlink.SetRawTargetUrl(UrlCleaned);
                        }
                    }
                }
            }
        }
        /**************************************************************************/

        private void ProcessSitemapXmlOutlinks(XmlDocument XmlDoc)
        {
            XmlNodeList OutlinksList = XmlDoc.GetElementsByTagName("loc", MacroscopeConstants.SitemapXmlNamespace);

            DebugMsg(string.Format("ProcessSitemapXmlOutlinks nlOutlinks: {0}", OutlinksList.Count));

            if (OutlinksList != null)
            {
                foreach (XmlNode LinkNode in OutlinksList)
                {
                    string LinkUrl = null;

                    try
                    {
                        LinkUrl = LinkNode.InnerText;
                        DebugMsg(string.Format("ProcessSitemapXmlOutlinks sLinkUrl: {0}", LinkUrl));
                    }
                    catch (Exception ex)
                    {
                        DebugMsg(string.Format("ProcessSitemapXmlOutlinks: {0}", ex.Message));
                    }

                    if (LinkUrl != null)
                    {
                        MacroscopeLink Outlink;
                        string         LinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute(BaseUrl: this.GetUrl(), Url: LinkUrl);

                        Outlink = this.AddDocumentOutlink(
                            AbsoluteUrl: LinkUrlAbs,
                            LinkType: MacroscopeConstants.InOutLinkType.SITEMAPXML,
                            Follow: true
                            );

                        if (Outlink != null)
                        {
                            Outlink.SetRawTargetUrl(LinkUrl);
                        }
                    }
                }
            }
        }
        /** -------------------------------------------------------------------- **/

        public void ProcessPureTextOutlinks(string TextDoc, MacroscopeConstants.InOutLinkType LinkType)
        {
            // BUG: Trailing punctuation in the detected URL can cause problems:
            Regex UrlRegex = new Regex(
                @"(https?://[^/]+/[^\s]*)",
                RegexOptions.IgnoreCase
                );

            Match UrlMatch = UrlRegex.Match(TextDoc);

            while (UrlMatch.Success)
            {
                Group             CaptureGroups = UrlMatch.Groups[0];
                CaptureCollection Captures      = CaptureGroups.Captures;
                Capture           Captured      = null;
                string            UrlProcessing = null;
                string            UrlCleaned    = null;

                if (Captures.Count <= 0)
                {
                    continue;
                }

                Captured      = Captures[0];
                UrlProcessing = Captured.Value;
                UrlProcessing = UrlProcessing.Trim();
                UrlProcessing = UrlProcessing.Trim(',');
                UrlProcessing = UrlProcessing.Trim('.');
                UrlProcessing = UrlProcessing.Trim('(');
                UrlProcessing = UrlProcessing.Trim(')');
                UrlProcessing = UrlProcessing.Trim('"');
                UrlProcessing = UrlProcessing.Trim('\'');

                if (!string.IsNullOrEmpty(UrlProcessing))
                {
                    try
                    {
                        Uri PureTextUri = new Uri(UrlProcessing);
                        if (PureTextUri != null)
                        {
                            UrlCleaned = UrlProcessing;
                        }
                    }
                    catch (UriFormatException ex)
                    {
                        this.DebugMsg(string.Format("ProcessPureTextOutlinks: {0}", ex.Message));
                        UrlCleaned = null;
                    }
                    catch (Exception ex)
                    {
                        this.DebugMsg(string.Format("ProcessPureTextOutlinks: {0}", ex.Message));
                        UrlCleaned = null;
                    }

                    if (UrlCleaned != null)
                    {
                        MacroscopeLink Outlink;

                        Outlink = this.AddDocumentOutlink(
                            AbsoluteUrl: UrlCleaned,
                            LinkType: LinkType,
                            Follow: true
                            );

                        if (Outlink != null)
                        {
                            Outlink.SetRawTargetUrl(TargetUrl: UrlCleaned);
                        }
                    }
                }

                UrlMatch = UrlMatch.NextMatch();
            }
        }
Beispiel #9
0
        /**************************************************************************/

        private void BuildWorksheetSitemapXmlErrors(
            MacroscopeJobMaster JobMaster,
            XLWorkbook wb,
            string WorksheetLabel
            )
        {
            var ws = wb.Worksheets.Add(WorksheetLabel);

            int iRow    = 1;
            int iCol    = 1;
            int iColMax = 1;

            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();

            {
                ws.Cell(iRow, iCol).Value = "Sitemap URL";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Status Code";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Robots";
                iCol++;

                ws.Cell(iRow, iCol).Value = "URL";
            }

            iColMax = iCol;

            iRow++;

            foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments())
            {
                if (msDoc.GetIsInternal() && msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML))
                {
                    foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks())
                    {
                        string             TargetUrl   = Outlink.GetTargetUrl();
                        MacroscopeDocument msDocLinked = DocCollection.GetDocumentByUrl(Url: TargetUrl);
                        bool InsertRow = false;

                        if (msDocLinked.GetIsInternal())
                        {
                            int StatusCode = (int)msDocLinked.GetStatusCode();
                            if ((StatusCode >= 400) && (StatusCode <= 599))
                            {
                                InsertRow = true;
                            }
                            if (!msDocLinked.GetAllowedByRobots())
                            {
                                InsertRow = true;
                            }
                        }

                        if (InsertRow)
                        {
                            iCol = 1;

                            this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc);

                            if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl()))
                            {
                                ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                            }
                            else
                            {
                                ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray);
                            }

                            iCol++;

                            this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, msDoc);

                            iCol++;

                            this.InsertAndFormatRobotsCell(ws, iRow, iCol, msDoc);

                            iCol++;

                            this.InsertAndFormatUrlCell(ws, iRow, iCol, TargetUrl);

                            if (AllowedHosts.IsInternalUrl(Url: TargetUrl))
                            {
                                ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                            }
                            else
                            {
                                ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray);
                            }

                            iRow++;
                        }
                    }
                }
            }

            {
                var rangeData  = ws.Range(1, 1, iRow - 1, iColMax);
                var excelTable = rangeData.CreateTable();
            }
        }