/**************************************************************************/

        private Dictionary <string, Dictionary <string, bool> > BuildSitemapUrlList(MacroscopeDocumentCollection DocCollection)
        {
            Dictionary <string, Dictionary <string, bool> > UrlMap = new Dictionary <string, Dictionary <string, bool> >();

            try
            {
                MacroscopeDocumentList SitemapDocumentList = this.FindSitemaps(DocCollection: DocCollection);

                foreach (MacroscopeDocument msDoc in SitemapDocumentList.IterateDocuments())
                {
                    string SitemapUrl = msDoc.GetUrl();

                    if (!UrlMap.ContainsKey(SitemapUrl))
                    {
                        UrlMap.Add(SitemapUrl, new Dictionary <string, bool>());
                    }

                    foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks())
                    {
                        string TargetUrl = Outlink.GetTargetUrl();

                        if (!UrlMap[SitemapUrl].ContainsKey(TargetUrl))
                        {
                            UrlMap[SitemapUrl].Add(TargetUrl, false);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("BuildSitemapUrlList: {0}", ex.Message));
            }

            return(UrlMap);
        }
        /** -------------------------------------------------------------------- **/

        public void Analyze(
            MacroscopeDocument msDoc,
            string Text,
            Dictionary <string, int> Terms,
            int Words
            )
        {
            Dictionary <string, int> TermsList = null;

            if (Words == 1)
            {
                TermsList = this.AnalyzeTerm(
                    Text: Text.ToLower(),
                    Terms: Terms
                    );
            }
            else
            if (Words > 1)
            {
                TermsList = this.AnalyzePhrase(
                    Text: Text.ToLower(),
                    Terms: Terms,
                    Words: Words
                    );
            }

            if ((this.DocList != null) && (TermsList != null))
            {
                lock (this.DocList)
                {
                    foreach (string KeywordTerm in TermsList.Keys)
                    {
                        MacroscopeDocumentList DocumentList;

                        if (this.DocList.ContainsKey(KeywordTerm))
                        {
                            DocumentList = this.DocList[KeywordTerm];
                        }
                        else
                        {
                            DocumentList = new MacroscopeDocumentList();
                            this.DocList.Add(KeywordTerm, DocumentList);
                        }

                        DocumentList.AddDocument(msDoc);
                    }
                }
            }
        }
        /**************************************************************************/

        public List <MacroscopeDocumentList> AnalyzeInSitemaps(MacroscopeDocumentCollection DocCollection)
        {
            Dictionary <string, Dictionary <string, bool> > UrlMap  = this.BuildSitemapUrlList(DocCollection: DocCollection);
            MacroscopeDocumentList        InSitemapsDocumentList    = new MacroscopeDocumentList();
            MacroscopeDocumentList        NotInSitemapsDocumentList = new MacroscopeDocumentList();
            List <MacroscopeDocumentList> DocumentLists             = new List <MacroscopeDocumentList>(2);

            DocumentLists.Add(NotInSitemapsDocumentList);
            DocumentLists.Add(InSitemapsDocumentList);

            foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments())
            {
                bool   InSitemap    = false;
                string DocumentNote = null;
                string Url          = msDoc.GetUrl();

                if (msDoc.GetIsExternal())
                {
                    continue;
                }

                if (!msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML))
                {
                    continue;
                }

                foreach (string SitemapUrl in UrlMap.Keys)
                {
                    if (UrlMap[SitemapUrl].ContainsKey(Url))
                    {
                        InSitemap    = true;
                        DocumentNote = SitemapUrl;
                    }
                }

                if (InSitemap)
                {
                    InSitemapsDocumentList.AddDocument(msDoc: msDoc);
                    InSitemapsDocumentList.AddDocumentNote(msDoc: msDoc, Note: DocumentNote);
                }
                else
                {
                    NotInSitemapsDocumentList.AddDocument(msDoc: msDoc);
                }
            }

            return(DocumentLists);
        }
        /**************************************************************************/

        private MacroscopeDocumentList FindSitemaps(MacroscopeDocumentCollection DocCollection)
        {
            MacroscopeDocumentList SitemapDocumentList = new MacroscopeDocumentList();

            foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments())
            {
                if (
                    msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPTEXT) ||
                    msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML))
                {
                    SitemapDocumentList.AddDocument(msDoc: msDoc);
                }
            }

            return(SitemapDocumentList);
        }
        /**************************************************************************/

        private void RenderListViewSitemapsAudit(MacroscopeDocumentCollection DocCollection)
        {
            MacroscopeDocumentList DocumentsNotInSitemaps = DocCollection.GetDocumentsNotInSitemaps();
            MacroscopeDocumentList DocumentsInSitemaps    = DocCollection.GetDocumentsInSitemaps();

            this._RenderListViewSitemapsAudit(
                DocCollection: DocCollection,
                DocumentList: DocumentsNotInSitemaps,
                InOut: false
                );

            this._RenderListViewSitemapsAudit(
                DocCollection: DocCollection,
                DocumentList: DocumentsInSitemaps,
                InOut: true
                );

            return;
        }
Exemple #6
0
        /**************************************************************************/

        public new void RenderListView(MacroscopeDocumentCollection DocCollection)
        {
            List <ListViewItem>    ListViewItems        = new List <ListViewItem>(1);
            MacroscopeDocumentList OrphanedDocumentList = DocCollection.GetOrphanedDocumentList();

            this.ClearData();

            if (OrphanedDocumentList != null)
            {
                foreach (MacroscopeDocument msDoc in OrphanedDocumentList.IterateDocuments())
                {
                    this.RenderListView(
                        ListViewItems: ListViewItems,
                        DocCollection: DocCollection,
                        msDoc: msDoc,
                        Url: msDoc.GetUrl()
                        );
                }
            }

            this.DisplayListView.Items.AddRange(ListViewItems.ToArray());
        }
Exemple #7
0
        /**************************************************************************/

        public void WriteXslx(MacroscopeJobMaster JobMaster, string OutputFilename)
        {
            XLWorkbook             Workbook = new XLWorkbook();
            MacroscopeDocumentList DocumentsNotInSitemaps = JobMaster.GetDocCollection().GetDocumentsNotInSitemaps();
            MacroscopeDocumentList DocumentsInSitemaps    = JobMaster.GetDocCollection().GetDocumentsInSitemaps();

            this.BuildWorksheetSitemapXmlErrors(JobMaster, Workbook, "Sitemap XML Errors");

            this.BuildWorksheetSitemapsAudit(JobMaster, Workbook, "Sitemaps Audit - Missing", DocumentsNotInSitemaps, false);
            this.BuildWorksheetSitemapsAudit(JobMaster, Workbook, "Sitemaps Audit - Present", DocumentsInSitemaps, true);

            try
            {
                Workbook.SaveAs(OutputFilename);
            }
            catch (IOException)
            {
                MacroscopeSaveExcelFileException CannotSaveExcelFileException;
                CannotSaveExcelFileException = new MacroscopeSaveExcelFileException(
                    string.Format("Cannot write to Excel file at {0}", OutputFilename)
                    );
                throw CannotSaveExcelFileException;
            }
        }
        /**************************************************************************/

        private void BuildWorksheetPageOrphanedPages(
            MacroscopeJobMaster JobMaster,
            CsvWriter ws
            )
        {
            MacroscopeDocumentCollection DocCollection        = JobMaster.GetDocCollection();
            MacroscopeDocumentList       OrphanedDocumentList = DocCollection.GetOrphanedDocumentList();

            {
                ws.WriteField("URL");
                ws.WriteField("Status Code");
                ws.WriteField("Status");

                ws.NextRecord();
            }

            if (OrphanedDocumentList != null)
            {
                foreach (MacroscopeDocument msDoc in OrphanedDocumentList.IterateDocuments())
                {
                    string Url        = msDoc.GetUrl();
                    string StatusCode = ((int)msDoc.GetStatusCode()).ToString();
                    string Status     = msDoc.GetStatusCode().ToString();
                    string Checksum   = msDoc.GetChecksum();
                    int    Count      = DocCollection.GetStatsChecksumCount(Checksum: Checksum);

                    this.InsertAndFormatUrlCell(ws, msDoc);

                    this.InsertAndFormatContentCell(ws, StatusCode);

                    this.InsertAndFormatContentCell(ws, Status);

                    ws.NextRecord();
                }
            }
        }
        /**************************************************************************/

        private void BuildWorksheetKeywordTerms(
            MacroscopeJobMaster JobMaster,
            XLWorkbook wb,
            string WorksheetLabel,
            Dictionary <string, int> DicTerms
            )
        {
            var     ws        = wb.Worksheets.Add(WorksheetLabel);
            decimal TermTotal = DicTerms.Count;
            decimal TermCount = 0;

            int iRow    = 1;
            int iCol    = 1;
            int iColMax = 1;

            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();

            {
                ws.Cell(iRow, iCol).Value = "Occurrences";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Term";
                iCol++;

                ws.Cell(iRow, iCol).Value = "URL";
            }

            iColMax = iCol;

            iRow++;

            foreach (string Term in DicTerms.Keys)
            {
                MacroscopeDocumentList DocumentList = DocCollection.GetDeepKeywordAnalysDocumentList(Term);

                decimal DocTotal = ( decimal )DocumentList.CountDocuments();
                decimal DocCount = 0;
                TermCount++;

                if (TermTotal > 0)
                {
                    this.ProgressForm.UpdatePercentages(
                        Title: null,
                        Message: null,
                        MajorPercentage: -1,
                        ProgressLabelMajor: null,
                        MinorPercentage: (( decimal )100 / TermTotal) * TermCount,
                        ProgressLabelMinor: "Keywords Processed",
                        SubMinorPercentage: -1,
                        ProgressLabelSubMinor: null
                        );
                }

                foreach (MacroscopeDocument msDoc in DocumentList.IterateDocuments())
                {
                    DocCount++;

                    if (DocTotal > 0)
                    {
                        this.ProgressForm.UpdatePercentages(
                            Title: null,
                            Message: null,
                            MajorPercentage: -1,
                            ProgressLabelMajor: null,
                            MinorPercentage: -1,
                            ProgressLabelMinor: null,
                            SubMinorPercentage: (( decimal )100 / DocTotal) * DocCount,
                            ProgressLabelSubMinor: "Documents Processed"
                            );
                    }

                    iCol = 1;

                    this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(DicTerms[Term].ToString()));

                    iCol++;

                    this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Term));

                    iCol++;

                    this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc.GetUrl());

                    iRow++;
                }
            }

            {
                var rangeData  = ws.Range(1, 1, iRow - 1, iColMax);
                var excelTable = rangeData.CreateTable();
            }
        }
Exemple #10
0
        /**************************************************************************/

        public MacroscopeDocumentList AnalyzeOrphanedDocumentsInCollection(MacroscopeDocumentCollection DocCollection)
        {
            MacroscopeDocumentList OrphanedDocumentList = new MacroscopeDocumentList();

            foreach (MacroscopeDocument msDocLeft in DocCollection.IterateDocuments())
            {
                bool   IsOrphan = true;
                string UrlLeft  = msDocLeft.GetUrl();

                if (!IsValidDocument(msDoc: msDocLeft))
                {
                    continue;
                }

                foreach (MacroscopeDocument msDocRight in DocCollection.IterateDocuments())
                {
                    if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: msDocRight.GetUrl()))
                    {
                        continue;
                    }

                    if (!this.IsValidDocument(msDoc: msDocRight))
                    {
                        continue;
                    }

                    foreach (MacroscopeHyperlinkOut HyperlinkOut in msDocRight.IterateHyperlinksOut())
                    {
                        string UrlRight    = HyperlinkOut.GetTargetUrl();
                        string UrlRightRaw = HyperlinkOut.GetRawTargetUrl();

                        if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: UrlRight))
                        {
                            IsOrphan = false;
                        }
                        else
                        if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: UrlRightRaw))
                        {
                            IsOrphan = false;
                        }

                        if (!IsOrphan)
                        {
                            break;
                        }
                    }

                    if (!IsOrphan)
                    {
                        break;
                    }
                }

                if (IsOrphan)
                {
                    OrphanedDocumentList.AddDocument(msDoc: msDocLeft);
                    msDocLeft.AddRemark("ORPHAN1", "This appears to be an orphaned page, not linked to from any other HTML page in this collection.");
                    msDocLeft.AddRemark("ORPHAN2", "This page appears to only be referenced from one or more sitemaps.");
                }
                else
                {
                    msDocLeft.RemoveRemark("ORPHAN1");
                    msDocLeft.RemoveRemark("ORPHAN2");
                }
            }

            return(OrphanedDocumentList);
        }
        /**************************************************************************/

        private void BuildWorksheetSitemapsAudit(
            MacroscopeJobMaster JobMaster,
            XLWorkbook wb,
            string WorksheetLabel,
            MacroscopeDocumentList DocumentList,
            bool InOut
            )
        {
            var ws = wb.Worksheets.Add(WorksheetLabel);

            int iRow    = 1;
            int iCol    = 1;
            int iColMax = 1;

            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();

            {
                ws.Cell(iRow, iCol).Value = "URL";
                iCol++;

                ws.Cell(iRow, iCol).Value = "In Sitemap";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Status Code";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Is Redirect";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Robots";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Sitemap";
            }

            iColMax = iCol;

            iRow++;

            foreach (MacroscopeDocument msDoc in DocumentList.IterateDocuments())
            {
                string Url        = null;
                string Robots     = null;
                string SitemapUrl = null;
                int    StatusCode;

                if (!msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML))
                {
                    continue;
                }

                if (msDoc.GetIsExternal())
                {
                    continue;
                }

                Url        = msDoc.GetUrl();
                StatusCode = (int)msDoc.GetStatusCode();
                Robots     = msDoc.GetAllowedByRobotsAsString();
                SitemapUrl = DocumentList.GetDocumentNote(msDoc: msDoc);

                iCol = 1;

                this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc);

                if (msDoc.GetIsInternal())
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                }
                else
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray);
                }

                iCol++;

                this.InsertAndFormatContentCell(ws, iRow, iCol, InOut.ToString());

                if (InOut)
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                }
                else
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red);
                }

                iCol++;

                this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, msDoc);

                iCol++;

                this.InsertAndFormatRedirectCell(ws, iRow, iCol, msDoc);

                iCol++;

                this.InsertAndFormatRobotsCell(ws, iRow, iCol, msDoc);

                iCol++;

                this.InsertAndFormatUrlCell(ws, iRow, iCol, SitemapUrl);

                if (AllowedHosts.IsInternalUrl(Url: SitemapUrl))
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                }
                else
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray);
                }

                iRow++;
            }

            {
                var rangeData  = ws.Range(1, 1, iRow - 1, iColMax);
                var excelTable = rangeData.CreateTable();
            }
        }
        /**************************************************************************/

        private void BuildWorksheetPageOrphanedPages(
            MacroscopeJobMaster JobMaster,
            XLWorkbook wb,
            string WorksheetLabel
            )
        {
            var ws = wb.Worksheets.Add(WorksheetLabel);

            int iRow    = 1;
            int iCol    = 1;
            int iColMax = 1;

            MacroscopeDocumentCollection DocCollection        = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts         = JobMaster.GetAllowedHosts();
            MacroscopeDocumentList       OrphanedDocumentList = DocCollection.GetOrphanedDocumentList();

            {
                ws.Cell(iRow, iCol).Value = "URL";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Status Code";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Status";
            }

            iColMax = iCol;

            iRow++;

            if (OrphanedDocumentList != null)
            {
                foreach (MacroscopeDocument msDoc in OrphanedDocumentList.IterateDocuments())
                {
                    string Url        = msDoc.GetUrl();
                    string StatusCode = ((int)msDoc.GetStatusCode()).ToString();
                    string Status     = msDoc.GetStatusCode().ToString();

                    iCol = 1;

                    this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc);

                    if (AllowedHosts.IsInternalUrl(Url: Url))
                    {
                        ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                    }
                    else
                    {
                        ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray);
                    }

                    iCol++;

                    this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode);

                    iCol++;

                    this.InsertAndFormatContentCell(ws, iRow, iCol, Status);

                    iRow++;
                }

                {
                    var rangeData  = ws.Range(1, 1, iRow - 1, iColMax);
                    var excelTable = rangeData.CreateTable();
                }
            }
        }
        /** -------------------------------------------------------------------- **/

        private void _RenderListViewSitemapsAudit(
            MacroscopeDocumentCollection DocCollection,
            MacroscopeDocumentList DocumentList,
            bool InOut
            )
        {
            List <ListViewItem> ListViewItems = new List <ListViewItem>(1);

            foreach (MacroscopeDocument msDoc in DocumentList.IterateDocuments())
            {
                string       Url        = null;
                string       Robots     = null;
                string       SitemapUrl = null;
                string       PairKey    = null;
                ListViewItem lvItem     = null;
                int          StatusCode;

                if (!msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML))
                {
                    continue;
                }

                if (msDoc.GetIsExternal())
                {
                    continue;
                }

                Url        = msDoc.GetUrl();
                StatusCode = (int)msDoc.GetStatusCode();
                Robots     = msDoc.GetAllowedByRobotsAsString();
                SitemapUrl = DocumentList.GetDocumentNote(msDoc: msDoc);
                PairKey    = string.Join("::::::::", Url);

                if (this.DisplayListView.Items.ContainsKey(PairKey))
                {
                    try
                    {
                        lvItem = this.DisplayListView.Items[PairKey];
                        lvItem.SubItems[COL_URL].Text         = Url;
                        lvItem.SubItems[COL_IN_SITEMAP].Text  = InOut.ToString();
                        lvItem.SubItems[COL_STATUS_CODE].Text = msDoc.GetStatusCode().ToString();
                        lvItem.SubItems[COL_IS_REDIRECT].Text = msDoc.GetIsRedirect().ToString();
                        lvItem.SubItems[COL_ROBOTS].Text      = Robots;
                        lvItem.SubItems[COL_SITEMAP].Text     = SitemapUrl;
                    }
                    catch (Exception ex)
                    {
                        DebugMsg(string.Format("_RenderListViewSitemapsAudit 1: {0}", ex.Message));
                    }
                }
                else
                {
                    try
                    {
                        lvItem = new ListViewItem(PairKey);
                        lvItem.UseItemStyleForSubItems = false;
                        lvItem.Name = PairKey;

                        lvItem.SubItems[COL_URL].Text = Url;
                        lvItem.SubItems.Add(InOut.ToString());
                        lvItem.SubItems.Add(msDoc.GetStatusCode().ToString());
                        lvItem.SubItems.Add(msDoc.GetIsRedirect().ToString());
                        lvItem.SubItems.Add(Robots);
                        lvItem.SubItems.Add(SitemapUrl);

                        ListViewItems.Add(lvItem);
                    }
                    catch (Exception ex)
                    {
                        DebugMsg(string.Format("_RenderListViewSitemapsAudit 2: {0}", ex.Message));
                    }
                }

                try
                {
                    if (lvItem != null)
                    {
                        lvItem.ForeColor = Color.Blue;

                        if (msDoc.GetIsInternal())
                        {
                            lvItem.SubItems[COL_URL].ForeColor = Color.Green;
                        }
                        else
                        {
                            lvItem.SubItems[COL_URL].ForeColor = Color.Gray;
                        }

                        if (InOut)
                        {
                            lvItem.SubItems[COL_IN_SITEMAP].ForeColor = Color.Green;
                        }
                        else
                        {
                            lvItem.SubItems[COL_IN_SITEMAP].ForeColor = Color.Red;
                        }

                        if ((StatusCode >= 200) && (StatusCode <= 299))
                        {
                            lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Green;
                        }
                        else
                        if ((StatusCode >= 300) && (StatusCode <= 399))
                        {
                            lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Goldenrod;
                        }
                        else
                        if ((StatusCode >= 400) && (StatusCode <= 599))
                        {
                            lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Red;
                        }
                        else
                        {
                            lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Blue;
                        }

                        if (StatusCode == 410)
                        {
                            lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Purple;
                        }

                        if (msDoc.GetIsRedirect())
                        {
                            lvItem.SubItems[COL_IS_REDIRECT].ForeColor = Color.Goldenrod;
                        }
                        else
                        {
                            lvItem.SubItems[COL_IS_REDIRECT].ForeColor = Color.Gray;
                        }

                        if (!msDoc.GetAllowedByRobots())
                        {
                            lvItem.SubItems[COL_ROBOTS].ForeColor = Color.Red;
                        }
                        else
                        {
                            lvItem.SubItems[COL_ROBOTS].ForeColor = Color.Green;
                        }

                        if (msDoc.GetIsInternal())
                        {
                            lvItem.SubItems[COL_SITEMAP].ForeColor = Color.Green;
                        }
                        else
                        {
                            lvItem.SubItems[COL_SITEMAP].ForeColor = Color.Gray;
                        }
                    }
                    else
                    {
                        lvItem.SubItems[3].ForeColor = Color.Gray;
                    }
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("_RenderListViewSitemapsAudit 3: {0}", ex.Message));
                }
            }

            this.DisplayListView.Items.AddRange(ListViewItems.ToArray());

            return;
        }