public async Task TestDuplicate()
        {
            const string                 StartUrl = "https://nazuke.github.io/SEOMacroscope/";
            const string                 DupeUrl  = "https://nazuke.github.io/SEOMacroscope/index.html";
            MacroscopeJobMaster          JobMaster;
            MacroscopeDocumentCollection DocCollection;
            Dictionary <string, bool>    CrossCheckList;
            MacroscopeDocument           msDoc;
            MacroscopeDocument           msDocDifferent;

            JobMaster = new MacroscopeJobMaster(
                JobRunTimeMode: MacroscopeConstants.RunTimeMode.LIVE,
                TaskController: this
                );

            DocCollection = new MacroscopeDocumentCollection(JobMaster: JobMaster);

            CrossCheckList = MacroscopeLevenshteinAnalysis.GetCrossCheckList(Capacity: DocCollection.CountDocuments());

            msDoc          = DocCollection.CreateDocument(StartUrl);
            msDocDifferent = DocCollection.CreateDocument(DupeUrl);

            await msDoc.Execute();

            await msDocDifferent.Execute();

            DebugMsg(string.Format("msDoc: {0}", msDoc.GetStatusCode()));

            DebugMsg(string.Format("msDocDifferent: {0}", msDocDifferent.GetStatusCode()));

            for (int i = 1; i <= 100; i++)
            {
                MacroscopeLevenshteinAnalysis        LevenshteinAnalysis;
                Dictionary <MacroscopeDocument, int> DocList;

                LevenshteinAnalysis = new MacroscopeLevenshteinAnalysis(
                    msDoc: msDoc,
                    SizeDifference: 64,
                    Threshold: 16,
                    CrossCheckList: CrossCheckList
                    );

                DocList = LevenshteinAnalysis.AnalyzeDocCollection(DocCollection: DocCollection);

                DebugMsg(string.Format("DocList: {0}", DocList.Count));

                foreach (MacroscopeDocument msDocAnalyzed in DocList.Keys)
                {
                    DebugMsg(string.Format("msDocAnalyzed: {0} => {1}", DocList[msDocAnalyzed], msDocAnalyzed.GetUrl()));

                    Assert.AreEqual(
                        DocList[msDocAnalyzed],
                        0,
                        string.Format("FAIL: {0} => {1}", DocList[msDocAnalyzed], msDocAnalyzed.GetUrl())
                        );
                }
            }
        }
        public void TestDifferent()
        {
            const string StartUrl = "https://nazuke.github.io/SEOMacroscope/";

            MacroscopeJobMaster JobMaster = new MacroscopeJobMaster(
                JobRunTimeMode: MacroscopeConstants.RunTimeMode.LIVE,
                TaskController: this
                );

            MacroscopeDocumentCollection DocCollection = new MacroscopeDocumentCollection(JobMaster: JobMaster);

            Dictionary <string, Boolean> CrossCheckList = MacroscopeLevenshteinAnalysis.GetCrossCheckList(Capacity: DocCollection.CountDocuments());

            MacroscopeDocument msDoc = DocCollection.CreateDocument(StartUrl);

            msDoc.Execute();
            DocCollection.AddDocument(msDoc);

            DebugMsg(string.Format("msDoc: {0}", msDoc.GetStatusCode()));

            MacroscopeLevenshteinAnalysis LevenshteinAnalysis = new MacroscopeLevenshteinAnalysis(
                msDoc: msDoc,
                SizeDifference: 64,
                Threshold: 16,
                CrossCheckList: CrossCheckList
                );

            List <string> TargetUrls = new List <string> ()
            {
                {
                    "https://nazuke.github.io/SEOMacroscope/blog/"
                },
                {
                    "https://nazuke.github.io/SEOMacroscope/downloads/"
                },
                {
                    "https://nazuke.github.io/SEOMacroscope/manual/"
                }
            };

            foreach (string TargetUrl in TargetUrls)
            {
                MacroscopeDocument msDocTarget = DocCollection.CreateDocument(TargetUrl);
                msDocTarget.Execute();
                DocCollection.AddDocument(msDocTarget);
                DebugMsg(string.Format("msDocTarget: {0}", msDocTarget.GetStatusCode()));
            }

            for (int i = 1; i <= 10; i++)
            {
                Dictionary <MacroscopeDocument, int> DocList;

                DocList = LevenshteinAnalysis.AnalyzeDocCollection(
                    DocCollection: DocCollection
                    );

                DebugMsg(string.Format("DocList: {0}", DocList.Count));

                foreach (MacroscopeDocument msDocAnalyzed in DocList.Keys)
                {
                    DebugMsg(string.Format("msDocAnalyzed: {0} => {1}", DocList[msDocAnalyzed], msDocAnalyzed.GetUrl()));

                    Assert.AreNotEqual(
                        DocList[msDocAnalyzed],
                        0,
                        string.Format(
                            "FAIL: {0} => {1}",
                            DocList[msDocAnalyzed],
                            msDocAnalyzed.GetUrl()
                            )
                        );
                }
            }
        }
Example #3
0
        /**************************************************************************/

        private void BuildWorksheetPageDuplicatePages(
            MacroscopeJobMaster JobMaster,
            XLWorkbook wb,
            string WorksheetLabel
            )
        {
            var ws = wb.Worksheets.Add(WorksheetLabel);

            int iRow    = 1;
            int iCol    = 1;
            int iColMax = 1;

            decimal DocCount     = 0;
            decimal DocListCount = 0;
            decimal CountOuter   = 0;
            decimal CountInner   = 0;

            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();
            Dictionary <string, bool>    CrossCheckList;

            CrossCheckList = MacroscopeLevenshteinAnalysis.GetCrossCheckList(
                Capacity: DocCollection.CountDocuments()
                );

            DocCount = ( decimal )DocCollection.CountDocuments();

            {
                ws.Cell(iRow, iCol).Value = "Status Code";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Status";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Origin URL";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Distance";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Similar URL";
            }

            iColMax = iCol;

            iRow++;

            foreach (string UrlLeft in DocCollection.DocumentUrls())
            {
                MacroscopeDocument            msDocLeft           = DocCollection.GetDocumentByUrl(Url: UrlLeft);
                MacroscopeLevenshteinAnalysis LevenshteinAnalysis = null;

                CountOuter++;
                CountInner = 0;

                if (DocCount > 0)
                {
                    this.ProgressForm.UpdatePercentages(
                        Title: null,
                        Message: null,
                        MajorPercentage: -1,
                        ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter),
                        MinorPercentage: (( decimal )100 / DocCount) * CountOuter,
                        ProgressLabelMinor: UrlLeft,
                        SubMinorPercentage: 0,
                        ProgressLabelSubMinor: ""
                        );
                }

                if (msDocLeft.GetIsExternal())
                {
                    continue;
                }

                if (!msDocLeft.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML))
                {
                    continue;
                }

                LevenshteinAnalysis = new MacroscopeLevenshteinAnalysis(
                    msDoc: msDocLeft,
                    SizeDifference: MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference(),
                    Threshold: MacroscopePreferencesManager.GetMaxLevenshteinDistance(),
                    CrossCheckList: CrossCheckList,
                    IPercentageDone: this
                    );

                Dictionary <MacroscopeDocument, int> DocList;

                DocList = LevenshteinAnalysis.AnalyzeDocCollection(
                    DocCollection: DocCollection
                    );

                DocListCount = ( decimal )DocList.Count;

                foreach (MacroscopeDocument msDocDuplicate in DocList.Keys)
                {
                    int            StatusCode   = ( int )msDocLeft.GetStatusCode();
                    HttpStatusCode Status       = msDocLeft.GetStatusCode();
                    string         UrlDuplicate = msDocDuplicate.GetUrl();
                    int            Distance     = DocList[msDocDuplicate];

                    CountInner++;
                    iCol = 1;

                    if (DocCount > 0)
                    {
                        this.ProgressForm.UpdatePercentages(
                            Title: null,
                            Message: null,
                            MajorPercentage: -1,
                            ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter),
                            MinorPercentage: (( decimal )100 / DocCount) * CountOuter,
                            ProgressLabelMinor: UrlLeft,
                            SubMinorPercentage: (( decimal )100 / DocListCount) * CountInner,
                            ProgressLabelSubMinor: UrlDuplicate
                            );
                    }

                    this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, StatusCode);
                    iCol++;

                    this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, Status);
                    iCol++;

                    this.InsertAndFormatUrlCell(ws, iRow, iCol, UrlLeft);

                    if (AllowedHosts.IsInternalUrl(Url: UrlLeft))
                    {
                        ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                    }
                    else
                    {
                        ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray);
                    }

                    iCol++;

                    this.InsertAndFormatContentCell(ws, iRow, iCol, Distance.ToString());

                    if (Distance <= MacroscopePreferencesManager.GetMaxLevenshteinDistance())
                    {
                        ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red);
                    }
                    else
                    {
                        ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                    }

                    iCol++;

                    this.InsertAndFormatUrlCell(ws, iRow, iCol, UrlDuplicate);

                    if (AllowedHosts.IsInternalUrl(Url: UrlDuplicate))
                    {
                        ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                    }
                    else
                    {
                        ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray);
                    }

                    iRow++;

                    if (this.ProgressForm.Cancelled())
                    {
                        break;
                    }
                }

                if (this.ProgressForm.Cancelled())
                {
                    break;
                }

                //Thread.Yield();
            }

            {
                var rangeData  = ws.Range(1, 1, iRow - 1, iColMax);
                var excelTable = rangeData.CreateTable();
            }
        }
Example #4
0
        /**************************************************************************/

        private void BuildWorksheetPageDuplicatePages(
            MacroscopeJobMaster JobMaster,
            CsvWriter ws
            )
        {
            decimal DocCount     = 0;
            decimal DocListCount = 0;
            decimal CountOuter   = 0;
            decimal CountInner   = 0;

            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();
            Dictionary <string, bool>    CrossCheckList;

            CrossCheckList = MacroscopeLevenshteinAnalysis.GetCrossCheckList(
                Capacity: DocCollection.CountDocuments()
                );

            DocCount = ( decimal )DocCollection.CountDocuments();

            {
                ws.WriteField("Status Code");
                ws.WriteField("Status");
                ws.WriteField("Origin URL");
                ws.WriteField("Distance");
                ws.WriteField("Similar URL");

                ws.NextRecord();
            }

            foreach (string UrlLeft in DocCollection.DocumentUrls())
            {
                MacroscopeDocument            msDocLeft           = DocCollection.GetDocumentByUrl(Url: UrlLeft);
                MacroscopeLevenshteinAnalysis LevenshteinAnalysis = null;

                CountOuter++;
                CountInner = 0;

                if (DocCount > 0)
                {
                    this.ProgressForm.UpdatePercentages(
                        Title: null,
                        Message: null,
                        MajorPercentage: -1,
                        ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter),
                        MinorPercentage: (( decimal )100 / DocCount) * CountOuter,
                        ProgressLabelMinor: UrlLeft,
                        SubMinorPercentage: 0,
                        ProgressLabelSubMinor: ""
                        );
                }

                if (msDocLeft.GetIsExternal())
                {
                    continue;
                }

                if (!msDocLeft.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML))
                {
                    continue;
                }

                LevenshteinAnalysis = new MacroscopeLevenshteinAnalysis(
                    msDoc: msDocLeft,
                    SizeDifference: MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference(),
                    Threshold: MacroscopePreferencesManager.GetMaxLevenshteinDistance(),
                    CrossCheckList: CrossCheckList,
                    IPercentageDone: this
                    );

                Dictionary <MacroscopeDocument, int> DocList;

                DocList = LevenshteinAnalysis.AnalyzeDocCollection(
                    DocCollection: DocCollection
                    );

                DocListCount = ( decimal )DocList.Count;

                foreach (MacroscopeDocument msDocDuplicate in DocList.Keys)
                {
                    int            StatusCode   = ( int )msDocLeft.GetStatusCode();
                    HttpStatusCode Status       = msDocLeft.GetStatusCode();
                    string         UrlDuplicate = msDocDuplicate.GetUrl();
                    int            Distance     = DocList[msDocDuplicate];

                    CountInner++;

                    if (DocCount > 0)
                    {
                        this.ProgressForm.UpdatePercentages(
                            Title: null,
                            Message: null,
                            MajorPercentage: -1,
                            ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter),
                            MinorPercentage: (( decimal )100 / DocCount) * CountOuter,
                            ProgressLabelMinor: UrlLeft,
                            SubMinorPercentage: (( decimal )100 / DocListCount) * CountInner,
                            ProgressLabelSubMinor: UrlDuplicate
                            );
                    }

                    this.InsertAndFormatStatusCodeCell(ws, StatusCode);

                    this.InsertAndFormatStatusCodeCell(ws, Status);

                    this.InsertAndFormatUrlCell(ws, UrlLeft);

                    this.InsertAndFormatContentCell(ws, Distance.ToString());

                    this.InsertAndFormatUrlCell(ws, UrlDuplicate);

                    ws.NextRecord();

                    if (this.ProgressForm.Cancelled())
                    {
                        break;
                    }
                }

                if (this.ProgressForm.Cancelled())
                {
                    break;
                }

                //Thread.Yield();
            }
        }