public async Task TestDuplicate() { const string StartUrl = "https://nazuke.github.io/SEOMacroscope/"; const string DupeUrl = "https://nazuke.github.io/SEOMacroscope/index.html"; MacroscopeJobMaster JobMaster; MacroscopeDocumentCollection DocCollection; Dictionary <string, bool> CrossCheckList; MacroscopeDocument msDoc; MacroscopeDocument msDocDifferent; JobMaster = new MacroscopeJobMaster( JobRunTimeMode: MacroscopeConstants.RunTimeMode.LIVE, TaskController: this ); DocCollection = new MacroscopeDocumentCollection(JobMaster: JobMaster); CrossCheckList = MacroscopeLevenshteinAnalysis.GetCrossCheckList(Capacity: DocCollection.CountDocuments()); msDoc = DocCollection.CreateDocument(StartUrl); msDocDifferent = DocCollection.CreateDocument(DupeUrl); await msDoc.Execute(); await msDocDifferent.Execute(); DebugMsg(string.Format("msDoc: {0}", msDoc.GetStatusCode())); DebugMsg(string.Format("msDocDifferent: {0}", msDocDifferent.GetStatusCode())); for (int i = 1; i <= 100; i++) { MacroscopeLevenshteinAnalysis LevenshteinAnalysis; Dictionary <MacroscopeDocument, int> DocList; LevenshteinAnalysis = new MacroscopeLevenshteinAnalysis( msDoc: msDoc, SizeDifference: 64, Threshold: 16, CrossCheckList: CrossCheckList ); DocList = LevenshteinAnalysis.AnalyzeDocCollection(DocCollection: DocCollection); DebugMsg(string.Format("DocList: {0}", DocList.Count)); foreach (MacroscopeDocument msDocAnalyzed in DocList.Keys) { DebugMsg(string.Format("msDocAnalyzed: {0} => {1}", DocList[msDocAnalyzed], msDocAnalyzed.GetUrl())); Assert.AreEqual( DocList[msDocAnalyzed], 0, string.Format("FAIL: {0} => {1}", DocList[msDocAnalyzed], msDocAnalyzed.GetUrl()) ); } } }
public void TestDifferent() { const string StartUrl = "https://nazuke.github.io/SEOMacroscope/"; MacroscopeJobMaster JobMaster = new MacroscopeJobMaster( JobRunTimeMode: MacroscopeConstants.RunTimeMode.LIVE, TaskController: this ); MacroscopeDocumentCollection DocCollection = new MacroscopeDocumentCollection(JobMaster: JobMaster); Dictionary <string, Boolean> CrossCheckList = MacroscopeLevenshteinAnalysis.GetCrossCheckList(Capacity: DocCollection.CountDocuments()); MacroscopeDocument msDoc = DocCollection.CreateDocument(StartUrl); msDoc.Execute(); DocCollection.AddDocument(msDoc); DebugMsg(string.Format("msDoc: {0}", msDoc.GetStatusCode())); MacroscopeLevenshteinAnalysis LevenshteinAnalysis = new MacroscopeLevenshteinAnalysis( msDoc: msDoc, SizeDifference: 64, Threshold: 16, CrossCheckList: CrossCheckList ); List <string> TargetUrls = new List <string> () { { "https://nazuke.github.io/SEOMacroscope/blog/" }, { "https://nazuke.github.io/SEOMacroscope/downloads/" }, { "https://nazuke.github.io/SEOMacroscope/manual/" } }; foreach (string TargetUrl in TargetUrls) { MacroscopeDocument msDocTarget = DocCollection.CreateDocument(TargetUrl); msDocTarget.Execute(); DocCollection.AddDocument(msDocTarget); DebugMsg(string.Format("msDocTarget: {0}", msDocTarget.GetStatusCode())); } for (int i = 1; i <= 10; i++) { Dictionary <MacroscopeDocument, int> DocList; DocList = LevenshteinAnalysis.AnalyzeDocCollection( DocCollection: DocCollection ); DebugMsg(string.Format("DocList: {0}", DocList.Count)); foreach (MacroscopeDocument msDocAnalyzed in DocList.Keys) { DebugMsg(string.Format("msDocAnalyzed: {0} => {1}", DocList[msDocAnalyzed], msDocAnalyzed.GetUrl())); Assert.AreNotEqual( DocList[msDocAnalyzed], 0, string.Format( "FAIL: {0} => {1}", DocList[msDocAnalyzed], msDocAnalyzed.GetUrl() ) ); } } }
/**************************************************************************/ private void BuildWorksheetPageDuplicatePages( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; decimal DocCount = 0; decimal DocListCount = 0; decimal CountOuter = 0; decimal CountInner = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, bool> CrossCheckList; CrossCheckList = MacroscopeLevenshteinAnalysis.GetCrossCheckList( Capacity: DocCollection.CountDocuments() ); DocCount = ( decimal )DocCollection.CountDocuments(); { ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Origin URL"; iCol++; ws.Cell(iRow, iCol).Value = "Distance"; iCol++; ws.Cell(iRow, iCol).Value = "Similar URL"; } iColMax = iCol; iRow++; foreach (string UrlLeft in DocCollection.DocumentUrls()) { MacroscopeDocument msDocLeft = DocCollection.GetDocumentByUrl(Url: UrlLeft); MacroscopeLevenshteinAnalysis LevenshteinAnalysis = null; CountOuter++; CountInner = 0; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: UrlLeft, SubMinorPercentage: 0, ProgressLabelSubMinor: "" ); } if (msDocLeft.GetIsExternal()) { continue; } if (!msDocLeft.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { continue; } LevenshteinAnalysis = new MacroscopeLevenshteinAnalysis( msDoc: msDocLeft, SizeDifference: MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference(), Threshold: MacroscopePreferencesManager.GetMaxLevenshteinDistance(), CrossCheckList: CrossCheckList, IPercentageDone: this ); Dictionary <MacroscopeDocument, int> DocList; DocList = LevenshteinAnalysis.AnalyzeDocCollection( DocCollection: DocCollection ); DocListCount = ( decimal )DocList.Count; foreach (MacroscopeDocument msDocDuplicate in DocList.Keys) { int StatusCode = ( int )msDocLeft.GetStatusCode(); HttpStatusCode Status = msDocLeft.GetStatusCode(); string UrlDuplicate = msDocDuplicate.GetUrl(); int Distance = DocList[msDocDuplicate]; CountInner++; iCol = 1; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: UrlLeft, SubMinorPercentage: (( decimal )100 / DocListCount) * CountInner, ProgressLabelSubMinor: UrlDuplicate ); } this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, StatusCode); iCol++; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, Status); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, UrlLeft); if (AllowedHosts.IsInternalUrl(Url: UrlLeft)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Distance.ToString()); if (Distance <= MacroscopePreferencesManager.GetMaxLevenshteinDistance()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, UrlDuplicate); if (AllowedHosts.IsInternalUrl(Url: UrlDuplicate)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iRow++; if (this.ProgressForm.Cancelled()) { break; } } if (this.ProgressForm.Cancelled()) { break; } //Thread.Yield(); } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageDuplicatePages( MacroscopeJobMaster JobMaster, CsvWriter ws ) { decimal DocCount = 0; decimal DocListCount = 0; decimal CountOuter = 0; decimal CountInner = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, bool> CrossCheckList; CrossCheckList = MacroscopeLevenshteinAnalysis.GetCrossCheckList( Capacity: DocCollection.CountDocuments() ); DocCount = ( decimal )DocCollection.CountDocuments(); { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Origin URL"); ws.WriteField("Distance"); ws.WriteField("Similar URL"); ws.NextRecord(); } foreach (string UrlLeft in DocCollection.DocumentUrls()) { MacroscopeDocument msDocLeft = DocCollection.GetDocumentByUrl(Url: UrlLeft); MacroscopeLevenshteinAnalysis LevenshteinAnalysis = null; CountOuter++; CountInner = 0; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: UrlLeft, SubMinorPercentage: 0, ProgressLabelSubMinor: "" ); } if (msDocLeft.GetIsExternal()) { continue; } if (!msDocLeft.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { continue; } LevenshteinAnalysis = new MacroscopeLevenshteinAnalysis( msDoc: msDocLeft, SizeDifference: MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference(), Threshold: MacroscopePreferencesManager.GetMaxLevenshteinDistance(), CrossCheckList: CrossCheckList, IPercentageDone: this ); Dictionary <MacroscopeDocument, int> DocList; DocList = LevenshteinAnalysis.AnalyzeDocCollection( DocCollection: DocCollection ); DocListCount = ( decimal )DocList.Count; foreach (MacroscopeDocument msDocDuplicate in DocList.Keys) { int StatusCode = ( int )msDocLeft.GetStatusCode(); HttpStatusCode Status = msDocLeft.GetStatusCode(); string UrlDuplicate = msDocDuplicate.GetUrl(); int Distance = DocList[msDocDuplicate]; CountInner++; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: UrlLeft, SubMinorPercentage: (( decimal )100 / DocListCount) * CountInner, ProgressLabelSubMinor: UrlDuplicate ); } this.InsertAndFormatStatusCodeCell(ws, StatusCode); this.InsertAndFormatStatusCodeCell(ws, Status); this.InsertAndFormatUrlCell(ws, UrlLeft); this.InsertAndFormatContentCell(ws, Distance.ToString()); this.InsertAndFormatUrlCell(ws, UrlDuplicate); ws.NextRecord(); if (this.ProgressForm.Cancelled()) { break; } } if (this.ProgressForm.Cancelled()) { break; } //Thread.Yield(); } }