/**************************************************************************/ private bool CrossCheckDocuments(MacroscopeDocument msDocCompare) { bool CrossChecked = false; string Key1 = string.Join(":", this.msDocOriginal.GetChecksum(), msDocCompare.GetChecksum()); string Key2 = string.Join(":", msDocCompare.GetChecksum(), this.msDocOriginal.GetChecksum()); lock (this.CrossCheck) { if (this.CrossCheck.ContainsKey(Key1)) { CrossChecked = true; } else { this.CrossCheck.Add(Key1, true); } if (this.CrossCheck.ContainsKey(Key2)) { CrossChecked = true; } else { this.CrossCheck.Add(Key2, true); } } return(CrossChecked); }
/**************************************************************************/ private void BuildWorksheetPageUriAnalysis( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Occurrences"); ws.WriteField("Checksum"); ws.NextRecord(); } foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url: Url); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string Checksum = msDoc.GetChecksum(); int Count = DocCollection.GetStatsChecksumCount(Checksum: Checksum); this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, StatusCode); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatContentCell(ws, Count.ToString()); this.InsertAndFormatContentCell(ws, Checksum); ws.NextRecord(); } }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string Checksum = msDoc.GetChecksum(); int Count = this.MainForm.GetJobMaster().GetDocCollection().GetStatsChecksumCount(Checksum: Checksum); string PairKey = string.Join("", Url); ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[0].Text = Url; lvItem.SubItems[1].Text = StatusCode; lvItem.SubItems[2].Text = Status; lvItem.SubItems[3].Text = Count.ToString(); lvItem.SubItems[4].Text = Checksum; } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayUriAnalysis 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[0].Text = Url; lvItem.SubItems.Add(StatusCode); lvItem.SubItems.Add(Status); lvItem.SubItems.Add(Count.ToString()); lvItem.SubItems.Add(Checksum); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayUriAnalysis 2: {0}", ex.Message)); } } if (lvItem != null) { lvItem.ForeColor = Color.Blue; if (msDoc.GetIsInternal()) { lvItem.SubItems[0].ForeColor = Color.Green; } else { lvItem.SubItems[0].ForeColor = Color.Gray; } if (Regex.IsMatch(StatusCode, "^[2]")) { lvItem.SubItems[1].ForeColor = Color.Green; lvItem.SubItems[2].ForeColor = Color.Green; } else if (Regex.IsMatch(StatusCode, "^[3]")) { lvItem.SubItems[1].ForeColor = Color.Goldenrod; lvItem.SubItems[2].ForeColor = Color.Goldenrod; } else if (Regex.IsMatch(StatusCode, "^[45]")) { lvItem.SubItems[1].ForeColor = Color.Red; lvItem.SubItems[2].ForeColor = Color.Red; } else { lvItem.SubItems[1].ForeColor = Color.Blue; lvItem.SubItems[2].ForeColor = Color.Blue; } if (Count > 1) { lvItem.SubItems[2].ForeColor = Color.Red; lvItem.SubItems[3].ForeColor = Color.Red; lvItem.SubItems[4].ForeColor = Color.Red; } else { lvItem.SubItems[2].ForeColor = Color.Blue; lvItem.SubItems[3].ForeColor = Color.Blue; lvItem.SubItems[4].ForeColor = Color.Blue; } } }
/**************************************************************************/ private void BuildWorksheetPageUriAnalysis( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Checksum"; } iColMax = iCol; iRow++; foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string Checksum = msDoc.GetChecksum(); int Count = DocCollection.GetStatsChecksumCount(Checksum: Checksum); iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: Url)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Status); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Count); if (Count > 1) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Checksum); if (Count > 1) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue); } iRow++; } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ public Dictionary <MacroscopeDocument, int> AnalyzeDocCollection(MacroscopeDocumentCollection DocCollection) { Dictionary <MacroscopeDocument, int> DocList; decimal DocListCount; decimal Count; if (this.AnalyzerFingerprint.GetType() != typeof(Levenshtein)) { throw new Exception("MacroscopeLevenshteinAnalysis not initialized"); } DocList = new Dictionary <MacroscopeDocument, int>(DocCollection.CountDocuments()); DocListCount = (decimal)DocCollection.CountDocuments(); Count = 0; foreach (MacroscopeDocument msDocCompare in DocCollection.IterateDocuments()) { string CompareFingerprint = msDocCompare.GetLevenshteinFingerprint(); bool DoCheck = false; Count++; if ((this.PercentageDone != null) && (DocListCount > 0)) { this.PercentageDone.PercentageDone((((decimal)100 / DocListCount) * Count), msDocCompare.GetUrl()); } if (this.CrossCheckDocuments(msDocCompare: msDocCompare)) { continue; } if (msDocCompare.GetIsExternal()) { continue; } if (msDocCompare.GetIsRedirect()) { continue; } if (!this.AllowedDocType(msDoc: msDocCompare)) { continue; } else if (msDocCompare.GetUrl() == this.msDocOriginal.GetUrl()) { continue; } else if (CompareFingerprint.Length == 0) { continue; } if (msDocOriginal.GetChecksum() == msDocCompare.GetChecksum()) { DocList.Add(msDocCompare, 0); continue; } //this.DebugMsg( string.Format( "msDocOriginal: {0}", this.msDocOriginal.GetUrl() ) ); //this.DebugMsg( string.Format( "this.Fingerprint.Length: {0}", this.Fingerprint.Length ) ); //this.DebugMsg( string.Format( "msDocCompare: {0}", msDocCompare.GetUrl() ) ); //this.DebugMsg( string.Format( "CompareFingerprint.Length: {0}", CompareFingerprint.Length ) ); //this.DebugMsg( string.Format( "this.ComparisonThreshold: {0}", this.ComparisonThreshold ) ); if (CompareFingerprint.Length > this.Fingerprint.Length) { int Len = CompareFingerprint.Length - this.Fingerprint.Length; if (Len <= this.ComparisonSizeDifference) { DoCheck = true; } } else { int Len = this.Fingerprint.Length - CompareFingerprint.Length; if (Len <= this.ComparisonSizeDifference) { DoCheck = true; } } if (DoCheck) { int DistanceFingerprint = this.AnalyzerFingerprint.DistanceFrom(value: CompareFingerprint); if (DistanceFingerprint <= this.ComparisonThreshold) { switch (MacroscopePreferencesManager.GetLevenshteinAnalysisLevel()) { case 1: DocList.Add(msDocCompare, DistanceFingerprint); break; case 2: string DocumentText = this.msDocOriginal.GetDocumentTextRaw().ToLower(); string CompareDocumentText = msDocCompare.GetDocumentTextRaw().ToLower(); Levenshtein AnalyzerText = new Levenshtein(value: DocumentText); int DistanceDocumentText = AnalyzerText.DistanceFrom(value: CompareDocumentText); if (DistanceDocumentText <= this.ComparisonThreshold) { DocList.Add(msDocCompare, DistanceDocumentText); } break; default: throw new Exception("Invalid Levenshtein Analysis Level"); } } } Thread.Yield(); } return(DocList); }
/**************************************************************************/ private void BuildWorksheetPageDuplicateChecksums( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; decimal CountOuter = 0; decimal CountInner = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, int> DuplicatesList = new Dictionary <string, int> (DocCollection.CountDocuments()); Dictionary <string, MacroscopeDocument> DuplicatesDocList = new Dictionary <string, MacroscopeDocument> (DocCollection.CountDocuments()); DocCount = ( decimal )DocCollection.CountDocuments(); foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url); string Checksum = msDoc.GetChecksum(); if ((Checksum != null) && (Checksum.Length > 0)) { if (!DuplicatesDocList.ContainsKey(Url)) { DuplicatesDocList.Add(Url, msDoc); } if (DuplicatesList.ContainsKey(Checksum)) { DuplicatesList[Checksum] = DuplicatesList[Checksum] + 1; } else { DuplicatesList.Add(Checksum, 1); } } } { ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Checksum"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (string Checksum in DuplicatesList.Keys) { CountOuter++; CountInner = 0; if (DuplicatesList[Checksum] > 1) { foreach (MacroscopeDocument msDoc in DuplicatesDocList.Values) { CountInner++; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: Checksum, SubMinorPercentage: (( decimal )100 / DocCount) * CountInner, ProgressLabelSubMinor: msDoc.GetUrl() ); } if (msDoc.GetChecksum() == Checksum) { iCol = 1; int StatusCode = ( int )msDoc.GetStatusCode(); HttpStatusCode Status = msDoc.GetStatusCode(); int Occurrences = DuplicatesList[Checksum]; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, StatusCode); iCol++; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, Status); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Occurrences); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetChecksum()); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); iRow++; } } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageDuplicateChecksums( MacroscopeJobMaster JobMaster, CsvWriter ws ) { decimal CountOuter = 0; decimal CountInner = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, int> DuplicatesList = new Dictionary <string, int> (DocCollection.CountDocuments()); Dictionary <string, MacroscopeDocument> DuplicatesDocList = new Dictionary <string, MacroscopeDocument> (DocCollection.CountDocuments()); DocCount = ( decimal )DocCollection.CountDocuments(); foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url); string Checksum = msDoc.GetChecksum(); if ((Checksum != null) && (Checksum.Length > 0)) { if (!DuplicatesDocList.ContainsKey(Url)) { DuplicatesDocList.Add(Url, msDoc); } if (DuplicatesList.ContainsKey(Checksum)) { DuplicatesList[Checksum] = DuplicatesList[Checksum] + 1; } else { DuplicatesList.Add(Checksum, 1); } } } { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Occurrences"); ws.WriteField("Checksum"); ws.WriteField("URL"); ws.NextRecord(); } foreach (string Checksum in DuplicatesList.Keys) { CountOuter++; CountInner = 0; if (DuplicatesList[Checksum] > 1) { foreach (MacroscopeDocument msDoc in DuplicatesDocList.Values) { CountInner++; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: Checksum, SubMinorPercentage: (( decimal )100 / DocCount) * CountInner, ProgressLabelSubMinor: msDoc.GetUrl() ); } if (msDoc.GetChecksum() == Checksum) { int StatusCode = ( int )msDoc.GetStatusCode(); HttpStatusCode Status = msDoc.GetStatusCode(); int Occurrences = DuplicatesList[Checksum]; this.InsertAndFormatStatusCodeCell(ws, StatusCode); this.InsertAndFormatStatusCodeCell(ws, Status); this.InsertAndFormatContentCell(ws, Occurrences); this.InsertAndFormatContentCell(ws, msDoc.GetChecksum()); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } } }
/**************************************************************************/ public Dictionary <MacroscopeDocument, int> AnalyzeDocCollection( MacroscopeDocumentCollection DocCollection ) { if (this.Monster.GetType() != typeof(Levenshtein)) { throw new Exception("MacroscopeLevenshteinAnalysis not initialized"); } Dictionary <MacroscopeDocument, int> DocList = new Dictionary <MacroscopeDocument, int> (DocCollection.CountDocuments()); decimal DocListCount = ( decimal )DocCollection.CountDocuments(); decimal Count = 0; Boolean Proceed = false; try { long MemoryEstimateBytes = 0; int RequiredMegabytes = 0; long DocumentCount = 0; foreach (MacroscopeDocument msDocCheck in DocCollection.IterateDocuments()) { if ((!msDocCheck.GetIsExternal()) && (!msDocCheck.GetIsRedirect())) { DocumentCount++; } } MemoryEstimateBytes = 512 * DocumentCount; RequiredMegabytes = ( int )(MemoryEstimateBytes / ( long )1024); if (this.MemoryGate(RequiredMegabytes: RequiredMegabytes)) { Proceed = true; } else { Proceed = false; } } catch (MacroscopeInsufficientMemoryException ex) { DebugMsg(string.Format("MacroscopeInsufficientMemoryException: {0}", ex.Message)); GC.Collect(); Thread.Yield(); } if (!Proceed) { return(DocList); } foreach (MacroscopeDocument msDocCompare in DocCollection.IterateDocuments()) { string BodyText = msDocCompare.GetDocumentTextRaw().ToLower(); Boolean DoCheck = false; Count++; if ((this.PercentageDone != null) && (DocListCount > 0)) { this.PercentageDone.PercentageDone(((( decimal )100 / DocListCount) * Count), msDocCompare.GetUrl()); } if (CrossCheckDocuments(msDocCompare: msDocCompare)) { continue; } if (msDocCompare.GetIsExternal()) { continue; } if (msDocCompare.GetIsRedirect()) { continue; } if (!msDocCompare.GetIsHtml()) { continue; } else if (msDocCompare.GetUrl() == this.msDocOriginal.GetUrl()) { continue; } else if (BodyText.Length == 0) { continue; } if (msDocOriginal.GetChecksum() == msDocCompare.GetChecksum()) { DocList.Add(msDocCompare, 0); continue; } //DebugMsg( string.Format( "msDocOriginal: {0}", this.msDocOriginal.GetUrl() ) ); //DebugMsg( string.Format( "this.MonstrousText.Length: {0}", this.MonstrousText.Length ) ); //DebugMsg( string.Format( "msDocCompare: {0}", msDocCompare.GetUrl() ) ); //DebugMsg( string.Format( "BodyText.Length: {0}", BodyText.Length ) ); //DebugMsg( string.Format( "this.ComparisonThreshold: {0}", this.ComparisonThreshold ) ); if (BodyText.Length > this.MonstrousText.Length) { int iLen = BodyText.Length - this.MonstrousText.Length; //DebugMsg( string.Format( "iLen 1: {0}", iLen ) ); if (iLen <= this.ComparisonSizeDifference) { DoCheck = true; } } else { int iLen = this.MonstrousText.Length - BodyText.Length; //DebugMsg( string.Format( "iLen 2: {0}", iLen ) ); if (iLen <= this.ComparisonSizeDifference) { DoCheck = true; } } if (DoCheck) { int Distance = this.Monster.Distance(BodyText); //DebugMsg( string.Format( "Distance: {0}", Distance ) ); if (Distance <= this.ComparisonThreshold) { DocList.Add(msDocCompare, Distance); } } else { //DebugMsg( string.Format( "DoCheck: {0}", DoCheck ) ); } Thread.Yield(); } return(DocList); }