Beispiel #1
0
        /**************************************************************************/

        private bool CrossCheckDocuments(MacroscopeDocument msDocCompare)
        {
            bool CrossChecked = false;

            string Key1 = string.Join(":", this.msDocOriginal.GetChecksum(), msDocCompare.GetChecksum());
            string Key2 = string.Join(":", msDocCompare.GetChecksum(), this.msDocOriginal.GetChecksum());

            lock (this.CrossCheck)
            {
                if (this.CrossCheck.ContainsKey(Key1))
                {
                    CrossChecked = true;
                }
                else
                {
                    this.CrossCheck.Add(Key1, true);
                }

                if (this.CrossCheck.ContainsKey(Key2))
                {
                    CrossChecked = true;
                }
                else
                {
                    this.CrossCheck.Add(Key2, true);
                }
            }

            return(CrossChecked);
        }
        /**************************************************************************/

        private void BuildWorksheetPageUriAnalysis(
            MacroscopeJobMaster JobMaster,
            CsvWriter ws
            )
        {
            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();

            {
                ws.WriteField("URL");
                ws.WriteField("Status Code");
                ws.WriteField("Status");
                ws.WriteField("Occurrences");
                ws.WriteField("Checksum");

                ws.NextRecord();
            }

            foreach (string Url in DocCollection.DocumentKeys())
            {
                MacroscopeDocument msDoc = DocCollection.GetDocument(Url: Url);

                string StatusCode = (( int )msDoc.GetStatusCode()).ToString();
                string Status     = msDoc.GetStatusCode().ToString();
                string Checksum   = msDoc.GetChecksum();
                int    Count      = DocCollection.GetStatsChecksumCount(Checksum: Checksum);

                this.InsertAndFormatUrlCell(ws, msDoc);

                this.InsertAndFormatContentCell(ws, StatusCode);

                this.InsertAndFormatContentCell(ws, Status);

                this.InsertAndFormatContentCell(ws, Count.ToString());

                this.InsertAndFormatContentCell(ws, Checksum);

                ws.NextRecord();
            }
        }
        /**************************************************************************/

        protected override void RenderListView(
            List <ListViewItem> ListViewItems,
            MacroscopeDocumentCollection DocCollection,
            MacroscopeDocument msDoc,
            string Url
            )
        {
            string       StatusCode = (( int )msDoc.GetStatusCode()).ToString();
            string       Status     = msDoc.GetStatusCode().ToString();
            string       Checksum   = msDoc.GetChecksum();
            int          Count      = this.MainForm.GetJobMaster().GetDocCollection().GetStatsChecksumCount(Checksum: Checksum);
            string       PairKey    = string.Join("", Url);
            ListViewItem lvItem     = null;

            if (this.DisplayListView.Items.ContainsKey(PairKey))
            {
                try
                {
                    lvItem = this.DisplayListView.Items[PairKey];
                    lvItem.SubItems[0].Text = Url;
                    lvItem.SubItems[1].Text = StatusCode;
                    lvItem.SubItems[2].Text = Status;
                    lvItem.SubItems[3].Text = Count.ToString();
                    lvItem.SubItems[4].Text = Checksum;
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("MacroscopeDisplayUriAnalysis 1: {0}", ex.Message));
                }
            }
            else
            {
                try
                {
                    lvItem = new ListViewItem(PairKey);
                    lvItem.UseItemStyleForSubItems = false;
                    lvItem.Name = PairKey;

                    lvItem.SubItems[0].Text = Url;
                    lvItem.SubItems.Add(StatusCode);
                    lvItem.SubItems.Add(Status);
                    lvItem.SubItems.Add(Count.ToString());
                    lvItem.SubItems.Add(Checksum);

                    ListViewItems.Add(lvItem);
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("MacroscopeDisplayUriAnalysis 2: {0}", ex.Message));
                }
            }

            if (lvItem != null)
            {
                lvItem.ForeColor = Color.Blue;

                if (msDoc.GetIsInternal())
                {
                    lvItem.SubItems[0].ForeColor = Color.Green;
                }
                else
                {
                    lvItem.SubItems[0].ForeColor = Color.Gray;
                }

                if (Regex.IsMatch(StatusCode, "^[2]"))
                {
                    lvItem.SubItems[1].ForeColor = Color.Green;
                    lvItem.SubItems[2].ForeColor = Color.Green;
                }
                else
                if (Regex.IsMatch(StatusCode, "^[3]"))
                {
                    lvItem.SubItems[1].ForeColor = Color.Goldenrod;
                    lvItem.SubItems[2].ForeColor = Color.Goldenrod;
                }
                else
                if (Regex.IsMatch(StatusCode, "^[45]"))
                {
                    lvItem.SubItems[1].ForeColor = Color.Red;
                    lvItem.SubItems[2].ForeColor = Color.Red;
                }
                else
                {
                    lvItem.SubItems[1].ForeColor = Color.Blue;
                    lvItem.SubItems[2].ForeColor = Color.Blue;
                }

                if (Count > 1)
                {
                    lvItem.SubItems[2].ForeColor = Color.Red;
                    lvItem.SubItems[3].ForeColor = Color.Red;
                    lvItem.SubItems[4].ForeColor = Color.Red;
                }
                else
                {
                    lvItem.SubItems[2].ForeColor = Color.Blue;
                    lvItem.SubItems[3].ForeColor = Color.Blue;
                    lvItem.SubItems[4].ForeColor = Color.Blue;
                }
            }
        }
        /**************************************************************************/

        private void BuildWorksheetPageUriAnalysis(
            MacroscopeJobMaster JobMaster,
            XLWorkbook wb,
            string WorksheetLabel
            )
        {
            var ws = wb.Worksheets.Add(WorksheetLabel);

            int iRow    = 1;
            int iCol    = 1;
            int iColMax = 1;

            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();

            {
                ws.Cell(iRow, iCol).Value = "URL";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Status Code";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Status";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Occurrences";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Checksum";
            }

            iColMax = iCol;

            iRow++;

            foreach (string Url in DocCollection.DocumentKeys())
            {
                MacroscopeDocument msDoc = DocCollection.GetDocument(Url);

                string StatusCode = (( int )msDoc.GetStatusCode()).ToString();
                string Status     = msDoc.GetStatusCode().ToString();
                string Checksum   = msDoc.GetChecksum();
                int    Count      = DocCollection.GetStatsChecksumCount(Checksum: Checksum);

                iCol = 1;

                this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc);

                if (AllowedHosts.IsInternalUrl(Url: Url))
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green);
                }
                else
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray);
                }

                iCol++;

                this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode);

                iCol++;

                this.InsertAndFormatContentCell(ws, iRow, iCol, Status);

                iCol++;

                this.InsertAndFormatContentCell(ws, iRow, iCol, Count);

                if (Count > 1)
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red);
                }
                else
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue);
                }

                iCol++;

                this.InsertAndFormatContentCell(ws, iRow, iCol, Checksum);

                if (Count > 1)
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red);
                }
                else
                {
                    ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue);
                }

                iRow++;
            }

            {
                var rangeData  = ws.Range(1, 1, iRow - 1, iColMax);
                var excelTable = rangeData.CreateTable();
            }
        }
Beispiel #5
0
        /**************************************************************************/

        public Dictionary <MacroscopeDocument, int> AnalyzeDocCollection(MacroscopeDocumentCollection DocCollection)
        {
            Dictionary <MacroscopeDocument, int> DocList;
            decimal DocListCount;
            decimal Count;

            if (this.AnalyzerFingerprint.GetType() != typeof(Levenshtein))
            {
                throw new Exception("MacroscopeLevenshteinAnalysis not initialized");
            }

            DocList      = new Dictionary <MacroscopeDocument, int>(DocCollection.CountDocuments());
            DocListCount = (decimal)DocCollection.CountDocuments();
            Count        = 0;

            foreach (MacroscopeDocument msDocCompare in DocCollection.IterateDocuments())
            {
                string CompareFingerprint = msDocCompare.GetLevenshteinFingerprint();
                bool   DoCheck            = false;

                Count++;

                if ((this.PercentageDone != null) && (DocListCount > 0))
                {
                    this.PercentageDone.PercentageDone((((decimal)100 / DocListCount) * Count), msDocCompare.GetUrl());
                }

                if (this.CrossCheckDocuments(msDocCompare: msDocCompare))
                {
                    continue;
                }

                if (msDocCompare.GetIsExternal())
                {
                    continue;
                }

                if (msDocCompare.GetIsRedirect())
                {
                    continue;
                }

                if (!this.AllowedDocType(msDoc: msDocCompare))
                {
                    continue;
                }
                else
                if (msDocCompare.GetUrl() == this.msDocOriginal.GetUrl())
                {
                    continue;
                }
                else
                if (CompareFingerprint.Length == 0)
                {
                    continue;
                }

                if (msDocOriginal.GetChecksum() == msDocCompare.GetChecksum())
                {
                    DocList.Add(msDocCompare, 0);
                    continue;
                }

                //this.DebugMsg( string.Format( "msDocOriginal: {0}", this.msDocOriginal.GetUrl() ) );
                //this.DebugMsg( string.Format( "this.Fingerprint.Length: {0}", this.Fingerprint.Length ) );
                //this.DebugMsg( string.Format( "msDocCompare: {0}", msDocCompare.GetUrl() ) );
                //this.DebugMsg( string.Format( "CompareFingerprint.Length: {0}", CompareFingerprint.Length ) );

                //this.DebugMsg( string.Format( "this.ComparisonThreshold: {0}", this.ComparisonThreshold ) );

                if (CompareFingerprint.Length > this.Fingerprint.Length)
                {
                    int Len = CompareFingerprint.Length - this.Fingerprint.Length;
                    if (Len <= this.ComparisonSizeDifference)
                    {
                        DoCheck = true;
                    }
                }
                else
                {
                    int Len = this.Fingerprint.Length - CompareFingerprint.Length;
                    if (Len <= this.ComparisonSizeDifference)
                    {
                        DoCheck = true;
                    }
                }

                if (DoCheck)
                {
                    int DistanceFingerprint = this.AnalyzerFingerprint.DistanceFrom(value: CompareFingerprint);

                    if (DistanceFingerprint <= this.ComparisonThreshold)
                    {
                        switch (MacroscopePreferencesManager.GetLevenshteinAnalysisLevel())
                        {
                        case 1:
                            DocList.Add(msDocCompare, DistanceFingerprint);
                            break;

                        case 2:
                            string      DocumentText         = this.msDocOriginal.GetDocumentTextRaw().ToLower();
                            string      CompareDocumentText  = msDocCompare.GetDocumentTextRaw().ToLower();
                            Levenshtein AnalyzerText         = new Levenshtein(value: DocumentText);
                            int         DistanceDocumentText = AnalyzerText.DistanceFrom(value: CompareDocumentText);
                            if (DistanceDocumentText <= this.ComparisonThreshold)
                            {
                                DocList.Add(msDocCompare, DistanceDocumentText);
                            }
                            break;

                        default:
                            throw new Exception("Invalid Levenshtein Analysis Level");
                        }
                    }
                }

                Thread.Yield();
            }

            return(DocList);
        }
        /**************************************************************************/

        private void BuildWorksheetPageDuplicateChecksums(
            MacroscopeJobMaster JobMaster,
            XLWorkbook wb,
            string WorksheetLabel
            )
        {
            var ws = wb.Worksheets.Add(WorksheetLabel);

            int iRow    = 1;
            int iCol    = 1;
            int iColMax = 1;

            decimal CountOuter = 0;
            decimal CountInner = 0;
            decimal DocCount   = 0;

            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();

            Dictionary <string, int> DuplicatesList = new Dictionary <string, int> (DocCollection.CountDocuments());
            Dictionary <string, MacroscopeDocument> DuplicatesDocList = new Dictionary <string, MacroscopeDocument> (DocCollection.CountDocuments());

            DocCount = ( decimal )DocCollection.CountDocuments();

            foreach (string Url in DocCollection.DocumentKeys())
            {
                MacroscopeDocument msDoc    = DocCollection.GetDocument(Url);
                string             Checksum = msDoc.GetChecksum();

                if ((Checksum != null) && (Checksum.Length > 0))
                {
                    if (!DuplicatesDocList.ContainsKey(Url))
                    {
                        DuplicatesDocList.Add(Url, msDoc);
                    }

                    if (DuplicatesList.ContainsKey(Checksum))
                    {
                        DuplicatesList[Checksum] = DuplicatesList[Checksum] + 1;
                    }
                    else
                    {
                        DuplicatesList.Add(Checksum, 1);
                    }
                }
            }

            {
                ws.Cell(iRow, iCol).Value = "Status Code";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Status";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Occurrences";
                iCol++;

                ws.Cell(iRow, iCol).Value = "Checksum";
                iCol++;

                ws.Cell(iRow, iCol).Value = "URL";
            }

            iColMax = iCol;

            iRow++;

            foreach (string Checksum in DuplicatesList.Keys)
            {
                CountOuter++;
                CountInner = 0;

                if (DuplicatesList[Checksum] > 1)
                {
                    foreach (MacroscopeDocument msDoc in  DuplicatesDocList.Values)
                    {
                        CountInner++;

                        if (DocCount > 0)
                        {
                            this.ProgressForm.UpdatePercentages(
                                Title: null,
                                Message: null,
                                MajorPercentage: -1,
                                ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter),
                                MinorPercentage: (( decimal )100 / DocCount) * CountOuter,
                                ProgressLabelMinor: Checksum,
                                SubMinorPercentage: (( decimal )100 / DocCount) * CountInner,
                                ProgressLabelSubMinor: msDoc.GetUrl()
                                );
                        }

                        if (msDoc.GetChecksum() == Checksum)
                        {
                            iCol = 1;

                            int            StatusCode  = ( int )msDoc.GetStatusCode();
                            HttpStatusCode Status      = msDoc.GetStatusCode();
                            int            Occurrences = DuplicatesList[Checksum];

                            this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, StatusCode);
                            iCol++;

                            this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, Status);
                            iCol++;

                            this.InsertAndFormatContentCell(ws, iRow, iCol, Occurrences);
                            iCol++;

                            this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetChecksum());
                            iCol++;

                            this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc);

                            iRow++;
                        }
                    }
                }
            }

            {
                var rangeData  = ws.Range(1, 1, iRow - 1, iColMax);
                var excelTable = rangeData.CreateTable();
            }
        }
Beispiel #7
0
        /**************************************************************************/

        private void BuildWorksheetPageDuplicateChecksums(
            MacroscopeJobMaster JobMaster,
            CsvWriter ws
            )
        {
            decimal CountOuter = 0;
            decimal CountInner = 0;
            decimal DocCount   = 0;

            MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection();
            MacroscopeAllowedHosts       AllowedHosts  = JobMaster.GetAllowedHosts();

            Dictionary <string, int> DuplicatesList = new Dictionary <string, int> (DocCollection.CountDocuments());
            Dictionary <string, MacroscopeDocument> DuplicatesDocList = new Dictionary <string, MacroscopeDocument> (DocCollection.CountDocuments());

            DocCount = ( decimal )DocCollection.CountDocuments();

            foreach (string Url in DocCollection.DocumentKeys())
            {
                MacroscopeDocument msDoc    = DocCollection.GetDocument(Url);
                string             Checksum = msDoc.GetChecksum();

                if ((Checksum != null) && (Checksum.Length > 0))
                {
                    if (!DuplicatesDocList.ContainsKey(Url))
                    {
                        DuplicatesDocList.Add(Url, msDoc);
                    }

                    if (DuplicatesList.ContainsKey(Checksum))
                    {
                        DuplicatesList[Checksum] = DuplicatesList[Checksum] + 1;
                    }
                    else
                    {
                        DuplicatesList.Add(Checksum, 1);
                    }
                }
            }

            {
                ws.WriteField("Status Code");
                ws.WriteField("Status");
                ws.WriteField("Occurrences");
                ws.WriteField("Checksum");
                ws.WriteField("URL");

                ws.NextRecord();
            }

            foreach (string Checksum in DuplicatesList.Keys)
            {
                CountOuter++;
                CountInner = 0;

                if (DuplicatesList[Checksum] > 1)
                {
                    foreach (MacroscopeDocument msDoc in  DuplicatesDocList.Values)
                    {
                        CountInner++;

                        if (DocCount > 0)
                        {
                            this.ProgressForm.UpdatePercentages(
                                Title: null,
                                Message: null,
                                MajorPercentage: -1,
                                ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter),
                                MinorPercentage: (( decimal )100 / DocCount) * CountOuter,
                                ProgressLabelMinor: Checksum,
                                SubMinorPercentage: (( decimal )100 / DocCount) * CountInner,
                                ProgressLabelSubMinor: msDoc.GetUrl()
                                );
                        }

                        if (msDoc.GetChecksum() == Checksum)
                        {
                            int            StatusCode  = ( int )msDoc.GetStatusCode();
                            HttpStatusCode Status      = msDoc.GetStatusCode();
                            int            Occurrences = DuplicatesList[Checksum];

                            this.InsertAndFormatStatusCodeCell(ws, StatusCode);

                            this.InsertAndFormatStatusCodeCell(ws, Status);

                            this.InsertAndFormatContentCell(ws, Occurrences);

                            this.InsertAndFormatContentCell(ws, msDoc.GetChecksum());

                            this.InsertAndFormatUrlCell(ws, msDoc);

                            ws.NextRecord();
                        }
                    }
                }
            }
        }
        /**************************************************************************/

        public Dictionary <MacroscopeDocument, int> AnalyzeDocCollection(
            MacroscopeDocumentCollection DocCollection
            )
        {
            if (this.Monster.GetType() != typeof(Levenshtein))
            {
                throw new Exception("MacroscopeLevenshteinAnalysis not initialized");
            }

            Dictionary <MacroscopeDocument, int> DocList = new Dictionary <MacroscopeDocument, int> (DocCollection.CountDocuments());
            decimal DocListCount = ( decimal )DocCollection.CountDocuments();
            decimal Count        = 0;
            Boolean Proceed      = false;

            try
            {
                long MemoryEstimateBytes = 0;
                int  RequiredMegabytes   = 0;
                long DocumentCount       = 0;

                foreach (MacroscopeDocument msDocCheck in DocCollection.IterateDocuments())
                {
                    if ((!msDocCheck.GetIsExternal()) && (!msDocCheck.GetIsRedirect()))
                    {
                        DocumentCount++;
                    }
                }

                MemoryEstimateBytes = 512 * DocumentCount;
                RequiredMegabytes   = ( int )(MemoryEstimateBytes / ( long )1024);

                if (this.MemoryGate(RequiredMegabytes: RequiredMegabytes))
                {
                    Proceed = true;
                }
                else
                {
                    Proceed = false;
                }
            }
            catch (MacroscopeInsufficientMemoryException ex)
            {
                DebugMsg(string.Format("MacroscopeInsufficientMemoryException: {0}", ex.Message));
                GC.Collect();
                Thread.Yield();
            }

            if (!Proceed)
            {
                return(DocList);
            }

            foreach (MacroscopeDocument msDocCompare in DocCollection.IterateDocuments())
            {
                string  BodyText = msDocCompare.GetDocumentTextRaw().ToLower();
                Boolean DoCheck  = false;

                Count++;

                if ((this.PercentageDone != null) && (DocListCount > 0))
                {
                    this.PercentageDone.PercentageDone(((( decimal )100 / DocListCount) * Count), msDocCompare.GetUrl());
                }

                if (CrossCheckDocuments(msDocCompare: msDocCompare))
                {
                    continue;
                }

                if (msDocCompare.GetIsExternal())
                {
                    continue;
                }

                if (msDocCompare.GetIsRedirect())
                {
                    continue;
                }

                if (!msDocCompare.GetIsHtml())
                {
                    continue;
                }
                else
                if (msDocCompare.GetUrl() == this.msDocOriginal.GetUrl())
                {
                    continue;
                }
                else
                if (BodyText.Length == 0)
                {
                    continue;
                }

                if (msDocOriginal.GetChecksum() == msDocCompare.GetChecksum())
                {
                    DocList.Add(msDocCompare, 0);
                    continue;
                }

                //DebugMsg( string.Format( "msDocOriginal: {0}", this.msDocOriginal.GetUrl() ) );
                //DebugMsg( string.Format( "this.MonstrousText.Length: {0}", this.MonstrousText.Length ) );
                //DebugMsg( string.Format( "msDocCompare: {0}", msDocCompare.GetUrl() ) );
                //DebugMsg( string.Format( "BodyText.Length: {0}", BodyText.Length ) );

                //DebugMsg( string.Format( "this.ComparisonThreshold: {0}", this.ComparisonThreshold ) );

                if (BodyText.Length > this.MonstrousText.Length)
                {
                    int iLen = BodyText.Length - this.MonstrousText.Length;

                    //DebugMsg( string.Format( "iLen 1: {0}", iLen ) );

                    if (iLen <= this.ComparisonSizeDifference)
                    {
                        DoCheck = true;
                    }
                }
                else
                {
                    int iLen = this.MonstrousText.Length - BodyText.Length;

                    //DebugMsg( string.Format( "iLen 2: {0}", iLen ) );

                    if (iLen <= this.ComparisonSizeDifference)
                    {
                        DoCheck = true;
                    }
                }

                if (DoCheck)
                {
                    int Distance = this.Monster.Distance(BodyText);

                    //DebugMsg( string.Format( "Distance: {0}", Distance ) );

                    if (Distance <= this.ComparisonThreshold)
                    {
                        DocList.Add(msDocCompare, Distance);
                    }
                }
                else
                {
                    //DebugMsg( string.Format( "DoCheck: {0}", DoCheck ) );
                }

                Thread.Yield();
            }

            return(DocList);
        }