コード例 #1
0
        /**************************************************************************/


        private void RenderListViewSearchTargetUrls(
            List <ListViewItem> ListViewItems,
            MacroscopeDocument msDoc,
            string Url,
            string UrlFragment
            )
        {
            MacroscopeAllowedHosts       AllowedHosts  = this.MainForm.GetJobMaster().GetAllowedHosts();
            MacroscopeHyperlinksOut      HyperlinksOut = msDoc.GetHyperlinksOut();
            MacroscopeDocumentCollection DocCollection = this.MainForm.GetJobMaster().GetDocCollection();

            foreach (MacroscopeHyperlinkOut HyperlinkOut in HyperlinksOut.IterateLinks())
            {
                string         UrlTarget      = HyperlinkOut.GetTargetUrl();
                HttpStatusCode StatusCode     = HttpStatusCode.NotFound;
                string         StatusCodeText = "Not crawled";
                string         StatusText     = "Not crawled";
                string         PairKey        = string.Join(":", UrlToDigest(Url: Url), UrlToDigest(Url: UrlTarget)).ToString();
                string         LinkTarget     = HyperlinkOut.GetLinkTarget();
                string         LinkText       = HyperlinkOut.GetAnchorText();
                string         LinkTitle      = HyperlinkOut.GetTitle();
                string         AltText        = HyperlinkOut.GetAltText();

                string LinkTextLabel  = LinkText;
                string LinkTitleLabel = LinkTitle;
                string AltTextLabel   = AltText;

                string DoFollow = "No Follow";

                try
                {
                    if (DocCollection.ContainsDocument(Url: HyperlinkOut.GetTargetUrl()))
                    {
                        StatusCode     = DocCollection.GetDocumentByUrl(Url: HyperlinkOut.GetTargetUrl()).GetStatusCode();
                        StatusCodeText = ((int)StatusCode).ToString();
                        StatusText     = StatusCode.ToString();
                    }
                    else
                    {
                        DebugMsg("Not in DocCollection");
                    }
                }
                catch (Exception ex)
                {
                    this.DebugMsg(ex.Message);
                }

                if (HyperlinkOut.GetDoFollow())
                {
                    DoFollow = "Follow";
                }

                if (LinkText.Length == 0)
                {
                    LinkTextLabel = "MISSING";
                }

                if (LinkTitle.Length == 0)
                {
                    LinkTitleLabel = "MISSING";
                }

                if (AltText.Length == 0)
                {
                    AltTextLabel = "MISSING";
                }

                if (
                    (UrlTarget != null) &&
                    (UrlTarget.IndexOf(UrlFragment, StringComparison.CurrentCulture) >= 0))
                {
                    ListViewItem lvItem = null;

                    if (this.DisplayListView.Items.ContainsKey(PairKey))
                    {
                        try
                        {
                            lvItem = this.DisplayListView.Items[PairKey];

                            lvItem.SubItems[ColUrl].Text                 = Url;
                            lvItem.SubItems[ColUrlTarget].Text           = UrlTarget;
                            lvItem.SubItems[ColStatusCode].Text          = StatusCodeText;
                            lvItem.SubItems[ColStatus].Text              = StatusText;
                            lvItem.SubItems[ColDoFollow].Text            = DoFollow;
                            lvItem.SubItems[ColLinkTarget].Text          = LinkTarget;
                            lvItem.SubItems[ColLinkAnchorTextLabel].Text = LinkTextLabel;
                            lvItem.SubItems[ColLinkTitleLabel].Text      = LinkTitleLabel;
                            lvItem.SubItems[ColAltTextLabel].Text        = AltTextLabel;
                        }
                        catch (Exception ex)
                        {
                            this.DebugMsg(string.Format("MacroscopeDisplayLinks 1: {0}", ex.Message));
                        }
                    }
                    else
                    {
                        try
                        {
                            lvItem = new ListViewItem(PairKey);
                            lvItem.UseItemStyleForSubItems = false;
                            lvItem.Name = PairKey;

                            lvItem.SubItems[ColUrl].Text = Url;
                            lvItem.SubItems.Add(UrlTarget);
                            lvItem.SubItems.Add(StatusCodeText);
                            lvItem.SubItems.Add(StatusText);
                            lvItem.SubItems.Add(DoFollow);
                            lvItem.SubItems.Add(LinkTarget);
                            lvItem.SubItems.Add(LinkTextLabel);
                            lvItem.SubItems.Add(LinkTitleLabel);
                            lvItem.SubItems.Add(AltTextLabel);

                            ListViewItems.Add(lvItem);
                        }
                        catch (Exception ex)
                        {
                            this.DebugMsg(string.Format("MacroscopeDisplayLinks 2: {0}", ex.Message));
                        }
                    }

                    if (lvItem != null)
                    {
                        for (int i = 0; i < lvItem.SubItems.Count; i++)
                        {
                            lvItem.SubItems[i].ForeColor = Color.Blue;
                        }

                        if (AllowedHosts.IsAllowedFromUrl(Url))
                        {
                            lvItem.SubItems[ColUrl].ForeColor = Color.Green;
                        }
                        else
                        {
                            lvItem.SubItems[ColUrl].ForeColor = Color.Gray;
                        }

                        if (AllowedHosts.IsAllowedFromUrl(UrlTarget))
                        {
                            lvItem.SubItems[ColUrlTarget].ForeColor = Color.Green;
                        }
                        else
                        {
                            lvItem.SubItems[ColUrlTarget].ForeColor = Color.Gray;
                        }

                        if (AllowedHosts.IsAllowedFromUrl(Url))
                        {
                            if (HyperlinkOut.GetDoFollow())
                            {
                                lvItem.SubItems[ColDoFollow].ForeColor = Color.Green;
                            }
                            else
                            {
                                lvItem.SubItems[ColDoFollow].ForeColor = Color.Red;
                            }
                        }
                        else
                        {
                            lvItem.SubItems[ColDoFollow].ForeColor = Color.Gray;
                        }

                        if (LinkText.Length == 0)
                        {
                            lvItem.SubItems[ColLinkAnchorTextLabel].ForeColor = Color.Gray;
                        }

                        if (LinkTitle.Length == 0)
                        {
                            lvItem.SubItems[ColLinkTitleLabel].ForeColor = Color.Gray;
                        }

                        if (AltText.Length == 0)
                        {
                            lvItem.SubItems[ColAltTextLabel].ForeColor = Color.Gray;
                        }

                        if (
                            (LinkText.Length == 0) &&
                            (LinkTitle.Length == 0) &&
                            (AltText.Length == 0))
                        {
                            lvItem.SubItems[ColLinkAnchorTextLabel].ForeColor = Color.Red;
                            lvItem.SubItems[ColLinkTitleLabel].ForeColor      = Color.Red;
                            lvItem.SubItems[ColAltTextLabel].ForeColor        = Color.Red;
                        }
                    }
                }
            }
        }
コード例 #2
0
        /**************************************************************************/

        private void RenderListView(Dictionary <ulong, bool> History, MacroscopeDocumentCollection DocCollection)
        {
            if (History.Count == 0)
            {
                return;
            }

            List <ListViewItem> ListViewItems = new List <ListViewItem>(1);

            MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts();
            MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm);
            decimal Count           = 0;
            decimal TotalDocs       = (decimal)History.Count;
            decimal MajorPercentage = ((decimal)100 / TotalDocs) * Count;

            if (MacroscopePreferencesManager.GetShowProgressDialogues())
            {
                ProgressForm.UpdatePercentages(
                    Title: "Preparing Display",
                    Message: "Processing document collection for display:",
                    MajorPercentage: MajorPercentage,
                    ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs)
                    );
            }

            foreach (ulong DocKey in History.Keys)
            {
                ListViewItem       lvItem  = null;
                MacroscopeDocument msDoc   = DocCollection.GetDocumentByDocKey(DocKey: DocKey);
                string             PairKey = DocKey.ToString();

                if (msDoc != null)
                {
                    string Url             = msDoc.GetUrl();
                    string Visited         = "No";
                    string InDocCollection = "No";

                    if (History.ContainsKey(DocKey) && History[DocKey])
                    {
                        Visited = "Yes";
                    }

                    if (DocCollection.ContainsDocument(Url: Url))
                    {
                        InDocCollection = "Yes";
                    }

                    if (this.DisplayListView.Items.ContainsKey(PairKey))
                    {
                        try
                        {
                            lvItem = this.DisplayListView.Items[PairKey];
                            lvItem.SubItems[ColUrl].Text             = Url;
                            lvItem.SubItems[ColVisited].Text         = Visited;
                            lvItem.SubItems[ColInDocCollection].Text = InDocCollection;
                        }
                        catch (Exception ex)
                        {
                            DebugMsg(string.Format("RenderListView 1: {0}", ex.Message));
                        }
                    }
                    else
                    {
                        try
                        {
                            lvItem = new ListViewItem(PairKey);
                            lvItem.UseItemStyleForSubItems = false;

                            lvItem.Name = PairKey;

                            lvItem.SubItems[0].Text = Url;
                            lvItem.SubItems.Add(Visited);
                            lvItem.SubItems.Add(InDocCollection);

                            ListViewItems.Add(lvItem);
                        }
                        catch (Exception ex)
                        {
                            DebugMsg(string.Format("RenderListView 2: {0}", ex.Message));
                        }
                    }

                    if (lvItem != null)
                    {
                        lvItem.ForeColor = Color.Blue;

                        if (AllowedHosts.IsInternalUrl(Url))
                        {
                            lvItem.SubItems[ColUrl].ForeColor = Color.Green;
                            if (History.ContainsKey(DocKey) && History[DocKey])
                            {
                                lvItem.SubItems[ColVisited].ForeColor = Color.Green;
                            }
                            else
                            {
                                lvItem.SubItems[ColVisited].ForeColor = Color.Red;
                            }
                            lvItem.SubItems[ColInDocCollection].ForeColor = Color.Blue;
                        }
                        else
                        {
                            lvItem.SubItems[ColUrl].ForeColor             = Color.Gray;
                            lvItem.SubItems[ColVisited].ForeColor         = Color.Gray;
                            lvItem.SubItems[ColInDocCollection].ForeColor = Color.Gray;
                        }
                    }
                }

                if (MacroscopePreferencesManager.GetShowProgressDialogues())
                {
                    Count++;
                    TotalDocs       = (decimal)History.Count;
                    MajorPercentage = ((decimal)100 / TotalDocs) * Count;

                    ProgressForm.UpdatePercentages(
                        Title: null,
                        Message: null,
                        MajorPercentage: MajorPercentage,
                        ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs)
                        );
                }
            }

            this.DisplayListView.Items.AddRange(ListViewItems.ToArray());

            if (MacroscopePreferencesManager.GetShowProgressDialogues())
            {
                ProgressForm.DoClose();
            }

            if (ProgressForm != null)
            {
                ProgressForm.Dispose();
            }
        }
コード例 #3
0
        /**************************************************************************/

        private async Task <MacroscopeConstants.FetchStatus> Fetch(string Url, string RedirectedFromUrl = null)
        {
            MacroscopeDocument msDoc = null;

            MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID;
            bool BlockedByRobotsRule;

            if (MacroscopePreferencesManager.GetPageLimit() > -1)
            {
                int PagesFound = this.JobMaster.GetPagesFound();
                int PageLimit  = MacroscopePreferencesManager.GetPageLimit();
                if (PagesFound >= PageLimit)
                {
                    this.DebugMsg(string.Format("PAGE LIMIT REACHED: {0} :: {1}", PageLimit, PagesFound));
                    return(FetchStatus);
                }
            }

            if (this.DocCollection.ContainsDocument(Url: Url))
            {
                msDoc = this.DocCollection.GetDocumentByUrl(Url: Url);

                if (msDoc.GetAuthenticationRealm() != null)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredential Credential;

                        Credential = this.JobMaster.GetCredentialsHttp().GetCredential(
                            msDoc.GetHostAndPort(),
                            msDoc.GetAuthenticationRealm()
                            );

                        if (Credential != null)
                        {
                            msDoc = this.DocCollection.CreateDocument(
                                Credential: Credential,
                                Url: Url
                                );
                        }
                    }
                }
            }
            else
            {
                msDoc = this.DocCollection.CreateDocument(Url: Url);
            }

            if (!string.IsNullOrEmpty(RedirectedFromUrl))
            {
                msDoc.SetUrlRedirectFrom(Url: RedirectedFromUrl);
            }

            msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK);

            if (!MacroscopeDnsTools.CheckValidHostname(Url: Url))
            {
                this.DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK"));
                msDoc.SetStatusCode(HttpStatusCode.BadGateway);
                FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR;
                msDoc.SetFetchStatus(FetchStatus);
            }

            if (await this.JobMaster.GetRobots().CheckRobotRule(Url: Url))
            {
                msDoc.SetAllowedByRobots(true);
            }
            else
            {
                msDoc.SetAllowedByRobots(false);
            }

            BlockedByRobotsRule = await this.JobMaster.GetRobots().ApplyRobotRule(Url: Url);

            if (!BlockedByRobotsRule)
            {
                this.DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url));

                this.JobMaster.AddToBlockedByRobots(Url);

                FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED;

                msDoc.SetFetchStatus(FetchStatus);

                JobHistory.VisitedHistoryItem(Url: msDoc.GetUrl());
            }
            else
            {
                this.JobMaster.RemoveFromBlockedByRobots(Url);
            }

            if (this.AllowedHosts.IsExternalUrl(Url: Url))
            {
                this.DebugMsg(string.Format("IsExternalUrl: {0}", Url));
                msDoc.SetIsExternal(State: true);
            }

            if (this.DocCollection.ContainsDocument(Url: Url))
            {
                if (!this.DocCollection.GetDocumentByUrl(Url: Url).GetIsDirty())
                {
                    FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN;
                    return(FetchStatus);
                }
            }

            if (MacroscopePreferencesManager.GetDepth() >= 0)
            {
                int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url);
                if (Depth > MacroscopePreferencesManager.GetDepth())
                {
                    this.DebugMsg(string.Format("URL Too Deep: {0}", Depth));
                    FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED;
                    return(FetchStatus);
                }
            }

            /** ------------------------------------------------------------------ **/

            if (!await msDoc.Execute())
            {
                this.DebugMsg(string.Format("EXECUTE FAILED: {0}", Url));
                FetchStatus = MacroscopeConstants.FetchStatus.ERROR;
            }

            /** ------------------------------------------------------------------ **/



            /** ------------------------------------------------------------------ **/

            {
                if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp();

                        CredentialsHttp.EnqueueCredentialRequest(
                            Domain: msDoc.GetHostAndPort(),
                            Realm: msDoc.GetAuthenticationRealm(),
                            Url: msDoc.GetUrl()
                            );

                        this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl());
                    }
                }

                if (msDoc.GetIsRedirect())
                {
                    this.DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl()));
                    this.DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom()));

                    if (MacroscopePreferencesManager.GetCheckRedirects())
                    {
                        string Hostname      = msDoc.GetHostAndPort();
                        string HostnameFrom  = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom());
                        string UrlRedirectTo = msDoc.GetUrlRedirectTo();
                        string HostnameTo    = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo);

                        this.DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo));
                        this.DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo));

                        if (MacroscopePreferencesManager.GetFollowRedirects())
                        {
                            if (MacroscopePreferencesManager.GetCheckExternalLinks())
                            {
                                this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo);
                            }
                            else
                            {
                                if (this.AllowedHosts.IsInternalUrl(Url: UrlRedirectTo))
                                {
                                    this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo);
                                }
                            }
                        }
                    }

                    this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo());
                }
                else
                {
                    this.ProcessHrefLangLanguages(msDoc);         // Process Languages from HrefLang

                    this.JobMaster.ProcessOutlinks(msDoc: msDoc); // Process Outlinks from document
                }

                FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS;
            }

            /** ------------------------------------------------------------------ **/

            if (DocCollection.ContainsDocument(msDoc: msDoc))
            {
                JobHistory.VisitedHistoryItem(Url: Url);
            }
            else
            {
                this.DebugMsg(string.Format("OOPS: {0}", Url));
            }

            /** ------------------------------------------------------------------ **/

            return(FetchStatus);
        }
コード例 #4
0
        /**************************************************************************/

        protected override void RenderListView(
            List <ListViewItem> ListViewItems,
            MacroscopeDocumentCollection DocCollection,
            MacroscopeDocument msDoc,
            string Url
            )
        {
            MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts();

            foreach (MacroscopeLink Link in msDoc.IterateOutlinks())
            {
                ListViewItem   lvItem         = null;
                string         LinkType       = Link.GetLinkType().ToString();
                string         UrlTarget      = Link.GetTargetUrl();
                HttpStatusCode StatusCode     = HttpStatusCode.NotFound;
                string         StatusCodeText = "Not crawled";
                string         StatusText     = "Not crawled";
                string         PairKey        = string.Join(":", UrlToDigest(Url: Url), UrlToDigest(Url: UrlTarget));
                string         DoFollow       = "No Follow";
                string         AltText        = Link.GetAltText();
                string         AltTextLabel   = AltText;
                string         RawSourceUrl   = Link.GetRawSourceUrl();
                string         RawTargetUrl   = Link.GetRawTargetUrl();

                try
                {
                    if (DocCollection.ContainsDocument(Url: Link.GetTargetUrl()))
                    {
                        StatusCode     = DocCollection.GetDocumentByUrl(Url: Link.GetTargetUrl()).GetStatusCode();
                        StatusCodeText = ((int)StatusCode).ToString();
                        StatusText     = StatusCode.ToString();
                    }
                }
                catch (Exception ex)
                {
                    this.DebugMsg(ex.Message);
                }

                if (Link.GetDoFollow())
                {
                    DoFollow = "Follow";
                }

                if (string.IsNullOrEmpty(AltText))
                {
                    AltTextLabel = "";
                }

                if (string.IsNullOrEmpty(RawSourceUrl))
                {
                    RawSourceUrl = "";
                }

                if (string.IsNullOrEmpty(RawTargetUrl))
                {
                    RawTargetUrl = "";
                }

                if (this.DisplayListView.Items.ContainsKey(PairKey))
                {
                    try
                    {
                        lvItem = this.DisplayListView.Items[PairKey];

                        lvItem.SubItems[ColType].Text         = LinkType;
                        lvItem.SubItems[ColUrl].Text          = Url;
                        lvItem.SubItems[ColUrlTarget].Text    = UrlTarget;
                        lvItem.SubItems[ColStatusCode].Text   = StatusCodeText;
                        lvItem.SubItems[ColStatus].Text       = StatusText;
                        lvItem.SubItems[ColDoFollow].Text     = DoFollow;
                        lvItem.SubItems[ColAltTextLabel].Text = AltTextLabel;
                        lvItem.SubItems[ColRawSourceUrl].Text = RawSourceUrl;
                        lvItem.SubItems[ColRawTargetUrl].Text = RawTargetUrl;
                    }
                    catch (Exception ex)
                    {
                        this.DebugMsg(string.Format("MacroscopeDisplayLinks 1: {0}", ex.Message));
                    }
                }
                else
                {
                    try
                    {
                        lvItem = new ListViewItem(PairKey);
                        lvItem.UseItemStyleForSubItems = false;
                        lvItem.Name = PairKey;

                        lvItem.SubItems[ColType].Text = LinkType;
                        lvItem.SubItems.Add(Url);
                        lvItem.SubItems.Add(UrlTarget);
                        lvItem.SubItems.Add(StatusCodeText);
                        lvItem.SubItems.Add(StatusText);
                        lvItem.SubItems.Add(DoFollow);
                        lvItem.SubItems.Add(AltTextLabel);
                        lvItem.SubItems.Add(RawSourceUrl);
                        lvItem.SubItems.Add(RawTargetUrl);

                        ListViewItems.Add(lvItem);
                    }
                    catch (Exception ex)
                    {
                        this.DebugMsg(string.Format("MacroscopeDisplayLinks 2: {0}", ex.Message));
                    }
                }

                if (lvItem != null)
                {
                    for (int i = 0; i < lvItem.SubItems.Count; i++)
                    {
                        lvItem.SubItems[i].ForeColor = Color.Blue;
                    }

                    if (AllowedHosts.IsAllowedFromUrl(Url))
                    {
                        lvItem.SubItems[ColUrl].ForeColor = Color.Green;
                    }
                    else
                    {
                        lvItem.SubItems[ColUrl].ForeColor = Color.Gray;
                    }

                    if (AllowedHosts.IsAllowedFromUrl(UrlTarget))
                    {
                        lvItem.SubItems[ColUrlTarget].ForeColor = Color.Green;
                    }
                    else
                    {
                        lvItem.SubItems[ColUrlTarget].ForeColor = Color.Gray;
                    }

                    if (AllowedHosts.IsAllowedFromUrl(UrlTarget))
                    {
                        if (Link.GetDoFollow())
                        {
                            lvItem.SubItems[ColDoFollow].ForeColor = Color.Green;
                        }
                        else
                        {
                            lvItem.SubItems[ColDoFollow].ForeColor = Color.Red;
                        }
                    }
                    else
                    {
                        lvItem.SubItems[ColDoFollow].ForeColor = Color.Gray;
                    }
                }
            }
        }