Exemplo n.º 1
0
        /** -------------------------------------------------------------------- **/

        public void AddUrlQueueItem(string Url)
        {
            string NewUrl = Url;

            if (MacroscopePreferencesManager.GetIgnoreQueries())
            {
                NewUrl = MacroscopeUrlUtils.StripQueryString(Url: NewUrl);
            }

            if (MacroscopePreferencesManager.GetIgnoreHashFragments())
            {
                NewUrl = MacroscopeUrlUtils.StripHashFragment(Url: NewUrl);
            }

            if (!this.JobHistory.SeenHistoryItem(Url: NewUrl))
            {
                try
                {
                    MacroscopeJobItem JobItem;

                    JobItem = new MacroscopeJobItem(Url: NewUrl);

                    this.NamedQueueJobItems.AddToNamedQueue(
                        Name: MacroscopeConstants.NamedQueueUrlList,
                        Item: JobItem
                        );
                }
                catch (MacroscopeNamedQueueException ex)
                {
                    this.DebugMsg(string.Format("AddUrlQueueItem: {0}", ex.Message));
                }
            }

            this.AddToProgress(Url: NewUrl);
        }
Exemplo n.º 2
0
        public void TestMakeUrlAbsoluteUrls()
        {
            Dictionary <string, string> UrlTable = new Dictionary <string, string> ();

            UrlTable.Add(
                @"path/to/images/picture.gif",
                @"http://www.host.com/path/to/page/path/to/images/picture.gif"
                );

            UrlTable.Add(
                @"../path/to/images/picture.gif",
                @"http://www.host.com/path/to/path/to/images/picture.gif"
                );

            UrlTable.Add(
                @"../../path/to/images/picture.gif",
                @"http://www.host.com/path/path/to/images/picture.gif"
                );

            const string BaseUrl  = "http://www.host.com/path/to/page/";
            const string Filename = "index.html";
            string       Url      = string.Join("", BaseUrl, Filename);

            foreach (string RelativeUrl in UrlTable.Keys)
            {
                string sAbsoluteUrl = MacroscopeUrlUtils.MakeUrlAbsolute(Url, RelativeUrl);
                Assert.AreEqual(UrlTable[RelativeUrl], sAbsoluteUrl, "DO NOT MATCH");
            }
        }
Exemplo n.º 3
0
        /**************************************************************************/

        private string ProcessCssBackImageUrl(string BackgroundImageUrl)
        {
            string LinkUrlAbs     = null;
            string LinkUrlCleaned = MacroscopeUrlUtils.CleanUrlCss(BackgroundImageUrl);

            if (LinkUrlCleaned != null)
            {
                try
                {
                    LinkUrlAbs = MacroscopeUrlUtils.MakeUrlAbsolute(
                        BaseUrl: this.DocUrl,
                        Url: LinkUrlCleaned
                        );
                }
                catch (MacroscopeUriFormatException ex)
                {
                    DebugMsg(string.Format("ProcessCssBackImageUrl: {0}", ex.Message));
                }

                DebugMsg(string.Format("ProcessCssBackImageUrl: {0}", LinkUrlCleaned));
                DebugMsg(string.Format("ProcessCssBackImageUrl: this.DocUrl: {0}", this.DocUrl));
                DebugMsg(string.Format("ProcessCssBackImageUrl: LinkUrlAbs: {0}", LinkUrlAbs));
            }

            return(LinkUrlAbs);
        }
Exemplo n.º 4
0
        public void TestMakeUrlAbsoluteUrlsWithBaseHref()
        {
            /*
             * List Items:
             *  Base HREF
             *  Base URL
             *  Page URL
             *  Absolute URL
             */

            List <List <string> > TestList = new List <List <string> > ();

            TestList.Add(new List <string> ());
            TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html");
            TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/");
            TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/to/pages/index.html");
            TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/to/pages/index.html");

            TestList.Add(new List <string> ());
            TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html");
            TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/");
            TestList[TestList.Count - 1].Add("path/to/pages/index.html");
            TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/path/to/pages/index.html");

            TestList.Add(new List <string> ());
            TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html");
            TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/");
            TestList[TestList.Count - 1].Add("../path/to/pages/index.html");
            TestList[TestList.Count - 1].Add("http://www.host.com/path/to/pages/index.html");

            TestList.Add(new List <string> ());
            TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html");
            TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/");
            TestList[TestList.Count - 1].Add("../../path/to/pages/index.html");
            TestList[TestList.Count - 1].Add("http://www.host.com/path/to/pages/index.html");

            foreach (List <string> UrlSet in TestList)
            {
                string BaseHref    = UrlSet[0];
                string BaseUrl     = UrlSet[1];
                string PageUrl     = UrlSet[2];
                string AbsoluteUrl = UrlSet[3];

                string ResolvedUrl;

                ResolvedUrl = MacroscopeUrlUtils.MakeUrlAbsolute(
                    BaseHref: BaseHref,
                    BaseUrl: BaseUrl,
                    Url: PageUrl
                    );

                Assert.AreEqual(AbsoluteUrl, ResolvedUrl, "DO NOT MATCH");
            }
        }
Exemplo n.º 5
0
        /**************************************************************************/

        public Boolean IsPdfUrl(string Url)
        {
            Boolean Result   = false;
            string  MimeType = MacroscopeUrlUtils.GetMimeTypeOfUrl(Url: Url);

            if (!string.IsNullOrEmpty(MimeType))
            {
                if (Regex.IsMatch(MimeType, "^application/pdf$", RegexOptions.IgnoreCase))
                {
                    Result = true;
                }
            }

            return(Result);
        }
Exemplo n.º 6
0
        /**************************************************************************/

        private Boolean Check()
        {
            // TODO: Increase level of detail here.

            HttpWebRequest  req = null;
            HttpWebResponse res = null;
            Boolean         IsAvailableCheck = false;

            try
            {
                req           = WebRequest.CreateHttp(this.Url);
                req.Method    = "HEAD";
                req.Timeout   = 10000;
                req.KeepAlive = false;
                req.Host      = MacroscopeUrlUtils.GetHostnameAndPortFromUrl(this.Url);
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                MacroscopePreferencesManager.EnableHttpProxy(req);

                using (res = ( HttpWebResponse )req.GetResponse())
                {
                    DebugMsg(string.Format("MacroscopeHrefLang Status: {0}", res.StatusCode));

                    if (res.StatusCode == HttpStatusCode.OK)
                    {
                        IsAvailableCheck = true;

                        this.ProcessResponseHttpHeaders(req: req, res: res);
                    }
                    else
                    {
                        IsAvailableCheck = false;
                    }

                    res.Close();
                }
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("MacroscopeHrefLang UriFormatException: {0}", ex.Message));
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("MacroscopeHrefLang WebException: {0}", ex.Message));
            }

            return(IsAvailableCheck);
        }
Exemplo n.º 7
0
        /** Execute Job ***********************************************************/

        public Boolean Execute()
        {
            DebugMsg(string.Format("Start URL: {0}", this.StartUrl));

            //this.LogEntry( string.Format( "Executing with Start URL: {0}", this.StartUrl ) );

            this.StartUrl = MacroscopeUrlUtils.SanitizeUrl(Url: this.StartUrl);

            this.DocCollection.SetStartUrl(Url: this.StartUrl);

            this.DetermineStartingDirectory();

            this.SetThreadsStop(Stopped: false);

            this.AllowedHosts.AddFromUrl(Url: this.StartUrl);

            if (!this.PeekUrlQueue())
            {
                string RobotsUrl = MacroscopeRobots.GenerateRobotUrl(Url: this.StartUrl);

                if (!string.IsNullOrEmpty(RobotsUrl))
                {
                    this.AddUrlQueueItem(Url: RobotsUrl);
                }

                this.IncludeExcludeUrls.AddExplicitIncludeUrl(Url: this.StartUrl);

                this.AddUrlQueueItem(Url: this.StartUrl);
            }

            this.ProbeRobotsFile(Url: this.StartUrl);

            this.SetCrawlDelay(Url: this.StartUrl);

            this.SpawnWorkers();

            DebugMsg(string.Format("Pages Found: {0}", this.GetPagesFound()));

            if (this.TaskController != null)
            {
                this.TaskController.ICallbackScanComplete();
            }

            this.AddUpdateDisplayQueue(Url: this.StartUrl);

            return(true);
        }
Exemplo n.º 8
0
        public void TestValidateUrls()
        {
            Dictionary <string, Boolean> UrlList = new Dictionary <string, Boolean> ();

            UrlList.Add(
                "http://www.host.com/",
                true
                );

            UrlList.Add(
                "http://www.host.com/index.html",
                true
                );

            UrlList.Add(
                "http://www.host.com/path/path/to/images/picture.gif",
                true
                );

            UrlList.Add(
                "http://www.host.com/??",
                true
                );

            UrlList.Add(
                "http://www.host.com/ ",
                true
                );

            UrlList.Add(
                "http://   www.host.com/",
                false
                );

            foreach (string Url in UrlList.Keys)
            {
                Boolean IsValid = MacroscopeUrlUtils.ValidateUrl(Url);
                Assert.AreEqual(UrlList[Url], IsValid, string.Format("NOT VALID: {0}", Url));
            }
        }
Exemplo n.º 9
0
        public void TestStripHashFragment()
        {
            Dictionary <string, string> UrlList = new Dictionary <string, string> ();

            UrlList.Add("http://www.host.com/#aberdeen-angus", "http://www.host.com/");

            UrlList.Add("http://www.host.com/product/list/#boris", "http://www.host.com/product/list/");

            UrlList.Add("http://www.host.com/product/list/index.html#boris", "http://www.host.com/product/list/index.html");

            UrlList.Add("http://www.host.com/?key1=value1&key2=value2&key3=value3", "http://www.host.com/?key1=value1&key2=value2&key3=value3");

            UrlList.Add("http://www.host.com/?key1=value1&key2=value2&key3=value3#gonzo", "http://www.host.com/?key1=value1&key2=value2&key3=value3");

            UrlList.Add("http://www.host.com/index.html?key1=value1&key2=value2&key3=value3#gonzo", "http://www.host.com/index.html?key1=value1&key2=value2&key3=value3");

            foreach (string Url in UrlList.Keys)
            {
                string UrlResult = MacroscopeUrlUtils.StripHashFragment(Url);
                Assert.AreEqual(UrlList[Url], UrlResult, string.Format("NOT VALID: {0}", Url));
            }
        }
Exemplo n.º 10
0
        /** -------------------------------------------------------------------- **/

        public void ForgetUrlQueueItem(string Url)
        {
            MacroscopeJobItem JobItem;
            string            NewUrl = Url;

            if (MacroscopePreferencesManager.GetIgnoreQueries())
            {
                NewUrl = MacroscopeUrlUtils.StripQueryString(Url: NewUrl);
            }

            if (MacroscopePreferencesManager.GetIgnoreHashFragments())
            {
                NewUrl = MacroscopeUrlUtils.StripHashFragment(Url: NewUrl);
            }

            JobItem = new MacroscopeJobItem(Url: NewUrl);

            this.NamedQueueJobItems.ForgetNamedQueueItem(
                Name: MacroscopeConstants.NamedQueueUrlList,
                Item: JobItem
                );
        }
Exemplo n.º 11
0
        /** -------------------------------------------------------------------- **/

        private void GenerateTextSitemapPdfEntries(
            MacroscopeDocument msDoc,
            List <string> SitemapText,
            Dictionary <string, Boolean> Dedupe
            )
        {
            foreach (MacroscopeHyperlinkOut HyperlinkOut in msDoc.IterateHyperlinksOut())
            {
                string Url       = HyperlinkOut.GetTargetUrl();
                Uri    UrlParsed = new Uri(uriString: Url);

                if (Dedupe.ContainsKey(Url))
                {
                    continue;
                }
                else
                {
                    Dedupe.Add(Url, true);
                }

                if (!UrlParsed.AbsolutePath.ToLower().EndsWith(".pdf", StringComparison.InvariantCultureIgnoreCase))
                {
                    continue;
                }

                if (!this.DocCollection.GetAllowedHosts().IsAllowedFromUrl(Url: Url))
                {
                    continue;
                }

                if (!MacroscopeUrlUtils.VerifySameHost(BaseUrl: msDoc.GetUrl(), Url: Url))
                {
                    continue;
                }

                SitemapText.Add(Url);
            }
        }
Exemplo n.º 12
0
        /**************************************************************************/

        /*
         *
         * Reference: https://www.w3.org/TR/html5/document-metadata.html#the-base-element
         *
         */

        public static string MakeUrlAbsolute(
            string BaseHref,
            string BaseUrl,
            string Url
            )
        {
            string AbsoluteBaseHref;
            string UrlFixed;

            if (!string.IsNullOrEmpty(value: BaseHref))
            {
                AbsoluteBaseHref = MacroscopeUrlUtils.MakeUrlAbsolute(
                    BaseUrl: BaseUrl,
                    Url: BaseHref
                    );

                DebugMsg(string.Format("BASEHREF: {0}", BaseHref), true);
                DebugMsg(string.Format("ABSOLUTEBASEHREF: {0}", AbsoluteBaseHref), true);

                UrlFixed = MacroscopeUrlUtils.MakeUrlAbsolute(
                    BaseUrl: AbsoluteBaseHref,
                    Url: Url
                    );

                DebugMsg(string.Format("URL: {0}", Url), true);
                DebugMsg(string.Format("URLFIXED: {0}", UrlFixed), true);
            }
            else
            {
                UrlFixed = MacroscopeUrlUtils.MakeUrlAbsolute(
                    BaseUrl: BaseUrl,
                    Url: Url
                    );
            }

            return(UrlFixed);
        }
Exemplo n.º 13
0
        public void TestCleanUrlCss()
        {
            Dictionary <string, string> PropertiesTable = new Dictionary <string, string> ();

            PropertiesTable.Add(
                "background-image:none;",
                null
                );

            PropertiesTable.Add(
                "background: #0b7bee url(none) no-repeat center center/cover;",
                null
                );

            PropertiesTable.Add(
                "background: #0b7bee url(images/video-bg.jpg) no-repeat center center/cover;",
                "images/video-bg.jpg"
                );

            PropertiesTable.Add(
                "background: #0b7bee url(\"images/video-bg.jpg\") no-repeat center center/cover;",
                "images/video-bg.jpg"
                );

            PropertiesTable.Add(
                "src: url(\"fonts/company/latin-e-bold-eot.eot\");",
                "fonts/company/latin-e-bold-eot.eot"
                );

            PropertiesTable.Add(
                "src: url(\"fonts/company/latin-e-bold-eot.eot?#iefix\") format(\"embedded-opentype\"),url(\"fonts/company/latin-e-bold-woff.woff\") format(\"woff\"),url(\"fonts/company/latin-e-bold-ttf.ttf\") format(\"truetype\");",
                "fonts/company/latin-e-bold-eot.eot?#iefix"
                );

            PropertiesTable.Add(
                "background: #ffffff url(images/services/features-background.png) no-repeat left bottom;",
                "images/services/features-background.png"
                );

            PropertiesTable.Add(
                "background: transparent url(\"images/home/mouse.png\") no-repeat 90% top;",
                "images/home/mouse.png"
                );

            PropertiesTable.Add(
                "background: #0b7bee url(images/services/features-background_hover.png) no-repeat left bottom;",
                "images/services/features-background_hover.png"
                );

            PropertiesTable.Add(
                "background-image: url(\"images/global/page-head-trans.png\");",
                "images/global/page-head-trans.png"
                );

            PropertiesTable.Add(
                "background-image: url(\"images/heroes/hero.jpg\");",
                "images/heroes/hero.jpg"
                );

            foreach (string PropertyKey in PropertiesTable.Keys)
            {
                string Cleaned = MacroscopeUrlUtils.CleanUrlCss(PropertyKey);
                Assert.AreEqual(PropertiesTable[PropertyKey], Cleaned, string.Format("NOT VALID: {0}", Cleaned));
            }
        }
Exemplo n.º 14
0
        /**************************************************************************/

        private MacroscopeConstants.FetchStatus Fetch(string Url)
        {
            MacroscopeDocument msDoc = this.DocCollection.GetDocument(Url);

            MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID;

            if (msDoc != null)
            {
                if (msDoc.GetAuthenticationRealm() != null)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredential Credential;

                        Credential = this.JobMaster.GetCredentialsHttp().GetCredential(
                            msDoc.GetHostAndPort(),
                            msDoc.GetAuthenticationRealm()
                            );

                        if (Credential != null)
                        {
                            msDoc = this.DocCollection.CreateDocument(
                                Credential: Credential,
                                Url: Url
                                );
                        }
                    }
                }
            }
            else
            {
                msDoc = this.DocCollection.CreateDocument(Url);
            }

            msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK);

            if (!MacroscopeDnsTools.CheckValidHostname(Url: Url))
            {
                DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK"));

                msDoc.SetStatusCode(HttpStatusCode.BadGateway);

                FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR;

                msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.NETWORK_ERROR);
            }

            if (!this.JobMaster.GetRobots().ApplyRobotRule(Url))
            {
                DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url));

                this.JobMaster.AddToBlockedByRobots(Url);

                FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED;

                msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED);

                this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl());
            }
            else
            {
                this.JobMaster.RemoveFromBlockedByRobots(Url);
            }

            this.JobMaster.GetJobHistory().AddHistoryItem(Url);

            if (this.AllowedHosts.IsExternalUrl(Url: Url))
            {
                DebugMsg(string.Format("IsExternalUrl: {0}", Url));
                msDoc.SetIsExternal(State: true);
            }

            if (this.DocCollection.ContainsDocument(Url))
            {
                if (!this.DocCollection.GetDocument(Url).GetIsDirty())
                {
                    FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN;
                    return(FetchStatus);
                }
            }

            if (this.JobMaster.GetDepth() > 0)
            {
                int Depth = MacroscopeUrlUtils.FindUrlDepth(Url);
                if (Depth > this.JobMaster.GetDepth())
                {
                    DebugMsg(string.Format("TOO DEEP: {0}", Depth));
                    FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED;
                    return(FetchStatus);
                }
            }

            if (msDoc.Execute())
            {
                this.DocCollection.AddDocument(Url, msDoc);

                if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp();

                        CredentialsHttp.EnqueueCredentialRequest(
                            Domain: msDoc.GetHostAndPort(),
                            Realm: msDoc.GetAuthenticationRealm(),
                            Url: msDoc.GetUrl()
                            );

                        this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl());
                    }
                }

                this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl());

                this.JobMaster.IncPageLimitCount();

                if (msDoc.GetIsRedirect())
                {
                    DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl()));
                    DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom()));

                    if (MacroscopePreferencesManager.GetFollowRedirects())
                    {
                        string Hostname      = msDoc.GetHostAndPort();
                        string HostnameFrom  = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom());
                        string UrlRedirectTo = msDoc.GetUrlRedirectTo();
                        string HostnameTo    = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo);

                        DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo));
                        DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo));
                    }

                    this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo());
                }
                else
                {
                    this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang

                    this.ProcessOutlinks(msDoc);          // Process Outlinks from document
                }

                FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS;
            }
            else
            {
                DebugMsg(string.Format("EXECUTE FAILED: {0}", Url));
                FetchStatus = MacroscopeConstants.FetchStatus.ERROR;
            }

            return(FetchStatus);
        }
Exemplo n.º 15
0
        /** -------------------------------------------------------------------- **/

        private void GenerateXmlSitemapPdfEntries(
            MacroscopeDocument msDoc,
            XmlDocument SitemapXml,
            XmlElement UrlSetNode,
            Dictionary <string, Boolean> Dedupe
            )
        {
            foreach (MacroscopeHyperlinkOut HyperlinkOut in msDoc.IterateHyperlinksOut())
            {
                string Url       = HyperlinkOut.GetTargetUrl();
                Uri    UrlParsed = new Uri(uriString: Url);

                if (Dedupe.ContainsKey(Url))
                {
                    continue;
                }
                else
                {
                    Dedupe.Add(Url, true);
                }

                if (!UrlParsed.AbsolutePath.ToLower().EndsWith(".pdf", StringComparison.InvariantCultureIgnoreCase))
                {
                    continue;
                }

                if (!this.DocCollection.GetAllowedHosts().IsAllowedFromUrl(Url: Url))
                {
                    continue;
                }

                if (!MacroscopeUrlUtils.VerifySameHost(BaseUrl: msDoc.GetUrl(), Url: Url))
                {
                    continue;
                }

                XmlElement UrlNode = SitemapXml.CreateElement(string.Empty, "url", MacroscopeSitemapGenerator.XmlNamespace);
                UrlSetNode.AppendChild(UrlNode);

                {
                    XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "loc", MacroscopeSitemapGenerator.XmlNamespace);
                    XmlText    TextNode  = SitemapXml.CreateTextNode(Url);
                    UrlNode.AppendChild(EntryNode);
                    EntryNode.AppendChild(TextNode);
                }

                {
                    XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "changefreq", MacroscopeSitemapGenerator.XmlNamespace);
                    XmlText    TextNode  = SitemapXml.CreateTextNode("daily");
                    UrlNode.AppendChild(EntryNode);
                    EntryNode.AppendChild(TextNode);
                }

                {
                    XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "priority", MacroscopeSitemapGenerator.XmlNamespace);
                    XmlText    TextNode  = SitemapXml.CreateTextNode("1.0");
                    UrlNode.AppendChild(EntryNode);
                    EntryNode.AppendChild(TextNode);
                }
            }
        }