コード例 #1
0
        /**************************************************************************/

        private Boolean Check()
        {
            // TODO: Increase level of detail here.

            HttpWebRequest  req = null;
            HttpWebResponse res = null;
            Boolean         IsAvailableCheck = false;

            try
            {
                req           = WebRequest.CreateHttp(this.Url);
                req.Method    = "HEAD";
                req.Timeout   = 10000;
                req.KeepAlive = false;
                req.Host      = MacroscopeUrlUtils.GetHostnameAndPortFromUrl(this.Url);
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                MacroscopePreferencesManager.EnableHttpProxy(req);

                using (res = ( HttpWebResponse )req.GetResponse())
                {
                    DebugMsg(string.Format("MacroscopeHrefLang Status: {0}", res.StatusCode));

                    if (res.StatusCode == HttpStatusCode.OK)
                    {
                        IsAvailableCheck = true;

                        this.ProcessResponseHttpHeaders(req: req, res: res);
                    }
                    else
                    {
                        IsAvailableCheck = false;
                    }

                    res.Close();
                }
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("MacroscopeHrefLang UriFormatException: {0}", ex.Message));
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("MacroscopeHrefLang WebException: {0}", ex.Message));
            }

            return(IsAvailableCheck);
        }
コード例 #2
0
        /**************************************************************************/

        public static string GetMimeTypeOfUrl(string Url)
        {
            HttpWebRequest  req      = null;
            HttpWebResponse res      = null;
            string          MimeType = null;

            try
            {
                req = WebRequest.CreateHttp(Url);

                req.Method                 = "HEAD";
                req.Timeout                = MacroscopePreferencesManager.GetRequestTimeout() * 1000;
                req.KeepAlive              = false;
                req.AllowAutoRedirect      = false;
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                MacroscopePreferencesManager.EnableHttpProxy(req);

                res = ( HttpWebResponse )req.GetResponse();

                MimeType = res.Headers[HttpResponseHeader.ContentType];

                res.Close();

                res.Dispose();
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("ExecuteHeadRequest :: UriFormatException: {0}", ex.Message), true);
            }
            catch (TimeoutException ex)
            {
                DebugMsg(string.Format("ExecuteHeadRequest :: TimeoutException: {0}", ex.Message), true);
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("ExecuteHeadRequest :: WebException: {0}", ex.Message), true);
            }

            return(MimeType);
        }
コード例 #3
0
        /**************************************************************************/

        public MemoryStream LoadMemoryStreamFromUrl(string Url)
        {
            HttpWebRequest  req      = null;
            HttpWebResponse res      = null;
            MemoryStream    msStream = null;

            try
            {
                req           = WebRequest.CreateHttp(Url);
                req.Method    = "GET";
                req.Timeout   = 1000;
                req.KeepAlive = false;
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                MacroscopePreferencesManager.EnableHttpProxy(req);

                res = ( HttpWebResponse )req.GetResponse();
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("LoadFromUrl :: UriFormatException: {0}", ex.Message));
            }
            catch (WebException ex)
            {
                this.DebugMsg(string.Format("LoadFromUrl :: WebException: {0}", ex.Message));
                this.DebugMsg(string.Format("LoadFromUrl :: WebException: {0}", Url));
            }

            if (res != null)
            {
                try
                {
                    Stream      sStream      = res.GetResponseStream();
                    List <byte> aRawDataList = new List <byte> ();
                    Byte []     aRawData     = new Byte[0];

                    do
                    {
                        int buf = sStream.ReadByte();
                        if (buf > -1)
                        {
                            aRawDataList.Add(( byte )buf);
                        }
                        else
                        {
                            break;
                        }
                    } while(sStream.CanRead);

                    aRawData = aRawDataList.ToArray();


                    if (aRawData.Length > 0)
                    {
                        msStream = new MemoryStream(aRawData);
                    }
                }
                catch (WebException ex)
                {
                    this.DebugMsg(string.Format("LoadFromUrl :: WebException: {0}", ex.Message));
                }

                res.Close();

                res.Dispose();
            }

            return(msStream);
        }
コード例 #4
0
        /**************************************************************************/

        private void ProcessJavascriptPage()
        {
            HttpWebRequest  req = null;
            HttpWebResponse res = null;
            string          ResponseErrorCondition = null;
            Boolean         IsAuthenticating       = false;

            try
            {
                req           = WebRequest.CreateHttp(this.DocUrl);
                req.Method    = "GET";
                req.Timeout   = this.Timeout;
                req.KeepAlive = false;
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                this.PrepareRequestHttpHeaders(req: req);

                IsAuthenticating = this.AuthenticateRequest(req);

                MacroscopePreferencesManager.EnableHttpProxy(req);

                res = ( HttpWebResponse )req.GetResponse();
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("ProcessJavascriptPage :: UriFormatException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
            }
            catch (TimeoutException ex)
            {
                DebugMsg(string.Format("ProcessJavascriptPage :: TimeoutException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("ProcessJavascriptPage :: WebException: {0}", ex.Message));
                DebugMsg(string.Format("ProcessJavascriptPage :: WebException: {0}", ex.Status));
                DebugMsg(string.Format("ProcessJavascriptPage :: WebException: {0}", ( int )ex.Status));

                ResponseErrorCondition = ex.Status.ToString();
            }

            if (res != null)
            {
                string RawData = "";

                this.ProcessResponseHttpHeaders(req, res);

                /** ---------------------------------------------------------------- **/

                if (IsAuthenticating)
                {
                    this.VerifyOrPurgeCredential();
                }

                /** Get Response Body ---------------------------------------------- **/

                try
                {
                    DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType));

                    Encoding encUseEncoding = Encoding.UTF8;

                    if (this.GetCharacterEncoding() != null)
                    {
                        encUseEncoding = this.GetCharacterEncoding();
                    }
                    else
                    {
                        encUseEncoding = this.JavascriptSniffCharset();
                    }

                    Stream       ResponseStream       = res.GetResponseStream();
                    StreamReader ResponseStreamReader = new StreamReader(ResponseStream, encUseEncoding);
                    RawData            = ResponseStreamReader.ReadToEnd();
                    this.ContentLength = RawData.Length; // May need to find bytes length
                    this.SetChecksum(RawData);
                }
                catch (WebException ex)
                {
                    DebugMsg(string.Format("WebException: {0}", ex.Message));

                    if (ex.Response != null)
                    {
                        this.SetStatusCode((( HttpWebResponse )ex.Response).StatusCode);
                    }
                    else
                    {
                        this.SetStatusCode(( HttpStatusCode )ex.Status);
                    }

                    RawData            = "";
                    this.ContentLength = 0;
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("Exception: {0}", ex.Message));
                    this.SetStatusCode(HttpStatusCode.BadRequest);
                    RawData            = "";
                    this.ContentLength = 0;
                }

                /** Custom Filters ------------------------------------------------- **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetCustomFiltersEnable() &&
                        MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts())
                    {
                        MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter();

                        if ((CustomFilter != null) && (CustomFilter.IsEnabled()))
                        {
                            this.ProcessGenericCustomFiltered(
                                CustomFilter: CustomFilter,
                                GenericText: RawData
                                );
                        }
                    }
                }

                /** Data Extractors ------------------------------------------------ **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetDataExtractorsEnable() &&
                        MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts())
                    {
                        this.ProcessGenericDataExtractors(GenericText: RawData);
                    }
                }

                /** Title ---------------------------------------------------------- **/

                {
                    MatchCollection reMatches     = Regex.Matches(this.DocUrl, "/([^/]+)$");
                    string          DocumentTitle = null;
                    foreach (Match match in reMatches)
                    {
                        if (match.Groups[1].Value.Length > 0)
                        {
                            DocumentTitle = match.Groups[1].Value.ToString();
                            break;
                        }
                    }
                    if (DocumentTitle != null)
                    {
                        this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        DebugMsg(string.Format("TITLE: {0}", this.GetTitle()));
                    }
                    else
                    {
                        DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                    }
                }

                /** ---------------------------------------------------------------- **/

                res.Close();

                res.Dispose();
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }
コード例 #5
0
        /**************************************************************************/

        private void ProcessImagePage()
        {
            HttpWebRequest  req = null;
            HttpWebResponse res = null;
            string          ResponseErrorCondition = null;
            Boolean         IsAuthenticating       = false;

            try
            {
                req           = WebRequest.CreateHttp(this.DocUrl);
                req.Method    = "HEAD";
                req.Timeout   = this.Timeout;
                req.KeepAlive = false;
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                this.PrepareRequestHttpHeaders(req: req);

                IsAuthenticating = this.AuthenticateRequest(req);

                MacroscopePreferencesManager.EnableHttpProxy(req);

                res = ( HttpWebResponse )req.GetResponse();
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("ProcessImagePage :: UriFormatException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("ProcessImagePage :: WebException: {0}", ex.Message));
                DebugMsg(string.Format("ProcessImagePage :: WebException: {0}", ex.Status));
                DebugMsg(string.Format("ProcessImagePage :: WebException: {0}", ( int )ex.Status));

                ResponseErrorCondition = ex.Status.ToString();
            }

            if (res != null)
            {
                this.ProcessResponseHttpHeaders(req, res);

                if (IsAuthenticating)
                {
                    this.VerifyOrPurgeCredential();
                }

                { // Title
                    MatchCollection reMatches     = Regex.Matches(this.DocUrl, "/([^/]+)$");
                    string          DocumentTitle = null;

                    foreach (Match match in reMatches)
                    {
                        if (match.Groups[1].Value.Length > 0)
                        {
                            DocumentTitle = match.Groups[1].Value.ToString();
                            break;
                        }
                    }

                    if (DocumentTitle != null)
                    {
                        this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        DebugMsg(string.Format("TITLE: {0}", this.GetTitle()));
                    }
                    else
                    {
                        DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                    }
                }

                res.Close();

                res.Dispose();
            }

            if (ResponseErrorCondition != null)
            {
                this.ErrorCondition = ResponseErrorCondition;
            }
        }
コード例 #6
0
        /**************************************************************************/

        private void ProcessTextPage()
        {
            List <string>   TextDoc = new List <string> ();
            HttpWebRequest  req     = null;
            HttpWebResponse res     = null;
            string          ResponseErrorCondition = null;
            Boolean         IsAuthenticating       = false;

            try
            {
                req           = WebRequest.CreateHttp(this.DocUrl);
                req.Method    = "GET";
                req.Timeout   = this.Timeout;
                req.KeepAlive = false;
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                this.PrepareRequestHttpHeaders(req: req);

                IsAuthenticating = this.AuthenticateRequest(req);

                MacroscopePreferencesManager.EnableHttpProxy(req);

                res = ( HttpWebResponse )req.GetResponse();
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("ProcessTextPage :: UriFormatException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("ProcessTextPage :: WebException: {0}", ex.Message));
                DebugMsg(string.Format("ProcessTextPage :: WebException: {0}", this.DocUrl));
                DebugMsg(string.Format("ProcessTextPage :: WebExceptionStatus: {0}", ex.Status));
                ResponseErrorCondition = ex.Status.ToString();
            }

            if (res != null)
            {
                string RawData = "";

                this.ProcessResponseHttpHeaders(req, res);

                if (IsAuthenticating)
                {
                    this.VerifyOrPurgeCredential();
                }

                // Get Response Body
                try
                {
                    DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType));

                    Stream       ResponseStream       = res.GetResponseStream();
                    StreamReader ResponseStreamReader = new StreamReader(ResponseStream, Encoding.UTF8); // Assume UTF-8
                    RawData = ResponseStreamReader.ReadToEnd();

                    this.ContentLength = RawData.Length; // May need to find bytes length

                    this.SetWasDownloaded(true);

                    this.SetChecksum(RawData);
                }
                catch (WebException ex)
                {
                    DebugMsg(string.Format("WebException: {0}", ex.Message));

                    if (ex.Response != null)
                    {
                        this.SetStatusCode((( HttpWebResponse )ex.Response).StatusCode);
                    }
                    else
                    {
                        this.SetStatusCode(( HttpStatusCode )ex.Status);
                    }

                    RawData            = "";
                    this.ContentLength = 0;
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("Exception: {0}", ex.Message));
                    this.SetStatusCode(HttpStatusCode.BadRequest);
                    RawData            = "";
                    this.ContentLength = 0;
                }

                /** ---------------------------------------------------------------- **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    string [] Lines = Regex.Split(RawData, @"[\r\n]+");
                    TextDoc = Lines.ToList();

                    DebugMsg(string.Format("TextDoc: {0}", TextDoc.Count));
                }
                else
                {
                    DebugMsg(string.Format("RawData: {0}", "EMPTY"));
                }

                /** Custom Filters ------------------------------------------------- **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetCustomFiltersEnable() &&
                        MacroscopePreferencesManager.GetCustomFiltersApplyToText())
                    {
                        MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter();

                        if ((CustomFilter != null) && (CustomFilter.IsEnabled()))
                        {
                            this.ProcessGenericCustomFiltered(
                                CustomFilter: CustomFilter,
                                GenericText: RawData
                                );
                        }
                    }
                }

                /** Data Extractors ------------------------------------------------ **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetDataExtractorsEnable() &&
                        MacroscopePreferencesManager.GetDataExtractorsApplyToText())
                    {
                        this.ProcessGenericDataExtractors(GenericText: RawData);
                    }
                }

                /** Process Text Document ------------------------------------------ **/

                if ((TextDoc != null) && (TextDoc.Count > 0))
                {
                    if (this.GetPath().EndsWith("robots.txt", StringComparison.InvariantCultureIgnoreCase))
                    {
                        this.ProcessRobotsTextOutlinks(TextDoc: TextDoc);
                    }

                    if (this.DetectSitemapTextDocument(TextDoc: TextDoc))
                    {
                        DebugMsg(string.Format("ProcessTextPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl()));
                        this.SetIsSitemapText();
                        this.ProcessSitemapTextOutlinks(TextDoc);
                    }
                }

                /** ---------------------------------------------------------------- **/

                res.Close();

                res.Dispose();
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }
コード例 #7
0
        /**************************************************************************/

        private void ProcessPdfPage()
        {
            HttpWebRequest  req = null;
            HttpWebResponse res = null;
            string          ResponseErrorCondition = null;
            Boolean         Authenticating         = false;

            try
            {
                req           = WebRequest.CreateHttp(this.DocUrl);
                req.Method    = "GET";
                req.Timeout   = this.Timeout;
                req.KeepAlive = false;
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                this.PrepareRequestHttpHeaders(req: req);

                Authenticating = this.AuthenticateRequest(req);

                MacroscopePreferencesManager.EnableHttpProxy(req);

                res = ( HttpWebResponse )req.GetResponse();
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("ProcessPdfPage :: UriFormatException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("ProcessPdfPage :: WebException: {0}", ex.Message));
                DebugMsg(string.Format("ProcessPdfPage :: WebException: {0}", ex.Status));
                DebugMsg(string.Format("ProcessPdfPage :: WebException: {0}", ( int )ex.Status));

                ResponseErrorCondition = ex.Status.ToString();
            }

            if (res != null)
            {
                MacroscopePdfTools pdfTools;

                this.ProcessResponseHttpHeaders(req, res);

                if (Authenticating)
                {
                    this.VerifyOrPurgeCredential();
                }

                {                              // Probe Locale
                  //this.Locale = "en"; // Implement locale probing
                    this.Locale = "x-default"; // Implement locale probing
                    this.SetHreflang(HrefLangLocale: this.Locale, Url: this.DocUrl);
                }

                { // Canonical
                    this.Canonical = this.DocUrl;
                    DebugMsg(string.Format("CANONICAL: {0}", this.Canonical));
                }

                { // Get Response Body
                    try
                    {
                        Stream      ResponseStream = res.GetResponseStream();
                        List <byte> RawDataList    = new List <byte> ();
                        byte []     RawData;

                        do
                        {
                            int buf = ResponseStream.ReadByte();
                            if (buf > -1)
                            {
                                RawDataList.Add(( byte )buf);
                            }
                            else
                            {
                                break;
                            }
                        } while(ResponseStream.CanRead);

                        RawData            = RawDataList.ToArray();
                        this.ContentLength = RawData.Length;

                        pdfTools = new MacroscopePdfTools(RawData);

                        if (pdfTools.GetHasError())
                        {
                            this.AddRemark(Observation: pdfTools.GetErrorMessage());
                        }

                        this.SetWasDownloaded(true);
                    }
                    catch (WebException ex)
                    {
                        DebugMsg(string.Format("WebException: {0}", ex.Message));

                        if (ex.Response != null)
                        {
                            this.SetStatusCode((( HttpWebResponse )ex.Response).StatusCode);
                        }
                        else
                        {
                            this.SetStatusCode(( HttpStatusCode )ex.Status);
                        }

                        pdfTools           = null;
                        this.ContentLength = 0;
                    }
                    catch (Exception ex)
                    {
                        DebugMsg(string.Format("Exception: {0}", ex.Message));
                        this.SetStatusCode(HttpStatusCode.BadRequest);
                        pdfTools           = null;
                        this.ContentLength = 0;
                    }
                }

                /** Title ---------------------------------------------------------- **/

                {
                    if (pdfTools != null)
                    {
                        string DocumentTitle = pdfTools.GetTitle();
                        if (DocumentTitle != null)
                        {
                            this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                            DebugMsg(string.Format("TITLE: {0}", this.GetTitle()));
                        }
                        else
                        {
                            DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                        }
                    }
                }

                /** ---------------------------------------------------------------- **/

                res.Close();

                res.Dispose();
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }
コード例 #8
0
        /** Fetch Robots Text *****************************************************/

        private string FetchRobotTextFile(Uri RobotsUri)
        {
            Boolean         Proceed   = false;
            HttpWebRequest  req       = null;
            HttpWebResponse res       = null;
            string          RobotText = "";
            string          RawData   = "";

            if (!MacroscopeDnsTools.CheckValidHostname(Url: RobotsUri.ToString()))
            {
                DebugMsg(string.Format("FetchRobotTextFile :: CheckValidHostname: {0}", "NOT OK"));
                return(RobotText);
            }

            try
            {
                req           = WebRequest.CreateHttp(RobotsUri);
                req.Method    = "GET";
                req.Timeout   = MacroscopePreferencesManager.GetRequestTimeout() * 1000;
                req.KeepAlive = false;
                req.UserAgent = this.UserAgent();
                req.Host      = RobotsUri.Host;
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                MacroscopePreferencesManager.EnableHttpProxy(req);

                res = ( HttpWebResponse )req.GetResponse();

                Proceed = true;
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("UriFormatException: {0}", ex.Message));
                DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString()));
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("WebException: {0}", ex.Message));
                DebugMsg(string.Format("WebException: {0}", RobotsUri.ToString()));
                DebugMsg(string.Format("WebExceptionStatus: {0}", ex.Status));
            }
            catch (NotSupportedException ex)
            {
                DebugMsg(string.Format("NotSupportedException: {0}", ex.Message));
                DebugMsg(string.Format("NotSupportedException: {0}", RobotsUri.ToString()));
            }
            catch (Exception ex)
            {
                DebugMsg(string.Format("Exception: {0}", ex.Message));
                DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString()));
            }

            if ((Proceed) && (res != null))
            {
                try
                {
                    Stream       ResponseStream = res.GetResponseStream();
                    StreamReader ReadStream     = new StreamReader(ResponseStream);
                    RawData = ReadStream.ReadToEnd();
                }
                catch (WebException ex)
                {
                    DebugMsg(string.Format("FetchRobotTextFile: WebException: {0}", ex.Message));
                    RawData = "";
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("FetchRobotTextFile: Exception: {0}", ex.Message));
                    RawData = "";
                }

                res.Close();

                res.Dispose();
            }
            else
            {
                lock (this.BadRobots)
                {
                    if (!this.BadRobots.ContainsKey(RobotsUri))
                    {
                        this.BadRobots.Add(RobotsUri, true);
                        RobotText = "";
                    }
                }
            }

            if (!string.IsNullOrEmpty(RawData))
            {
                RobotText = RawData;
            }

            return(RobotText);
        }