Esempio n. 1
0
        public async Task TestHttpTwoClientGet()
        {
            MacroscopeHttpTwoClient Client = new MacroscopeHttpTwoClient();
            List <Uri> UrlList             = new List <Uri>();

            UrlList.Add(new Uri("https://nazuke.github.io/robots.txt"));

            foreach (Uri Url in UrlList)
            {
                this.DebugMsg(string.Format("Url: {0}", Url));

                MacroscopeHttpTwoClientResponse ClientResponse = await Client.Get(
                    Url,
                    this.PreProcessHeadRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );

                HttpResponseMessage Response = ClientResponse.GetResponse();

                this.DebugMsg(string.Format("Response.Version: {0}", Response.Version));

                Assert.AreEqual(200, (int)Response.StatusCode);

                Assert.Greater(ClientResponse.GetContentAsString().Length, 0);
            }

            return;
        }
        /** -------------------------------------------------------------------- **/

        private async Task _ProcessXmlPage()
        {
            XmlDocument                     XmlDoc   = null;
            MacroscopeHttpTwoClient         Client   = this.DocCollection.GetJobMaster().GetHttpClient();
            MacroscopeHttpTwoClientResponse Response = null;
            string ResponseErrorCondition            = null;

            try
            {
                Response = await Client.Get(
                    this.GetUri(),
                    this.ConfigureXmlPageRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );
            }
            catch (MacroscopeDocumentException ex)
            {
                this.DebugMsg(string.Format("_ProcessXmlPage :: MacroscopeDocumentException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessXmlPage", ex.Message);
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("_ProcessXmlPage :: Exception: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessXmlPage", ex.Message);
            }

            if (Response != null)
            {
                string RawData = "";

                this.ProcessResponseHttpHeaders(Response: Response);

                // Get Response Body
                try
                {
                    DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType));

                    RawData = Response.GetContentAsString();

                    this.SetContentLength(Length: RawData.Length); // May need to find bytes length

                    this.SetWasDownloaded(true);

                    this.SetChecksum(RawData);
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("Exception: {0}", ex.Message));
                    this.SetStatusCode(HttpStatusCode.BadRequest);
                    RawData = "";
                    this.SetContentLength(Length: 0);
                }

                if (!string.IsNullOrEmpty(RawData))
                {
                    XmlDoc = new XmlDocument();

                    try
                    {
                        XmlDoc.LoadXml(RawData);
                    }
                    catch (XmlException ex)
                    {
                        DebugMsg(string.Format("XmlException: {0}", ex.Message));
                    }
                    catch (Exception ex)
                    {
                        DebugMsg(string.Format("Exception: {0}", ex.Message));
                    }

                    DebugMsg(string.Format("XmlDoc: {0}", XmlDoc));
                }
                else
                {
                    DebugMsg(string.Format("RawData: {0}", "EMPTY"));
                }

                /** Custom Filters ------------------------------------------------- **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetCustomFiltersEnable() &&
                        MacroscopePreferencesManager.GetCustomFiltersApplyToXml())
                    {
                        MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter();

                        if ((CustomFilter != null) && (CustomFilter.IsEnabled()))
                        {
                            this.ProcessGenericCustomFiltered(
                                CustomFilter: CustomFilter,
                                GenericText: RawData
                                );
                        }
                    }
                }

                /** Data Extractors ------------------------------------------------ **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetDataExtractorsEnable() &&
                        MacroscopePreferencesManager.GetDataExtractorsApplyToXml())
                    {
                        this.ProcessGenericDataExtractors(GenericText: RawData);
                    }
                }

                /** ---------------------------------------------------------------- **/

                if ((XmlDoc != null) && (XmlDoc.DocumentElement != null))
                {
                    if (this.DetectSitemapXmlDocument(XmlDoc))
                    {
                        DebugMsg(string.Format("ProcessXmlPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl()));
                        this.SetDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML);
                        this.ProcessSitemapXmlOutlinks(XmlDoc);
                    }
                }

                /** ---------------------------------------------------------------- **/

                if (RawData != null)
                {
                    this.SetDocumentText(Text: RawData);
                }

                /** ---------------------------------------------------------------- **/
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }
        /** Fetch Robots Text *****************************************************/

        private async Task <string> FetchRobotTextFile(Uri RobotsUri)
        {
            MacroscopeHttpTwoClientResponse Response = null;
            bool   Proceed   = false;
            string RobotText = "";
            string RawData   = "";

            if (!MacroscopeDnsTools.CheckValidHostname(Url: RobotsUri.ToString()))
            {
                DebugMsg(string.Format("FetchRobotTextFile :: CheckValidHostname: {0}", "NOT OK"));
                return(RobotText);
            }

            try
            {
                Response = await this.Client.Get(
                    RobotsUri,
                    this.ConfigureHeadRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );

                if (Response != null)
                {
                    Proceed = true;
                }
            }
            catch (MacroscopeDocumentException ex)
            {
                DebugMsg(string.Format("MacroscopeDocumentException: {0}", ex.Message));
                DebugMsg(string.Format("MacroscopeDocumentException: {0}", RobotsUri.ToString()));
            }
            catch (Exception ex)
            {
                DebugMsg(string.Format("Exception: {0}", ex.Message));
                DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString()));
            }

            if ((Proceed) && (Response != null))
            {
                try
                {
                    RawData = Response.GetContentAsString();
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("FetchRobotTextFile: Exception: {0}", ex.Message));
                    RawData = "";
                }
            }
            else
            {
                lock (this.BadRobots)
                {
                    if (!this.BadRobots.ContainsKey(RobotsUri))
                    {
                        this.BadRobots.Add(RobotsUri, true);
                        RobotText = "";
                    }
                }
            }

            if (!string.IsNullOrEmpty(RawData))
            {
                RobotText = RawData;
            }

            return(RobotText);
        }
        /** -------------------------------------------------------------------- **/

        private async Task _ProcessJavascriptPage()
        {
            MacroscopeHttpTwoClient         Client   = this.DocCollection.GetJobMaster().GetHttpClient();
            MacroscopeHttpTwoClientResponse Response = null;
            string ResponseErrorCondition            = null;

            try
            {
                Response = await Client.Get(
                    this.GetUri(),
                    this.ConfigureJavascriptPageRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );
            }
            catch (MacroscopeDocumentException ex)
            {
                this.DebugMsg(string.Format("_ProcessJavascriptPage :: MacroscopeDocumentException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessJavascriptPage", ex.Message);
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("_ProcessJavascriptPage :: Exception: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessJavascriptPage", ex.Message);
            }

            if (Response != null)
            {
                string RawData = "";

                this.ProcessResponseHttpHeaders(Response: Response);

                /** Get Response Body ---------------------------------------------- **/

                try
                {
                    DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType));

                    /*
                     * Encoding encUseEncoding = Encoding.UTF8;
                     *
                     * if( this.GetCharacterEncoding() != null )
                     * {
                     * encUseEncoding = this.GetCharacterEncoding();
                     * }
                     * else
                     * {
                     * encUseEncoding = this.JavascriptSniffCharset();
                     * }
                     */

                    RawData = Response.GetContentAsString();
                    this.SetContentLength(Length: RawData.Length); // May need to find bytes length
                    this.SetChecksum(RawData);
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("Exception: {0}", ex.Message));
                    this.SetStatusCode(HttpStatusCode.Ambiguous);
                    RawData = "";
                    this.SetContentLength(Length: 0);
                }

                /** Custom Filters ------------------------------------------------- **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetCustomFiltersEnable() &&
                        MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts())
                    {
                        MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter();

                        if ((CustomFilter != null) && (CustomFilter.IsEnabled()))
                        {
                            this.ProcessGenericCustomFiltered(
                                CustomFilter: CustomFilter,
                                GenericText: RawData
                                );
                        }
                    }
                }

                /** Data Extractors ------------------------------------------------ **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetDataExtractorsEnable() &&
                        MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts())
                    {
                        this.ProcessGenericDataExtractors(GenericText: RawData);
                    }
                }

                /** Title ---------------------------------------------------------- **/

                {
                    MatchCollection reMatches     = Regex.Matches(this.DocUrl, "/([^/]+)$");
                    string          DocumentTitle = null;
                    foreach (Match match in reMatches)
                    {
                        if (match.Groups[1].Value.Length > 0)
                        {
                            DocumentTitle = match.Groups[1].Value.ToString();
                            break;
                        }
                    }
                    if (DocumentTitle != null)
                    {
                        this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        DebugMsg(string.Format("TITLE: {0}", this.GetTitle()));
                    }
                    else
                    {
                        DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                    }
                }

                /** ---------------------------------------------------------------- **/
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }
Esempio n. 5
0
        /** -------------------------------------------------------------------- **/

        private async Task _ProcessTextPage()
        {
            List <string>                   TextDoc  = new List <string>();
            MacroscopeHttpTwoClient         Client   = this.DocCollection.GetJobMaster().GetHttpClient();
            MacroscopeHttpTwoClientResponse Response = null;
            string ResponseErrorCondition            = null;

            try
            {
                Response = await Client.Get(
                    this.GetUri(),
                    this.ConfigureTextPageRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );
            }
            catch (MacroscopeDocumentException ex)
            {
                this.DebugMsg(string.Format("_ProcessTextPage :: MacroscopeDocumentException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessTextPage", ex.Message);
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("_ProcessTextPage :: Exception: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessTextPage", ex.Message);
            }

            if (Response != null)
            {
                string RawData = "";

                this.ProcessResponseHttpHeaders(Response: Response);

                /** Get Response Body ---------------------------------------------- **/

                try
                {
                    DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType));

                    RawData = Response.GetContentAsString();

                    this.SetContentLength(Length: RawData.Length); // May need to find bytes length

                    this.SetWasDownloaded(true);

                    this.SetChecksum(RawData);
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("Exception: {0}", ex.Message));
                    this.SetStatusCode(HttpStatusCode.BadRequest);
                    RawData = "";
                    this.SetContentLength(Length: 0);
                }

                /** ---------------------------------------------------------------- **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    string[] Lines = Regex.Split(RawData, @"[\r\n]+");
                    TextDoc = Lines.ToList();

                    DebugMsg(string.Format("TextDoc: {0}", TextDoc.Count));
                }
                else
                {
                    DebugMsg(string.Format("RawData: {0}", "EMPTY"));
                }

                /** Custom Filters ------------------------------------------------- **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetCustomFiltersEnable() &&
                        MacroscopePreferencesManager.GetCustomFiltersApplyToText())
                    {
                        MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter();

                        if ((CustomFilter != null) && (CustomFilter.IsEnabled()))
                        {
                            this.ProcessGenericCustomFiltered(
                                CustomFilter: CustomFilter,
                                GenericText: RawData
                                );
                        }
                    }
                }

                /** Data Extractors ------------------------------------------------ **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetDataExtractorsEnable() &&
                        MacroscopePreferencesManager.GetDataExtractorsApplyToText())
                    {
                        this.ProcessGenericDataExtractors(GenericText: RawData);
                    }
                }

                /** Process Text Document ------------------------------------------ **/

                if ((TextDoc != null) && (TextDoc.Count > 0))
                {
                    this.SetDocumentText(Text: string.Join(Environment.NewLine, TextDoc));

                    if (this.GetPath().EndsWith("robots.txt", StringComparison.InvariantCultureIgnoreCase))
                    {
                        long?TextSize          = this.GetContentLength();
                        long?RobotsMaxTextSize = 1024 * 512;

                        this.ProcessRobotsTextOutlinks(TextDoc: TextDoc);

                        if (this.DetectSitemapTextDocument(TextDoc: TextDoc))
                        {
                            DebugMsg(string.Format("ProcessTextPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl()));
                            this.SetDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPTEXT);
                            this.ProcessSitemapTextOutlinks(TextDoc: TextDoc);
                        }

                        if (TextSize > RobotsMaxTextSize)
                        {
                            this.AddRemark("ROBOTS_TOO_BIG", "Robots.txt is larger than 512KB");
                        }
                    }
                    else
                    {
                        if (this.GetIsInternal())
                        {
                            this.ProcessPureTextOutlinks(TextDoc: TextDoc, LinkType: MacroscopeConstants.InOutLinkType.PURETEXT);
                        }
                    }
                }
                else
                {
                    this.SetDocumentText(Text: "");
                }

                /** ---------------------------------------------------------------- **/
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }
        /** -------------------------------------------------------------------- **/

        private async Task _ProcessCssPage()
        {
            MacroscopeHttpTwoClient         Client   = this.DocCollection.GetJobMaster().GetHttpClient();
            MacroscopeHttpTwoClientResponse Response = null;
            string ResponseErrorCondition            = null;

            DebugMsg(string.Format("ProcessCssPage: {0}", ""));

            try
            {
                Response = await Client.Get(
                    this.GetUri(),
                    this.ConfigureCssPageRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );
            }
            catch (MacroscopeDocumentException ex)
            {
                this.DebugMsg(string.Format("_ProcessCssPage :: MacroscopeDocumentException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessCssPage", ex.Message);
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("_ProcessCssPage :: Exception: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessCssPage", ex.Message);
            }

            if (Response != null)
            {
                string RawData = "";

                this.ProcessResponseHttpHeaders(Response: Response);

                /** Get Response Body ---------------------------------------------- **/

                try
                {
                    DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType));

                    RawData = Response.GetContentAsString();

                    this.SetContentLength(Length: RawData.Length); // May need to find bytes length

                    this.SetWasDownloaded(true);
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("Exception: {0}", ex.Message));
                    this.SetStatusCode(HttpStatusCode.Ambiguous);
                    this.SetContentLength(Length: 0);
                }

                if (!string.IsNullOrEmpty(RawData))
                {
                    try
                    {
                        StylesheetParser CssParser     = new StylesheetParser();
                        Stylesheet       CssStylesheet = CssParser.Parse(RawData);
                        this.ProcessCssOutlinks(CssStylesheet: CssStylesheet);
                    }
                    catch (Exception ex)
                    {
                        this.DebugMsg(string.Format("ProcessHtmlAttributeCssLinks: {0}", ex.Message));
                        this.AddRemark("ProcessHtmlAttributeCssLinks", ex.Message);
                    }
                }
                else
                {
                    DebugMsg(string.Format("ProcessCssPage: ERROR: {0}", this.GetUrl()));
                }

                /** Custom Filters ------------------------------------------------- **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetCustomFiltersEnable() &&
                        MacroscopePreferencesManager.GetCustomFiltersApplyToCss())
                    {
                        MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter();

                        if ((CustomFilter != null) && (CustomFilter.IsEnabled()))
                        {
                            this.ProcessGenericCustomFiltered(
                                CustomFilter: CustomFilter,
                                GenericText: RawData
                                );
                        }
                    }
                }

                /** Data Extractors ------------------------------------------------ **/

                if (!string.IsNullOrEmpty(RawData))
                {
                    if (
                        MacroscopePreferencesManager.GetDataExtractorsEnable() &&
                        MacroscopePreferencesManager.GetDataExtractorsApplyToCss())
                    {
                        this.ProcessGenericDataExtractors(GenericText: RawData);
                    }
                }

                /** Title ---------------------------------------------------------- **/

                {
                    MatchCollection reMatches     = Regex.Matches(this.DocUrl, "/([^/]+)$");
                    string          DocumentTitle = null;
                    foreach (Match match in reMatches)
                    {
                        if (match.Groups[1].Value.Length > 0)
                        {
                            DocumentTitle = match.Groups[1].Value.ToString();
                            break;
                        }
                    }
                    if (DocumentTitle != null)
                    {
                        this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        DebugMsg(string.Format("TITLE: {0}", this.GetTitle()));
                    }
                    else
                    {
                        DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                    }
                }
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }