Ejemplo n.º 1
0
        public static async Task <CrawlResponse> DownloadHtml(Uri uri, string userAgent, CrawlRequest reqBody)
        {
            var request = (HttpWebRequest)WebRequest.Create(uri);

            request.Method    = "GET";
            request.UserAgent = userAgent;

            if (!string.IsNullOrEmpty(reqBody.ETag))
            {
                request.Headers.Add(HttpRequestHeader.IfNoneMatch, reqBody.ETag);
            }


            using (var response = (HttpWebResponse)await request.GetResponseAsync())
            {
                using (var stream = response.GetResponseStream())
                    using (var reader = new StreamReader(stream))
                    {
                        // TODO: look for schema.org
                        var html = await reader.ReadToEndAsync();

                        // TODO: support microsoft:ds_id
                        return(HtmlExtractor.Parse(html, new Uri(reqBody.Url)));
                    }
            }
        }
Ejemplo n.º 2
0
        // <meta property="microsoft:ds_id" content="some-id">
        //private static Regex MetaMicrosoftDsIdRegex = new Regex(@"<meta[^>]+property\s*=\s*[""']microsoft:ds_id[""'][^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
        //private static Regex MetaContentRegex = new Regex(@"content\s*=\s*[""']([^""']+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

        public static async Task <HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log)
        {
            CrawlRequest crawlRequest = null;
            string       reqBodyStr   = null;

            try
            {
                using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.HTML"))
                {
                    reqBodyStr = await req.Content.ReadAsStringAsync();

                    var reqBody = JsonConvert.DeserializeObject <CrawlRequest>(reqBodyStr);

                    operation.Telemetry.Properties.Add("AppId", reqBody.Site);
                    operation.Telemetry.Properties.Add("ActionId", reqBody.Id);
                    operation.Telemetry.Properties.Add("Url", reqBody.Url);

                    log.Info($"Crawl AppId={reqBody.Site} Id={reqBody.Id} Url={reqBody.Url}");

                    var request = (HttpWebRequest)WebRequest.Create(reqBody.Url);

                    if (!string.IsNullOrEmpty(reqBody.ETag))
                    {
                        request.Headers.Add(HttpRequestHeader.IfNoneMatch, reqBody.ETag);
                    }

                    request.Method    = "GET";
                    request.KeepAlive = true;
                    request.UserAgent = "DSbot/1.0 (+https://ds.microsoft.com/bot.htm)";

                    using (var response = (HttpWebResponse)await request.GetResponseAsync())
                    {
                        operation.Telemetry.ResultCode = response.StatusCode.ToString();

                        using (var stream = response.GetResponseStream())
                            using (var reader = new StreamReader(stream))
                            {
                                // TODO: allow direct JSON
                                // TODO: look for schema.org
                                var html = await reader.ReadToEndAsync();

                                // TODO: support microsoft:ds_id
                                var result = HtmlExtractor.Parse(html, new Uri(reqBody.Url));
                                result.Url  = reqBody.Url;
                                result.Site = reqBody.Site;
                                result.Id   = reqBody.Id;

                                return(new HttpResponseMessage(HttpStatusCode.OK)
                                {
                                    Content = new StringContent(
                                        JsonConvert.SerializeObject(result, new JsonSerializerSettings
                                    {
                                        Formatting = Formatting.None,
                                        StringEscapeHandling = StringEscapeHandling.EscapeNonAscii
                                    }),
                                        new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
                                        "application/json")
                                });
                            }
                    }
                }
            }
            catch (Exception ex)
            {
                var props = new Dictionary <string, string>
                {
                    { "Service", req.RequestUri.ToString() }
                };

                if (crawlRequest == null)
                {
                    props.Add("JSON", reqBodyStr);
                }
                else
                {
                    props.Add("Url", crawlRequest.Url);
                    props.Add("AppId", crawlRequest.Site);
                    props.Add("ActionId", crawlRequest.Id);
                }

                Services.TelemetryClient.TrackException(ex, props);

                throw ex;
            }
        }