private static async Task <BlobContent> GetVideoIndexerBreakdownAsync(CrawlResponse reqBody, VideoIndexerSettings settings, TraceWriter log, CancellationToken cancellationToken) { using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.VideoIndexer.GetBreakdown")) { var localCogService = GetCognitiveService(settings); var client = await localCogService.GetHttpClientAsync(); var id = HttpUtility.UrlEncode(reqBody.Id); var responseMessage = await client.GetAsync($"/Breakdowns/Api/Partner/Breakdowns/Search?externalId={id}", cancellationToken); if (!responseMessage.IsSuccessStatusCode) { return(null); } var videoIndexerResponseStr = await responseMessage.Content.ReadAsStringAsync(); var videoIndexerResponse = JsonConvert.DeserializeObject <VideoIndexerSearchResult>(videoIndexerResponseStr); var breakdownId = videoIndexerResponse.Results?.FirstOrDefault()?.Id; if (breakdownId == null) { return(null); } return(await localCogService.RequestAsync( log, reqBody.Site, reqBody.Id, $"/Breakdowns/Api/Partner/Breakdowns/{breakdownId}", reqBody.ForceRefresh, isPost : false, cancellationToken : cancellationToken)); } }
private static async Task IndexVideo(CrawlResponse reqBody, VideoIndexerSettings settings) { if (reqBody == null || string.IsNullOrEmpty(reqBody.Video) || !Uri.TryCreate(reqBody.Video, UriKind.Absolute, out Uri unused)) { return; } //var account = CloudStorageAccount.Parse(await cogService.GetAzureStorageConnectionStringAsync()); //var blobClient = account.CreateCloudBlobClient(); //var container = blobClient.GetContainerReference(BlobCache.ToContainerName(DateTime.UtcNow, cogService.containerName)); //await container.CreateIfNotExistsAsync(); //var leaseBlob = container.GetBlockBlobReference(BlobCache.ToBlobName(reqBody.Site, reqBody.Id) + ".lease"); //leaseBlob. using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.VideoIndexer.Enqueue")) { operation.Telemetry.Properties.Add("url", reqBody.Video); operation.Telemetry.Properties.Add("id", reqBody.Id); // https://videobreakdown.azure-api.net/Breakdowns/Api/Partner/Breakdowns[?name][&privacy][&videoUrl][&language][&externalId][&metadata][&description][&partition][&callbackUrl][&indexingPreset][&streamingPreset] var url = HttpUtility.UrlEncode(reqBody.Video); var id = HttpUtility.UrlEncode(reqBody.Id); var query = "Breakdowns/Api/Partner/Breakdowns" + $"?externalId={id}" + $"&videoUrl={url}" + "&privacy=private&searchable=true"; string name = reqBody.Title; if (string.IsNullOrEmpty(name)) { name = id; } query += "&name=" + name; if (!string.IsNullOrEmpty(reqBody.Description)) { query += "&description=" + reqBody.Description; } if (reqBody.Categories != null && reqBody.Categories.Count > 0) { query += "&metadata=" + HttpUtility.UrlEncode(string.Join(" ", reqBody.Categories)); } var localCogService = GetCognitiveService(settings); var client = await localCogService.GetHttpClientAsync(); var httpResponse = await client.PostAsync(client.BaseAddress + query, new MultipartFormDataContent()); operation.Telemetry.Success = httpResponse.IsSuccessStatusCode; operation.Telemetry.ResultCode = httpResponse.StatusCode.ToString(); } }
public async Task <HttpResponseMessage> InvokeAsync(HttpRequestMessage req, TraceWriter log, Func <CrawlResponse, object> requestBodyFunc, Action <CrawlResponse, BlobContent> responseAction, CancellationToken cancellationToken) { log.Info("Crawl." + this.containerName); await this.InitializeAsync(); string reqBodyStr = null; CrawlResponse reqBody = null; BlobContent blobContent = null; try { using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl." + this.containerName)) { reqBodyStr = await req.Content.ReadAsStringAsync(); reqBody = JsonConvert.DeserializeObject <CrawlResponse>(reqBodyStr); operation.Telemetry.Target = this.endpoint; operation.Telemetry.Properties.Add("AppId", reqBody.Site); operation.Telemetry.Properties.Add("ActionId", reqBody.Id); blobContent = await this.PostAsync( log, reqBody.Site, reqBody.Id, requestBodyFunc(reqBody), reqBody.ForceRefresh, cancellationToken); if (blobContent != null) { operation.Telemetry.Properties.Add("Expires", blobContent.Expires.ToString(CultureInfo.InvariantCulture)); if (blobContent.Value != null) { responseAction(reqBody, blobContent); operation.Telemetry.ResultCode = "OK"; } } return(req.CreateResponse(blobContent)); } } catch (Exception ex) { Services.TrackException(ex, req, log, reqBodyStr, reqBody, blobContent); throw ex; } }
private void ParsePage(string title, string url, PageElement pageElement = null) { if (pageElement == null) { pageElement = new PageElement { Title = title, Url = url }; } var xpath = new ItemPageXPaths(); List <SubItemElement> subList; DateTime startTime = DateTime.Now; PageElement result; if (GeckoDownRd.Checked) { //result = new GeckoParser().GetArticleContent(url, title, DeterminedMode(), out xpath); CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000); string content = resp.Content; result = PageAutoAnalyzer.AnalyzeContent(content, pageElement, DeterminedMode(), new IdentityContentElement(), ref xpath, out subList, 86400, ExcludeTxt.Text); } else if (HttpdownRd.Checked) { string content = WebRequestProcessor.DownloadHTTPString(url, 30); result = PageAutoAnalyzer.AnalyzeContent(content, pageElement, DeterminedMode(), new IdentityContentElement(), ref xpath, out subList, 86400, ExcludeTxt.Text); } else { throw new Exception("不支持该方式分析正文"); } TimeSpan usedTime = DateTime.Now - startTime; if (result == null) { return; } PageUrlTxt.Text = HtmlUtility.ExpandRelativePath(url, result.Url); TitleTxt.Text = result.Title; ContentTxt.Text = result.Content; ViewTxt.Text = result.View.ToString(); ReplyTxt.Text = result.Reply.ToString(); PubdateTxt.Text = result.Pubdate == null ? "" : result.Pubdate.ToString(); AuthorTxt.Text = result.Author; MediaTxt.Text = result.MediaName; ElementXPathTxt.Text = result.ElementXPath; ElementBlockTxt.Text = result.ElementBlock; NextpageXPathTxt.Text = result.NextPageXPath; }
private void ParseListBtn_Click(object sender, EventArgs e) { string url = InputUrlTxt.Text; string content = ""; RecogniseMode mode = DeterminedMode(); var xpath = new ListPageXPaths(); PageElement[] result; if (GeckoDownRd.Checked) { //result = new GeckoParser().AnalyzeArticleList(url,mode,out xpath,86400); CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000); content = resp.Content; var ret = PageAutoAnalyzer.AnalyzeArticleList(resp.Url, content, mode, new IdentityPageElement(), ref xpath, 86400); result = ret == null ? null : ret.List; } else if (HttpdownRd.Checked) { content = WebRequestProcessor.DownloadHTTPString(url, 30); var ret = PageAutoAnalyzer.AnalyzeArticleList(url, content, mode, new IdentityPageElement(), ref xpath, 86400); result = ret == null ? null : ret.List; } else { throw new NotSupportedException("不支持当前项抓取"); } if (result == null) { MessageBox.Show("解析不出数据"); return; } foreach (var pageElement in result) { pageElement.Url = HtmlUtility.ExpandRelativePath(url, pageElement.Url); } ListGridView.DataSource = result; }
public static void TrackException(Exception ex, HttpRequestMessage req, TraceWriter log, string reqBodyStr, CrawlResponse reqBody, BlobContent blobContent = null) { var props = new Dictionary <string, string> { { "Service", req.RequestUri.ToString() }, { "Request", reqBodyStr } }; if (reqBody != null) { props.Add("AppId", reqBody.Site); props.Add("ActionId", reqBody.Id); } if (blobContent != null) { props.Add("Response", blobContent.Value); } TelemetryClient.TrackException(ex, props); log.Error($"Request for AppId={reqBody?.Site} ActionId={reqBody?.Id} failed", ex); }
public static async Task <CrawlResponse> Download(CrawlRequest reqBody) { Uri uri; if (!Uri.TryCreate(reqBody.Url, UriKind.Absolute, out uri)) { return(null); } foreach (var userAgent in UserAgents) { var headRequest = (HttpWebRequest)WebRequest.Create(uri); headRequest.Method = "HEAD"; headRequest.UserAgent = userAgent; try { // make sure we only crawl HTML using (var response = (HttpWebResponse)await headRequest.GetResponseAsync()) { var contentType = response.GetResponseHeader("Content-Type"); CrawlResponse result = null; if (string.IsNullOrWhiteSpace(contentType) || contentType.StartsWith("text/html")) { result = await DownloadHtml(uri, userAgent, reqBody); } if (contentType.StartsWith("application/json")) { result = await DownloadJson(uri, userAgent, reqBody); } if (contentType.StartsWith("video/") || contentType.StartsWith("audio/")) { result = new CrawlResponse { Video = reqBody.Url } } ; if (contentType.StartsWith("image/")) { result = new CrawlResponse { Image = reqBody.Url } } ; return(result); } } catch (WebException we) { HttpWebResponse httpResponse = we.Response as HttpWebResponse; if (we.Status == WebExceptionStatus.ServerProtocolViolation) { // Get a little more telemetry about what is going on here, though most cases don't // have a Response object. IDictionary <string, string> traceData = new Dictionary <string, string>() { { "HasResponse", (we.Response != null).ToString() } }; if (we.Response != null) { traceData["Response.SupportsHeaders"] = we.Response.SupportsHeaders.ToString(); if (we.Response.SupportsHeaders) { for (int i = 0; i < we.Response.Headers.Count; i++) { string headerName = we.Response.Headers.GetKey(i); string headerValue = we.Response.Headers.Get(i); traceData[$"Response.Headers.{headerName}"] = headerValue; } } if (httpResponse != null) { traceData["HttpResponse.StatusCode"] = httpResponse.StatusCode.ToString(); } } Services.TelemetryClient.TrackTrace($"Download target ({uri}) ServerProtocolViolation", SeverityLevel.Error, traceData); // Ignore known cases where crawl fails due to error on the crawl-target side - these should not // cause a hard failure on our end. continue; } if (httpResponse != null) { // Ignore known cases where crawl fails due to error on the crawl-target side - these should not // cause a hard failure on our end. if (httpResponse.StatusCode == HttpStatusCode.Forbidden || httpResponse.StatusCode == HttpStatusCode.NotFound || httpResponse.StatusCode == HttpStatusCode.ServiceUnavailable) { continue; } } throw; } } throw new UnauthorizedAccessException("Unable to access HTTP endpoint"); }
public static async Task <HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log) { CrawlRequest crawlRequest = null; string reqBodyStr = null; try { using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.HTML")) { reqBodyStr = await req.Content.ReadAsStringAsync(); var reqBody = JsonConvert.DeserializeObject <CrawlRequest>(reqBodyStr); operation.Telemetry.Properties.Add("AppId", reqBody.Site); operation.Telemetry.Properties.Add("ActionId", reqBody.Id); operation.Telemetry.Properties.Add("Url", reqBody.Url); log.Info($"Crawl AppId={reqBody.Site} Id={reqBody.Id} Url={reqBody.Url}"); var crawlResponse = await Download(reqBody); // always return a valid object so that downstream workflows can continue if (crawlResponse == null) { crawlResponse = new CrawlResponse(); } crawlResponse.Url = reqBody.Url; crawlResponse.Site = reqBody.Site; crawlResponse.Id = reqBody.Id; var json = JsonConvert.SerializeObject(crawlResponse, new JsonSerializerSettings { Formatting = Formatting.None, StringEscapeHandling = StringEscapeHandling.EscapeNonAscii }); return(new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent( json, new UTF8Encoding(encoderShouldEmitUTF8Identifier: false), "application/json") }); } } catch (Exception ex) { var props = new Dictionary <string, string> { { "Service", req.RequestUri.ToString() } }; if (crawlRequest == null) { props.Add("JSON", reqBodyStr); } else { props.Add("Url", crawlRequest.Url); props.Add("AppId", crawlRequest.Site); props.Add("ActionId", crawlRequest.Id); } Services.TelemetryClient.TrackException(ex, props); throw ex; } }
public static CrawlResponse Parse(string html, Uri sourceUrl) { var response = new CrawlResponse(); var doc = new HtmlDocument(); doc.LoadHtml(html); var head = doc.DocumentNode.SelectSingleNode("html/head"); if (head == null) { return(response); } response.Title = FindMeta(head, "meta[@property='og:title' or name='og:title' or @property='twitter:title' or @name='twitter:title']"); if (string.IsNullOrEmpty(response.Title)) { response.Title = FindValue(head, "title"); } if (!string.IsNullOrEmpty(response.Title)) { response.Title = WebUtility.HtmlDecode(response.Title.Trim()); } response.Description = FindMeta(head, "meta[@property='og:description' or name='og:description' or @property='twitter:description' or @name='twitter:description' or @name='description']"); if (string.IsNullOrEmpty(response.Description)) { response.Title = FindValue(head, "title"); } if (response.Description != null) { response.Description = WebUtility.HtmlDecode(response.Description.Trim()); } response.Type = FindMeta(head, "meta[@property='og:type' or name='og:type']"); var categories = FindAll(head, "meta[@property='article:tag' or @name='article:tag']").ToList(); if (categories.Count > 0) { response.Categories = categories; } // TODO: get the better resolution var img = FindMeta(head, "meta[@property='og:image' or name='og:image' or @property='twitter:image' or @name='twitter:image']"); if (img != null) { if (img.StartsWith("//")) { img = sourceUrl.Scheme + ":" + img; } // TODO: support relative URLs too response.Image = img; } // build article var articleText = new StringBuilder(); var articles = doc.DocumentNode.SelectNodes("//article"); if (articles != null) { // find the longest article text string text = null; foreach (var art in articles) { var newText = StripTags(art); if (text == null || text.Length < newText.Length) { text = newText; } } if (!string.IsNullOrEmpty(text)) { articleText.AppendLine(text); } } response.Article = WebUtility.HtmlDecode(articleText.ToString()); // <meta property="microsoft:ds_id" content="255308" data-react-helmet="true"> var dsId = FindMeta(head, "meta[@property='microsoft:ds_id' or name='microsoft:ds_id']"); response.PassThroughDetails = WebUtility.HtmlDecode(dsId); return(response); }
private UserTweet CrawlTask(string entryUrl) { _currentUrl = entryUrl; var Site = SiteBusiness.GetBySiteID("Weibo"); var Request = BuildRequest(entryUrl, RegexContent); Site.TimeoutSecs = 60; CrawlResponse Response = null; try { Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); } catch { } if (Response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + Response.Url); } var content = Response.Content; //First page UserTweet result = new UserTweet(); result.Url = entryUrl; try { FillUserInfo(content, result); } catch { return(result); } var endId = DeterminedMid(content, MidType.EndId); var maxId = DeterminedMid(content, MidType.MaxId); //var name = Regex.Match(content,) int currentPage = 1; int maxPage = 50; string rootPath = @"D:/output/" + result.Name + "/"; Workbook outputBook = new Workbook(); Worksheet sheet = null; int currentLine = 4; int pos = 1; //表示第几次滚屏(一页中共3次),0:第一次;1:第二次;2:第三次 bool isContinue = false; if (!Directory.Exists(rootPath)) { Directory.CreateDirectory(rootPath); } if (File.Exists(rootPath + result.Name + ".xls")) { outputBook.Open(rootPath + result.Name + ".xls"); sheet = outputBook.Worksheets[result.Name]; int endrow = currentLine; while (!string.IsNullOrEmpty(sheet.Cells[endrow, 0].StringValue)) { endrow++; } currentLine = endrow; currentPage = (int)(currentLine / 45d) + 1; isContinue = true; } else { sheet = outputBook.Worksheets.Add(result.Name); } //Save to excel //Initialize column sheet.Cells[0, 0].PutValue("姓名"); sheet.Cells[0, 1].PutValue("网址"); sheet.Cells[0, 2].PutValue("粉丝数"); sheet.Cells[0, 3].PutValue("关注数"); sheet.Cells[0, 4].PutValue("微博数"); sheet.Cells[1, 0].PutValue(result.Name); sheet.Cells[1, 1].PutValue(result.Url); sheet.Cells[1, 2].PutValue(result.Follower); sheet.Cells[1, 3].PutValue(result.Follow); sheet.Cells[1, 4].PutValue(result.TweetNum); sheet.Cells[3, 0].PutValue("微博内容"); sheet.Cells[3, 1].PutValue("发布时间"); sheet.Cells[3, 2].PutValue("转发数"); sheet.Cells[3, 3].PutValue("评论数"); sheet.Cells[3, 4].PutValue("原帖地址"); sheet.Cells[3, 5].PutValue("来源"); sheet.Cells[3, 6].PutValue("具体评论"); if (isContinue) { pos = 0; var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos); Request = BuildRequest(url); Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); JsonResponse tmpResult = JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data); pos++; var firstTweet = FillUserTweet(result, Response.Content); var firstUrl = firstTweet.FirstOrDefault().Url; result.Tweets.Clear(); int endrow = currentLine - 1; while (endrow > 3) { if (sheet.Cells[endrow, 4].StringValue == firstUrl) { currentLine = endrow; break; } endrow--; } } //Crawl with json while (Regex.IsMatch(Response.Content.Trim(), RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase) && Response.Status == Enums.CrawlResult.Succ) { content = Response.Content.Trim(); var currentTweet = FillUserTweet(result, content); foreach (Tweet tweet in currentTweet) { string fileName = tweet.Mid + ".xls"; //检查是否是失败后的已经存在的评论 if (NeedCrawlComment) { if (!File.Exists(rootPath + fileName)) { FillTweetComment(tweet, Site); if (tweet.Comments.Count > 0) { SaveComment(rootPath, tweet, fileName); } } } sheet.Cells[currentLine, 0].PutValue(tweet.Content); sheet.Cells[currentLine, 1].PutValue(tweet.PubDate.ToString("yyyy-MM-dd HH:mm:ss")); sheet.Cells[currentLine, 2].PutValue(tweet.Forward); sheet.Cells[currentLine, 3].PutValue(tweet.Comment); sheet.Cells[currentLine, 4].PutValue(tweet.Url); sheet.Cells[currentLine, 5].PutValue(tweet.Source); //link comment if (File.Exists(rootPath + fileName)) { sheet.Cells[currentLine, 6].PutValue("点击查看"); //string linkPath = result.Name + "/" + fileName; string linkPath = fileName; sheet.Hyperlinks.Add(currentLine, 6, 1, 1, linkPath); } outputBook.Save(rootPath + result.Name + ".xls"); StatusLbl.Text = string.Format("正在读取名人:{0}的第{1}条微博", result.Name, currentLine - 3); Application.DoEvents(); currentLine++; } var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos); Request = BuildRequest(url); for (int i = 0; i < 5; i++) { try { Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); } catch { } if (Response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + Response.Url); } else { break; } } try { JsonResponse tmpResult = JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data); } catch { try { CommentJsonResponse tmpResult = JsonConvert.DeserializeObject <CommentJsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data.html); } catch { } } pos = (pos + 1) % 3; if (pos == 0) { currentPage++; } maxId = result.Tweets.Last().Mid; } return(result); }
private void FillTweetComment(Tweet tweet, SiteEntity site) { if (tweet.Comment == 0) { return; } int currentPage = 1; string mid = tweet.Mid; try { while (true) { string url = string.Format(CommentUrlFormat, mid, currentPage); var request = BuildRequest(url); CrawlResponse response = null; for (int i = 0; i < 5; i++) { try { response = GeckoRequestProcessor.DoRequest(request, site, null, null); AggrSum(); } catch {} if (response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + response.Url); } else { break; } } CommentJsonResponse tmpResult = JsonConvert.DeserializeObject <CommentJsonResponse>(response.Content.Trim("</pre>".ToArray())); response.Content = HttpUtility.HtmlDecode(tmpResult.data.html); var pageMatch = Regex.Match(response.Content, RegexCommentPage, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (currentPage != 1 && (!pageMatch.Success || pageMatch.Groups["CurrentPageNum"].Value != currentPage.ToString(CultureInfo.InvariantCulture))) { return; } //Fill Tweet var matches = Regex.Matches(response.Content, RegexComment, RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match match in matches) { Comment comment = new Comment(); comment.Author = match.Groups["Author"].Value; comment.AuthorUrl = RegexParser.AbsoluteUrl(match.Groups["AuthorUrl"].Value, tweet.Url, true); comment.Content = TextCleaner.FullClean(match.Groups["Content"].Value); comment.PubDate = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue; tweet.Comments.Add(comment); } currentPage++; } } catch { } }
public async Task <HttpResponseMessage> InvokeAsync(HttpRequestMessage req, TraceWriter log, Func <CrawlResponse, object> requestBodyFunc, Action <CrawlResponse, BlobContent> responseAction, bool isPost, CancellationToken cancellationToken) { log.Info("Crawl." + this.containerName); await this.InitializeAsync(); string reqBodyStr = null; CrawlResponse reqBody = null; BlobContent blobContent = null; try { using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl." + this.containerName)) { reqBodyStr = await req.Content.ReadAsStringAsync(); reqBody = JsonConvert.DeserializeObject <CrawlResponse>(reqBodyStr); operation.Telemetry.Target = this.endpoint; operation.Telemetry.Properties.Add("AppId", reqBody.Site); operation.Telemetry.Properties.Add("ActionId", reqBody.Id); var serviceRequestBody = requestBodyFunc(reqBody); if (serviceRequestBody == null) { return(new HttpResponseMessage(System.Net.HttpStatusCode.OK) { Content = new StringContent( string.Empty, new UTF8Encoding(encoderShouldEmitUTF8Identifier: false), "application/json") }); } blobContent = await this.RequestAsync( log, reqBody.Site, reqBody.Id, serviceRequestBody, reqBody.ForceRefresh, isPost, cancellationToken); if (blobContent != null) { operation.Telemetry.Properties.Add("Expires", blobContent.Expires.ToString(CultureInfo.InvariantCulture)); if (blobContent.Value != null) { responseAction(reqBody, blobContent); operation.Telemetry.ResultCode = "OK"; } } return(Services.CreateResponse(blobContent)); } } catch (Exception ex) { Services.TrackException(ex, req, log, reqBodyStr, reqBody, blobContent); throw ex; } }
public static async Task <HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log, CancellationToken cancellationToken) { string reqBodyStr = null; CrawlResponse reqBody = null; try { using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.VideoIndexer")) { // TODO: if the id is not parsable, just ignore - make sure the others do too reqBodyStr = await req.Content.ReadAsStringAsync(); reqBody = JsonConvert.DeserializeObject <CrawlResponse>(reqBodyStr); operation.Telemetry.Properties.Add("AppId", reqBody.Site); operation.Telemetry.Properties.Add("ActionId", reqBody.Id); if (string.IsNullOrEmpty(reqBody.Id)) { return(Services.CreateResponse(new BlobContent { Expires = DateTime.UtcNow + TimeSpan.FromMinutes(5) })); } var settings = await GetVideoIndexerSettings(reqBody.Site); // find existing breakdown var breakdownContent = await GetVideoIndexerBreakdownAsync(reqBody, settings, log, cancellationToken); // The GetVideoIndexerBreakdownAsync call can, if the underlying call to VideoIndexer GetBreakdown fails - but the // call to VideoIndexer SearchBreakdown succeeds, return a BlobContent with an empty Value, and a short "expires". // Treat that as the same as not getting a VideoIndexer response. if (breakdownContent == null || string.IsNullOrWhiteSpace(breakdownContent.Value)) { if (string.IsNullOrEmpty(reqBody.Video)) { // try to resolve video through Ooyala var ooyalaVideo = Ooyala.GetOoyalaVideo(reqBody.Id, settings); if (ooyalaVideo != null) { reqBody.Video = ooyalaVideo.Url; if (string.IsNullOrEmpty(reqBody.Title)) { reqBody.Title = ooyalaVideo.Title; } if (string.IsNullOrEmpty(reqBody.Description)) { reqBody.Description = ooyalaVideo.Description; } if (reqBody.Categories == null || reqBody.Categories.Count == 0) { reqBody.Categories = ooyalaVideo.Keywords; } } } // enqueue break down indexing await IndexVideo(reqBody, settings); // make sure caller comes back in 5min return(Services.CreateResponse(new BlobContent { Expires = DateTime.UtcNow + TimeSpan.FromMinutes(5) })); } var result = JsonConvert.DeserializeObject <VideoBreakdownResult>(breakdownContent.Value); if (result.State != "Processed") { // make sure caller comes back in 5min return(Services.CreateResponse(new BlobContent { Expires = DateTime.UtcNow + TimeSpan.FromMinutes(5) })); } // featurize breakdownContent.Output = VideoIndexerFeaturizer.FeaturizeVideoIndexerBreakdown(result); return(Services.CreateResponse(breakdownContent)); } } catch (Exception ex) { Services.TrackException(ex, req, log, reqBodyStr, reqBody); throw ex; } }