Пример #1
0
        private static async Task <BlobContent> GetVideoIndexerBreakdownAsync(CrawlResponse reqBody, VideoIndexerSettings settings, TraceWriter log, CancellationToken cancellationToken)
        {
            using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.VideoIndexer.GetBreakdown"))
            {
                var localCogService = GetCognitiveService(settings);

                var client = await localCogService.GetHttpClientAsync();

                var id = HttpUtility.UrlEncode(reqBody.Id);
                var responseMessage = await client.GetAsync($"/Breakdowns/Api/Partner/Breakdowns/Search?externalId={id}", cancellationToken);

                if (!responseMessage.IsSuccessStatusCode)
                {
                    return(null);
                }

                var videoIndexerResponseStr = await responseMessage.Content.ReadAsStringAsync();

                var videoIndexerResponse = JsonConvert.DeserializeObject <VideoIndexerSearchResult>(videoIndexerResponseStr);
                var breakdownId          = videoIndexerResponse.Results?.FirstOrDefault()?.Id;
                if (breakdownId == null)
                {
                    return(null);
                }

                return(await localCogService.RequestAsync(
                           log,
                           reqBody.Site,
                           reqBody.Id,
                           $"/Breakdowns/Api/Partner/Breakdowns/{breakdownId}",
                           reqBody.ForceRefresh,
                           isPost : false,
                           cancellationToken : cancellationToken));
            }
        }
Пример #2
0
        private static async Task IndexVideo(CrawlResponse reqBody, VideoIndexerSettings settings)
        {
            if (reqBody == null || string.IsNullOrEmpty(reqBody.Video) || !Uri.TryCreate(reqBody.Video, UriKind.Absolute, out Uri unused))
            {
                return;
            }

            //var account = CloudStorageAccount.Parse(await cogService.GetAzureStorageConnectionStringAsync());
            //var blobClient = account.CreateCloudBlobClient();

            //var container = blobClient.GetContainerReference(BlobCache.ToContainerName(DateTime.UtcNow, cogService.containerName));
            //await container.CreateIfNotExistsAsync();

            //var leaseBlob = container.GetBlockBlobReference(BlobCache.ToBlobName(reqBody.Site, reqBody.Id) + ".lease");
            //leaseBlob.

            using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.VideoIndexer.Enqueue"))
            {
                operation.Telemetry.Properties.Add("url", reqBody.Video);
                operation.Telemetry.Properties.Add("id", reqBody.Id);

                // https://videobreakdown.azure-api.net/Breakdowns/Api/Partner/Breakdowns[?name][&privacy][&videoUrl][&language][&externalId][&metadata][&description][&partition][&callbackUrl][&indexingPreset][&streamingPreset]
                var url   = HttpUtility.UrlEncode(reqBody.Video);
                var id    = HttpUtility.UrlEncode(reqBody.Id);
                var query =
                    "Breakdowns/Api/Partner/Breakdowns" +
                    $"?externalId={id}" +
                    $"&videoUrl={url}" +
                    "&privacy=private&searchable=true";

                string name = reqBody.Title;
                if (string.IsNullOrEmpty(name))
                {
                    name = id;
                }

                query += "&name=" + name;

                if (!string.IsNullOrEmpty(reqBody.Description))
                {
                    query += "&description=" + reqBody.Description;
                }

                if (reqBody.Categories != null && reqBody.Categories.Count > 0)
                {
                    query += "&metadata=" + HttpUtility.UrlEncode(string.Join(" ", reqBody.Categories));
                }

                var localCogService = GetCognitiveService(settings);

                var client = await localCogService.GetHttpClientAsync();


                var httpResponse = await client.PostAsync(client.BaseAddress + query, new MultipartFormDataContent());

                operation.Telemetry.Success    = httpResponse.IsSuccessStatusCode;
                operation.Telemetry.ResultCode = httpResponse.StatusCode.ToString();
            }
        }
Пример #3
0
        public async Task <HttpResponseMessage> InvokeAsync(HttpRequestMessage req, TraceWriter log,
                                                            Func <CrawlResponse, object> requestBodyFunc,
                                                            Action <CrawlResponse, BlobContent> responseAction,
                                                            CancellationToken cancellationToken)
        {
            log.Info("Crawl." + this.containerName);

            await this.InitializeAsync();

            string        reqBodyStr  = null;
            CrawlResponse reqBody     = null;
            BlobContent   blobContent = null;

            try
            {
                using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl." + this.containerName))
                {
                    reqBodyStr = await req.Content.ReadAsStringAsync();

                    reqBody = JsonConvert.DeserializeObject <CrawlResponse>(reqBodyStr);

                    operation.Telemetry.Target = this.endpoint;
                    operation.Telemetry.Properties.Add("AppId", reqBody.Site);
                    operation.Telemetry.Properties.Add("ActionId", reqBody.Id);

                    blobContent = await this.PostAsync(
                        log,
                        reqBody.Site,
                        reqBody.Id,
                        requestBodyFunc(reqBody),
                        reqBody.ForceRefresh,
                        cancellationToken);

                    if (blobContent != null)
                    {
                        operation.Telemetry.Properties.Add("Expires", blobContent.Expires.ToString(CultureInfo.InvariantCulture));

                        if (blobContent.Value != null)
                        {
                            responseAction(reqBody, blobContent);

                            operation.Telemetry.ResultCode = "OK";
                        }
                    }

                    return(req.CreateResponse(blobContent));
                }
            }
            catch (Exception ex)
            {
                Services.TrackException(ex, req, log, reqBodyStr, reqBody, blobContent);
                throw ex;
            }
        }
Пример #4
0
        private void ParsePage(string title, string url, PageElement pageElement = null)
        {
            if (pageElement == null)
            {
                pageElement = new PageElement {
                    Title = title, Url = url
                };
            }

            var xpath = new ItemPageXPaths();
            List <SubItemElement> subList;
            DateTime    startTime = DateTime.Now;
            PageElement result;

            if (GeckoDownRd.Checked)
            {
                //result = new GeckoParser().GetArticleContent(url, title, DeterminedMode(), out xpath);
                CrawlResponse resp    = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000);
                string        content = resp.Content;
                result = PageAutoAnalyzer.AnalyzeContent(content, pageElement,
                                                         DeterminedMode(), new IdentityContentElement(), ref xpath,
                                                         out subList, 86400, ExcludeTxt.Text);
            }
            else if (HttpdownRd.Checked)
            {
                string content = WebRequestProcessor.DownloadHTTPString(url, 30);
                result = PageAutoAnalyzer.AnalyzeContent(content, pageElement,
                                                         DeterminedMode(), new IdentityContentElement(), ref xpath,
                                                         out subList, 86400, ExcludeTxt.Text);
            }
            else
            {
                throw new Exception("不支持该方式分析正文");
            }


            TimeSpan usedTime = DateTime.Now - startTime;

            if (result == null)
            {
                return;
            }
            PageUrlTxt.Text       = HtmlUtility.ExpandRelativePath(url, result.Url);
            TitleTxt.Text         = result.Title;
            ContentTxt.Text       = result.Content;
            ViewTxt.Text          = result.View.ToString();
            ReplyTxt.Text         = result.Reply.ToString();
            PubdateTxt.Text       = result.Pubdate == null ? "" : result.Pubdate.ToString();
            AuthorTxt.Text        = result.Author;
            MediaTxt.Text         = result.MediaName;
            ElementXPathTxt.Text  = result.ElementXPath;
            ElementBlockTxt.Text  = result.ElementBlock;
            NextpageXPathTxt.Text = result.NextPageXPath;
        }
Пример #5
0
        private void ParseListBtn_Click(object sender, EventArgs e)
        {
            string        url     = InputUrlTxt.Text;
            string        content = "";
            RecogniseMode mode    = DeterminedMode();
            var           xpath   = new ListPageXPaths();

            PageElement[] result;
            if (GeckoDownRd.Checked)
            {
                //result = new GeckoParser().AnalyzeArticleList(url,mode,out xpath,86400);
                CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000);
                content = resp.Content;
                var ret = PageAutoAnalyzer.AnalyzeArticleList(resp.Url, content, mode, new IdentityPageElement(), ref xpath, 86400);
                result = ret == null ? null : ret.List;
            }
            else if (HttpdownRd.Checked)
            {
                content = WebRequestProcessor.DownloadHTTPString(url, 30);
                var ret = PageAutoAnalyzer.AnalyzeArticleList(url, content, mode, new IdentityPageElement(), ref xpath, 86400);
                result = ret == null ? null : ret.List;
            }
            else
            {
                throw new NotSupportedException("不支持当前项抓取");
            }



            if (result == null)
            {
                MessageBox.Show("解析不出数据");
                return;
            }
            foreach (var pageElement in result)
            {
                pageElement.Url = HtmlUtility.ExpandRelativePath(url, pageElement.Url);
            }
            ListGridView.DataSource = result;
        }
Пример #6
0
        public static void TrackException(Exception ex, HttpRequestMessage req, TraceWriter log, string reqBodyStr, CrawlResponse reqBody, BlobContent blobContent = null)
        {
            var props = new Dictionary <string, string>
            {
                { "Service", req.RequestUri.ToString() },
                { "Request", reqBodyStr }
            };

            if (reqBody != null)
            {
                props.Add("AppId", reqBody.Site);
                props.Add("ActionId", reqBody.Id);
            }

            if (blobContent != null)
            {
                props.Add("Response", blobContent.Value);
            }

            TelemetryClient.TrackException(ex, props);
            log.Error($"Request for AppId={reqBody?.Site} ActionId={reqBody?.Id} failed", ex);
        }
Пример #7
0
        public static async Task <CrawlResponse> Download(CrawlRequest reqBody)
        {
            Uri uri;

            if (!Uri.TryCreate(reqBody.Url, UriKind.Absolute, out uri))
            {
                return(null);
            }

            foreach (var userAgent in UserAgents)
            {
                var headRequest = (HttpWebRequest)WebRequest.Create(uri);
                headRequest.Method    = "HEAD";
                headRequest.UserAgent = userAgent;

                try
                {
                    // make sure we only crawl HTML
                    using (var response = (HttpWebResponse)await headRequest.GetResponseAsync())
                    {
                        var contentType = response.GetResponseHeader("Content-Type");

                        CrawlResponse result = null;

                        if (string.IsNullOrWhiteSpace(contentType) || contentType.StartsWith("text/html"))
                        {
                            result = await DownloadHtml(uri, userAgent, reqBody);
                        }

                        if (contentType.StartsWith("application/json"))
                        {
                            result = await DownloadJson(uri, userAgent, reqBody);
                        }

                        if (contentType.StartsWith("video/") || contentType.StartsWith("audio/"))
                        {
                            result = new CrawlResponse {
                                Video = reqBody.Url
                            }
                        }
                        ;

                        if (contentType.StartsWith("image/"))
                        {
                            result = new CrawlResponse {
                                Image = reqBody.Url
                            }
                        }
                        ;

                        return(result);
                    }
                }
                catch (WebException we)
                {
                    HttpWebResponse httpResponse = we.Response as HttpWebResponse;
                    if (we.Status == WebExceptionStatus.ServerProtocolViolation)
                    {
                        // Get a little more telemetry about what is going on here, though most cases don't
                        // have a Response object.
                        IDictionary <string, string> traceData = new Dictionary <string, string>()
                        {
                            { "HasResponse", (we.Response != null).ToString() }
                        };

                        if (we.Response != null)
                        {
                            traceData["Response.SupportsHeaders"] = we.Response.SupportsHeaders.ToString();

                            if (we.Response.SupportsHeaders)
                            {
                                for (int i = 0; i < we.Response.Headers.Count; i++)
                                {
                                    string headerName  = we.Response.Headers.GetKey(i);
                                    string headerValue = we.Response.Headers.Get(i);
                                    traceData[$"Response.Headers.{headerName}"] = headerValue;
                                }
                            }

                            if (httpResponse != null)
                            {
                                traceData["HttpResponse.StatusCode"] = httpResponse.StatusCode.ToString();
                            }
                        }

                        Services.TelemetryClient.TrackTrace($"Download target ({uri}) ServerProtocolViolation", SeverityLevel.Error, traceData);

                        // Ignore known cases where crawl fails due to error on the crawl-target side - these should not
                        // cause a hard failure on our end.
                        continue;
                    }

                    if (httpResponse != null)
                    {
                        // Ignore known cases where crawl fails due to error on the crawl-target side - these should not
                        // cause a hard failure on our end.
                        if (httpResponse.StatusCode == HttpStatusCode.Forbidden ||
                            httpResponse.StatusCode == HttpStatusCode.NotFound ||
                            httpResponse.StatusCode == HttpStatusCode.ServiceUnavailable)
                        {
                            continue;
                        }
                    }

                    throw;
                }
            }

            throw new UnauthorizedAccessException("Unable to access HTTP endpoint");
        }
Пример #8
0
        public static async Task <HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log)
        {
            CrawlRequest crawlRequest = null;
            string       reqBodyStr   = null;

            try
            {
                using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.HTML"))
                {
                    reqBodyStr = await req.Content.ReadAsStringAsync();

                    var reqBody = JsonConvert.DeserializeObject <CrawlRequest>(reqBodyStr);

                    operation.Telemetry.Properties.Add("AppId", reqBody.Site);
                    operation.Telemetry.Properties.Add("ActionId", reqBody.Id);
                    operation.Telemetry.Properties.Add("Url", reqBody.Url);

                    log.Info($"Crawl AppId={reqBody.Site} Id={reqBody.Id} Url={reqBody.Url}");

                    var crawlResponse = await Download(reqBody);

                    // always return a valid object so that downstream workflows can continue
                    if (crawlResponse == null)
                    {
                        crawlResponse = new CrawlResponse();
                    }

                    crawlResponse.Url  = reqBody.Url;
                    crawlResponse.Site = reqBody.Site;
                    crawlResponse.Id   = reqBody.Id;

                    var json = JsonConvert.SerializeObject(crawlResponse, new JsonSerializerSettings
                    {
                        Formatting           = Formatting.None,
                        StringEscapeHandling = StringEscapeHandling.EscapeNonAscii
                    });

                    return(new HttpResponseMessage(HttpStatusCode.OK)
                    {
                        Content = new StringContent(
                            json,
                            new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
                            "application/json")
                    });
                }
            }
            catch (Exception ex)
            {
                var props = new Dictionary <string, string>
                {
                    { "Service", req.RequestUri.ToString() }
                };

                if (crawlRequest == null)
                {
                    props.Add("JSON", reqBodyStr);
                }
                else
                {
                    props.Add("Url", crawlRequest.Url);
                    props.Add("AppId", crawlRequest.Site);
                    props.Add("ActionId", crawlRequest.Id);
                }

                Services.TelemetryClient.TrackException(ex, props);

                throw ex;
            }
        }
Пример #9
0
        public static CrawlResponse Parse(string html, Uri sourceUrl)
        {
            var response = new CrawlResponse();

            var doc = new HtmlDocument();

            doc.LoadHtml(html);

            var head = doc.DocumentNode.SelectSingleNode("html/head");

            if (head == null)
            {
                return(response);
            }

            response.Title = FindMeta(head, "meta[@property='og:title' or name='og:title' or @property='twitter:title' or @name='twitter:title']");

            if (string.IsNullOrEmpty(response.Title))
            {
                response.Title = FindValue(head, "title");
            }

            if (!string.IsNullOrEmpty(response.Title))
            {
                response.Title = WebUtility.HtmlDecode(response.Title.Trim());
            }

            response.Description = FindMeta(head, "meta[@property='og:description' or name='og:description' or @property='twitter:description' or @name='twitter:description' or @name='description']");

            if (string.IsNullOrEmpty(response.Description))
            {
                response.Title = FindValue(head, "title");
            }

            if (response.Description != null)
            {
                response.Description = WebUtility.HtmlDecode(response.Description.Trim());
            }

            response.Type = FindMeta(head, "meta[@property='og:type' or name='og:type']");
            var categories = FindAll(head, "meta[@property='article:tag' or @name='article:tag']").ToList();

            if (categories.Count > 0)
            {
                response.Categories = categories;
            }

            // TODO: get the better resolution
            var img = FindMeta(head, "meta[@property='og:image' or name='og:image' or @property='twitter:image' or @name='twitter:image']");

            if (img != null)
            {
                if (img.StartsWith("//"))
                {
                    img = sourceUrl.Scheme + ":" + img;
                }

                // TODO: support relative URLs too
                response.Image = img;
            }

            // build article
            var articleText = new StringBuilder();

            var articles = doc.DocumentNode.SelectNodes("//article");

            if (articles != null)
            {
                // find the longest article text
                string text = null;
                foreach (var art in articles)
                {
                    var newText = StripTags(art);
                    if (text == null || text.Length < newText.Length)
                    {
                        text = newText;
                    }
                }

                if (!string.IsNullOrEmpty(text))
                {
                    articleText.AppendLine(text);
                }
            }

            response.Article = WebUtility.HtmlDecode(articleText.ToString());

            // <meta property="microsoft:ds_id" content="255308" data-react-helmet="true">
            var dsId = FindMeta(head, "meta[@property='microsoft:ds_id' or name='microsoft:ds_id']");

            response.PassThroughDetails = WebUtility.HtmlDecode(dsId);

            return(response);
        }
Пример #10
0
        private UserTweet CrawlTask(string entryUrl)
        {
            _currentUrl = entryUrl;
            var Site    = SiteBusiness.GetBySiteID("Weibo");
            var Request = BuildRequest(entryUrl, RegexContent);

            Site.TimeoutSecs = 60;
            CrawlResponse Response = null;

            try
            {
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
            }
            catch
            {
            }



            if (Response.Status != Enums.CrawlResult.Succ)
            {
                Logger.Info("访问页面错误:Url = " + Response.Url);
            }
            var content = Response.Content;
            //First page
            UserTweet result = new UserTweet();

            result.Url = entryUrl;
            try
            {
                FillUserInfo(content, result);
            }
            catch
            {
                return(result);
            }

            var endId = DeterminedMid(content, MidType.EndId);
            var maxId = DeterminedMid(content, MidType.MaxId);
            //var name = Regex.Match(content,)
            int       currentPage = 1;
            int       maxPage     = 50;
            string    rootPath    = @"D:/output/" + result.Name + "/";
            Workbook  outputBook  = new Workbook();
            Worksheet sheet       = null;
            int       currentLine = 4;
            int       pos         = 1; //表示第几次滚屏(一页中共3次),0:第一次;1:第二次;2:第三次
            bool      isContinue  = false;

            if (!Directory.Exists(rootPath))
            {
                Directory.CreateDirectory(rootPath);
            }
            if (File.Exists(rootPath + result.Name + ".xls"))
            {
                outputBook.Open(rootPath + result.Name + ".xls");
                sheet = outputBook.Worksheets[result.Name];
                int endrow = currentLine;
                while (!string.IsNullOrEmpty(sheet.Cells[endrow, 0].StringValue))
                {
                    endrow++;
                }
                currentLine = endrow;
                currentPage = (int)(currentLine / 45d) + 1;
                isContinue  = true;
            }
            else
            {
                sheet = outputBook.Worksheets.Add(result.Name);
            }
            //Save to excel

            //Initialize column
            sheet.Cells[0, 0].PutValue("姓名");
            sheet.Cells[0, 1].PutValue("网址");
            sheet.Cells[0, 2].PutValue("粉丝数");
            sheet.Cells[0, 3].PutValue("关注数");
            sheet.Cells[0, 4].PutValue("微博数");

            sheet.Cells[1, 0].PutValue(result.Name);
            sheet.Cells[1, 1].PutValue(result.Url);
            sheet.Cells[1, 2].PutValue(result.Follower);
            sheet.Cells[1, 3].PutValue(result.Follow);
            sheet.Cells[1, 4].PutValue(result.TweetNum);

            sheet.Cells[3, 0].PutValue("微博内容");
            sheet.Cells[3, 1].PutValue("发布时间");
            sheet.Cells[3, 2].PutValue("转发数");
            sheet.Cells[3, 3].PutValue("评论数");
            sheet.Cells[3, 4].PutValue("原帖地址");
            sheet.Cells[3, 5].PutValue("来源");
            sheet.Cells[3, 6].PutValue("具体评论");



            if (isContinue)
            {
                pos = 0;
                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request  = BuildRequest(url);
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
                JsonResponse tmpResult =
                    JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                pos++;
                var firstTweet = FillUserTweet(result, Response.Content);
                var firstUrl   = firstTweet.FirstOrDefault().Url;
                result.Tweets.Clear();
                int endrow = currentLine - 1;
                while (endrow > 3)
                {
                    if (sheet.Cells[endrow, 4].StringValue == firstUrl)
                    {
                        currentLine = endrow;
                        break;
                    }
                    endrow--;
                }
            }
            //Crawl with json
            while (Regex.IsMatch(Response.Content.Trim(), RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase) && Response.Status == Enums.CrawlResult.Succ)
            {
                content = Response.Content.Trim();
                var currentTweet = FillUserTweet(result, content);
                foreach (Tweet tweet in currentTweet)
                {
                    string fileName = tweet.Mid + ".xls";
                    //检查是否是失败后的已经存在的评论
                    if (NeedCrawlComment)
                    {
                        if (!File.Exists(rootPath + fileName))
                        {
                            FillTweetComment(tweet, Site);
                            if (tweet.Comments.Count > 0)
                            {
                                SaveComment(rootPath, tweet, fileName);
                            }
                        }
                    }



                    sheet.Cells[currentLine, 0].PutValue(tweet.Content);
                    sheet.Cells[currentLine, 1].PutValue(tweet.PubDate.ToString("yyyy-MM-dd HH:mm:ss"));
                    sheet.Cells[currentLine, 2].PutValue(tweet.Forward);
                    sheet.Cells[currentLine, 3].PutValue(tweet.Comment);
                    sheet.Cells[currentLine, 4].PutValue(tweet.Url);
                    sheet.Cells[currentLine, 5].PutValue(tweet.Source);

                    //link comment
                    if (File.Exists(rootPath + fileName))
                    {
                        sheet.Cells[currentLine, 6].PutValue("点击查看");
                        //string linkPath = result.Name + "/" + fileName;
                        string linkPath = fileName;
                        sheet.Hyperlinks.Add(currentLine, 6, 1, 1, linkPath);
                    }
                    outputBook.Save(rootPath + result.Name + ".xls");
                    StatusLbl.Text = string.Format("正在读取名人:{0}的第{1}条微博", result.Name, currentLine - 3);
                    Application.DoEvents();
                    currentLine++;
                }



                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request = BuildRequest(url);
                for (int i = 0; i < 5; i++)
                {
                    try
                    {
                        Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                        AggrSum();
                    }
                    catch
                    {
                    }
                    if (Response.Status != Enums.CrawlResult.Succ)
                    {
                        Logger.Info("访问页面错误:Url = " + Response.Url);
                    }
                    else
                    {
                        break;
                    }
                }

                try
                {
                    JsonResponse tmpResult =
                        JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                    Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                }
                catch
                {
                    try
                    {
                        CommentJsonResponse tmpResult =
                            JsonConvert.DeserializeObject <CommentJsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                        Response.Content = HttpUtility.HtmlDecode(tmpResult.data.html);
                    }
                    catch
                    {
                    }
                }

                pos = (pos + 1) % 3;
                if (pos == 0)
                {
                    currentPage++;
                }
                maxId = result.Tweets.Last().Mid;
            }



            return(result);
        }
Пример #11
0
        private void FillTweetComment(Tweet tweet, SiteEntity site)
        {
            if (tweet.Comment == 0)
            {
                return;
            }
            int    currentPage = 1;
            string mid         = tweet.Mid;

            try
            {
                while (true)
                {
                    string url = string.Format(CommentUrlFormat, mid, currentPage);

                    var request = BuildRequest(url);

                    CrawlResponse response = null;
                    for (int i = 0; i < 5; i++)
                    {
                        try
                        {
                            response = GeckoRequestProcessor.DoRequest(request, site, null, null);
                            AggrSum();
                        }
                        catch {}

                        if (response.Status != Enums.CrawlResult.Succ)
                        {
                            Logger.Info("访问页面错误:Url = " + response.Url);
                        }
                        else
                        {
                            break;
                        }
                    }
                    CommentJsonResponse tmpResult =
                        JsonConvert.DeserializeObject <CommentJsonResponse>(response.Content.Trim("</pre>".ToArray()));
                    response.Content = HttpUtility.HtmlDecode(tmpResult.data.html);
                    var pageMatch = Regex.Match(response.Content, RegexCommentPage,
                                                RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    if (currentPage != 1 &&
                        (!pageMatch.Success ||
                         pageMatch.Groups["CurrentPageNum"].Value != currentPage.ToString(CultureInfo.InvariantCulture)))
                    {
                        return;
                    }
                    //Fill Tweet
                    var matches = Regex.Matches(response.Content, RegexComment,
                                                RegexOptions.IgnoreCase | RegexOptions.Multiline);

                    foreach (Match match in matches)
                    {
                        Comment comment = new Comment();
                        comment.Author    = match.Groups["Author"].Value;
                        comment.AuthorUrl = RegexParser.AbsoluteUrl(match.Groups["AuthorUrl"].Value, tweet.Url, true);
                        comment.Content   = TextCleaner.FullClean(match.Groups["Content"].Value);
                        comment.PubDate   = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue;
                        tweet.Comments.Add(comment);
                    }

                    currentPage++;
                }
            }
            catch {
            }
        }
Пример #12
0
        public async Task <HttpResponseMessage> InvokeAsync(HttpRequestMessage req, TraceWriter log,
                                                            Func <CrawlResponse, object> requestBodyFunc,
                                                            Action <CrawlResponse, BlobContent> responseAction,
                                                            bool isPost,
                                                            CancellationToken cancellationToken)
        {
            log.Info("Crawl." + this.containerName);

            await this.InitializeAsync();

            string        reqBodyStr  = null;
            CrawlResponse reqBody     = null;
            BlobContent   blobContent = null;

            try
            {
                using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl." + this.containerName))
                {
                    reqBodyStr = await req.Content.ReadAsStringAsync();

                    reqBody = JsonConvert.DeserializeObject <CrawlResponse>(reqBodyStr);

                    operation.Telemetry.Target = this.endpoint;
                    operation.Telemetry.Properties.Add("AppId", reqBody.Site);
                    operation.Telemetry.Properties.Add("ActionId", reqBody.Id);

                    var serviceRequestBody = requestBodyFunc(reqBody);

                    if (serviceRequestBody == null)
                    {
                        return(new HttpResponseMessage(System.Net.HttpStatusCode.OK)
                        {
                            Content = new StringContent(
                                string.Empty,
                                new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
                                "application/json")
                        });
                    }

                    blobContent = await this.RequestAsync(
                        log,
                        reqBody.Site,
                        reqBody.Id,
                        serviceRequestBody,
                        reqBody.ForceRefresh,
                        isPost,
                        cancellationToken);

                    if (blobContent != null)
                    {
                        operation.Telemetry.Properties.Add("Expires", blobContent.Expires.ToString(CultureInfo.InvariantCulture));

                        if (blobContent.Value != null)
                        {
                            responseAction(reqBody, blobContent);

                            operation.Telemetry.ResultCode = "OK";
                        }
                    }

                    return(Services.CreateResponse(blobContent));
                }
            }
            catch (Exception ex)
            {
                Services.TrackException(ex, req, log, reqBodyStr, reqBody, blobContent);
                throw ex;
            }
        }
Пример #13
0
        public static async Task <HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log, CancellationToken cancellationToken)
        {
            string        reqBodyStr = null;
            CrawlResponse reqBody    = null;

            try
            {
                using (var operation = Services.TelemetryClient.StartOperation <DependencyTelemetry>("Crawl.VideoIndexer"))
                {
                    // TODO: if the id is not parsable, just ignore - make sure the others do too

                    reqBodyStr = await req.Content.ReadAsStringAsync();

                    reqBody = JsonConvert.DeserializeObject <CrawlResponse>(reqBodyStr);

                    operation.Telemetry.Properties.Add("AppId", reqBody.Site);
                    operation.Telemetry.Properties.Add("ActionId", reqBody.Id);

                    if (string.IsNullOrEmpty(reqBody.Id))
                    {
                        return(Services.CreateResponse(new BlobContent {
                            Expires = DateTime.UtcNow + TimeSpan.FromMinutes(5)
                        }));
                    }

                    var settings = await GetVideoIndexerSettings(reqBody.Site);

                    // find existing breakdown
                    var breakdownContent = await GetVideoIndexerBreakdownAsync(reqBody, settings, log, cancellationToken);

                    // The GetVideoIndexerBreakdownAsync call can, if the underlying call to VideoIndexer GetBreakdown fails - but the
                    // call to VideoIndexer SearchBreakdown succeeds, return a BlobContent with an empty Value, and a short "expires".
                    // Treat that as the same as not getting a VideoIndexer response.
                    if (breakdownContent == null || string.IsNullOrWhiteSpace(breakdownContent.Value))
                    {
                        if (string.IsNullOrEmpty(reqBody.Video))
                        {
                            // try to resolve video through Ooyala
                            var ooyalaVideo = Ooyala.GetOoyalaVideo(reqBody.Id, settings);

                            if (ooyalaVideo != null)
                            {
                                reqBody.Video = ooyalaVideo.Url;

                                if (string.IsNullOrEmpty(reqBody.Title))
                                {
                                    reqBody.Title = ooyalaVideo.Title;
                                }

                                if (string.IsNullOrEmpty(reqBody.Description))
                                {
                                    reqBody.Description = ooyalaVideo.Description;
                                }

                                if (reqBody.Categories == null || reqBody.Categories.Count == 0)
                                {
                                    reqBody.Categories = ooyalaVideo.Keywords;
                                }
                            }
                        }

                        // enqueue break down indexing
                        await IndexVideo(reqBody, settings);

                        // make sure caller comes back in 5min
                        return(Services.CreateResponse(new BlobContent {
                            Expires = DateTime.UtcNow + TimeSpan.FromMinutes(5)
                        }));
                    }

                    var result = JsonConvert.DeserializeObject <VideoBreakdownResult>(breakdownContent.Value);
                    if (result.State != "Processed")
                    {
                        // make sure caller comes back in 5min
                        return(Services.CreateResponse(new BlobContent {
                            Expires = DateTime.UtcNow + TimeSpan.FromMinutes(5)
                        }));
                    }


                    // featurize
                    breakdownContent.Output = VideoIndexerFeaturizer.FeaturizeVideoIndexerBreakdown(result);

                    return(Services.CreateResponse(breakdownContent));
                }
            }
            catch (Exception ex)
            {
                Services.TrackException(ex, req, log, reqBodyStr, reqBody);
                throw ex;
            }
        }