Beispiel #1
0
        private async Task <bool> PostRepositories(string url, string json)
        {
            var requestProecessor = new WebRequestProcessor();
            var result            = await requestProecessor.WebRequestPost(url, json);

            return(result);
        }
Beispiel #2
0
        private async Task <string> RetrieveRepositories(string url)
        {
            var requestProecessor = new WebRequestProcessor();
            var jsonResponse      = await requestProecessor.WebRequestGet(url);

            return(jsonResponse);
        }
Beispiel #3
0
 void OnEnable()
 {
     _target  = target as WebRequestProcessor;
     _bundles = Settings.instance.runtimeSettings.bundles
                .Where(x => x.type != BundleType.Static)
                .Select(x => x.name)
                .ToArray();
 }
Beispiel #4
0
        private void CrawlDailyReport(Worksheet dailyWorksheet, Workbook dailybook, ref int dailyStartRow, string categoryName,
                                      string[] categoryUrls)
        {
            bool isFirst = true;

            foreach (string url in categoryUrls)
            {
                var dailycontent = WebRequestProcessor.DownloadHTTPString(url);
                Thread.Sleep(2000);
                var dailyMatches = Regex.Matches(dailycontent, baiduRegex,
                                                 RegexOptions.IgnoreCase | RegexOptions.Multiline);
                foreach (Match dailyMatch in dailyMatches)
                {
                    if (!dailyMatch.Groups["PubDate"].Value.Contains("前"))
                    {
                        continue;
                    }
                    if (isFirst)
                    {
                        dailyWorksheet.Cells[dailyStartRow, 2].PutValue(categoryName);
                        isFirst = false;
                    }
                    var resultUrl = dailyMatch.Groups["Url"].Value;
                    try
                    {
                        Uri uri    = new Uri(resultUrl);
                        var domain = GetUrlDomain(uri.Host);
                        //匹配媒体名
                        dailyWorksheet.Cells[dailyStartRow, 1].PutValue(domain);
                    }
                    catch (Exception)
                    {
                    }

                    var title = TextCleaner.FullClean(dailyMatch.Groups["Title"].Value) + Environment.NewLine +
                                TextCleaner.FullClean(dailyMatch.Groups["Text"].Value);
                    var colorstyle = dailyWorksheet.Cells[dailyStartRow, 6].GetDisplayStyle();
                    colorstyle.Font.Color = Color.Blue;
                    var currentExcelRow = dailyStartRow + 1;
                    dailyWorksheet.Cells[dailyStartRow, 0].PutValue(resultUrl);
                    dailyWorksheet.Cells[dailyStartRow, 5].Formula = "=VLOOKUP(B" + currentExcelRow + ",Sheet2!A:B,2,FALSE)";


                    dailyWorksheet.Cells[dailyStartRow, 6].SetStyle(colorstyle);
                    dailyWorksheet.Cells[dailyStartRow, 6].PutValue(title);


                    dailyWorksheet.Hyperlinks.Add(dailyStartRow, 6, 1, 1, resultUrl);
                    dailyWorksheet.Cells[dailyStartRow, 7].PutValue(DateTime.Now.ToString("yyyy-MM-dd"));
                    dailyWorksheet.Cells[dailyStartRow, 8].PutValue("负面舆情");
                    dailyStartRow++;
                }
            }
            dailybook.Save(@"D:\dailyreport\日报.xlsx");
        }
Beispiel #5
0
        private void ParsePage(string title, string url, PageElement pageElement = null)
        {
            if (pageElement == null)
            {
                pageElement = new PageElement {
                    Title = title, Url = url
                };
            }

            var xpath = new ItemPageXPaths();
            List <SubItemElement> subList;
            DateTime    startTime = DateTime.Now;
            PageElement result;

            if (GeckoDownRd.Checked)
            {
                //result = new GeckoParser().GetArticleContent(url, title, DeterminedMode(), out xpath);
                CrawlResponse resp    = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000);
                string        content = resp.Content;
                result = PageAutoAnalyzer.AnalyzeContent(content, pageElement,
                                                         DeterminedMode(), new IdentityContentElement(), ref xpath,
                                                         out subList, 86400, ExcludeTxt.Text);
            }
            else if (HttpdownRd.Checked)
            {
                string content = WebRequestProcessor.DownloadHTTPString(url, 30);
                result = PageAutoAnalyzer.AnalyzeContent(content, pageElement,
                                                         DeterminedMode(), new IdentityContentElement(), ref xpath,
                                                         out subList, 86400, ExcludeTxt.Text);
            }
            else
            {
                throw new Exception("不支持该方式分析正文");
            }


            TimeSpan usedTime = DateTime.Now - startTime;

            if (result == null)
            {
                return;
            }
            PageUrlTxt.Text       = HtmlUtility.ExpandRelativePath(url, result.Url);
            TitleTxt.Text         = result.Title;
            ContentTxt.Text       = result.Content;
            ViewTxt.Text          = result.View.ToString();
            ReplyTxt.Text         = result.Reply.ToString();
            PubdateTxt.Text       = result.Pubdate == null ? "" : result.Pubdate.ToString();
            AuthorTxt.Text        = result.Author;
            MediaTxt.Text         = result.MediaName;
            ElementXPathTxt.Text  = result.ElementXPath;
            ElementBlockTxt.Text  = result.ElementBlock;
            NextpageXPathTxt.Text = result.NextPageXPath;
        }
Beispiel #6
0
        private void ParseListBtn_Click(object sender, EventArgs e)
        {
            string        url     = InputUrlTxt.Text;
            string        content = "";
            RecogniseMode mode    = DeterminedMode();
            var           xpath   = new ListPageXPaths();

            PageElement[] result;
            if (GeckoDownRd.Checked)
            {
                //result = new GeckoParser().AnalyzeArticleList(url,mode,out xpath,86400);
                CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000);
                content = resp.Content;
                var ret = PageAutoAnalyzer.AnalyzeArticleList(resp.Url, content, mode, new IdentityPageElement(), ref xpath, 86400);
                result = ret == null ? null : ret.List;
            }
            else if (HttpdownRd.Checked)
            {
                content = WebRequestProcessor.DownloadHTTPString(url, 30);
                var ret = PageAutoAnalyzer.AnalyzeArticleList(url, content, mode, new IdentityPageElement(), ref xpath, 86400);
                result = ret == null ? null : ret.List;
            }
            else
            {
                throw new NotSupportedException("不支持当前项抓取");
            }



            if (result == null)
            {
                MessageBox.Show("解析不出数据");
                return;
            }
            foreach (var pageElement in result)
            {
                pageElement.Url = HtmlUtility.ExpandRelativePath(url, pageElement.Url);
            }
            ListGridView.DataSource = result;
        }
Beispiel #7
0
        private void CrawlBtn_Click(object sender, EventArgs e)
        {
            //ImportMedia();
            //return;
            //Dsg Report generate
            var content = WebRequestProcessor.DownloadHTTPString(DsgUrl);

            var      matches = Regex.Matches(content, baiduRegex, RegexOptions.Multiline | RegexOptions.IgnoreCase);
            Workbook book    = new Workbook();

            book.Open(@"D:\dailyreport\DSG.xlsx");
            var worksheet   = book.Worksheets[0];
            int dsgStartRow = 7;

            foreach (Match match in matches)
            {
                if (match.Groups["PubDate"].Value.Contains("前"))
                {
                    worksheet.Cells.InsertRow(dsgStartRow);
                }
            }


            foreach (Match match in matches)
            {
                if (!match.Groups["PubDate"].Value.Contains("前"))
                {
                    continue;
                }


                var resultUrl = match.Groups["Url"].Value;
                try
                {
                    Uri uri    = new Uri(resultUrl);
                    var domain = GetUrlDomain(uri.Host);
                    //匹配媒体名
                    worksheet.Cells[dsgStartRow, 1].PutValue(domain);
                }
                catch (Exception)
                {
                }
                var title           = TextCleaner.FullClean(match.Groups["Title"].Value) + Environment.NewLine + TextCleaner.FullClean(match.Groups["Text"].Value);
                var currentExcelRow = dsgStartRow + 1;
                worksheet.Cells[dsgStartRow, 0].PutValue(resultUrl);
                worksheet.Cells[dsgStartRow, 5].Formula = "=VLOOKUP(B" + currentExcelRow + ",Sheet2!A:B,2,FALSE)";
                worksheet.Cells[dsgStartRow, 6].PutValue(title);

                worksheet.Hyperlinks.Add(dsgStartRow, 6, 1, 1, match.Groups["Url"].Value);
                worksheet.Cells[dsgStartRow, 7].PutValue(DateTime.Now.ToString("yyyy-MM-dd"));
                worksheet.Cells[dsgStartRow, 8].PutValue("负面舆情");
                dsgStartRow++;
            }

            book.Save(@"D:\dailyreport\DSG.xlsx");

            //Polo Report generate
            Workbook dailybook = new Workbook();

            dailybook.Open(@"D:\dailyreport\日报.xlsx");
            var dailyWorksheet = dailybook.Worksheets[0];
            int dailyStartRow  = 6;

            string categoryName = "大众-POLO";
            var    categoryUrls = poloUrls;

            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-朗逸";
            categoryUrls = langyiUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-途安";
            categoryUrls = turanUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-帕萨特";
            categoryUrls = pasateUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-桑塔纳";
            categoryUrls = santanaUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-途观";
            categoryUrls = tuguanUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);



            MessageBox.Show("抓取完成");
        }