public CrawlStatusData GetCurrentCrawlStatus() { var infos = CrawlerManager.CrawlerFactory.Pipelines.Select(model => model.Info).ToArray(); var result = new CrawlStatusData() { HostData = HostBasicData.GetHostData() }; var data = from item in infos select new CrawlData { CrawlID = item.CurrentJob_Name, Message = item.Message, JobCount = item.JobCount }; foreach (var crawlData in data) { using (PalasDB db = new PalasDB()) { try { var crawl = db.Crawl.FirstOrDefault(model => model.CrawlID == crawlData.CrawlID); crawlData.LastCrawlTime = crawl.LastCrawlTime; crawlData.Name = crawl.Name; crawlData.Url = crawl.Url; } catch {} } } result.CrawlData = data.ToArray(); return(result); }
private void ImportMedia() { Workbook dailybook = new Workbook(); dailybook.Open(@"D:\dailyreport\日报.xlsx"); var dailyWorksheet = dailybook.Worksheets[1]; int dailyStartRow = 0; using (PalasDB db = new PalasDB()) { var result = db.Media.Where(model => model.ParentMediaID == null || model.ParentMediaID == ""); foreach (var item in result) { try { var url = item.Url; Uri uri = new Uri(url); var host = GetUrlDomain(uri.Host); dailyWorksheet.Cells[dailyStartRow, 0].PutValue(host); dailyWorksheet.Cells[dailyStartRow, 1].PutValue(item.MediaName); dailyStartRow++; } catch (Exception) { } } } dailybook.Save(@"D:\dailyreport\日报.xlsx"); }
private List <ResultResponse> CrawlOneKeyword(string siteID, string keyword, DateTime startDate, int maxCnt, Action <string, string, int, int> CallBack) { Crawler.Core.Crawler.SimpleCrawler oneJob = new Core.Crawler.SimpleCrawler("test", new Scheduler("nothing", new SchedulerSetting())); var crawlid = ""; var pageCnt = 0; var siteName = ""; using (PalasDB db = new PalasDB()) { var crawl = db.Crawl.FirstOrDefault(model => model.SiteID == siteID); crawlid = crawl.CrawlID; pageCnt = crawl.Site.ListItemCountPerPage; siteName = crawl.Site.Name; } List <ResultResponse> resultList = new List <ResultResponse>(); var needPage = (int)(maxCnt / pageCnt) + 1; for (int pageNum = 0; pageNum < needPage; pageNum++) { var result = oneJob.CrawlOnePageList(crawlid, keyword, pageNum, "", true); if (result != null && result.Items != null) { var resultItem = from item in result.Items select new ResultResponse { Count = item.DataItemCount, Data = item.DataItem, SiteName = siteName, Keyword = keyword }; //如果时间已经过了特定时间段则直接返回 resultItem = resultItem.Where(model => model.Data != null).ToArray(); var expireCnt = resultItem.Count(model => model.Data.PubDate < startDate); if (expireCnt > 0) { //如果存在过期项目则特殊处理 resultItem = resultItem.Where(model => model.Data.PubDate >= startDate); resultList.AddRange(resultItem); return(resultList); } resultList.AddRange(resultItem); } if (CallBack != null) { CallBack(siteName, keyword, pageNum + 1, pageNum * pageCnt); } } return(resultList); }
private string GetRegionID(string region) { using (PalasDB db = new PalasDB()) { if (string.IsNullOrEmpty(region)) { return(null); } string province = region.Substring(0, 2); var regionEntity = db.Region.FirstOrDefault(model => model.Province == province && model.City == null); if (regionEntity == null) { return(null); } return(regionEntity.RegionID); } }
private void Initailize() { //绑定搜索列表 using (PalasDB db = new PalasDB()) { var result = db.Site.Where(model => model.Media.MediaType == (sbyte)Enums.MediaType.SearchForum).ToArray(); SearchForumList.DataSource = result; SearchForumList.DisplayMember = "Name"; SearchForumList.ValueMember = "SiteID"; SearchChkList.DataSource = result; SearchChkList.DisplayMember = "Name"; SearchChkList.ValueMember = "SiteID"; } //初始化Grid ResultDataGridView.AutoGenerateColumns = false; SearchResultDataGridView.AutoGenerateColumns = false; //初始化限定时间 startDateTime.Value = DateTime.Now.AddMonths(-1); BatchStartDateTime.Value = DateTime.Now.AddMonths(-1); //暂时移除第二个选项卡 //tabControl1.TabPages.RemoveAt(1); }
private void ImportBtn_Click(object sender, EventArgs e) { Workbook workbook = new Workbook(); workbook.Open(@"D:\output\input.xlsx"); Worksheet sheet = workbook.Worksheets["输入源"]; int currentLine = 1; while (!string.IsNullOrEmpty(sheet.Cells[currentLine, 0].StringValue)) { try { string mediaChannel = sheet.Cells[currentLine, 0].StringValue; string url = ProcessUrl(sheet.Cells[currentLine, 1].StringValue); string type = sheet.Cells[currentLine, 2].StringValue; string subType = sheet.Cells[currentLine, 3].StringValue; string name = sheet.Cells[currentLine, 4].StringValue; string webName = sheet.Cells[currentLine, 5].StringValue; string webUrl = ProcessUrl(sheet.Cells[currentLine, 6].StringValue); string region = sheet.Cells[currentLine, 7].StringValue; string industry = sheet.Cells[currentLine, 8].StringValue; string orgLevel = sheet.Cells[currentLine, 9].StringValue; string socialSystem = sheet.Cells[currentLine, 10].StringValue; int mediaType = 3; const string economyIndustryID = "J"; string departmentID = null; string orgID = null; string industryID = null; string regionID = GetRegionID(region); switch (type) { //需要填入行业信息 case "行业协会": case "行业信息": { using (PalasDB db = new PalasDB()) { industryID = string.IsNullOrEmpty(industry) ? null : industry.Substring(0, 1); if (subType == "行业协会") { //Add org Organization org = db.Organization.FirstOrDefault(model => model.FullName == name); if (org == null) { org = db.Organization.CreateObject(); org.OrganizationID = CrawlDTO.NewGuid; org.FullName = name; org.ShortName = name; org.OrgLevel = (sbyte)Enums.OrgLevel.Province; //省部 org.RegionID = regionID ?? "10000"; org.RegionLevel = (sbyte)Enums.RegionLevel.Province; //省级 org.SocialSystem = (sbyte)Enums.SocialSystem.CompanyNational; //国企 org.DepartmentID = null; org.IndustryID = industryID; org.StockListing = (sbyte)Enums.StockListing.NonIPO; //org.Homepage = webUrl; db.Organization.AddObject(org); db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave); } orgID = org.OrganizationID; } } break; } //需要填入部门信息 case "机构": case "上市公司": { industryID = economyIndustryID; using (PalasDB db = new PalasDB()) { var department = db.Department.FirstOrDefault(model => model.DepartmentName == subType); if (department == null) { department = db.Department.CreateObject(); department.DepartmentName = subType; department.Order = 0; department.DepartmentID = CrawlDTO.NewGuid; db.Department.AddObject(department); db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave); } departmentID = department.DepartmentID; var org = db.Organization.FirstOrDefault(model => model.FullName == name); if (org == null) { org = db.Organization.CreateObject(); org.OrganizationID = CrawlDTO.NewGuid; org.FullName = name; org.ShortName = name; org.OrgLevel = (sbyte)Enums.OrgLevel.Province; //省部 org.RegionID = regionID ?? "10000"; org.RegionLevel = (sbyte)Enums.RegionLevel.Province; //省级 org.SocialSystem = (sbyte)Enums.SocialSystem.CompanyNational; //国企 org.DepartmentID = departmentID; org.IndustryID = industryID; if (type == "上市公司") { org.StockListing = (sbyte)Enums.StockListing.Shanghai_A; } else { org.StockListing = (sbyte)Enums.StockListing.NonIPO; } //org.Homepage = webUrl; db.Organization.AddObject(org); db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave); } orgID = org.OrganizationID; } break; } //需要填入组织级别 case "政府机构": { industryID = null; using (PalasDB db = new PalasDB()) { var department = db.Department.FirstOrDefault(model => model.DepartmentName == subType); if (department == null) { department = db.Department.CreateObject(); department.DepartmentName = subType; department.Order = 0; department.DepartmentID = CrawlDTO.NewGuid; db.Department.AddObject(department); db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave); } departmentID = department.DepartmentID; var org = db.Organization.FirstOrDefault(model => model.FullName == name); if (org == null) { org = db.Organization.CreateObject(); org.OrganizationID = CrawlDTO.NewGuid; org.FullName = name; org.ShortName = name; org.OrgLevel = sbyte.Parse(orgLevel.Substring(0, 1)); //省部 org.RegionID = regionID ?? "10000"; org.RegionLevel = (sbyte)(region == "全国 " ? Enums.RegionLevel.National : Enums.RegionLevel.Province); org.SocialSystem = (sbyte)Enums.SocialSystem.Government; //政府 org.IndustryID = industryID; org.StockListing = (sbyte)Enums.StockListing.NonIPO; org.DepartmentID = departmentID; //org.Homepage = webUrl; db.Organization.AddObject(org); db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave); } orgID = org.OrganizationID; } break; } //普通方式 default: { if (subType == "专业财经 ") { industryID = economyIndustryID; } //决定MediaType mediaType = (int)DeterminedMediaType(type); break; } } using (PalasDB db = new PalasDB()) { //添加Media var media = db.Media.FirstOrDefault(model => model.MediaName == name); if (media == null) { media = db.Media.CreateObject(); media.MediaID = CrawlDTO.NewGuid; media.MediaName = name; media.Url = webUrl; media.Channel = mediaChannel; media.MediaType = (sbyte)mediaType; media.MediaTendency = 0; media.MediaOrganType = 0; media.MediaWeight = 0; media.MediaStyle = 1; media.RegionType = 1; media.IndustryIDs = industryID; media.OrganizationIDs = orgID; media.DepartmentIDs = departmentID; media.ProxyZone = 0; media.CreateTime = DateTime.Now; db.Media.AddObject(media); db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave); } var mediaID = media.MediaID; //添加Site var site = db.Site.FirstOrDefault(model => model.Name == name); if (site == null) { site = db.Site.CreateObject(); site.SiteID = CrawlDTO.NewGuid; site.MediaID = mediaID; site.UrlEncoding = "UTF-8"; site.ParallelWithOtherCrawler = false; site.TimeoutSecs = 10; site.EncodingResponse = "UTF-8"; //site.EncodingResponse = "UTF-8"; site.LoginUseWebBrowser = false; site.IsVisible = true; site.ListItemCountPerPage = 20; site.Name = media.MediaName; site.ParseMethod = 1;//XPath抓取 site.CreateTime = DateTime.Now; site.ListPattern = ""; site.ItemPattern = ""; site.ContentDetailLevel = 2; db.Site.AddObject(site); db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave); } var siteID = site.SiteID; //添加Crawl var crawl = db.Crawl.FirstOrDefault(model => model.Url == url); if (crawl == null) { crawl = db.Crawl.CreateObject(); crawl.CrawlID = CrawlDTO.NewGuid; } crawl.SiteID = siteID; crawl.Name = name; crawl.IssueID = "ECO"; crawl.ListDrillMethod = 1; crawl.CrawlType = 1;//Crawl List and Items //Crawl Summary crawl.Url = url; crawl.IntervalMins = 1440; crawl.IntervalStrategy = 1; crawl.ExistItemStrategy = 2; crawl.MaxRetriveDays = 180; crawl.RequiredCount = 20; crawl.InitRequiredCount = 20; crawl.CreateTime = DateTime.Now; crawl.NextCrawlTime = DateTime.Now; crawl.MediaType = media.MediaType; crawl.MediaRegionType = media.RegionType; crawl.MediaID = mediaID; crawl.LastCrawlNewCount = 0; crawl.FollowIntervalMins = 60; crawl.FollowMinReplyLen = 6; crawl.MediaMapToChannel = false; crawl.MediaRecordNew = true; crawl.SaveSummary = true; crawl.SaveHtml = true; crawl.SaveContent = true; crawl.ReleaseAutoFormat = true; crawl.OrganizationIDs = orgID; crawl.DepartmentIDs = departmentID; crawl.IndustryIDs = industryID; db.Crawl.AddObject(crawl); db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave); } } catch (Exception ex) { Logger.Error("行" + currentLine + "错误", ex); } Statuslbl.Text = "当前正导入第" + currentLine + "条记录"; Application.DoEvents(); currentLine++; } }
private DetailResult[] GetHistoryData(DateTime startDate, DateTime endDate, Dictionary <string, LeaderInfo> leaderInfoMapping, Dictionary <string, FocusEvent> focusEventMapping, int startCnt = 10, int endCnt = 30) { List <DetailResult> list = new List <DetailResult>(); string[] crawlIDs; using (PalasDB db = new PalasDB()) { crawlIDs = db.Crawl.Where(model => model.CrawlID.StartsWith("Weibo")).Select(model => model.CrawlID).ToArray(); } crawlIDs = crawlIDs.Where(model => model != "WeiboSubscribe").ToArray(); foreach (var crawlID in crawlIDs) { for (int i = startCnt; i < endCnt; i++) { //var crawlQuery = Query.Matches("CrawlID", "/Weibo/"); var crawlQuery = Query.EQ("CrawlID", crawlID); var query = Query.And(crawlQuery, Query.GTE("FetchTime", startDate), Query.LTE("FetchTime", endDate), Query.Size("CountHistory", i) ); //var sort = SortBy.Ascending("PubDate"); Item[] result = null; try { result = MongoItemAccess.Items.Find(query).SetFields("ItemID", "CountHistory", "CrawlID", "Url", "CleanTitle", "MediaID", "PubDate", "ParentItemID", "CleanText", "AuthorName", "AuthorID", "AuthorCertificated", "Source", "AttachUrl") //.SetSortOrder(sort) .ToArray(); } catch (Exception) { continue; } foreach (var item in result) { DetailResult dest = new DetailResult(); item.CopyTo(ref dest); //赋值Item中字段 //取Media字段 using (PalasDB db = new PalasDB()) { var media = db.Media.FirstOrDefault(model => model.MediaID == item.MediaID); if (media != null) { media.CopyTo(ref dest); } var crawl = db.Crawl.FirstOrDefault(model => model.CrawlID == item.CrawlID); if (crawl != null) { dest.KeywordQuery = crawl.KeywordQuery; dest.CrawlName = crawl.Name; } } //匹配意见领袖 LeaderInfo info; bool isPublicLeader = leaderInfoMapping.TryGetValue(dest.AuthorName ?? "", out info); dest.IsPublicLeader = isPublicLeader ?1:0; if (isPublicLeader) { dest.LeaderInfo = info; } //匹配事件 FocusEvent focus; bool isGovProcess = focusEventMapping.TryGetValue(dest.CrawlName ?? "", out focus); dest.GovProcess = isGovProcess? 1 : 0; dest.HasAttachUrl = !string.IsNullOrEmpty(dest.AttachUrl) ? 1 : 0; //赋值不同的嵌套字段 foreach (var itemCountData in item.CountHistory) { var clonedDest = dest.SwallowClone(); clonedDest.CurrentHistoryFetchTime = itemCountData.FetchTime; clonedDest.ViewCount = itemCountData.ViewCount; clonedDest.ReplyCount = itemCountData.ReplyCount; clonedDest.ForwardCount = itemCountData.ForwardCount; list.Add(clonedDest); } } } } return(list.ToArray()); }
private void ExportBtn_Click(object sender, EventArgs e) { var leaderInfoMapping = GetLeaderInfo(); var eventInfoMapping = GetEventInfo(); var result = GetHistoryData(StartDatePicker.Value, EndDatePicker.Value, leaderInfoMapping, eventInfoMapping, (int)MinUpdown.Value, (int)MaxUpdown.Value); CalcField(result); //照itemID排序 //var targetPath = Path.Combine(Path.GetDirectoryName(Application.ExecutablePath), "时点数.xlsx"); //var builder = new ExcelOutputFormatBuilder(); var groupedResult = from item in result group item by item.CrawlID into g select g; foreach (var groupItem in groupedResult) { var itemGroup = from item in groupItem group item by item.ItemID into g select g; var targetPath = Path.Combine(Path.GetDirectoryName(Application.ExecutablePath), groupItem.Key + ".csv"); var builder = new FilterFieldCsvOutputFormatBuilder(targetPath, ","); int pos = 1; foreach (var itemDetail in itemGroup) { foreach (var item in itemDetail) { item.ItemID = pos.ToString(); BuildOneItem(item, builder); } pos++; } builder.Output(); using (PalasDB db = new PalasDB()) { var printCountPath = Path.Combine(Path.GetDirectoryName(Application.ExecutablePath), groupItem.Key + "search.csv"); var logBuilder = new CsvOutputFormatBuilder(printCountPath, ","); IGrouping <string, DetailResult> item = groupItem; var crawlLogResult = db.CrawlLog.Where(model => model.CrawlID == item.Key && model.CrawlTime >= StartDatePicker.Value && model.CrawlTime <= EndDatePicker.Value).OrderBy(model => model.CrawlTime); foreach (var crawlLog in crawlLogResult) { logBuilder.Build("PrintCount", crawlLog.PrintCount); logBuilder.Build("CrawlTime", crawlLog.CrawlTime.ToString("yyyyMMddHHmmss")); logBuilder.NewLine(); } logBuilder.Output(); } } /* * builder.Initialize(targetPath); * foreach (var item in result) * { * BuildOneItem(item, builder); * } * builder.Output(); * //*/ MessageBox.Show("导出成功"); }