public CrawlStatusData GetCurrentCrawlStatus()
        {
            var infos  = CrawlerManager.CrawlerFactory.Pipelines.Select(model => model.Info).ToArray();
            var result = new CrawlStatusData()
            {
                HostData = HostBasicData.GetHostData()
            };
            var data = from item in infos
                       select new CrawlData
            {
                CrawlID  = item.CurrentJob_Name,
                Message  = item.Message,
                JobCount = item.JobCount
            };

            foreach (var crawlData in data)
            {
                using (PalasDB db = new PalasDB())
                {
                    try
                    {
                        var crawl = db.Crawl.FirstOrDefault(model => model.CrawlID == crawlData.CrawlID);
                        crawlData.LastCrawlTime = crawl.LastCrawlTime;
                        crawlData.Name          = crawl.Name;
                        crawlData.Url           = crawl.Url;
                    }
                    catch {}
                }
            }
            result.CrawlData = data.ToArray();
            return(result);
        }
Esempio n. 2
0
        private void ImportMedia()
        {
            Workbook dailybook = new Workbook();

            dailybook.Open(@"D:\dailyreport\日报.xlsx");
            var dailyWorksheet = dailybook.Worksheets[1];
            int dailyStartRow  = 0;

            using (PalasDB db = new PalasDB())
            {
                var result = db.Media.Where(model => model.ParentMediaID == null || model.ParentMediaID == "");
                foreach (var item in result)
                {
                    try
                    {
                        var url  = item.Url;
                        Uri uri  = new Uri(url);
                        var host = GetUrlDomain(uri.Host);
                        dailyWorksheet.Cells[dailyStartRow, 0].PutValue(host);
                        dailyWorksheet.Cells[dailyStartRow, 1].PutValue(item.MediaName);
                        dailyStartRow++;
                    }
                    catch (Exception)
                    {
                    }
                }
            }
            dailybook.Save(@"D:\dailyreport\日报.xlsx");
        }
Esempio n. 3
0
        private List <ResultResponse> CrawlOneKeyword(string siteID, string keyword, DateTime startDate, int maxCnt, Action <string, string, int, int> CallBack)
        {
            Crawler.Core.Crawler.SimpleCrawler oneJob = new Core.Crawler.SimpleCrawler("test",
                                                                                       new Scheduler("nothing", new SchedulerSetting()));

            var crawlid  = "";
            var pageCnt  = 0;
            var siteName = "";

            using (PalasDB db = new PalasDB())
            {
                var crawl = db.Crawl.FirstOrDefault(model => model.SiteID == siteID);
                crawlid  = crawl.CrawlID;
                pageCnt  = crawl.Site.ListItemCountPerPage;
                siteName = crawl.Site.Name;
            }
            List <ResultResponse> resultList = new List <ResultResponse>();
            var needPage = (int)(maxCnt / pageCnt) + 1;

            for (int pageNum = 0; pageNum < needPage; pageNum++)
            {
                var result = oneJob.CrawlOnePageList(crawlid, keyword, pageNum, "", true);
                if (result != null && result.Items != null)
                {
                    var resultItem = from item in result.Items
                                     select
                                     new ResultResponse {
                        Count = item.DataItemCount, Data = item.DataItem, SiteName = siteName, Keyword = keyword
                    };
                    //如果时间已经过了特定时间段则直接返回
                    resultItem = resultItem.Where(model => model.Data != null).ToArray();
                    var expireCnt = resultItem.Count(model => model.Data.PubDate < startDate);
                    if (expireCnt > 0)
                    {
                        //如果存在过期项目则特殊处理
                        resultItem = resultItem.Where(model => model.Data.PubDate >= startDate);
                        resultList.AddRange(resultItem);
                        return(resultList);
                    }
                    resultList.AddRange(resultItem);
                }
                if (CallBack != null)
                {
                    CallBack(siteName, keyword, pageNum + 1, pageNum * pageCnt);
                }
            }


            return(resultList);
        }
Esempio n. 4
0
 private string GetRegionID(string region)
 {
     using (PalasDB db = new PalasDB())
     {
         if (string.IsNullOrEmpty(region))
         {
             return(null);
         }
         string province     = region.Substring(0, 2);
         var    regionEntity = db.Region.FirstOrDefault(model => model.Province == province && model.City == null);
         if (regionEntity == null)
         {
             return(null);
         }
         return(regionEntity.RegionID);
     }
 }
Esempio n. 5
0
        private void Initailize()
        {
            //绑定搜索列表
            using (PalasDB db = new PalasDB())
            {
                var result = db.Site.Where(model => model.Media.MediaType == (sbyte)Enums.MediaType.SearchForum).ToArray();

                SearchForumList.DataSource    = result;
                SearchForumList.DisplayMember = "Name";
                SearchForumList.ValueMember   = "SiteID";

                SearchChkList.DataSource    = result;
                SearchChkList.DisplayMember = "Name";
                SearchChkList.ValueMember   = "SiteID";
            }
            //初始化Grid
            ResultDataGridView.AutoGenerateColumns       = false;
            SearchResultDataGridView.AutoGenerateColumns = false;
            //初始化限定时间
            startDateTime.Value      = DateTime.Now.AddMonths(-1);
            BatchStartDateTime.Value = DateTime.Now.AddMonths(-1);
            //暂时移除第二个选项卡
            //tabControl1.TabPages.RemoveAt(1);
        }
Esempio n. 6
0
        private void ImportBtn_Click(object sender, EventArgs e)
        {
            Workbook workbook = new Workbook();

            workbook.Open(@"D:\output\input.xlsx");
            Worksheet sheet       = workbook.Worksheets["输入源"];
            int       currentLine = 1;

            while (!string.IsNullOrEmpty(sheet.Cells[currentLine, 0].StringValue))
            {
                try
                {
                    string       mediaChannel      = sheet.Cells[currentLine, 0].StringValue;
                    string       url               = ProcessUrl(sheet.Cells[currentLine, 1].StringValue);
                    string       type              = sheet.Cells[currentLine, 2].StringValue;
                    string       subType           = sheet.Cells[currentLine, 3].StringValue;
                    string       name              = sheet.Cells[currentLine, 4].StringValue;
                    string       webName           = sheet.Cells[currentLine, 5].StringValue;
                    string       webUrl            = ProcessUrl(sheet.Cells[currentLine, 6].StringValue);
                    string       region            = sheet.Cells[currentLine, 7].StringValue;
                    string       industry          = sheet.Cells[currentLine, 8].StringValue;
                    string       orgLevel          = sheet.Cells[currentLine, 9].StringValue;
                    string       socialSystem      = sheet.Cells[currentLine, 10].StringValue;
                    int          mediaType         = 3;
                    const string economyIndustryID = "J";
                    string       departmentID      = null;
                    string       orgID             = null;
                    string       industryID        = null;
                    string       regionID          = GetRegionID(region);
                    switch (type)
                    {
                    //需要填入行业信息
                    case "行业协会":
                    case "行业信息":
                    {
                        using (PalasDB db = new PalasDB())
                        {
                            industryID = string.IsNullOrEmpty(industry) ? null : industry.Substring(0, 1);
                            if (subType == "行业协会")
                            {
                                //Add org

                                Organization org = db.Organization.FirstOrDefault(model => model.FullName == name);
                                if (org == null)
                                {
                                    org = db.Organization.CreateObject();
                                    org.OrganizationID = CrawlDTO.NewGuid;
                                    org.FullName       = name;
                                    org.ShortName      = name;
                                    org.OrgLevel       = (sbyte)Enums.OrgLevel.Province;            //省部
                                    org.RegionID       = regionID ?? "10000";
                                    org.RegionLevel    = (sbyte)Enums.RegionLevel.Province;         //省级
                                    org.SocialSystem   = (sbyte)Enums.SocialSystem.CompanyNational; //国企
                                    org.DepartmentID   = null;
                                    org.IndustryID     = industryID;
                                    org.StockListing   = (sbyte)Enums.StockListing.NonIPO;
                                    //org.Homepage = webUrl;
                                    db.Organization.AddObject(org);
                                    db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave);
                                }
                                orgID = org.OrganizationID;
                            }
                        }
                        break;
                    }

                    //需要填入部门信息
                    case "机构":
                    case "上市公司":
                    {
                        industryID = economyIndustryID;
                        using (PalasDB db = new PalasDB())
                        {
                            var department = db.Department.FirstOrDefault(model => model.DepartmentName == subType);
                            if (department == null)
                            {
                                department = db.Department.CreateObject();
                                department.DepartmentName = subType;
                                department.Order          = 0;
                                department.DepartmentID   = CrawlDTO.NewGuid;
                                db.Department.AddObject(department);
                                db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave);
                            }
                            departmentID = department.DepartmentID;
                            var org = db.Organization.FirstOrDefault(model => model.FullName == name);
                            if (org == null)
                            {
                                org = db.Organization.CreateObject();
                                org.OrganizationID = CrawlDTO.NewGuid;
                                org.FullName       = name;
                                org.ShortName      = name;
                                org.OrgLevel       = (sbyte)Enums.OrgLevel.Province;            //省部
                                org.RegionID       = regionID ?? "10000";
                                org.RegionLevel    = (sbyte)Enums.RegionLevel.Province;         //省级
                                org.SocialSystem   = (sbyte)Enums.SocialSystem.CompanyNational; //国企
                                org.DepartmentID   = departmentID;
                                org.IndustryID     = industryID;
                                if (type == "上市公司")
                                {
                                    org.StockListing = (sbyte)Enums.StockListing.Shanghai_A;
                                }
                                else
                                {
                                    org.StockListing = (sbyte)Enums.StockListing.NonIPO;
                                }

                                //org.Homepage = webUrl;
                                db.Organization.AddObject(org);
                                db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave);
                            }
                            orgID = org.OrganizationID;
                        }

                        break;
                    }

                    //需要填入组织级别
                    case "政府机构":
                    {
                        industryID = null;
                        using (PalasDB db = new PalasDB())
                        {
                            var department = db.Department.FirstOrDefault(model => model.DepartmentName == subType);
                            if (department == null)
                            {
                                department = db.Department.CreateObject();
                                department.DepartmentName = subType;
                                department.Order          = 0;
                                department.DepartmentID   = CrawlDTO.NewGuid;
                                db.Department.AddObject(department);
                                db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave);
                            }
                            departmentID = department.DepartmentID;
                            var org = db.Organization.FirstOrDefault(model => model.FullName == name);
                            if (org == null)
                            {
                                org = db.Organization.CreateObject();
                                org.OrganizationID = CrawlDTO.NewGuid;
                                org.FullName       = name;
                                org.ShortName      = name;
                                org.OrgLevel       = sbyte.Parse(orgLevel.Substring(0, 1));  //省部
                                org.RegionID       = regionID ?? "10000";
                                org.RegionLevel    = (sbyte)(region == "全国 " ? Enums.RegionLevel.National : Enums.RegionLevel.Province);
                                org.SocialSystem   = (sbyte)Enums.SocialSystem.Government;       //政府

                                org.IndustryID = industryID;

                                org.StockListing = (sbyte)Enums.StockListing.NonIPO;
                                org.DepartmentID = departmentID;

                                //org.Homepage = webUrl;
                                db.Organization.AddObject(org);
                                db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave);
                            }
                            orgID = org.OrganizationID;
                        }
                        break;
                    }

                    //普通方式
                    default:
                    {
                        if (subType == "专业财经 ")
                        {
                            industryID = economyIndustryID;
                        }
                        //决定MediaType
                        mediaType = (int)DeterminedMediaType(type);
                        break;
                    }
                    }
                    using (PalasDB db = new PalasDB())
                    {
                        //添加Media
                        var media = db.Media.FirstOrDefault(model => model.MediaName == name);
                        if (media == null)
                        {
                            media                 = db.Media.CreateObject();
                            media.MediaID         = CrawlDTO.NewGuid;
                            media.MediaName       = name;
                            media.Url             = webUrl;
                            media.Channel         = mediaChannel;
                            media.MediaType       = (sbyte)mediaType;
                            media.MediaTendency   = 0;
                            media.MediaOrganType  = 0;
                            media.MediaWeight     = 0;
                            media.MediaStyle      = 1;
                            media.RegionType      = 1;
                            media.IndustryIDs     = industryID;
                            media.OrganizationIDs = orgID;
                            media.DepartmentIDs   = departmentID;
                            media.ProxyZone       = 0;
                            media.CreateTime      = DateTime.Now;
                            db.Media.AddObject(media);
                            db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave);
                        }
                        var mediaID = media.MediaID;

                        //添加Site
                        var site = db.Site.FirstOrDefault(model => model.Name == name);
                        if (site == null)
                        {
                            site             = db.Site.CreateObject();
                            site.SiteID      = CrawlDTO.NewGuid;
                            site.MediaID     = mediaID;
                            site.UrlEncoding = "UTF-8";
                            site.ParallelWithOtherCrawler = false;
                            site.TimeoutSecs      = 10;
                            site.EncodingResponse = "UTF-8";
                            //site.EncodingResponse = "UTF-8";


                            site.LoginUseWebBrowser   = false;
                            site.IsVisible            = true;
                            site.ListItemCountPerPage = 20;
                            site.Name               = media.MediaName;
                            site.ParseMethod        = 1;//XPath抓取
                            site.CreateTime         = DateTime.Now;
                            site.ListPattern        = "";
                            site.ItemPattern        = "";
                            site.ContentDetailLevel = 2;
                            db.Site.AddObject(site);
                            db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave);
                        }
                        var siteID = site.SiteID;
                        //添加Crawl
                        var crawl = db.Crawl.FirstOrDefault(model => model.Url == url);
                        if (crawl == null)
                        {
                            crawl         = db.Crawl.CreateObject();
                            crawl.CrawlID = CrawlDTO.NewGuid;
                        }
                        crawl.SiteID = siteID;

                        crawl.Name = name;

                        crawl.IssueID         = "ECO";
                        crawl.ListDrillMethod = 1;

                        crawl.CrawlType = 1;//Crawl List and Items

                        //Crawl Summary
                        crawl.Url               = url;
                        crawl.IntervalMins      = 1440;
                        crawl.IntervalStrategy  = 1;
                        crawl.ExistItemStrategy = 2;
                        crawl.MaxRetriveDays    = 180;

                        crawl.RequiredCount      = 20;
                        crawl.InitRequiredCount  = 20;
                        crawl.CreateTime         = DateTime.Now;
                        crawl.NextCrawlTime      = DateTime.Now;
                        crawl.MediaType          = media.MediaType;
                        crawl.MediaRegionType    = media.RegionType;
                        crawl.MediaID            = mediaID;
                        crawl.LastCrawlNewCount  = 0;
                        crawl.FollowIntervalMins = 60;
                        crawl.FollowMinReplyLen  = 6;
                        crawl.MediaMapToChannel  = false;
                        crawl.MediaRecordNew     = true;
                        crawl.SaveSummary        = true;
                        crawl.SaveHtml           = true;
                        crawl.SaveContent        = true;
                        crawl.ReleaseAutoFormat  = true;
                        crawl.OrganizationIDs    = orgID;
                        crawl.DepartmentIDs      = departmentID;
                        crawl.IndustryIDs        = industryID;

                        db.Crawl.AddObject(crawl);
                        db.SaveChanges(SaveOptions.AcceptAllChangesAfterSave);
                    }
                }
                catch (Exception ex)
                {
                    Logger.Error("行" + currentLine + "错误", ex);
                }


                Statuslbl.Text = "当前正导入第" + currentLine + "条记录";
                Application.DoEvents();
                currentLine++;
            }
        }
Esempio n. 7
0
        private DetailResult[] GetHistoryData(DateTime startDate, DateTime endDate, Dictionary <string, LeaderInfo> leaderInfoMapping, Dictionary <string, FocusEvent> focusEventMapping, int startCnt = 10, int endCnt = 30)
        {
            List <DetailResult> list = new List <DetailResult>();

            string[] crawlIDs;
            using (PalasDB db = new PalasDB())
            {
                crawlIDs = db.Crawl.Where(model => model.CrawlID.StartsWith("Weibo")).Select(model => model.CrawlID).ToArray();
            }
            crawlIDs = crawlIDs.Where(model => model != "WeiboSubscribe").ToArray();
            foreach (var crawlID in crawlIDs)
            {
                for (int i = startCnt; i < endCnt; i++)
                {
                    //var crawlQuery = Query.Matches("CrawlID", "/Weibo/");
                    var crawlQuery = Query.EQ("CrawlID", crawlID);
                    var query      = Query.And(crawlQuery, Query.GTE("FetchTime", startDate), Query.LTE("FetchTime", endDate), Query.Size("CountHistory", i)

                                               );


                    //var sort = SortBy.Ascending("PubDate");
                    Item[] result = null;
                    try
                    {
                        result =
                            MongoItemAccess.Items.Find(query).SetFields("ItemID", "CountHistory", "CrawlID", "Url", "CleanTitle", "MediaID",
                                                                        "PubDate", "ParentItemID", "CleanText", "AuthorName", "AuthorID", "AuthorCertificated", "Source", "AttachUrl")
                            //.SetSortOrder(sort)
                            .ToArray();
                    }
                    catch (Exception)
                    {
                        continue;
                    }



                    foreach (var item in result)
                    {
                        DetailResult dest = new DetailResult();
                        item.CopyTo(ref dest);  //赋值Item中字段
                        //取Media字段
                        using (PalasDB db = new PalasDB())
                        {
                            var media = db.Media.FirstOrDefault(model => model.MediaID == item.MediaID);
                            if (media != null)
                            {
                                media.CopyTo(ref dest);
                            }
                            var crawl = db.Crawl.FirstOrDefault(model => model.CrawlID == item.CrawlID);
                            if (crawl != null)
                            {
                                dest.KeywordQuery = crawl.KeywordQuery;
                                dest.CrawlName    = crawl.Name;
                            }
                        }

                        //匹配意见领袖
                        LeaderInfo info;
                        bool       isPublicLeader = leaderInfoMapping.TryGetValue(dest.AuthorName ?? "", out info);
                        dest.IsPublicLeader = isPublicLeader ?1:0;
                        if (isPublicLeader)
                        {
                            dest.LeaderInfo = info;
                        }
                        //匹配事件
                        FocusEvent focus;

                        bool isGovProcess = focusEventMapping.TryGetValue(dest.CrawlName ?? "", out focus);
                        dest.GovProcess = isGovProcess? 1 : 0;

                        dest.HasAttachUrl = !string.IsNullOrEmpty(dest.AttachUrl) ? 1 : 0;
                        //赋值不同的嵌套字段
                        foreach (var itemCountData in item.CountHistory)
                        {
                            var clonedDest = dest.SwallowClone();
                            clonedDest.CurrentHistoryFetchTime = itemCountData.FetchTime;
                            clonedDest.ViewCount    = itemCountData.ViewCount;
                            clonedDest.ReplyCount   = itemCountData.ReplyCount;
                            clonedDest.ForwardCount = itemCountData.ForwardCount;
                            list.Add(clonedDest);
                        }
                    }
                }
            }
            return(list.ToArray());
        }
Esempio n. 8
0
        private void ExportBtn_Click(object sender, EventArgs e)
        {
            var leaderInfoMapping = GetLeaderInfo();
            var eventInfoMapping  = GetEventInfo();
            var result            = GetHistoryData(StartDatePicker.Value, EndDatePicker.Value, leaderInfoMapping, eventInfoMapping, (int)MinUpdown.Value, (int)MaxUpdown.Value);

            CalcField(result);
            //照itemID排序


            //var targetPath = Path.Combine(Path.GetDirectoryName(Application.ExecutablePath), "时点数.xlsx");
            //var builder = new ExcelOutputFormatBuilder();


            var groupedResult = from item in result
                                group item by item.CrawlID
                                into g
                                select g;

            foreach (var groupItem in groupedResult)
            {
                var itemGroup = from item in groupItem
                                group item by item.ItemID
                                into g
                                select g;
                var targetPath = Path.Combine(Path.GetDirectoryName(Application.ExecutablePath), groupItem.Key + ".csv");
                var builder    = new FilterFieldCsvOutputFormatBuilder(targetPath, ",");

                int pos = 1;
                foreach (var itemDetail in itemGroup)
                {
                    foreach (var item in itemDetail)
                    {
                        item.ItemID = pos.ToString();
                        BuildOneItem(item, builder);
                    }
                    pos++;
                }
                builder.Output();



                using (PalasDB db = new PalasDB())
                {
                    var printCountPath = Path.Combine(Path.GetDirectoryName(Application.ExecutablePath), groupItem.Key + "search.csv");
                    var logBuilder     = new CsvOutputFormatBuilder(printCountPath, ",");



                    IGrouping <string, DetailResult> item = groupItem;
                    var crawlLogResult = db.CrawlLog.Where(model => model.CrawlID == item.Key && model.CrawlTime >= StartDatePicker.Value && model.CrawlTime <= EndDatePicker.Value).OrderBy(model => model.CrawlTime);
                    foreach (var crawlLog in crawlLogResult)
                    {
                        logBuilder.Build("PrintCount", crawlLog.PrintCount);
                        logBuilder.Build("CrawlTime", crawlLog.CrawlTime.ToString("yyyyMMddHHmmss"));
                        logBuilder.NewLine();
                    }
                    logBuilder.Output();
                }
            }

            /*
             * builder.Initialize(targetPath);
             * foreach (var item in result)
             * {
             *  BuildOneItem(item, builder);
             * }
             * builder.Output();
             * //*/

            MessageBox.Show("导出成功");
        }