コード例 #1
0
        private void DO(CrawlerInfo ci)
        {
            var uri      = new Uri(ci.url.Url);
            var siteType = HtmlParse.RecogSite(uri);
            var c        = new NCrawler.Crawler(uri, new HtmlDocumentProcessor(),
                                                new MyPipelineStep(ci))
            {
                MaximumCrawlDepth  = CrawlArgs.CrawlDepth(siteType),
                MaximumThreadCount = 5,
                IncludeFilter      = CrawlArgs.IncludeFilter(siteType),
                ExcludeFilter      = CrawlArgs.ExcludeFilter(siteType),
            };

            c.Crawl();
        }
コード例 #2
0
        public void Process(NCrawler.Crawler crawler, PropertyBag propertyBag)
        {
            var rsp = propertyBag.GetResponse();

            try
            {
                HtmlDocument htmlDoc  = HtmlParse.LoadFromHtml(propertyBag);
                var          siteType = HtmlParse.RecogSite(propertyBag.ResponseUri);
                var          records  = Parse(htmlDoc, siteType);
                if (records == null)
                {
                    return;
                }
                foreach (var record in records)
                {
                    DAL.Data.Add(record);
                    ++ci.Count;
                }
            }
            catch (NullReferenceException)
            {
            }
        }
コード例 #3
0
        /// <summary>
        /// 输入table节点, 按表头解析表格
        /// 如存在标题, 返回值中以Title为键存放
        /// </summary>
        /// <param name="tableRoot"></param>
        /// <returns></returns>
        public static Dictionary <string, List <string> > ParseTable(HtmlNode tableRoot)
        {
            var content = new Dictionary <string, List <string> >();

            content.Add("Title", new List <string>());
            var tbody = tableRoot.SelectSingleNode("./tbody");

            tbody = tbody ?? tableRoot;
            var columnCnt = 0;

            foreach (var tr in tbody.SelectNodes("./tr"))
            {
                var tds = tr.SelectNodes("./td");
                if (tds == null)
                {
                    continue;
                }
                if (tds.Count > columnCnt)
                {
                    columnCnt = tds.Count;
                }
            }
            if (columnCnt <= 0)
            {
                return(content);
            }
            var tbodyContent = new List <List <string> >();

            for (var i = 0; i < columnCnt; i++)
            {
                tbodyContent.Add(new List <string>());
            }
            foreach (var tr in tbody.SelectNodes("./tr"))
            {
                var tds   = tr.SelectNodes("./td");
                var count = tds.Count;
                if (count < columnCnt)
                {
                    var product = HtmlParse.ParseProjectName(tr.InnerText);
                    if (product != null)
                    {
                        content["Title"].Add(product);
                    }
                    continue;
                }
                for (var i = 0; i < count; i++)
                {
                    tbodyContent[i].Add(Regex.Replace(tds[i].InnerText, "\\s+", " "));
                }
            }
            foreach (var items in tbodyContent)
            {
                if (items.Count == 0)
                {
                    continue;
                }
                if (content.ContainsKey(items[0]))
                {
                    // TODO Console.WriteLine($"Repeat Key: {items[0]}");
                    continue;
                }
                string key = items[0];
                items.RemoveAt(0);
                content.Add(key, items);
            }
            return(content);
        }
コード例 #4
0
        public static List <Model.Data> Parse(HtmlDocument doc, SiteType siteType)
        {
            var result   = new List <Model.Data>();
            var title    = HtmlParse.ParseTitle(doc, siteType);
            var rootNode = doc.DocumentNode;

            if (title.Length < 8 || !Regex.IsMatch(title, "(公告|公示|中标)") || Regex.IsMatch(title, "(流标|废标)"))
            {
                return(null);
            }
            //Console.WriteLine($"\t{MyPipelineStep.count++}");
            var      contentType = ContentType.Unrecognized;
            HtmlNode contentNode = null;

            switch (siteType)
            {
            case SiteType.NanChang:
                contentNode = rootNode.SelectSingleNode("//table[@class='MsoNormalTable']");
                contentType = ContentType.Table;
                break;

            case SiteType.JingDeZhen:
                contentNode = rootNode.SelectSingleNode("//*[@id='MyContent']");
                contentType = ContentType.Paragraph;
                break;

            case SiteType.JiangXi:
                contentType = ContentType.Table;
                contentNode = rootNode.SelectSingleNode("//*[@id='TDContent']//*[@class='MsoNormalTable']");
                break;

            case SiteType.PinXiang:
                contentNode = rootNode.SelectSingleNode("//table[@align='center'//talbe[@align='center']");
                contentType = ContentType.Table;
                break;

            case SiteType.GanZhou:
                contentNode = rootNode.SelectSingleNode("//table[@class='MsoNormalTable']");
                contentType = ContentType.Table;
                break;

            case SiteType.FuZhou:
                contentNode = rootNode.SelectSingleNode("//body");
                contentType = ContentType.Paragraph;
                break;

            default:
                return(null);
            }

            var product    = HtmlParse.ParseProjectName(title);
            var facilities = new List <string>();
            var money      = 0;
            var date       = HtmlParse.ParseDate(doc, siteType);

            //按表格型解析
            if (contentType == ContentType.Table)
            {
                if (contentNode == null)
                {
                    return(result);
                }
                var table   = contentNode;
                var content = HtmlParse.ParseTable(table);
                if (content.ContainsKey("Title") && content["Title"].Count > 0)
                {
                    product = product ?? content["Title"][0];
                }
                foreach (var item in content)
                {
                    if (Regex.IsMatch(item.Key, "商|单位|公司"))
                    {
                        facilities = item.Value;
                    }
                    else if (Regex.IsMatch(item.Key, "金额"))
                    {
                        money = item.Value.Count > 0 ? HtmlParse.ParseMoney(item.Value[0]) : 0;
                    }
                }
            }
            else if (contentType == ContentType.Paragraph)
            {
                //按文本型解析
                var text = contentNode.InnerText;
                facilities = HtmlParse.ParseFacilities(text);
                money      = HtmlParse.ParseMoney(text);
            }
            var count = facilities.Count;

            if (product == null)
            {
                //Console.WriteLine($"\n{title}\n");
                return(null);
            }
            for (int i = 0; i != facilities.Count; ++i)
            {
                facilities[i] = Regex.Replace(facilities[i], "&nbsp;", "");
                if (facilities[i].Length < 5)
                {
                    facilities[i] = null;
                }
            }
            for (var i = 0; i < count; i++)
            {
                try
                {
                    result.Add(new Model.Data()
                    {
                        ProjectName = (product ?? "NULL").Trim(),
                        WinCom      = (facilities[i] ?? "NULL").Trim(),
                        Money       = money,
                        Time        = date,
                    });
                }
                catch (FormatException)
                {
                }
            }
            return(result);
        }