Пример #1
0
        public IEnumerable <ShellNode> StartCrawlForSubwayRoom(string shellLineNumber, string shellLineName)
        {
            List <ShellNode> shellNodes = new List <ShellNode>();
            int totalNumber             = GetTotalShellNumberForSubwayType(shellLineNumber);
            int pages = totalNumber / 30;

            if (totalNumber % 30 > 0)
            {
                pages += 1;
            }
            Console.WriteLine($"{DateTime.Now}: Total page: {pages}.");
            if (pages > 100)
            {
                pages = 100; // Only show 100 pages.
            }

            for (int i = 1; i <= pages; i++)
            {
                string         pageUrl   = BuildSubwayUrl(i, shellLineNumber);
                ShellPageModel pageModel = CrawlOnePageInfo(pageUrl);
                Console.WriteLine($"{DateTime.Now}: <{shellLineName}> start to crawl page: {pageUrl}.");
                IEnumerable <ShellNode> pageNodes = ShellModelToNodes(pageModel, shellLineName);
                shellNodes.AddRange(pageNodes);
                Thread.Sleep(1000);
            }
            return(shellNodes);
        }
Пример #2
0
        private IEnumerable <ShellNode> ShellModelToNodes(ShellPageModel shellPageModel, string villageName)
        {
            List <ShellNode> nodes = new List <ShellNode>();
            int count = shellPageModel.Floors.Count();

            for (int i = 0; i < count; i++)
            {
                ShellNode node = new ShellNode();
                node.VillageName  = villageName;
                node.Position     = shellPageModel.Positions[i];
                node.Floor        = shellPageModel.Floors[i];
                node.Orientation  = shellPageModel.Orientations[i];
                node.Price        = shellPageModel.Prices[i];
                node.Title        = shellPageModel.Titles[i];
                node.AreaNumber   = shellPageModel.AreaNumbers[i];
                node.AreaStr      = shellPageModel.AreaStrings[i];
                node.FollowDay    = shellPageModel.FollowDays[i];
                node.FollowNumber = shellPageModel.FollowNumbers[i];
                node.UnitPrice    = shellPageModel.UnitPrices[i];
                node.YearInfo     = shellPageModel.YearInfos[i];
                node.CrawlDate    = DateTime.Now;
                node.LinkUrl      = shellPageModel.LinkUrls[i];
                nodes.Add(node);
            }

            return(nodes);
        }
Пример #3
0
        public IEnumerable <ShellNode> StartCrawlForOneVillage(string villageName)
        {
            List <ShellNode> shellNodes = new List <ShellNode>();
            int totalNumber             = GetTotalSellNumber(villageName);
            int pages = totalNumber / 30;

            if (totalNumber % 30 > 0)
            {
                pages += 1;
            }
            Console.WriteLine($"{DateTime.Now}: Total page: {pages}");
            for (int i = 1; i <= pages; i++)
            {
                string         pageUrl   = BuildCrawlerUrl(i, villageName);
                ShellPageModel pageModel = CrawlOnePageInfo(pageUrl);
                Console.WriteLine($"{DateTime.Now}: start to crawl page: {pageUrl}.");
                IEnumerable <ShellNode> pageNodes = ShellModelToNodes(pageModel, villageName);
                shellNodes.AddRange(pageNodes);
                Thread.Sleep(1000);
            }

            return(shellNodes);
        }
Пример #4
0
        private ShellPageModel CrawlOnePageInfo(string pageUrl)
        {
            ShellPageModel pageInfoModel = new ShellPageModel();
            var            doc           = _web.Load(pageUrl);
            var            itemTitles    = doc.DocumentNode.SelectNodes("//a[@class='img VIEWDATA CLICKDATA maidian-detail']");

            pageInfoModel.Titles = itemTitles.Select(node => node.Attributes["title"].Value).ToList();
            var linkUrls = doc.DocumentNode.SelectNodes("//a[@class='img VIEWDATA CLICKDATA maidian-detail']");

            pageInfoModel.LinkUrls = linkUrls.Select(node => node.Attributes["href"].Value).ToList();
            var positions = doc.DocumentNode.SelectNodes("//div[@class='positionInfo']");

            pageInfoModel.Positions = positions.Select(node => node.InnerText.Trim()).ToList();
            var           houseInfos  = doc.DocumentNode.SelectNodes("//div[@class='houseInfo']");
            List <string> floor       = new List <string>();
            List <string> yearInfo    = new List <string>();
            List <string> areaStr     = new List <string>();
            List <string> areaNumber  = new List <string>();
            List <string> orientation = new List <string>();

            foreach (HtmlNode node in houseInfos)
            {
                string   allInfo   = node.InnerText;
                string[] infoArray = allInfo.Split('|');
                if (infoArray.Length != 5)
                {
                    floor.Add(null);
                    yearInfo.Add(null);
                    areaStr.Add(null);
                    areaNumber.Add(null);
                    orientation.Add(null);
                }
                else
                {
                    floor.Add(infoArray[0].Trim());
                    yearInfo.Add(infoArray[1].Trim());
                    areaStr.Add(infoArray[2].Trim());
                    areaNumber.Add(infoArray[3].Trim());
                    orientation.Add(infoArray[4].Trim());
                }
            }
            var           followInfos  = doc.DocumentNode.SelectNodes("//div[@class='followInfo']");
            List <string> followNumber = new List <string>();
            List <string> followDay    = new List <string>();

            foreach (HtmlNode node in followInfos)
            {
                string   followStr = node.InnerText;
                string[] infoArray = followStr.Split('/');
                followNumber.Add(infoArray[0].Trim());
                followDay.Add(infoArray[1].Trim());
            }
            var totalPrices = doc.DocumentNode.SelectNodes("//div[@class='totalPrice']");

            pageInfoModel.Prices = totalPrices.Select(node => node.InnerText.Trim()).ToList();
            var unitPrices = doc.DocumentNode.SelectNodes("//div[@class='unitPrice']");

            pageInfoModel.UnitPrices    = unitPrices.Select(node => node.InnerText.Trim()).ToList();
            pageInfoModel.Floors        = floor;
            pageInfoModel.YearInfos     = yearInfo;
            pageInfoModel.AreaStrings   = areaStr;
            pageInfoModel.AreaNumbers   = areaNumber;
            pageInfoModel.Orientations  = orientation;
            pageInfoModel.FollowNumbers = followNumber;
            pageInfoModel.FollowDays    = followDay;
            return(pageInfoModel);
        }