public static void CapturPinPaiHouseInfo() { foreach (var crawlerConfiguration in dataContent.CrawlerConfigurations.Where(c => c.ConfigurationName == ConstConfigurationName.PinPaiGongYu && c.IsEnabled).ToList()) { LogHelper.RunActionNotThrowEx(() => { var confInfo = Newtonsoft.Json.JsonConvert.DeserializeObject <dynamic>(crawlerConfiguration.ConfigurationValue); for (var index = 0; index < confInfo.pagecount.Value; index++) { var url = $"http://{confInfo.shortcutname.Value}.58.com/pinpaigongyu/pn/{index}"; var htmlResult = HTTPHelper.GetHTMLByURL(url); var page = new HtmlParser().Parse(htmlResult); var lstLi = page.QuerySelectorAll("li").Where(element => element.HasAttribute("logr")); if (lstLi == null || lstLi.Count() == 0) { continue; } GetDataOnPageDoc(confInfo, page); dataContent.SaveChanges(); } }, "CapturPinPaiHouseInfo", crawlerConfiguration); } }
public static void AnalyzeDoubanHouseContent() { LogHelper.Info("AnalyzeDoubanHouseContent Start..."); int index = 0; try { var lstHouse = dataContent.HouseInfos.Where(h => h.Source == ConstConfigurationName.Douban && h.IsAnalyzed == false).Take(100).ToList(); foreach (var houseInfo in lstHouse) { var housePrice = JiebaTools.GetHousePrice(houseInfo.HouseText); string houseTextContent = string.Empty; if (housePrice == 0) { var htmlResult = HTTPHelper.GetHTML(houseInfo.HouseOnlineURL); if (string.IsNullOrEmpty(htmlResult)) { continue; } var page = htmlParser.Parse(htmlResult); var topicContent = page.QuerySelector("div.topic-content"); if (topicContent == null) { continue; } var houseDescription = topicContent.QuerySelector("p"); if (houseDescription == null) { continue; } houseTextContent = houseDescription.TextContent; housePrice = JiebaTools.GetHousePrice(houseDescription.TextContent); } if (housePrice != 0 || !string.IsNullOrEmpty(houseTextContent)) { index++; houseInfo.IsAnalyzed = true; } houseInfo.HouseText = houseTextContent; houseInfo.HousePrice = housePrice; } dataContent.SaveChanges(); } catch (Exception ex) { LogHelper.Error("AnalyzeDoubanHouseContent Exception", ex); } LogHelper.Info("AnalyzeDoubanHouseContent Finish,Update Count:" + index); }
private static List <BizHouseInfo> GetDataFromOnlineWeb(string groupID, string cityName, int pageIndex) { HashSet <string> hsDoubanHouseURL = new HashSet <string>(); dataContent.HouseInfos.Where(h => h.Source == ConstConfigurationName.Douban) .Select(h => h.HouseOnlineURL).Distinct().ToList() .ForEach(houseURL => { if (!hsDoubanHouseURL.Contains(houseURL)) { hsDoubanHouseURL.Add(houseURL); } }); List <BizHouseInfo> lstHouseInfo = new List <BizHouseInfo>(); var url = $"https://www.douban.com/group/{groupID}/discussion?start={pageIndex * 25}"; var htmlResult = HTTPHelper.GetHTML(url); if (string.IsNullOrEmpty(htmlResult)) { return(lstHouseInfo); } var page = htmlParser.Parse(htmlResult); foreach (var trItem in page.QuerySelector("table.olt").QuerySelectorAll("tr")) { var titleItem = trItem.QuerySelector("td.title"); if (titleItem == null || hsDoubanHouseURL.Contains(titleItem.QuerySelector("a").GetAttribute("href"))) { continue; } var houseInfo = new BizHouseInfo() { HouseTitle = titleItem.QuerySelector("a").GetAttribute("title"), HouseOnlineURL = titleItem.QuerySelector("a").GetAttribute("href"), HouseLocation = titleItem.QuerySelector("a").GetAttribute("title"), HouseText = titleItem.QuerySelector("a").GetAttribute("title"), DataCreateTime = DateTime.Now, PubTime = titleItem.QuerySelector("td.time") != null ? DateTime.Parse(DateTime.Now.ToString("yyyy-") + titleItem.QuerySelector("td.time").InnerHtml) : DateTime.Now, DisPlayPrice = "", Source = ConstConfigurationName.Douban, HousePrice = 0, LocationCityName = cityName }; lstHouseInfo.Add(houseInfo); } return(lstHouseInfo); }
private static void GetDataByWebAPI(int pageNum, HashSet <string> hsHouseOnlineUrl) { var dicParameter = new JObject() { { "uid", "" }, { "pageNum", $"{pageNum}" }, { "sortType", "1" }, { "sellRentType", "2" }, { "searchCondition", "{}" } }; var postHouseUrl = $"http://www.huzhumaifang.com:8080/hzmf-integration/getHouseList.action?content={JsonConvert.SerializeObject(dicParameter)}"; var resultJson = HTTPHelper.GetJsonResultByURL(postHouseUrl); var resultJObject = JsonConvert.DeserializeObject <JObject>(resultJson); var lstHouseInfo = from houseInfo in resultJObject["houseList"] select new { houseCreateTime = houseInfo["houseCreateTime"], houseRentPrice = houseInfo["houseRentPrice"], houseDescript = houseInfo["houseDescript"], houseId = houseInfo["houseId"] }; var tmp = new List <MutualHouseInfo>(); foreach (var houseInfo in lstHouseInfo) { var houseUrl = $"http://www.huzhumaifang.com/Renting/house_detail/id/{houseInfo.houseId.ToObject<Int32>()}.html"; if (hsHouseOnlineUrl.Contains(houseUrl)) { continue; } var desc = houseInfo.houseDescript.ToObject <string>().Replace("😄", ""); DataContent.MutualHouseInfos.Add(new MutualHouseInfo() { HouseOnlineURL = houseUrl, HouseLocation = desc, HousePrice = houseInfo.houseRentPrice.ToObject <Int32>(), HouseText = desc, DataCreateTime = DateTime.Now, HouseTitle = desc, DisPlayPrice = houseInfo.houseRentPrice.ToString(), LocationCityName = "上海", PubTime = houseInfo.houseCreateTime.ToObject <DateTime>(), Source = ConstConfigurationName.HuZhuZuFang, }); } DataContent.SaveChanges(); }
/// <summary> /// 过滤无效的城市配置 /// </summary> public static void FilterInvalidCityConfig() { foreach (var doubanConf in DataContent.CrawlerConfigurations.Where(c => c.ConfigurationName == ConstConfigurationName.PinPaiGongYu).ToList()) { var confInfo = JsonConvert.DeserializeObject <dynamic>(doubanConf.ConfigurationValue); var url = $"http://{confInfo.shortcutname.Value}.58.com/pinpaigongyu/pn/0"; var htmlResult = HTTPHelper.GetHTMLByURL(url); var page = new HtmlParser().Parse(htmlResult); var lstLi = page.QuerySelectorAll("li").Where(element => element.HasAttribute("logr")); if (!lstLi.Any()) { doubanConf.IsEnabled = false; } } DataContent.SaveChanges(); }
public static void GetDataFromOnlineWeb(string groupID, int index, string cityName) { var url = $"https://www.douban.com/group/{groupID}/discussion?start={index * 25}"; var htmlResult = HTTPHelper.GetHTML(url); if (string.IsNullOrEmpty(htmlResult)) { return; } var page = htmlParser.Parse(htmlResult); foreach (var trItem in page.QuerySelector("table.olt").QuerySelectorAll("tr")) { var titleItem = trItem.QuerySelector("td.title"); if (titleItem == null) { continue; } var houseInfo = new BizHouseInfo() { HouseTitle = titleItem.QuerySelector("a").GetAttribute("title"), HouseOnlineURL = titleItem.QuerySelector("a").GetAttribute("href"), HouseLocation = titleItem.QuerySelector("a").GetAttribute("title"), HouseText = titleItem.QuerySelector("a").GetAttribute("title"), DataCreateTime = DateTime.Now, PubTime = titleItem.QuerySelector("td.time") != null ? DateTime.Parse(DateTime.Now.ToString("yyyy-") + titleItem.QuerySelector("td.time").InnerHtml) : DateTime.Now, DisPlayPrice = "", SoureceDaminURL = "www.douban.com", HousePrice = 0, LocationCityName = cityName }; dataContent.Add(houseInfo); } dataContent.SaveChanges(); }
private static void AnalyzeFromWebPage(Web.Model.DBHouseInfo houseInfo, ref decimal housePrice, ref string houseTextContent) { var htmlResult = HTTPHelper.GetHTML(houseInfo.HouseOnlineURL); //没有页面信息 if (string.IsNullOrEmpty(htmlResult)) { //404页面 houseInfo.Status = 2; } else { var page = HtmlParser.Parse(htmlResult); var topicContent = page.QuerySelector("div.topic-content"); //没有帖子内容 if (topicContent == null || topicContent.QuerySelector("p") == null || topicContent.QuerySelector("p") == null) { houseInfo.Status = 3; } else { //获取帖子内容 houseTextContent = topicContent.QuerySelector("p").TextContent; //获取价格信息 housePrice = JiebaTools.GetHousePrice(houseTextContent); if (housePrice != 0 || !string.IsNullOrEmpty(houseTextContent)) { houseInfo.Status = 1; } houseInfo.DisPlayPrice = housePrice.ToString(CultureInfo.InvariantCulture); houseInfo.HousePrice = housePrice; houseInfo.HouseText = houseTextContent; } } }