public static void AnalyzeDoubanHouseContent() { LogHelper.Info("AnalyzeDoubanHouseContent Start..."); int index = 0; try { var lstHouse = dataContent.HouseInfos.Where(h => h.Source == ConstConfigurationName.Douban && h.IsAnalyzed == false).Take(100).ToList(); foreach (var houseInfo in lstHouse) { var housePrice = JiebaTools.GetHousePrice(houseInfo.HouseText); string houseTextContent = string.Empty; if (housePrice == 0) { var htmlResult = HTTPHelper.GetHTML(houseInfo.HouseOnlineURL); if (string.IsNullOrEmpty(htmlResult)) { continue; } var page = htmlParser.Parse(htmlResult); var topicContent = page.QuerySelector("div.topic-content"); if (topicContent == null) { continue; } var houseDescription = topicContent.QuerySelector("p"); if (houseDescription == null) { continue; } houseTextContent = houseDescription.TextContent; housePrice = JiebaTools.GetHousePrice(houseDescription.TextContent); } if (housePrice != 0 || !string.IsNullOrEmpty(houseTextContent)) { index++; houseInfo.IsAnalyzed = true; } houseInfo.HouseText = houseTextContent; houseInfo.HousePrice = housePrice; } dataContent.SaveChanges(); } catch (Exception ex) { LogHelper.Error("AnalyzeDoubanHouseContent Exception", ex); } LogHelper.Info("AnalyzeDoubanHouseContent Finish,Update Count:" + index); }
private static List <BizHouseInfo> GetDataFromOnlineWeb(string groupID, string cityName, int pageIndex) { HashSet <string> hsDoubanHouseURL = new HashSet <string>(); dataContent.HouseInfos.Where(h => h.Source == ConstConfigurationName.Douban) .Select(h => h.HouseOnlineURL).Distinct().ToList() .ForEach(houseURL => { if (!hsDoubanHouseURL.Contains(houseURL)) { hsDoubanHouseURL.Add(houseURL); } }); List <BizHouseInfo> lstHouseInfo = new List <BizHouseInfo>(); var url = $"https://www.douban.com/group/{groupID}/discussion?start={pageIndex * 25}"; var htmlResult = HTTPHelper.GetHTML(url); if (string.IsNullOrEmpty(htmlResult)) { return(lstHouseInfo); } var page = htmlParser.Parse(htmlResult); foreach (var trItem in page.QuerySelector("table.olt").QuerySelectorAll("tr")) { var titleItem = trItem.QuerySelector("td.title"); if (titleItem == null || hsDoubanHouseURL.Contains(titleItem.QuerySelector("a").GetAttribute("href"))) { continue; } var houseInfo = new BizHouseInfo() { HouseTitle = titleItem.QuerySelector("a").GetAttribute("title"), HouseOnlineURL = titleItem.QuerySelector("a").GetAttribute("href"), HouseLocation = titleItem.QuerySelector("a").GetAttribute("title"), HouseText = titleItem.QuerySelector("a").GetAttribute("title"), DataCreateTime = DateTime.Now, PubTime = titleItem.QuerySelector("td.time") != null ? DateTime.Parse(DateTime.Now.ToString("yyyy-") + titleItem.QuerySelector("td.time").InnerHtml) : DateTime.Now, DisPlayPrice = "", Source = ConstConfigurationName.Douban, HousePrice = 0, LocationCityName = cityName }; lstHouseInfo.Add(houseInfo); } return(lstHouseInfo); }
public static void GetDataFromOnlineWeb(string groupID, int index, string cityName) { var url = $"https://www.douban.com/group/{groupID}/discussion?start={index * 25}"; var htmlResult = HTTPHelper.GetHTML(url); if (string.IsNullOrEmpty(htmlResult)) { return; } var page = htmlParser.Parse(htmlResult); foreach (var trItem in page.QuerySelector("table.olt").QuerySelectorAll("tr")) { var titleItem = trItem.QuerySelector("td.title"); if (titleItem == null) { continue; } var houseInfo = new BizHouseInfo() { HouseTitle = titleItem.QuerySelector("a").GetAttribute("title"), HouseOnlineURL = titleItem.QuerySelector("a").GetAttribute("href"), HouseLocation = titleItem.QuerySelector("a").GetAttribute("title"), HouseText = titleItem.QuerySelector("a").GetAttribute("title"), DataCreateTime = DateTime.Now, PubTime = titleItem.QuerySelector("td.time") != null ? DateTime.Parse(DateTime.Now.ToString("yyyy-") + titleItem.QuerySelector("td.time").InnerHtml) : DateTime.Now, DisPlayPrice = "", SoureceDaminURL = "www.douban.com", HousePrice = 0, LocationCityName = cityName }; dataContent.Add(houseInfo); } dataContent.SaveChanges(); }
private static void AnalyzeFromWebPage(Web.Model.DBHouseInfo houseInfo, ref decimal housePrice, ref string houseTextContent) { var htmlResult = HTTPHelper.GetHTML(houseInfo.HouseOnlineURL); //没有页面信息 if (string.IsNullOrEmpty(htmlResult)) { //404页面 houseInfo.Status = 2; } else { var page = HtmlParser.Parse(htmlResult); var topicContent = page.QuerySelector("div.topic-content"); //没有帖子内容 if (topicContent == null || topicContent.QuerySelector("p") == null || topicContent.QuerySelector("p") == null) { houseInfo.Status = 3; } else { //获取帖子内容 houseTextContent = topicContent.QuerySelector("p").TextContent; //获取价格信息 housePrice = JiebaTools.GetHousePrice(houseTextContent); if (housePrice != 0 || !string.IsNullOrEmpty(houseTextContent)) { houseInfo.Status = 1; } houseInfo.DisPlayPrice = housePrice.ToString(CultureInfo.InvariantCulture); houseInfo.HousePrice = housePrice; houseInfo.HouseText = houseTextContent; } } }