/// <summary> /// InitFirstUrlPart /// </summary> /// <param name="url"></param> /// <returns></returns> private string InitFirstUrlPart(string url) { if (!url.ToLower().Contains("http")) { url = $"https://{url}"; } var mainHtml = GetMainWebContent(_shopUrl = url, null, ref _cookies, null); _shopName = Regex.Match(mainHtml, @"(?<=<title>)[\s\S]*?(?=</title>)").Value.Trim(); if (_shopName.Contains("阿里旅行·去啊Alitrip.com")) { throw new Exception("阿里旅行不支持"); } if (_shopName.Equals("店铺浏览-淘宝网")) { //throw new Exception("店铺不存在!"); SendLog("店铺不存在!"); return(string.Empty); } _shopName = Regex.Match(_shopName, "(?<=-).*(?=-)").Value.Trim(); //var categoryUrl = $"{url}/category.htm"; var categoryUrl = $"{url}/search.htm"; var html = GetMainWebContent(categoryUrl, null, ref _cookies, null); var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html); var listUrl = documentNode.SelectSingleNode("//input[@id='J_ShopAsynSearchURL']").Attributes["value"].Value; return($"{url}{listUrl}"); }
/// <summary> /// 采取分类 /// </summary> /// <param name="url"></param> public static void AmazonSpider(string url) { url = "http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore"; //http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore //var hcFirst = new HttpClient(url); //hcFirst.Timeout = 30000; //var htmlFirst = hcFirst.Request(); //var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst); WebBrowerManager.Instance.Setup(new cEXWB()); var html = WebBrowerManager.Instance.Run(url); // 注意htmldecode html = HttpUtility.HtmlDecode(html); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); var firstCategoryContainer = htmlDocument.DocumentNode.SelectNodes("//div[@class='popover-grouping']"); if (firstCategoryContainer == null || !firstCategoryContainer.Any()) { return; } foreach (HtmlNode htmlNode in firstCategoryContainer) { var firstCategoryNode = htmlNode.SelectSingleNode("//div[@class='popover-category-name']/h2"); if (firstCategoryNode == null) { continue; } // 一级分类 var firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); Insert(firstKey.ToString(), firstCategoryNode.InnerText, string.Empty, "0", "8"); // 二级分类 var secondCategoryNodes = htmlNode.SelectNodes("div"); if (secondCategoryNodes == null || !secondCategoryNodes.Any()) { continue; } foreach (HtmlNode node in secondCategoryNodes) { if (node.Attributes["class"] == null) { var secondCategoryNode = node.SelectSingleNode("a"); if (secondCategoryNode != null && secondCategoryNode.Attributes["href"] != null && !string.IsNullOrEmpty(secondCategoryNode.Attributes["href"].Value)) { var categoryUrl = "http://www.amazon.cn" + HttpUtility.UrlDecode(secondCategoryNode.Attributes["href"].Value); var categoryName = secondCategoryNode.InnerText; var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); Insert(secondKey.ToString(), categoryName, categoryUrl, "0", "8"); } } } } }
public DefaultWebCrawlerService(ICmsConfiguration cmsConfiguration) { this.cmsConfiguration = cmsConfiguration; webServer = cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneWebSiteUrl) ?? string.Empty; bool.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexPrivatePages), out indexPrivatePages); if (indexPrivatePages) { var authModeString = cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneAuthorizationMode); if (!string.IsNullOrWhiteSpace(authModeString)) { switch (authModeString.ToLower().Trim()) { case "windows": authMode = AuthMode.Windows; break; default: authMode = AuthMode.Forms; break; } } } HtmlAgilityPackHelper.FixMissingTagClosings(); TimeSpan timeout; if (TimeSpan.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexerPageFetchTimeout), out timeout) && timeout > TimeSpan.FromSeconds(0)) { fetchTimeout = timeout; } }
public void GetMetaNodesCollection_ValidHtml_ReturnHtmlNodeCollection() { var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(ValidUrl); var nodeCollection = HtmlAgilityPackHelper.GetMetaNodesCollection(htmlDocument); Assert.IsNotNull(nodeCollection); }
public void GetMetaNodesCollection_InvalidHtml_ReturnNull() { var htmlDocument = new HtmlDocument(); var nodeCollection = HtmlAgilityPackHelper.GetMetaNodesCollection(htmlDocument); Assert.IsNull(nodeCollection); }
/// <summary> /// 亚马逊产品列表 /// </summary> public static void AmazonProductList(string url) { //http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore if (string.IsNullOrEmpty(url)) { return; } Uri uri = new Uri(url); string queryString = uri.Query; NameValueCollection nameValue = UrlHelper.GetQueryString(queryString); // 根据url中抽取分类 string node = nameValue["node"]; if (string.IsNullOrEmpty(node)) { return; } string urlTemplate = "http://www.amazon.cn/s/ref=?rh=n:{0}&page={1}"; var firstPageUrl = string.Format(urlTemplate, node, 1); var hcFirst = new HttpClient(firstPageUrl); hcFirst.Timeout = 30000; var html = HttpUtility.HtmlDecode(hcFirst.Request()); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); var maxPageNode = htmlDocument.DocumentNode.SelectSingleNode("//span[@class='pagnDisabled']"); var maxPageNumber = 0; if (maxPageNode != null && int.TryParse(maxPageNode.InnerText, out maxPageNumber)) { for (int i = 1; i <= maxPageNumber; i++) { if (i != 1) { var pageUrl = string.Format(urlTemplate, node, i); var hc = new HttpClient(pageUrl); hc.Timeout = 30000; html = HttpUtility.HtmlDecode(hc.Request()); htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); } var productNodes = htmlDocument.DocumentNode.SelectNodes( "//div[@class='result product'] | //div[@class='result lastRow product']"); if (productNodes == null) { return; } foreach (HtmlNode productNode in productNodes) { var imageNode = productNode.SelectSingleNode("div[@class='image']/a"); var titleNode = productNode.SelectSingleNode("div[@class='data']/h3[@class='title']/a"); } } } }
public DefaultWebCrawlerService(ICmsConfiguration cmsConfiguration) { this.cmsConfiguration = cmsConfiguration; webServer = cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneWebSiteUrl) ?? string.Empty; bool.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexPrivatePages), out indexPrivatePages); HtmlAgilityPackHelper.FixMissingTagClosings(); }
public List <ProductInfo> SpiderProductList(SpiderCategoryInfo spiderCategory) { WebBrowerManager.Instance.Setup(new cEXWB()); WebBrowerManager.Instance.TimeOut = 15; var html = WebBrowerManager.Instance.Run(spiderCategory.CategoryUrl); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); htmlDocument.GetElementbyId("search_result"); return(new List <ProductInfo>()); }
private int GetKeywordOccurrencesFromHtmlDocument(string keyword, string url) { if (!IsUrlValid(url)) { return(0); } var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(url); return(htmlDocument.Text.OccurrencesOf(keyword)); }
public static List <string> GetUrls() { var indexUrls = new List <string>(); var indexUrl = "http://www.sge.sh/publish/sge/xqzx/jyxq/index.htm"; var firstPage = WebBrowerManager.Instance.Brower(indexUrl); if (string.IsNullOrEmpty(firstPage.HtmlSource)) { return(indexUrls); } var firstDocument = HtmlAgilityPackHelper.GetHtmlDocument(firstPage.HtmlSource); var firstPageNode = firstDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[1]/div[4]/a[1]"); var lastPageNode = firstDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[1]/div[4]/a[4]"); var maxPageNumber = 0; if (lastPageNode.HasAttributes) { var attr = lastPageNode.Attributes["href"]; if (attr != null && !string.IsNullOrEmpty(attr.Value)) { var index = attr.Value.IndexOf("index"); var pointIndex = attr.Value.IndexOf("."); if (index >= 0 && pointIndex > 0) { var length = pointIndex - index - "index".Length; var maxPageString = attr.Value.Substring(index + "index".Length, length); int.TryParse(maxPageString, out maxPageNumber); } } } if (maxPageNumber > 0) { string url = string.Empty; for (int i = 0; i <= maxPageNumber; i++) { if (i == 0) { url = "http://www.sge.sh/publish/sge/xqzx/jyxq/index.htm"; } else { url = string.Format("http://www.sge.sh/publish/sge/xqzx/jyxq/index{0}.htm", i); } indexUrls.Add(url); } } return(indexUrls); }
public void RetrieveHtml_InvalidUrl_ThrowsException() { try { var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(InvalidUrl); Assert.Fail("An exception should have been thrown"); } catch (Exception ex) { Assert.IsNotNull(ex); } }
private List <string> GetKeywordsFromUrl(string url) { if (!IsUrlValid(url)) { return(null); } var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(url); var metaNodes = HtmlAgilityPackHelper.GetMetaNodesCollection(htmlDocument); return(GetKeywordsFromMetaNodes(metaNodes)); }
public static void DangDangSpider(string url) { //http://category.dangdang.com/ WebBrowerManager.Instance.Setup(new cEXWB()); var html = WebBrowerManager.Instance.Run(url); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); var container = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[2]/div[1]/div[2]/div[1]/div[4]/div"); if (container == null || !container.Any()) { return; } foreach (HtmlNode htmlNode in container) { if (htmlNode.HasAttributes && htmlNode.Attributes["Id"] != null) { var liNodes = htmlNode.SelectNodes("div[2]/ul[1]/li"); foreach (HtmlNode liNode in liNodes) { var aNode = liNode.SelectNodes("a"); var chars = new char[6] { '&', 'n', 'b', 's', 'p', ';' }; foreach (HtmlNode node in aNode) { var firstKey = 0; if (node.HasAttributes && node.Attributes["class"] != null && node.Attributes["class"].Value == "title") { // 一级分类 firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); Insert(firstKey.ToString(), node.InnerText.Trim(chars), node.Attributes["href"].Value, "0", "5"); } else { //二级分类 // 一级分类 var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); Insert(secondKey.ToString(), node.InnerText, node.Attributes["href"].Value, firstKey.ToString(), "5"); } } } } } }
private List <Dictionary <string, string> > GetInfo(string url, string postDataString) { Func <string, string> removeSpace = s => s.Replace(" ", ""); List <Dictionary <string, string> > listDic = new List <Dictionary <string, string> >(); var httpHelper = new HttpHelper { Timeout = 5 * 60 * 1000, HttpEncoding = _httpEncoding }; var html = httpHelper.GetHtmlByPost(url, postDataString); var urlCollection = Regex.Matches(Regex.Match(Regex.Match(html, @"dataStore[\s]*=[\s]*\[.*?\]").Value, "(?<=\")[^,]+?(?=\")").Value, @"(?<=\$)[^\$]*$"); foreach (Match caseUrl in urlCollection) { html = httpHelper.GetHtmlByGet($"http://www.zjcredit.gov.cn{caseUrl.Value}"); var htmlNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html); var administrativePenaltyInfo = new AdministrativePenaltyInfo(); administrativePenaltyInfo.CaseName = htmlNode.SelectSingleNode("//td[@class='listf2']").InnerText; administrativePenaltyInfo.CaseId = htmlNode.SelectSingleNode("//table[2]//tr[1]/td[@class='xzcf_xx']").InnerText; administrativePenaltyInfo.PenaltyObject = removeSpace(htmlNode.SelectSingleNode("//table[2]//tr[2]/td[@class='xzcf_xx']/text()").InnerText); administrativePenaltyInfo.LegalRepresentative = Regex.Match(htmlNode.SelectSingleNode("//table[2]//span[@class='xzcf_mc']").InnerText, "(?<=:).*").Value.Trim(); administrativePenaltyInfo.Department = htmlNode.SelectSingleNode("//table[2]//tr[3]/td[@class='xzcf_xx']").InnerText; administrativePenaltyInfo.PenaltyDate = htmlNode.SelectSingleNode("//table[2]//tr[4]/td[@class='xzcf_xx']").InnerText; administrativePenaltyInfo.PenalyText = htmlNode.SelectSingleNode("//table[4]//td[@class='xzcf_jds']").InnerText; var dic = new Dictionary <string, string> { ["CaseName"] = administrativePenaltyInfo.CaseName, ["CaseId"] = administrativePenaltyInfo.CaseId, ["PenaltyObject"] = administrativePenaltyInfo.PenaltyObject, ["LegalRepresentative"] = administrativePenaltyInfo.LegalRepresentative, ["Department"] = administrativePenaltyInfo.Department, ["PenaltyDate"] = administrativePenaltyInfo.PenaltyDate, ["PenalyText"] = administrativePenaltyInfo.PenalyText, ["PostUrl"] = url, ["Url"] = caseUrl.Value, ["ThreadId"] = Thread.CurrentThread.ManagedThreadId.ToString() }; foreach (var info in dic) { Console.WriteLine($"{info.Key}:{info.Value}"); } listDic.Add(dic); } return(listDic); }
/// <summary> /// 一号店商品采集方法 /// </summary> /// <param name="url">全部分类url</param> public static void YiHaoDianSpider(string url) { //http://www.yihaodian.com/product/listAll.do HttpClient hc = new HttpClient(url); hc.Timeout = 30000; var allSortHtml = hc.Request(); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(allSortHtml); var firstCategoryContainer = htmlDocument.DocumentNode.SelectNodes("//div[@class='alonesort']"); //var texts = new List<string>(); foreach (HtmlNode firstCategoryNode in firstCategoryContainer) { var node = firstCategoryNode.CssSelect(".mt>h3>a"); if (node != null && node.Any()) { //一级分类 var firstCategoryText = node.FirstOrDefault().InnerText; var firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); Insert(firstKey.ToString(), firstCategoryText, node.FirstOrDefault().Attributes["href"].Value, "0", "2"); var secondCategoryContainer = firstCategoryNode.CssSelect(".mc>.fore"); foreach (HtmlNode htmlNode in secondCategoryContainer) { //二级分类 var secondCategoryNode = htmlNode.CssSelect("dt>a").FirstOrDefault(); var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); if (secondCategoryNode.Attributes["href"] != null) { Insert(secondKey.ToString(), secondCategoryNode.InnerText, secondCategoryNode.Attributes["href"].Value, firstKey.ToString(), "2"); } // 三级分类集合 var threeCategoryNodes = htmlNode.CssSelect("dd>em>span>a"); foreach (HtmlNode threeCategoryNode in threeCategoryNodes) { // 插入三级分类 var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); if (threeCategoryNode.Attributes["href"] != null) { Insert(thirdKey.ToString(), threeCategoryNode.InnerText, threeCategoryNode.Attributes["href"].Value, secondKey.ToString(), "2"); } } } } } }
public static void SuNingSpider(string url) { //www.suning.com/emall/SNProductCatgroupView?storeId=10052&catalogId=10051&flag=1 WebBrowerManager.Instance.Setup(new cEXWB()); var html = WebBrowerManager.Instance.Run(url); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); var container = htmlDocument.DocumentNode.CssSelect("div.sFloor.clearfix"); foreach (HtmlNode htmlNode in container) { var firstNode = htmlNode.CssSelect("h3>a"); var firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); Insert(firstKey.ToString(), firstNode.FirstOrDefault().InnerText, firstNode.FirstOrDefault().Attributes["href"].Value, "0", "4"); var temp = new List <HtmlNode>(); if (htmlNode.CssSelect(".listLeft>dl") != null && htmlNode.CssSelect(".listLeft>dl").Any()) { temp.AddRange(htmlNode.CssSelect(".listLeft>dl")); } if (htmlNode.CssSelect(".listRight>dl") != null && htmlNode.CssSelect(".listRight>dl").Any()) { temp.AddRange(htmlNode.CssSelect(".listRight>dl")); } foreach (HtmlNode node in temp) { var secondNode = node.CssSelect("dt>a").FirstOrDefault(); var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); if (secondNode != null) { Insert(secondKey.ToString(), secondNode.InnerText, "", firstKey.ToString(), "4"); } var thridNodes = node.CssSelect("dd>span>a"); if (thridNodes != null && thridNodes.Any()) { foreach (HtmlNode thridNode in thridNodes) { // 插入三级分类 var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); Insert(thirdKey.ToString(), thridNode.InnerText, thridNode.Attributes["href"].Value, secondKey.ToString(), "4"); } } } } }
/// <summary> /// ParseNextUrl /// </summary> /// <returns></returns> protected override string ParseNextUrl() { if (_isInHtml) { var docmentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource); var nextUrl = docmentNode.SelectSingleNode("//a[@class='J_SearchAsync next']")?.Attributes["href"]?.Value; if (nextUrl != null) { if (!nextUrl.Contains("http")) _urlQueue.Enqueue($"https:{nextUrl}"); } } return _urlQueue.Count == 0 ? null : _urlQueue.Dequeue(); }
/// <summary> /// InitTotalPage /// </summary> /// <param name="listHtmlFirst"></param> /// <returns></returns> private int InitTotalPage(string listHtmlFirst) { var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(listHtmlFirst); //不转义,双引号里面要两个双引号才表示双引号 var htmlNode = documentNode.SelectSingleNode(@"//span[@class='\""page-info\""']") ?? documentNode.SelectSingleNode(@"//b[@class='\""ui-page-s-len\""']"); if (htmlNode == null) { return(0); } var text = htmlNode.InnerText; var pageNum = Regex.Match(text, @"(?<=/)\d+").Value; int pageNumInt; return(_totalPage = int.TryParse(pageNum, out pageNumInt) ? pageNumInt : 0); }
void WebBrower_WBLButtonUp(object sender, csExWB.HTMLMouseEventArgs e) { if (e.SrcElement != null) { //user is scrolling using scrollbars //if (e.SrcElement.tagName == "HTML") // return; //If DIV then we can look for an HTML child element //AllForms.m_frmLog.AppendToLog("cEXWB1_WBLButtonUp==>" + e.SrcElement.tagName); TreeNodeEx tnRet = null; foreach (var tn in HtmlTree.Nodes) { var treeNodeEx = tn as TreeNodeEx; var selectedElement = new SelectedElement(); selectedElement.tagName = e.SrcElement.tagName.ToLower(); selectedElement.innerText = e.SrcElement.innerText; tnRet = this.FindNodeExt(treeNodeEx, selectedElement); if (tnRet != null) { break; } } if (tnRet != null) { tnRet.ForeColor = Color.Red; tnRet.Expand(); HtmlTree.SelectedNode = tnRet; var sb = new StringBuilder(); sb.AppendLine("xpath:" + tnRet.HtmlNode.XPath); sb.AppendLine(HtmlAgilityPackHelper.GetStringByXPath(Html, tnRet.HtmlNode.XPath, "|")); richTextBox.Text += sb.ToString(); } } else { //AllForms.m_frmLog.AppendToLog("cEXWB1_WBLButtonUp"); } //Rectangle rt = new Rectangle(m_mposX - 1, m_mposY - 1, 2, 2); //if (rt.Contains(e.ClientX, e.ClientY)) //{ // //AllForms.m_frmLog.AppendToLog("MOUSE CLICKED"); //} }
/// <summary> /// InitTotalPage /// </summary> /// <param name="listHtmlFirst"></param> /// <returns></returns> private int InitTotalPage(string listHtmlFirst) { //if (!_curUrl.Contains("/i/asynSearch.htm")) // return 1; var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(listHtmlFirst); //不转义,双引号里面要两个双引号才表示双引号 var htmlNode = documentNode.SelectSingleNode(@"//span[@class='\""page-info\""']") ?? documentNode.SelectSingleNode(@"//b[@class='\""ui-page-s-len\""']"); //?? documentNode.SelectSingleNode("//b[@class=\"ui-page-s-len\"]"); if (htmlNode == null) return 0; var text = htmlNode.InnerText; var pageNum = Regex.Match(text, @"(?<=/)\d+").Value; int pageNumInt; return _totalPage = int.TryParse(pageNum, out pageNumInt) ? pageNumInt : 0; }
private List <KeywordDto> GetKeywordsOccurrencesFromHtmlDocument(IEnumerable <string> keywords, string url) { if (!IsUrlValid(url)) { return(null); } var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(url); return((from keyword in keywords let occurrenceCount = htmlDocument.Text.OccurrencesOf(keyword) select new KeywordDto() { Keyword = keyword, OccurenceCount = occurrenceCount }).ToList()); }
/// <summary> /// InitFirstUrlPart /// </summary> /// <param name="url"></param> /// <returns></returns> private string InitFirstUrlPart(string url) { if (!url.ToLower().Contains("http")) { url = $"https://{url}"; } var mainHtml = GetMainWebContent(_shopUrl = url, null, ref _cookies, null); _shopName = Regex.Match(mainHtml, @"(?<=<title>)[\s\S]*?(?=</title>)").Value.Trim(); //if (_shopName.Contains("阿里旅行·去啊Alitrip.com")) //{ // return $"{url}/search.htm"; //} if (_shopName.Contains("阿里旅行·去啊Alitrip.com")) { throw new Exception("阿里旅行不支持"); } if (_shopName.Equals("店铺浏览-淘宝网")) { //throw new Exception("店铺不存在!"); SendLog("店铺不存在!"); return(string.Empty); } _shopName = Regex.Match(_shopName, "(?<=-).*(?=-)").Value.Trim(); //var categoryUrl = $"{url}/category.htm"; var categoryUrl = $"{url}/search.htm"; var html = GetMainWebContent(categoryUrl, null, ref _cookies, null); var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html); var listUrl = documentNode.SelectSingleNode("//input[@id='J_ShopAsynSearchURL']").Attributes["value"].Value; // /i/asynSearch.htm?mid=w-1901851942-0&wid=1901851942&path=/search.htm&search=y //if (string.IsNullOrEmpty(listUrl)) //{ // var dataWidgetid = // documentNode.SelectSingleNode("//div[@id=\"bd\"]//div[@class=\"J_TModule\"]").Attributes[ // "data-widgetid"].Value; // listUrl = $"/i/asynSearch.htm?mid=w-{dataWidgetid}-0&wid={dataWidgetid}&path=/search.htm&search=y"; //} //return string.IsNullOrEmpty(listUrl) ? url : $"{url}{listUrl}"; return($"{url}{listUrl}"); }
/// <summary> /// ParseCurrentItems /// </summary> /// <returns></returns> protected override IResut[] ParseCurrentItems() { List <IResut> resultList = new List <IResut>(); HtmlNode htmlNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource); HtmlNodeCollection htmlNodeCollection = htmlNode.SelectNodes("//td[@class='Font9']"); foreach (HtmlNode node in htmlNodeCollection) { string url = node.SelectSingleNode("./a[@class='five']")?.Attributes["href"]?.Value; string dateTimeString = Regex.Match(node.InnerText, @"\d+-\d+-\d+").Value; if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(dateTimeString)) { break; } url = $"http://www.ccgp-shandong.gov.cn{url}"; DateTime dateTime = Convert.ToDateTime(dateTimeString); int days = (DateTime.Now - dateTime).Days; if (days > _gatherDays) { _urlQueue.Clear(); break; } string html = _httpHelper.GetHtmlByGet(url); HtmlNode htmlNode2 = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html); string title = htmlNode2.SelectSingleNode("//div[@align='center']")?.InnerText; string publisher = Regex.Match(html, "(?<=发布人[::]).*(?=</td>)").Value; string publishTime = Regex.Match(html, "(?<=发布时间[::]).*(?=</td>)").Value; publishTime = Convert.ToDateTime(publishTime).ToString(CultureInfo.CurrentCulture); //string content = htmlNode2.SelectSingleNode("//td[@bgcolor='#FFFFFF' and @align='center' and not(@valign)]").InnerText.Trim(); //content = HttpUtility.HtmlDecode(Regex.Match(content, @".*(?=\r\n)").Value); string content = htmlNode2.SelectSingleNode("//table//tr[2]/td[2]/table").OuterHtml; Resut resut = new Resut() { ["url"] = url, ["title"] = title, ["content"] = content, ["publisher"] = publisher, ["publishTime"] = publishTime }; resultList.Add(resut); } return(resultList.ToArray()); }
static TestBase() { KnownAssemblies = new List <Assembly>(new[] { typeof(RootModuleDescriptor).Assembly, typeof(PagesModuleDescriptor).Assembly, typeof(BlogModuleDescriptor).Assembly, typeof(NewsletterModuleDescriptor).Assembly, typeof(MediaManagerModuleDescriptor).Assembly, typeof(UsersModuleDescriptor).Assembly, typeof(ApiModuleDescriptor).Assembly, typeof(UsersApiModuleDescriptor).Assembly, typeof(ImagesGalleryModuleDescriptor).Assembly }); CreateContainer(); HtmlAgilityPackHelper.FixMissingTagClosings(); }
public DefaultWebCrawlerService(ICmsConfiguration cmsConfiguration) { this.cmsConfiguration = cmsConfiguration; webServer = cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneWebSiteUrl) ?? string.Empty; bool.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexPrivatePages), out indexPrivatePages); HtmlAgilityPackHelper.FixMissingTagClosings(); TimeSpan timeout; if (TimeSpan.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexerPageFetchTimeout), out timeout) && timeout > TimeSpan.FromSeconds(0)) { fetchTimeout = timeout; } }
public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct) { //var html=WebBrowerManager.Instance.Run(spiderProduct.Url); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(spiderProduct.HtmlSource); //标题 var title = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[1]/h1[1]"); var price = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='p-price']/img"); // 文字价格 var priceText = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[2]/ul[1]/li[2]/script[1]"); // 产品图片 //var defaultImage = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/div[1]"); decimal realPrice = 0; if (price.Attributes["src"] != null && !string.IsNullOrEmpty(price.Attributes["src"].Value)) { decimal.TryParse(ImageProcess.Recognize(price.Attributes["src"].Value), out realPrice); } // 促销信息是ajax if (title != null && price != null && priceText != null) { var beginIndex = priceText.InnerText.IndexOf("京东价:¥"); var endIndex = priceText.InnerText.IndexOf("。", beginIndex); var readPrice = priceText.InnerText.Substring(beginIndex + "京东价:¥".Length, endIndex - beginIndex - "京东价:¥".Length); decimal decimalRealPrice = 0; if (decimal.TryParse(readPrice, out decimalRealPrice)) { //UpdateProduct(productId, title.InnerText, decimal.Parse(readPrice)); } } return(new ProductInfo() { Source = spiderProduct.HtmlSource, ProductId = spiderProduct.ProductId, Url = spiderProduct.Url, Price = realPrice }); }
/// <summary> /// 下载列表 /// </summary> /// <param name="search"></param> public static List <DouYinModel> DownLoad(int index, string directoryPath) { List <DouYinModel> result = new List <DouYinModel>(); try { string url = string.Format(SearchUrl, index); //请求搜索列表 var list_response = HttpHelper.HttpGetRequest(url, null, null, null, keepAlive: true); var list_respStr = HttpHelper.GetResponseStreamToStr(list_response); list_response.Close(); result = HtmlAgilityPackHelper.GetTable(list_respStr, directoryPath); return(result); } catch (Exception ex) { return(result); } }
private void WebBrower_DocumentComplete(object sender, DocumentCompleteEventArgs e) { if (e.url.ToLower() == "about:blank") { return; } cEXWB pWB = sender as cEXWB; if (e.istoplevel) { IsDocumentFinish = true; Html = pWB.DocumentSource; htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(pWB.DocumentSource); //Stopwatch sw=new Stopwatch(); //sw.Start(); //LoadHtmlTree(); //sw.Stop(); Stopwatch sw1 = new Stopwatch(); sw1.Start(); BuildTree3(); sw1.Stop(); richTextBox.Text += string.Format("DOM树构建时间{0}毫秒", sw1.ElapsedMilliseconds) + "\n"; //MessageBox.Show(sw1.ElapsedMilliseconds.ToString()); //Logger.Log(LogLevel.Info, string.Format("DocumentComplete,Url:{0},时间:{1}", e.url,DateTime.Now)); } else if (pWB != null && pWB.MainDocumentFullyLoaded) // a frame naviagtion within a frameset { IsDocumentFinish = true; //Logger.Log(LogLevel.Info, string.Format("MainDocumentFullyLoaded,Url:{0},时间:{1}", e.url, DateTime.Now)); } else { //log.Debug("DocumentComplete::TopLevel is FALSE===>" + e.url); } }
public static List <string> Spider(List <string> urls) { var pageUrls = new List <string>(); foreach (string indexUrl in urls) { var html = WebBrowerManager.Instance.Brower(indexUrl).HtmlSource; var document = HtmlAgilityPackHelper.GetHtmlDocument(html); var liNodes = document.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[3]/div[1]/div[3]/ul[1]/li"); if (liNodes != null && liNodes.Any()) { foreach (HtmlNode liNode in liNodes) { var aNode = liNode.SelectSingleNode("a"); if (aNode != null && aNode.HasAttributes && aNode.Attributes["href"] != null && !string.IsNullOrEmpty(aNode.Attributes["href"].Value)) { string pageUrl = string.Format("http://www.sge.sh/publish/sge/xqzx/jyxq/{0}", aNode.Attributes["href"].Value); pageUrls.Add(pageUrl); } } } } return(pageUrls); }
public void RetrieveHtml_ValidUrl_ReturnHtmlDocument() { var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(ValidUrl); Assert.IsNotNull(htmlDocument); }