public website GetWhois(string domain) { website webs = new website(); //测试 //将下面的代码复制到UnitTest1里面进行测试 //[TestMethod] //public void whois() //{ // WhoisQuery s = new WhoisQuery(); // s.GetWhois("bookzx.org"); //} //获取pv ,ip //http://www.alexa.cn/index.php?url=bookzx.org //js文件地址 http://www.alexa.cn/jquery_alexa_new_beta.js //date="url="+str+"&sig="+sig+"&keyt="+keyt //bookzx.org,835c4d4506618f02994adada985f41f3,1421651144 //http://alexa.cn/api0523.php?url=bookzx.org&sig=835c4d4506618f02994adada985f41f3&keyt=1421651144 string url = "http://www.alexa.cn/index.php?url={0}".FormatStr(domain); var xdoc = web.GetHtml(url, null, "utf-8"); var companyName = xdoc.SubstringAfter("主办单位名称").SubstringBefore("</font>").SubstringAfter("<font>"); companyName = BaiduQuery.RemoveInivalidChar(companyName); if (!string.IsNullOrEmpty(companyName)) { webs.CompanyName = companyName; } var idetifiNo = xdoc.SubstringAfter("网站备案/许可证号").SubstringAfter("<font>").SubstringBefore("</font");//xdoc.SubstringAfter("网站备案/许可证号").SubstringBefore("</a>").SubstringAfter("\">"); if (!string.IsNullOrEmpty(idetifiNo) && !idetifiNo.Contains("无备案信息")) { idetifiNo = idetifiNo.SubstringBefore("</a>").SubstringAfter("\">"); idetifiNo = idetifiNo.Replace("\r", "").Replace("\n", "").Replace("\t", "").Replace("\b", ""); if (!string.IsNullOrEmpty(idetifiNo) && idetifiNo != "无备案信息") { webs.ICPLicense = idetifiNo; } } //获取whois //http://whois.www.net.cn/whois/domain/bookzx.org?spm=5334.WHbookzxor.5.1 //FD获取的链接 //http://whois.www.net.cn/whois/api_whois?host=bookzx.org&_=1421306342508 //http://whois.www.net.cn/whois/api_whois?host=bookzx.org //http://whois.www.net.cn/whois/api_whois?host=qiaxz.com //百度 //http://whois.www.net.cn/whois/api_whois_full?host=baidu.com&web_server=whois.markmonitor.com&_=1422513346623 //http://whois.www.net.cn/whois/api_whois_full?host=baidu.com //string url1 = "http://whois.www.net.cn/whois/api_whois?host={0}".FormatStr(domain); string url1 = "http://whois.alexa.cn/whois.php?u={0}".FormatStr(domain);//http://whois.alexa.cn/whois.php string url3 = "http://whois.www.net.cn/whois/api_whois_full?host={0}".FormatStr(domain); //如果不加下面这行代码,Json文件会出现{"code":"405","msg":"限制访问","success":false} //先让程序运行一遍网页,再进行抓取Json文件。 //var html1 = web.GetHtml("http://whois.www.net.cn/whois/domain/{0}?spm=5334.WHbookzxor.5.1".FormatStr(domain),null,"gbk"); var html = web.GetHtml(url1, null, "utf-8"); if (html.IsContains("域名服务器:")) { string whoisdomain = html.SubstringAfter("域名服务器:").SubstringBefore("<br").Trim(); string url2 = "http://whois.alexa.cn/whois.php?server={0}&who={1}".FormatStr(whoisdomain, domain); html = web.GetHtml(url2, null, "utf-8"); if (!string.IsNullOrEmpty(html)) { string registrantName = html.SubstringAfter("Registrant Name:").SubstringBefore("<br").GetTrimed(); if (!string.IsNullOrEmpty(registrantName)) { webs.RegistrantName = registrantName; } string email = html.SubstringAfter("Registrant Email:").SubstringBefore("<br").GetTrimed(); if (!string.IsNullOrEmpty(email)) { webs.RegistrantEmail = email; } string sponsoringRegistrar = html.SubstringAfter("Registrar:").SubstringBefore("<br").GetTrimed(); if (!string.IsNullOrEmpty(sponsoringRegistrar)) { webs.SponsoringRegistrar = sponsoringRegistrar; } DateTime?zhuceriqi = html.SubstringAfter("Creation Date:").SubstringBefore("<br").ToDateTime(); if (zhuceriqi.HasValue) { webs.RegistrationDate = zhuceriqi; } DateTime?daoqiriqi = html.SubstringAfter("Registrar Registration Expiration Date:").SubstringBefore("<br").ToDateTime(); if (daoqiriqi.HasValue) { webs.ExpirationDate = daoqiriqi; } string dns = html.SubstringAfter("Name Server:").SubstringBefore("<br").GetTrimed(); if (!string.IsNullOrEmpty(dns)) { webs.DNS = dns; } string phone = html.SubstringAfter("Registrant Phone:").SubstringBefore("<br").GetTrimed(); if (!string.IsNullOrEmpty(phone)) { webs.RegistrantPhone = phone; } string address = html.SubstringAfter("Registrant Street:").SubstringBefore("<br").GetTrimed(); if (!string.IsNullOrEmpty(address)) { webs.RegistrantAddress = address; } string adminEmail = html.SubstringAfter("Admin Email:").SubstringBefore("<br").GetTrimed(); if (!string.IsNullOrEmpty(adminEmail)) { webs.AdminEmail = adminEmail; } string adminPhone = html.SubstringAfter("Admin Phone:").SubstringBefore("<br").GetTrimed(); if (!string.IsNullOrEmpty(adminPhone)) { webs.AdminPhone = adminPhone; } } } if (string.IsNullOrEmpty(webs.ICPLicense)) { //&qq-pf-to=pcqq.group string wurl = "http://seo.chinaz.com/?host={0}".FormatStr(domain); string whtml = web.GetHtml(wurl, null, "utf-8"); if (whtml.IsContains("获取不到Seo数据,可能是网站无法访问造成")) { Console.WriteLine("获取不到Seo数据,可能是网站无法访问造成"); } if (!string.IsNullOrEmpty(whtml)) { string license = whtml.SubstringAfter("备案号:").SubstringAfter("/font>").SubstringBefore("<font").GetTrimed(); if (license.IsContain(" ")) { license = license.SubstringBefore(" "); } if (!string.IsNullOrEmpty(license)) { webs.ICPLicense = license; } //string seokw = whtml.SubstringAfter("dekey='").SubstringBefore("'"); } else { Console.WriteLine("备案号Html没有提取到"); } } string baiduvurl = "http://www.baidu.com/s?wd={0}%40v".FormatStr(domain); var baiduvhtml = web.GetHtml(baiduvurl, null, "utf-8"); if (!string.IsNullOrEmpty(baiduvhtml)) { if (string.IsNullOrEmpty(webs.BDV)) { webs.BDV = baiduvhtml.SubLastStringAfter("主体识别码:").SubstringBefore("</span>").SubLastStringAfter(">"); } if (string.IsNullOrEmpty(webs.ICPLicense)) { webs.ICPLicense = baiduvhtml.SubLastStringAfter("备案编号:").SubstringBefore("</td>").SubLastStringAfter(">"); } if (string.IsNullOrEmpty(webs.Whois_txt)) { webs.Whois_txt = BaiduQuery.RemoveInivalidChar(baiduvhtml.SubLastStringAfter("经营范围:").SubstringBefore("</div>").SubstringAfter("data-origin=\"").SubstringBefore("\">")); } var bdwebtype = baiduvhtml.SubLastStringAfter("商家类型:").SubstringBefore("</td>").SubLastStringAfter(">"); if (string.IsNullOrEmpty(bdwebtype)) { bdwebtype = baiduvhtml.SubLastStringAfter("机构类型:").SubstringBefore("</td>").SubLastStringAfter(">"); } if (!string.IsNullOrEmpty(bdwebtype)) { webs.WebsiteType = bdwebtype; } if (string.IsNullOrEmpty(webs.BDV)) { webs.BDV = ""; } } return(webs); }
void GetLinks(string link, searchkeyword tsk, keyword businessKeyword, List <keyword> businessKeywords, List <keyword> excludedKeywords) { BotMng botmng = BotMng.Instance; AppType img = (AppType)tsk.AppType; string[] searchKeywords = tsk.Keyword.GetLower().RemoveSpace().Split(';'); List <KeywordScore> patterns = businessKeywords.Select(x => new KeywordScore { Keyword = x.Txt, Score = x.Score, BizType = x.BizType }).ToList(); string[] bizPatterns = businessKeywords.Select(x => x.Txt).ToArray(); patterns.Add(new KeywordScore { Keyword = tsk.Keyword, Score = 50, BizType = 0 }); //List<level1link> result = new List<level1link>(); int nohist_pages = 0; int quried_pages = 0; //最多搜索60页 while (!string.IsNullOrEmpty(link) && quried_pages <= 60) { log(link); var html = get_html(link); if (html == null) { break; } var tags = html.SubAfter("content_left").SplitWith("c-container"); if (tags == null || tags.Length == 0) { log("BLOCKED " + tsk.Keyword); break; } bool nohit = true; foreach (var tag in tags) { var a = tag.SubAfter("h3").SubAfter("a"); string title = RemoveInivalidChar( a.RemoveSpace().GetLower().SubBefore("</h3>").GetTxtFromHtml2().RemoveSpace().GetLower()); string href = a.GetFirstHref2(); string abs = RemoveInivalidChar(tag.SubAfter("abstract").SubBefore("</div").GetTxtFromHtml2().RemoveSpace().GetLower()); string domain = tag.SubLastStringAfter("\"f13").SubBefore("</span").GetTxtFromHtml2(); domain = GetDomain(domain); int maxScore = 0; //没有包含需要protect item信息的过滤掉 string txt = "{0}{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } string realUrl = null, detailHtml = null, abstracts = null; byte appType = 0; if (!string.IsNullOrWhiteSpace(href)) { //Encoding enc = Encoding.UTF8; //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl); var tuplehtml = get_htmlUrl(href); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()")) { var gourl = detailHtml.GetFirstHref2(); if (!string.IsNullOrEmpty(gourl)) { var tuplehtml = get_htmlUrl(gourl); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } } if (string.IsNullOrEmpty(realUrl)) { realUrl = href; } List <KeywordScore> matchpatterns = new List <KeywordScore>(); if (string.IsNullOrEmpty(detailHtml)) { continue; } else { if (!detailHtml.Contains(tsk.Keyword) || !detailHtml.IsContains2(bizPatterns)) { continue; } var hrefs = detailHtml.GetDescendents("a", "href"); StringBuilder sbabstracts = new StringBuilder(); List <string> abstractlist = new List <string>(); StringBuilder sbabstractlist = new StringBuilder(); foreach (KeywordScore pattern in patterns) { string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword); if (splitDetailHtmls.Length > 1) { matchpatterns.Add(pattern); } StringBuilder sbpatternStr = new StringBuilder(); for (int i = 0; i < splitDetailHtmls.Length - 1; i++) { string splitDetailHtml1 = splitDetailHtmls[i]; string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : ""; for (int j = splitDetailHtml1.Length - 1; j >= 0; j--) { if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1])) { break; } sbpatternStr.Append(splitDetailHtml1[j]); } for (int q = sbpatternStr.Length - 1; q >= 0; q--) { sbabstracts.Append(sbpatternStr[q]); } sbabstracts.Append(pattern.Keyword); sbpatternStr.Clear(); for (int j = 0; j < splitDetailHtml2.Length; j++) { if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1])) { break; } sbpatternStr.Append(splitDetailHtml2[j]); } sbabstracts.Append(sbpatternStr); sbpatternStr.Clear(); string tmpsbabstracts = sbabstracts.ToString(); tmpsbabstracts = BaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower()); if (!abstractlist.Contains(tmpsbabstracts)) { abstractlist.Add(tmpsbabstracts); sbabstractlist.Append(tmpsbabstracts).Append(" "); } sbabstracts.Clear(); } } abstracts = sbabstractlist.ToString(); if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0) { maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } } if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs)) { matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList(); maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } if (maxScore > 100) { maxScore = 100; } bool is_bus_matched = txt.IsContains2(businessKeyword.Txt); bool is_title_matched = title.GetLower().IsContains2(searchKeywords); bool is_abstr_matched = abs.IsContains2(searchKeywords); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; bool is_itm_title_matched = txt.GetLower().IsContains2(searchKeywords); level1link l1 = new level1link { UsrId = tsk.UsrId, Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = string.Format("{0} + {1}", tsk.Keyword, businessKeyword.Txt), LinkUrl = realUrl, MatchAt = (byte)part, Html = detailHtml, MatchType = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)), AppType = appType, BizId = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(realUrl, tsk.UsrId, tsk.Keyword)), SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, Score = maxScore, Abstract = abstracts }; byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0)); botmng.save_level1_links(new List <level1link> { l1 }, tsk, excludedKeywords); nohit = false; nohist_pages = 0; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; link = html.SubAfter("fkfk_cur").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://www.baidu.com/".GetContact(link); } } //return result; }