public website GetWhois(string domain)
        {
            website webs = new website();

            //测试
            //将下面的代码复制到UnitTest1里面进行测试
            //[TestMethod]
            //public void whois()
            //{
            //    WhoisQuery s = new WhoisQuery();
            //    s.GetWhois("bookzx.org");
            //}

            //获取pv ,ip
            //http://www.alexa.cn/index.php?url=bookzx.org

            //js文件地址  http://www.alexa.cn/jquery_alexa_new_beta.js
            //date="url="+str+"&sig="+sig+"&keyt="+keyt
            //bookzx.org,835c4d4506618f02994adada985f41f3,1421651144
            //http://alexa.cn/api0523.php?url=bookzx.org&sig=835c4d4506618f02994adada985f41f3&keyt=1421651144


            string url         = "http://www.alexa.cn/index.php?url={0}".FormatStr(domain);
            var    xdoc        = web.GetHtml(url, null, "utf-8");
            var    companyName = xdoc.SubstringAfter("主办单位名称").SubstringBefore("</font>").SubstringAfter("<font>");

            companyName = BaiduQuery.RemoveInivalidChar(companyName);
            if (!string.IsNullOrEmpty(companyName))
            {
                webs.CompanyName = companyName;
            }
            var idetifiNo = xdoc.SubstringAfter("网站备案/许可证号").SubstringAfter("<font>").SubstringBefore("</font");//xdoc.SubstringAfter("网站备案/许可证号").SubstringBefore("</a>").SubstringAfter("\">");

            if (!string.IsNullOrEmpty(idetifiNo) && !idetifiNo.Contains("无备案信息"))
            {
                idetifiNo = idetifiNo.SubstringBefore("</a>").SubstringAfter("\">");
                idetifiNo = idetifiNo.Replace("\r", "").Replace("\n", "").Replace("\t", "").Replace("\b", "");
                if (!string.IsNullOrEmpty(idetifiNo) && idetifiNo != "无备案信息")
                {
                    webs.ICPLicense = idetifiNo;
                }
            }

            //获取whois
            //http://whois.www.net.cn/whois/domain/bookzx.org?spm=5334.WHbookzxor.5.1

            //FD获取的链接
            //http://whois.www.net.cn/whois/api_whois?host=bookzx.org&_=1421306342508
            //http://whois.www.net.cn/whois/api_whois?host=bookzx.org
            //http://whois.www.net.cn/whois/api_whois?host=qiaxz.com
            //百度
            //http://whois.www.net.cn/whois/api_whois_full?host=baidu.com&web_server=whois.markmonitor.com&_=1422513346623
            //http://whois.www.net.cn/whois/api_whois_full?host=baidu.com
            //string url1 = "http://whois.www.net.cn/whois/api_whois?host={0}".FormatStr(domain);
            string url1 = "http://whois.alexa.cn/whois.php?u={0}".FormatStr(domain);//http://whois.alexa.cn/whois.php
            string url3 = "http://whois.www.net.cn/whois/api_whois_full?host={0}".FormatStr(domain);
            //如果不加下面这行代码,Json文件会出现{"code":"405","msg":"限制访问","success":false}
            //先让程序运行一遍网页,再进行抓取Json文件。
            //var html1 = web.GetHtml("http://whois.www.net.cn/whois/domain/{0}?spm=5334.WHbookzxor.5.1".FormatStr(domain),null,"gbk");
            var html = web.GetHtml(url1, null, "utf-8");

            if (html.IsContains("域名服务器:"))
            {
                string whoisdomain = html.SubstringAfter("域名服务器:").SubstringBefore("<br").Trim();
                string url2        = "http://whois.alexa.cn/whois.php?server={0}&who={1}".FormatStr(whoisdomain, domain);
                html = web.GetHtml(url2, null, "utf-8");
                if (!string.IsNullOrEmpty(html))
                {
                    string registrantName = html.SubstringAfter("Registrant Name:").SubstringBefore("<br").GetTrimed();
                    if (!string.IsNullOrEmpty(registrantName))
                    {
                        webs.RegistrantName = registrantName;
                    }
                    string email = html.SubstringAfter("Registrant Email:").SubstringBefore("<br").GetTrimed();
                    if (!string.IsNullOrEmpty(email))
                    {
                        webs.RegistrantEmail = email;
                    }
                    string sponsoringRegistrar = html.SubstringAfter("Registrar:").SubstringBefore("<br").GetTrimed();
                    if (!string.IsNullOrEmpty(sponsoringRegistrar))
                    {
                        webs.SponsoringRegistrar = sponsoringRegistrar;
                    }
                    DateTime?zhuceriqi = html.SubstringAfter("Creation Date:").SubstringBefore("<br").ToDateTime();
                    if (zhuceriqi.HasValue)
                    {
                        webs.RegistrationDate = zhuceriqi;
                    }
                    DateTime?daoqiriqi = html.SubstringAfter("Registrar Registration Expiration Date:").SubstringBefore("<br").ToDateTime();
                    if (daoqiriqi.HasValue)
                    {
                        webs.ExpirationDate = daoqiriqi;
                    }
                    string dns = html.SubstringAfter("Name Server:").SubstringBefore("<br").GetTrimed();
                    if (!string.IsNullOrEmpty(dns))
                    {
                        webs.DNS = dns;
                    }
                    string phone = html.SubstringAfter("Registrant Phone:").SubstringBefore("<br").GetTrimed();
                    if (!string.IsNullOrEmpty(phone))
                    {
                        webs.RegistrantPhone = phone;
                    }
                    string address = html.SubstringAfter("Registrant Street:").SubstringBefore("<br").GetTrimed();
                    if (!string.IsNullOrEmpty(address))
                    {
                        webs.RegistrantAddress = address;
                    }
                    string adminEmail = html.SubstringAfter("Admin Email:").SubstringBefore("<br").GetTrimed();
                    if (!string.IsNullOrEmpty(adminEmail))
                    {
                        webs.AdminEmail = adminEmail;
                    }
                    string adminPhone = html.SubstringAfter("Admin Phone:").SubstringBefore("<br").GetTrimed();
                    if (!string.IsNullOrEmpty(adminPhone))
                    {
                        webs.AdminPhone = adminPhone;
                    }
                }
            }

            if (string.IsNullOrEmpty(webs.ICPLicense))
            {
                //&qq-pf-to=pcqq.group
                string wurl  = "http://seo.chinaz.com/?host={0}".FormatStr(domain);
                string whtml = web.GetHtml(wurl, null, "utf-8");
                if (whtml.IsContains("获取不到Seo数据,可能是网站无法访问造成"))
                {
                    Console.WriteLine("获取不到Seo数据,可能是网站无法访问造成");
                }
                if (!string.IsNullOrEmpty(whtml))
                {
                    string license = whtml.SubstringAfter("备案号:").SubstringAfter("/font>").SubstringBefore("<font").GetTrimed();
                    if (license.IsContain("&nbsp"))
                    {
                        license = license.SubstringBefore("&nbsp");
                    }
                    if (!string.IsNullOrEmpty(license))
                    {
                        webs.ICPLicense = license;
                    }
                    //string seokw = whtml.SubstringAfter("dekey='").SubstringBefore("'");
                }
                else
                {
                    Console.WriteLine("备案号Html没有提取到");
                }
            }

            string baiduvurl  = "http://www.baidu.com/s?wd={0}%40v".FormatStr(domain);
            var    baiduvhtml = web.GetHtml(baiduvurl, null, "utf-8");

            if (!string.IsNullOrEmpty(baiduvhtml))
            {
                if (string.IsNullOrEmpty(webs.BDV))
                {
                    webs.BDV = baiduvhtml.SubLastStringAfter("主体识别码:").SubstringBefore("</span>").SubLastStringAfter(">");
                }
                if (string.IsNullOrEmpty(webs.ICPLicense))
                {
                    webs.ICPLicense = baiduvhtml.SubLastStringAfter("备案编号:").SubstringBefore("</td>").SubLastStringAfter(">");
                }
                if (string.IsNullOrEmpty(webs.Whois_txt))
                {
                    webs.Whois_txt = BaiduQuery.RemoveInivalidChar(baiduvhtml.SubLastStringAfter("经营范围:").SubstringBefore("</div>").SubstringAfter("data-origin=\"").SubstringBefore("\">"));
                }

                var bdwebtype = baiduvhtml.SubLastStringAfter("商家类型:").SubstringBefore("</td>").SubLastStringAfter(">");

                if (string.IsNullOrEmpty(bdwebtype))
                {
                    bdwebtype = baiduvhtml.SubLastStringAfter("机构类型:").SubstringBefore("</td>").SubLastStringAfter(">");
                }
                if (!string.IsNullOrEmpty(bdwebtype))
                {
                    webs.WebsiteType = bdwebtype;
                }
                if (string.IsNullOrEmpty(webs.BDV))
                {
                    webs.BDV = "";
                }
            }

            return(webs);
        }
Exemple #2
0
        void GetLinks(string link, searchkeyword tsk, keyword businessKeyword, List <keyword> businessKeywords, List <keyword> excludedKeywords)
        {
            BotMng  botmng = BotMng.Instance;
            AppType img    = (AppType)tsk.AppType;

            string[] searchKeywords = tsk.Keyword.GetLower().RemoveSpace().Split(';');

            List <KeywordScore> patterns = businessKeywords.Select(x => new KeywordScore {
                Keyword = x.Txt, Score = x.Score, BizType = x.BizType
            }).ToList();

            string[] bizPatterns = businessKeywords.Select(x => x.Txt).ToArray();
            patterns.Add(new KeywordScore {
                Keyword = tsk.Keyword, Score = 50, BizType = 0
            });

            //List<level1link> result = new List<level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;

            //最多搜索60页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 60)
            {
                log(link);
                var html = get_html(link);
                if (html == null)
                {
                    break;
                }
                var tags = html.SubAfter("content_left").SplitWith("c-container");

                if (tags == null || tags.Length == 0)
                {
                    log("BLOCKED " + tsk.Keyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tags)
                {
                    var    a     = tag.SubAfter("h3").SubAfter("a");
                    string title = RemoveInivalidChar(
                        a.RemoveSpace().GetLower().SubBefore("</h3>").GetTxtFromHtml2().RemoveSpace().GetLower());
                    string href = a.GetFirstHref2();


                    string abs    = RemoveInivalidChar(tag.SubAfter("abstract").SubBefore("</div").GetTxtFromHtml2().RemoveSpace().GetLower());
                    string domain = tag.SubLastStringAfter("\"f13").SubBefore("</span").GetTxtFromHtml2();
                    domain = GetDomain(domain);

                    int maxScore = 0;
                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0}{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    string realUrl = null, detailHtml = null, abstracts = null;
                    byte   appType = 0;

                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        //Encoding enc = Encoding.UTF8;
                        //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl);
                        var tuplehtml = get_htmlUrl(href);
                        if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                        {
                            realUrl = tuplehtml.Item1;
                        }
                        if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                        {
                            detailHtml = tuplehtml.Item2;
                        }
                        if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                        {
                            domain = GetDomain(realUrl);
                        }
                    }
                    if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()"))
                    {
                        var gourl = detailHtml.GetFirstHref2();
                        if (!string.IsNullOrEmpty(gourl))
                        {
                            var tuplehtml = get_htmlUrl(gourl);
                            if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                            {
                                realUrl = tuplehtml.Item1;
                            }
                            if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                            {
                                detailHtml = tuplehtml.Item2;
                            }
                            if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                            {
                                domain = GetDomain(realUrl);
                            }
                        }
                    }
                    if (string.IsNullOrEmpty(realUrl))
                    {
                        realUrl = href;
                    }
                    List <KeywordScore> matchpatterns = new List <KeywordScore>();
                    if (string.IsNullOrEmpty(detailHtml))
                    {
                        continue;
                    }
                    else
                    {
                        if (!detailHtml.Contains(tsk.Keyword) || !detailHtml.IsContains2(bizPatterns))
                        {
                            continue;
                        }
                        var           hrefs          = detailHtml.GetDescendents("a", "href");
                        StringBuilder sbabstracts    = new StringBuilder();
                        List <string> abstractlist   = new List <string>();
                        StringBuilder sbabstractlist = new StringBuilder();

                        foreach (KeywordScore pattern in patterns)
                        {
                            string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword);
                            if (splitDetailHtmls.Length > 1)
                            {
                                matchpatterns.Add(pattern);
                            }
                            StringBuilder sbpatternStr = new StringBuilder();
                            for (int i = 0; i < splitDetailHtmls.Length - 1; i++)
                            {
                                string splitDetailHtml1 = splitDetailHtmls[i];
                                string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : "";
                                for (int j = splitDetailHtml1.Length - 1; j >= 0; j--)
                                {
                                    if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1]))
                                    {
                                        break;
                                    }
                                    sbpatternStr.Append(splitDetailHtml1[j]);
                                }
                                for (int q = sbpatternStr.Length - 1; q >= 0; q--)
                                {
                                    sbabstracts.Append(sbpatternStr[q]);
                                }
                                sbabstracts.Append(pattern.Keyword);
                                sbpatternStr.Clear();
                                for (int j = 0; j < splitDetailHtml2.Length; j++)
                                {
                                    if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1]))
                                    {
                                        break;
                                    }
                                    sbpatternStr.Append(splitDetailHtml2[j]);
                                }
                                sbabstracts.Append(sbpatternStr);
                                sbpatternStr.Clear();

                                string tmpsbabstracts = sbabstracts.ToString();
                                tmpsbabstracts = BaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower());
                                if (!abstractlist.Contains(tmpsbabstracts))
                                {
                                    abstractlist.Add(tmpsbabstracts);
                                    sbabstractlist.Append(tmpsbabstracts).Append(" ");
                                }
                                sbabstracts.Clear();
                            }
                        }
                        abstracts = sbabstractlist.ToString();
                        if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0)
                        {
                            maxScore  = matchpatterns.Max(x => x.Score ?? 50);
                            appType   = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();
                            maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                            maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                        }
                    }
                    if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs))
                    {
                        matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList();
                        maxScore      = matchpatterns.Max(x => x.Score ?? 50);
                        appType       = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();

                        maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                        maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                    }
                    if (maxScore > 100)
                    {
                        maxScore = 100;
                    }

                    bool is_bus_matched = txt.IsContains2(businessKeyword.Txt);

                    bool          is_title_matched = title.GetLower().IsContains2(searchKeywords);
                    bool          is_abstr_matched = abs.IsContains2(searchKeywords);
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains2(searchKeywords);



                    level1link l1 = new level1link
                    {
                        UsrId           = tsk.UsrId,
                        Domain          = domain,
                        TopDomain       = GetLevel1Domain(domain),
                        Keywords        = string.Format("{0} + {1}", tsk.Keyword, businessKeyword.Txt),
                        LinkUrl         = realUrl,
                        MatchAt         = (byte)part,
                        Html            = detailHtml,
                        MatchType       = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)),
                        AppType         = appType,
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(realUrl, tsk.UsrId, tsk.Keyword)),
                        SearchkeywordId = tsk._id.ToString(),
                        CreatedAt       = DateTime.UtcNow.AddHours(8),
                        Description     = abs,
                        Title           = title,
                        Score           = maxScore,
                        Abstract        = abstracts
                    };

                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));

                    botmng.save_level1_links(new List <level1link> {
                        l1
                    }, tsk, excludedKeywords);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;
                link = html.SubAfter("fkfk_cur").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://www.baidu.com/".GetContact(link);
                }
            }
            //return result;
        }