示例#1
0
        override public Page DownloadPage(Uri link)
        {
            HtmlDocument doc = this.GetHtmlDocumentFromLink(link);
            KeyValuePair <string, List <string> > baseInfo = GetTitleAndKeywords(doc);
            List <string>      authors    = new List <string>();
            HtmlNode           pretexNode = doc.DocumentNode.SelectSingleNode("//div[@class='clanek']/div[@class='perex']");
            HtmlNodeCollection paragraphs = doc.DocumentNode.SelectNodes("//div[@class='clanek']/p");

            if (paragraphs == null)
            {
                throw new NullReferenceException(link + " doesn't contain paragraphs!!!");
            }
            HtmlNode authorNode = doc.DocumentNode.SelectSingleNode("//div[@class='clanek']/p[@class='clanek-autor']");

            if (authorNode != null)
            {
                paragraphs.Remove(authorNode);
                string authorsString = authorNode.InnerText;
                authorsString = authorsString.Substring(authorsString.IndexOf(':') + 1).Trim();
                authors.AddRange(authorsString.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries).Select(p => p.Trim()));
            }

            StringBuilder textBuilder = new StringBuilder();

            try
            {
                textBuilder.AppendLine(pretexNode.InnerText);
            }
            catch (NullReferenceException) {; }

            foreach (HtmlNode node in paragraphs)
            {
                textBuilder.AppendLine(node.InnerText);
            }

            Page page = new Page(link.AbsoluteUri, textBuilder.ToString(), baseInfo.Key);

            page.Keywords    = baseInfo.Value;
            page.Categories  = GetCategory(doc);
            page.Author      = authors;
            page.PublishDate = GetPublishDate(doc);

            return(page);
        }
示例#2
0
        private string GetStock(int stockId)
        {
            //指定來源網頁
            WebClient    url = new WebClient();
            MemoryStream ms  = new MemoryStream(url.DownloadData("http://tw.stock.yahoo.com/q/q?s=" + stockId));
            HtmlDocument doc = new HtmlDocument();

            doc.Load(ms, Encoding.GetEncoding("big5"));
            HtmlDocument hdc = new HtmlDocument();

            hdc.LoadHtml(
                doc.DocumentNode.SelectSingleNode("/html[1]/body[1]/center[1]/table[2]/tr[1]/td[1]/table[1]")
                .InnerHtml);

            // 取得個股標頭
            HtmlNodeCollection htnode = hdc.DocumentNode.SelectNodes("./tr[1]/th");

            htnode.Remove(htnode.FirstOrDefault(item => item.InnerText == "個股資料"));
            // 取得個股數值
            string[] txt = hdc.DocumentNode.SelectSingleNode("./tr[2]").InnerText.Replace("加到投資組合", string.Empty)
                           .Trim().Split('\n');
            int i      = 0;
            var result = new StringBuilder();

            foreach (HtmlNode nodeHeader in htnode)
            {
                var title = i == 0 ? string.Empty : nodeHeader.InnerText + ":";
                result.Append($"{title}{txt[i]} \n");
                i++;
            }

            doc = null;
            hdc = null;
            url = null;
            ms.Close();
            return(HttpUtility.HtmlDecode(result.ToString()));
        }
示例#3
0
        private async Task LoadHtmlAsync()
        {
            if (!await LoadHtmlDocumentAsync())
            {
                return;
            }
            HtmlNodeCollection nodes = null;

            if (htmlDoc != null)
            {
                nodes = htmlDoc.DocumentNode.ChildNodes;
                foreach (var node in nodes.ToArray())
                {
                    if (node.NodeType != HtmlNodeType.Element)
                    {
                        nodes.Remove(node);
                    }
                }
            }
            if (WebPage.BlackWhiteList != null)
            {
                foreach (var id in WebPage.BlackWhiteList)
                {
                    if (id == null)
                    {
                        continue;
                    }
                    var line = new HtmlBlackWhiteListItemLine(id.Clone());
                    line.Deleted += Line_Deleted;
                    stkIdentifies.Children.Add(line);
                }
            }
            tree.ItemTemplate = tree.Resources["htmlTemplate"] as HierarchicalDataTemplate;

            HtmlNodes        = nodes;
            tree.ItemsSource = HtmlNodes;
        }
示例#4
0
        private static List <Raport> GetCompanyRaports(string name)
        {
            List <Raport> CompanyRaports = new List <Raport>();

            HttpWebRequest request  = (HttpWebRequest)HttpWebRequest.Create("https://www.biznesradar.pl/raporty-finansowe-rachunek-zyskow-i-strat/" + name + ",Q");
            WebResponse    response = request.GetResponse();
            Stream         stream   = response.GetResponseStream();

            HtmlDocument doc = new HtmlDocument();

            doc.Load(stream);

            HtmlNodeCollection r      = doc.DocumentNode.SelectNodes("//table[@class='report-table']");
            HtmlNode           raport = r[0];

            //nazwy kwartałów
            HtmlNodeCollection rh = raport.SelectNodes("//th[@class='thq h'] | //th[@class='thq h newest']");

            foreach (HtmlNode element in rh)
            {
                string namen = element.InnerHtml;
                namen = Regex.Replace(namen, @"\s", "");

                CompanyRaports.Add(new Raport(namen));
            }

            HtmlNodeCollection tr = raport.SelectNodes(".//tr ");

            tr.Remove(0);

            int i = 0;

            foreach (HtmlNode element in tr)
            {
                HtmlNodeCollection rSpan = element.SelectNodes(".//span[@class='value']/span/span");

                int j = 0;

                if (i == CompanyRaports[j].NbElement)
                {
                    return(null);
                }

                foreach (HtmlNode value in rSpan)
                {
                    string v = value.InnerHtml;
                    v = Regex.Replace(v, @"\s", "");

                    CompanyRaports[j].Set(i, Convert.ToInt64(v));
                    j++;
                }

                //System.Console.Write(i);


                i++;
            }

            request  = (HttpWebRequest)HttpWebRequest.Create("https://www.biznesradar.pl/wskazniki-wartosci-rynkowej/" + name + ",0");
            response = request.GetResponse();
            stream   = response.GetResponseStream();

            doc = new HtmlDocument();

            doc.Load(stream);

            //kurs akcji
            r = doc.DocumentNode.SelectNodes("//table[@class='report-table']//tr");

            List <double> prices = new List <double>();

            foreach (var p in r[1].SelectNodes(".//td"))
            {
                string price = p.InnerText.Trim();
                if (price != "")
                {
                    try
                    {
                        double pp = Convert.ToDouble(price);
                        prices.Add(pp);
                    }
                    catch (FormatException) { }
                }
            }

            for (int j = 0; j < prices.Count; j++)
            {
                CompanyRaports[CompanyRaports.Count - j - 1].Price = prices[prices.Count - j - 1];
            }

            //ilość akcji
            r = doc.DocumentNode.SelectNodes("//table[@class='report-table']//tr");

            List <long> numbers = new List <long>();

            foreach (var n in r[2].SelectNodes(".//td"))
            {
                string number = n.InnerText.Trim();
                if (number != "")
                {
                    try
                    {
                        number = number.Replace(" ", "");
                        long nn = Convert.ToInt64(number);
                        numbers.Add(nn);
                    }
                    catch (FormatException) { }
                }
            }

            for (int j = 0; j < numbers.Count; j++)
            {
                CompanyRaports[CompanyRaports.Count - j - 1].NumberShares = numbers[numbers.Count - j - 1];
            }

            return(CompanyRaports.GetRange(5, CompanyRaports.Count - 5));
        }
示例#5
0
        //------------------------------------------------------------------------

        async private Task <HtmlNodeCollection> GetNodes(string request, int page = 1)
        {
            HtmlNodeCollection html_node_collection = null;

            CookieContainer cookies = new CookieContainer();

            cookies.Add(Program.BaseAddress, new Cookie("PHPSESSID", Program.settings.PHPSESSID));

            string responseString;

            string request_temp = request;

            if (page > 1)
            {
                if (request_temp == "")
                {
                    request_temp += "giveaways/search?page=" + page;
                }
                else
                {
                    request_temp += "&page=" + page;
                }
            }

            using (var handler = new HttpClientHandler()
            {
                CookieContainer = cookies
            })
                using (var client = new HttpClient(handler)
                {
                    BaseAddress = Program.BaseAddress
                })
                {
                    HttpResponseMessage result;

                    try
                    {
                        result = await client.GetAsync(request_temp);

                        result.EnsureSuccessStatusCode();
                    }
                    catch
                    {
                        MainForm.ShowLoadErrorMessage("Filed to get giveaways");
                        return(html_node_collection);
                    }

                    responseString = await result.Content.ReadAsStringAsync();
                }

            HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
            document.LoadHtml(responseString);

            html_node_collection = document.DocumentNode.SelectNodes("//*[@class=\"giveaway__row-inner-wrap\"]");

            List <HtmlNode> remove_list = new List <HtmlNode>();

            foreach (var single_node in html_node_collection)
            {
                if (single_node.ParentNode.ParentNode.XPath == "/html[1]/body[1]/div[4]/div[1]/div[1]/div[2]/div[1]/div[1]")
                {
                    remove_list.Add(single_node);
                    continue;
                }

                HtmlNode cost_node = single_node.SelectSingleNode(".//*[@class=\"giveaway__heading__thin\"]");
                if (CheckNode(cost_node) == false)
                {
                    remove_list.Add(single_node);
                    continue;
                }

                Match   m         = Regex.Match(cost_node.InnerText, @"\d+");
                decimal game_cost = 0;

                if (m.Success)
                {
                    game_cost = Convert.ToDecimal(m.Value);

                    if (game_cost < Program.settings.From || game_cost > Program.settings.To)
                    {
                        remove_list.Add(single_node);
                        continue;
                    }
                }
            }

            foreach (var remove in remove_list)
            {
                html_node_collection.Remove(remove);
            }

            if (html_node_collection.Count == 0)
            {
                document.LoadHtml(responseString);
                HtmlNode result_node = document.DocumentNode.SelectSingleNode("//*[@class=\"pagination__results\"]");
                if (CheckNode(result_node) == false || result_node.InnerText == "No results were found.")
                {
                    return(html_node_collection);
                }

                document.LoadHtml(responseString);
                HtmlNode pages_node = document.DocumentNode.SelectSingleNode("//*[@class=\"pagination__navigation\"]");
                if (CheckNode(pages_node) == false)
                {
                    return(html_node_collection);
                }
                else
                {
                    return(await GetNodes(request, page + 1));
                }
            }

            return(html_node_collection);
        }
示例#6
0
        //private static List<BarInfo> Barinfo_list { get; set; } = new List<BarInfo>();

        public static void Do(out List <BarInfo> barinfo_list_list)
        {
            barinfo_list_list = new List <BarInfo>();
            //while (true)
            //{

            bool EndOfPages   = false;
            int  CountOfPages = 1;

            while (EndOfPages == false)
            {
                HtmlDocument doc = new HtmlDocument();
                //doc.LoadHtml(getRequest(@"http://gdebar.ru/bars?mainType[0]=3&withFilter=1&p=" + CountOfPages.ToString() + "&fromUrl=/bars"));
                doc.LoadHtml(Program.getRequest(@"http://gdebar.ru/bars?mainType[0]=3&withFilter=1&p=" + CountOfPages.ToString() + "&fromUrl=/bars"));
                //http://gdebar.ru/bars?mainType[0]=3&withFilter=1&p=' + str(i) + '&fromUrl=/bars
                //HtmlNodeCollection l = doc.DocumentNode.SelectNodes("//section[@class = 'catalog__list']");
                //Console.WriteLine(doc.DocumentNode.SelectNodes("//div[@class = 'catalog__list']").Count);

                if (doc.DocumentNode.SelectNodes("//section[@class = 'catalog__list']")[0].InnerText.Contains("По данному запросу заведений не найдено :("))
                {
                    EndOfPages = true;
                    Console.WriteLine("stop");
                    continue;
                }
                else
                {
                    Console.WriteLine("continue");
                }

                //Console.WriteLine(doc.Encoding.EncodingName);
                //Console.WriteLine(doc);
                HtmlNodeCollection BarCollection = doc.DocumentNode.SelectNodes("//div[@class = 'place-card__specif']/a");

                for (int i = 0; i < BarCollection.Count; i++)
                {
                    if (!BarCollection[i].InnerText.Contains("\r\n"))
                    {
                        BarCollection.Remove(BarCollection[i]);
                    }
                }


                foreach (var item in BarCollection)     //тестовый вывод
                {
                    Console.WriteLine(item.InnerText);
                    // Console.WriteLine(item.Attributes["href"].Value);
                }
                Console.WriteLine(BarCollection.Count);


                foreach (var item in BarCollection)
                {
                    HtmlDocument doc_2 = new HtmlDocument();
                    doc_2.LoadHtml(Program.getRequest("http://gdebar.ru" + item.Attributes["href"].Value + "/menu"));
                    HtmlNodeCollection Menu = doc_2.DocumentNode.SelectNodes("//div[@class = 'menu__dish d-flex align-items-center justify-content-between p-2']");
                    //if (Menu == null) continue;
                    if (doc_2.DocumentNode.SelectNodes("//div[@class = 'text-center alert alert-danger h1']") != null)
                    {
                        continue;
                    }
                    if (Menu == null)
                    {
                        BarInfo info = new BarInfo();

                        HtmlNodeCollection ff = doc_2.DocumentNode.SelectNodes("//div[@id = 'bar-gallery-main']/div/a");    // отсюда берем ссылки на пикчи
                        if (ff != null)
                        {
                            foreach (var item_3 in ff)
                            {
                                info.PictureLinks.Add(item_3.Attributes["href"].Value);
                            }
                        }
                        else
                        {
                            info.PictureLinks.Add(null);
                        }

                        if (doc_2.DocumentNode.SelectNodes("//div[@class = 'dropdown-menu dropdown-menu--tooltip']/a") != null)
                        {
                            HtmlNodeCollection subwayy = doc_2.DocumentNode.SelectNodes("//div[@class = 'dropdown-menu dropdown-menu--tooltip']/a");

                            List <string> twmm = new List <string>();
                            foreach (var item_2 in subwayy)
                            {
                                twmm.Add(item_2.InnerText.Replace("\n", "").Trim().Split("  ")[0]);
                            }
                            info.NearSubway = twmm.ToArray();
                        }
                        else if (doc_2.DocumentNode.SelectNodes("//div[@class = 'metro d-flex align-items-start pl-4 mt-2 flex-wrap']") != null)
                        {
                            List <string> twm = new List <string>();

                            HtmlNode temp = doc_2.DocumentNode.SelectNodes("//div[@class = 'metro d-flex align-items-start pl-4 mt-2 flex-wrap']")[0];
                            string   sub  = temp.InnerText.Replace("\r\n", "").Trim().Split("  ")[0];
                            twm.Add(sub);
                            info.NearSubway = twm.ToArray();
                        }
                        else
                        {
                            info.NearSubway = new string[] { "отсутствует" };
                        }

                        if (doc_2.DocumentNode.SelectNodes("//a[@class = 'fancybox3']")[0].InnerText.ToLower().Contains("работает"))
                        {
                            info.WorkTime = doc_2.DocumentNode.SelectNodes("//a[@class = 'fancybox3']")[0].InnerText.Replace("\r\n", "").Split("работает ")[1].Replace("   ", "");
                        }
                        else
                        {
                            info.WorkTime = "время работы неизвестно";
                        }
                        if (doc_2.DocumentNode.SelectNodes("//a[@class = 'roistat-phone']") != null)
                        {
                            info.Phone = doc_2.DocumentNode.SelectNodes("//a[@class = 'roistat-phone']")[0].InnerText.Trim();
                        }
                        else
                        {
                            info.Phone = doc_2.DocumentNode.SelectNodes("//div[@class = 'phone bar__main--info__line d-flex align-items-center justify-content-start mb-4 w-100 flex-nowrap']")[0].InnerText.Trim();
                        }
                        string        add  = doc_2.DocumentNode.SelectNodes("//span[@class = 'font-weight-light mr-0']")[0].InnerText.Trim();
                        List <string> poss = Yandex.Yandex.GetPos(Apikey, add);
                        Console.WriteLine("------------------------------------------------------------------------------------------------------------------------------------------------------");
                        Console.WriteLine(poss[0].Split(" ")[1]);
                        Console.WriteLine("---------------------------------------------------");
                        info.Lat = Convert.ToDouble(poss[0].Split(" ")[1].Replace(".", ","));       //широта
                        info.Lng = Convert.ToDouble(poss[0].Split(" ")[0].Replace(".", ","));       //долгота

                        info.BarName = item.InnerText.Trim();
                        if (info.BarName.Contains('ё'))
                        {
                            info.BarName.Replace("ё", "е");
                        }

                        info.HasMenu = false;
                        barinfo_list_list.Add(info);

                        continue;
                    }

                    BarInfo   barinfo   = new BarInfo();
                    MenuItems menuitems = new MenuItems();
                    //barinfo.NearSubway[0] = doc_2.DocumentNode.SelectNodes("//div[@class = 'metro d-flex align-items-start pl-4 mt-2 flex-wrap']/div");
                    //HtmlNodeCollection temp = doc_2.DocumentNode.SelectNodes("//div[@class = 'metro d-flex align-items-start pl-4 mt-2 flex-wrap']");
                    HtmlNodeCollection subway = doc_2.DocumentNode.SelectNodes("//div[@class = 'dropdown-menu dropdown-menu--tooltip']/a");
                    Console.WriteLine(item.InnerText.Trim());
                    if (doc_2.DocumentNode.SelectNodes("//div[@class = 'dropdown-menu dropdown-menu--tooltip']/a") != null)
                    {
                        subway = doc_2.DocumentNode.SelectNodes("//div[@class = 'dropdown-menu dropdown-menu--tooltip']/a");

                        List <string> twm = new List <string>();
                        foreach (var item_2 in subway)
                        {
                            twm.Add(item_2.InnerText.Replace("\n", "").Trim().Split("  ")[0]);
                        }
                        barinfo.NearSubway = twm.ToArray();
                    }
                    else if (doc_2.DocumentNode.SelectNodes("//div[@class = 'metro d-flex align-items-start pl-4 mt-2 flex-wrap']") != null)
                    {
                        List <string> twm = new List <string>();

                        HtmlNode temp = doc_2.DocumentNode.SelectNodes("//div[@class = 'metro d-flex align-items-start pl-4 mt-2 flex-wrap']")[0];
                        string   sub  = temp.InnerText.Replace("\r\n", "").Trim().Split("  ")[0];
                        twm.Add(sub);
                        barinfo.NearSubway = twm.ToArray();
                    }
                    else
                    {
                        barinfo.NearSubway = new string[] { "отсутствует" };
                    }


                    //HtmlNode tttt = doc_2.DocumentNode.SelectNodes("//div[@class = 'metro d-flex align-items-start pl-4 mt-2 flex-wrap']")[0];


                    foreach (var item_2 in Menu)
                    {
                        //barinfo = new BarInfo();
                        menuitems = new MenuItems();
                        HtmlNode subtitle_path_2 = item_2.ParentNode.ParentNode.ParentNode; //сладкая вода - не обязательная subtitle_2
                        HtmlNode subtitle_path   = subtitle_path_2.ParentNode.ParentNode;   //вода
                        HtmlNode title_path      = subtitle_path.ParentNode.ParentNode;     //бар
                                                                                            //Console.WriteLine(name_path.InnerText);
                                                                                            //Console.WriteLine(subtitle_path.InnerText);

                        menuitems.BarName = item.InnerText.Replace("\r", "").Replace("\n", "").Trim();
                        if (menuitems.BarName.Contains('ё'))
                        {
                            menuitems.BarName.Replace("ё", "е");
                        }

                        //if (title_path.Name == "parent") Console.WriteLine("1");
                        if (title_path.GetAttributeValue("class", "") == "parent")
                        {
                            if (title_path.ChildNodes[0].InnerText.Contains(" ("))
                            {
                                menuitems.Title = title_path.ChildNodes[0].InnerText.Split(" (")[0];           //главная принадлежность
                            }
                            else
                            {
                                menuitems.Title = title_path.ChildNodes[0].InnerText;
                            }
                            Console.WriteLine(title_path.ChildNodes[0].InnerText);
                        }
                        if (subtitle_path.ChildNodes[0].InnerText.Contains(" ("))
                        {
                            menuitems.Subtitle = subtitle_path.ChildNodes[0].InnerText.Split(" (")[0];         //вторичная принадлежность
                        }
                        else
                        {
                            menuitems.Subtitle = subtitle_path.ChildNodes[0].InnerText;
                        }
                        if (subtitle_path_2.ChildNodes[0].ChildNodes[0].InnerText.Contains(" ("))
                        {
                            menuitems.Subtitle_2 = subtitle_path_2.ChildNodes[0].ChildNodes[0].InnerText.Split(" (")[0];       // если существует, то третичная
                        }
                        else
                        {
                            menuitems.Subtitle_2 = subtitle_path_2.ChildNodes[0].ChildNodes[0].InnerText;
                        }

                        Console.WriteLine(subtitle_path.ChildNodes[0].InnerText);
                        Console.WriteLine(subtitle_path_2.ChildNodes[0].ChildNodes[0].InnerText);

                        HtmlNodeCollection childrens = item_2.ChildNodes;
                        //Console.WriteLine(childrens.Count);
                        menuitems.Dish  = childrens[0].ChildNodes[0].InnerText;
                        menuitems.Price = Convert.ToInt32(childrens[1].InnerText.Split(" ")[0]);

                        Console.WriteLine("блюдо - " + childrens[0].ChildNodes[0].InnerText);
                        Console.WriteLine("цена - " + childrens[1].InnerText);


                        Menuitems_list.Add(menuitems);

                        //Console.WriteLine(subtitle_path.ChildNodes[0].InnerText);
                        //Console.WriteLine(name_path.ChildNodes[0].ChildNodes[0].InnerText);

                        //Console.WriteLine(title_path.ChildNodes[0].InnerText);
                        //Console.WriteLine(name_path.Name);
                        //Console.WriteLine(subtitle_path.Name);
                        //Console.WriteLine(title_path.OriginalName);
                    }


                    Console.WriteLine("-");
                    //Console.WriteLine(item);
                    //doc_2.LoadHtml(getRequest("http://gdebar.ru" + item.Attributes["href"].Value));
                    HtmlNodeCollection f = doc_2.DocumentNode.SelectNodes("//div[@id = 'bar-gallery-main']/div/a");    // отсюда берем ссылки на пикчи

                    foreach (var item_3 in f)
                    {
                        barinfo.PictureLinks.Add(item_3.Attributes["href"].Value);
                    }
                    if (doc_2.DocumentNode.SelectNodes("//a[@class = 'fancybox3']")[0].InnerText.ToLower().Contains("работает"))
                    {
                        barinfo.WorkTime = doc_2.DocumentNode.SelectNodes("//a[@class = 'fancybox3']")[0].InnerText.Replace("\r\n", "").Split("работает ")[1].Replace("   ", "");
                    }
                    else
                    {
                        barinfo.WorkTime = "отсутсвует";
                    }
                    if (doc_2.DocumentNode.SelectNodes("//a[@class = 'roistat-phone']") != null)
                    {
                        barinfo.Phone = doc_2.DocumentNode.SelectNodes("//a[@class = 'roistat-phone']")[0].InnerText.Trim();
                    }
                    else
                    {
                        barinfo.Phone = doc_2.DocumentNode.SelectNodes("//div[@class = 'phone bar__main--info__line d-flex align-items-center justify-content-start mb-4 w-100 flex-nowrap']")[0].InnerText.Trim();
                    }
                    string        address = doc_2.DocumentNode.SelectNodes("//span[@class = 'font-weight-light mr-0']")[0].InnerText.Trim();
                    List <string> pos     = Yandex.Yandex.GetPos(Apikey, address);
                    Console.WriteLine("------------------------------------------------------------------------------------------------------------------------------------------------------");
                    Console.WriteLine(pos[0].Split(" ")[1]);
                    Console.WriteLine("---------------------------------------------------");
                    barinfo.Lat = Convert.ToDouble(pos[0].Split(" ")[1].Replace(".", ","));       //широта
                    barinfo.Lng = Convert.ToDouble(pos[0].Split(" ")[0].Replace(".", ","));       //долгота
                    Console.WriteLine("dddddddddddddddd - " + barinfo.Lat);

                    barinfo.BarName = item.InnerText.Trim();
                    if (barinfo.BarName.Contains('ё'))
                    {
                        barinfo.BarName.Replace("ё", "е");
                    }

                    barinfo.HasMenu = true;
                    barinfo_list_list.Add(barinfo);
                }
                Console.WriteLine(CountOfPages);
                CountOfPages += 1;
                Thread.Sleep(2000);
            }
            //запись в бд
            // ClearBD("barinfo");
            //ClearBD("menuitems");
            //PastIntoBD(barinfo_list_list, Menuitems_list);

            Console.WriteLine("--------------------------------------------------------------------");
            //Thread.Sleep(50000); //86400000 - это сутки
            //}
        }
示例#7
0
        override public Page DownloadPage(Uri link)
        {
            HtmlDocument doc = this.GetHtmlDocumentFromLink(link);
            KeyValuePair <string, List <string> > baseInfo = GetTitleAndKeywords(doc);
            List <string> authors  = new List <string>();
            HtmlNode      mainnode = doc.DocumentNode.SelectSingleNode("//article");

            if (mainnode == null)
            {
                mainnode = doc.DocumentNode.SelectSingleNode("//div[@class='article ']");
            }
            //HtmlNode pretexNode = mainnode.SelectSingleNode("//h4");
            HtmlNode pretextNode = mainnode.SelectSingleNode("//section[@class='detailViewIntro']");

            HtmlNodeCollection paragraphs = mainnode.SelectNodes("//section[@class='detailViewContent']/p");

            if (paragraphs == null)
            {
                throw new NullReferenceException(link + " doesn't contain paragraph!!!");
            }

            HtmlNode authorNode = mainnode.SelectSingleNode("//p[@class='contentAuthor']");

            if (authorNode != null)
            {
                try
                {
                    paragraphs.Remove(authorNode);
                    foreach (HtmlNode n in authorNode.SelectNodes("//span[@itemprop='name']"))
                    {
                        authors.Add(n.InnerText.Trim());
                    }
                }
                catch (ArgumentOutOfRangeException) {; }
            }

            StringBuilder textBuilder = new StringBuilder();

            if (pretextNode != null)
            {
                textBuilder.AppendLine(pretextNode.InnerText.Trim());
            }
            foreach (HtmlNode p in paragraphs)
            {
                if (p.InnerText.Trim() != "")
                {
                    textBuilder.AppendLine(p.InnerText.Trim());
                }
            }

            Page page = new Page(link.AbsoluteUri, textBuilder.ToString(), baseInfo.Key);

            page.Keywords    = baseInfo.Value;
            page.Categories  = GetCategory(doc);
            page.Author      = authors;
            page.PublishDate = GetPublishDate(doc);

            return(page);

            throw new NotImplementedException();
        }
示例#8
0
        private void getBaiduCreative(string key)
        {
            List <string> BDurl = new List <string>();

            for (int i = 0; i < bdpage; i++)
            {
                BDurl.Add("http://www.baidu.com/s?wd=" + key + "&pn=" + (i * 10).ToString() + "&ie=utf-8&usm=4");
            }


            for (int j = 0; j < BDurl.Count; j++)
            {
restart:
                //OutDelegateSim simdelegate = new OutDelegateSim(OutTextSim);
                //this.Dispatcher.BeginInvoke(simdelegate, new object[] { BDurl[j] });
                //Thread.Sleep(5000);
                //string bd_source = htmlSim;
                //MessageBox.Show(htmlSim);
                string bd_source = GetWebPageSource(BDurl[j]);


                //被屏蔽的时候
                if (bd_source.Contains("很抱歉,您的请求暂时无法响应!"))
                {
                    MessageBox.Show("对不起!在点击确定之前解除百度屏蔽!");
                    Thread.Sleep(30000);
                    goto restart;
                }

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(bd_source);

                try
                {
                    //采集左边排名
                    if (leftrank)
                    {
                        int          bdbig      = 1;
                        HtmlNode     BDLeftHN   = doc.GetElementbyId("content_left");
                        string       bdlefthtml = "<!doctype html><html><head><title>baidu</title></head><body>" + BDLeftHN.InnerHtml + "</body></html>";
                        HtmlDocument docright   = new HtmlDocument();
                        docright.LoadHtml(bdlefthtml);
                        HtmlNodeCollection LeftNodes = docright.DocumentNode.SelectNodes("/html/body/div");
                        foreach (HtmlNode Node in LeftNodes)
                        {
                            if (!isContains(Node.OuterHtml))
                            {
                                string       Nodehtml  = "<!doctype html><html><head><title>baidu</title></head><body>" + Node.InnerHtml + "</body></html>";
                                HtmlDocument html_node = new HtmlDocument();
                                html_node.LoadHtml(Nodehtml);
                                HtmlNodeCollection hncNode = html_node.DocumentNode.SelectNodes("/html/body/div");
                                if (hncNode.Count == 3)
                                {
                                    string       bdlefthtml_node = "<!doctype html><html><head><title>baidu</title></head><body>" + Node.InnerHtml + "</body></html>";
                                    HtmlDocument docleft_node    = new HtmlDocument();
                                    docleft_node.LoadHtml(bdlefthtml_node);
                                    string title     = docleft_node.DocumentNode.SelectSingleNode("/html/body/div").InnerText;
                                    string desc1     = docleft_node.DocumentNode.SelectSingleNode("/html/body/div[2]").InnerText.Replace("&nbsp;", " ");
                                    string desc2     = "";
                                    string biddomain = docleft_node.DocumentNode.SelectSingleNode("/html/body/div[3]/span").InnerText;
                                    string hospital  = Common.MatchURL(Node.InnerHtml, "data-renzheng=\"{title:'", ":'");
                                    //dt1.Rows.Add(new object[10] { id++, key, rule, "Baidu", "左" + (bdbig++).ToString(), format(title), format(desc1), desc2, biddomain, hospital });
                                    if (rule.Trim() == "")
                                    {
                                        dt1.Rows.Add(new object[10] {
                                            id++, key, rule, "Baidu", "左" + (bdbig++).ToString(), format(title), format(desc1), desc2, biddomain, hospital
                                        });
                                    }
                                    else
                                    {
                                        if (mode == "NameMode" && MatchRule(hospital))
                                        {
                                            dt1.Rows.Add(new object[10] {
                                                id++, key, rule, "Baidu", "左" + (bdbig++).ToString(), format(title), format(desc1), desc2, biddomain, hospital
                                            });
                                        }
                                        else if (mode == "DomainMode" && MatchRule(biddomain))
                                        {
                                            dt1.Rows.Add(new object[10] {
                                                id++, key, rule, "Baidu", "左" + (bdbig++).ToString(), format(title), format(desc1), desc2, biddomain, hospital
                                            });
                                        }
                                    }
                                }
                                else if (hncNode.Count == 4)
                                {
                                    string       bdlefthtml_node = "<!doctype html><html><head><title>baidu</title></head><body>" + Node.InnerHtml + "</body></html>";
                                    HtmlDocument docleft_node    = new HtmlDocument();
                                    docleft_node.LoadHtml(bdlefthtml_node);
                                    string title     = docleft_node.DocumentNode.SelectSingleNode("/html/body/div").InnerText;
                                    string desc1     = docleft_node.DocumentNode.SelectSingleNode("/html/body/div[2]").InnerText.Replace("&nbsp;", " ");
                                    string desc2     = docleft_node.DocumentNode.SelectSingleNode("/html/body/div[3]").InnerText.Replace("&nbsp;", " ");
                                    string biddomain = docleft_node.DocumentNode.SelectSingleNode("/html/body/div[4]/span").InnerText;
                                    string hospital  = Common.MatchURL(Node.InnerHtml, "data-renzheng=\"{title:'", ":'");
                                    //dt1.Rows.Add(new object[10] { id++, key, rule, "Baidu", "左" + (bdbig++).ToString(), format(title), format(desc1), format(desc2), biddomain, hospital });
                                    if (rule.Trim() == "")
                                    {
                                        dt1.Rows.Add(new object[10] {
                                            id++, key, rule, "Baidu", "左" + (bdbig++).ToString(), format(title), format(desc1), format(desc2), biddomain, hospital
                                        });
                                    }
                                    else
                                    {
                                        if (mode == "NameMode" && MatchRule(hospital))
                                        {
                                            dt1.Rows.Add(new object[10] {
                                                id++, key, rule, "Baidu", "左" + (bdbig++).ToString(), format(title), format(desc1), format(desc2), biddomain, hospital
                                            });
                                        }
                                        else if (mode == "DomainMode" && MatchRule(biddomain))
                                        {
                                            dt1.Rows.Add(new object[10] {
                                                id++, key, rule, "Baidu", "左" + (bdbig++).ToString(), format(title), format(desc1), format(desc2), biddomain, hospital
                                            });
                                        }
                                    }
                                }
                            }
                        }
                    }
                    //采集右边排名
                    if (rightrank)
                    {
                        int          bdbig       = 1;
                        HtmlNode     BDRightHN   = doc.GetElementbyId("ec_im_container");
                        string       bdrighthtml = "<!doctype html><html><head><title>baidu</title></head><body>" + BDRightHN.InnerHtml + "</body></html>";
                        HtmlDocument docright    = new HtmlDocument();
                        docright.LoadHtml(bdrighthtml);
                        HtmlNodeCollection RightNodes = docright.DocumentNode.SelectNodes("/html/body/div");
                        RightNodes.Remove(0);
                        foreach (HtmlNode Node in RightNodes)
                        {
                            string       bdrighthtml_node = "<!doctype html><html><head><title>baidu</title></head><body>" + Node.InnerHtml + "</body></html>";
                            HtmlDocument docright_node    = new HtmlDocument();
                            docright_node.LoadHtml(bdrighthtml_node);
                            string title     = docright_node.DocumentNode.SelectSingleNode("/html/body/a").InnerText;
                            string desc1     = docright_node.DocumentNode.SelectSingleNode("/html/body/a[2]").InnerText;
                            string desc2     = "";
                            string biddomain = docright_node.DocumentNode.SelectSingleNode("/html/body/a[2]/font[2]").InnerText;
                            string hospital  = Common.MatchURL(Node.InnerHtml, "data-renzheng=\"{title:'", ":'");
                            if (rule.Trim() == "")
                            {
                                dt1.Rows.Add(new object[10] {
                                    id++, key, rule, "Baidu", "右" + (bdbig++).ToString(), format(title), format(desc1.Replace(biddomain, "")), desc2, biddomain, hospital
                                });
                            }
                            else
                            {
                                if (mode == "NameMode" && MatchRule(hospital))
                                {
                                    dt1.Rows.Add(new object[10] {
                                        id++, key, rule, "Baidu", "右" + (bdbig++).ToString(), format(title), format(desc1.Replace(biddomain, "")), desc2, biddomain, hospital
                                    });
                                }
                                else if (mode == "DomainMode" && MatchRule(biddomain))
                                {
                                    dt1.Rows.Add(new object[10] {
                                        id++, key, rule, "Baidu", "右" + (bdbig++).ToString(), format(title), format(desc1.Replace(biddomain, "")), desc2, biddomain, hospital
                                    });
                                }
                            }
                        }
                    }
                }
                catch
                {
                }
            }

            Thread.Sleep(2000);

            y++;
            OutDelegateNew outdelegate = new OutDelegateNew(OutTextNew);

            this.Dispatcher.BeginInvoke(outdelegate, new object[] { key, "Baidu" });
        }