コード例 #1
0
        private void RetrievePages(string url, int deep)
        {
            try
            {
                HtmlWeb hw = new HtmlWeb();
                HtmlAgilityPack.HtmlDocument doc = hw.Load(url);
                //foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))
                HtmlNodeCollection htmlNodes = doc.DocumentNode.SelectNodes("//a[@href]");
                htmlNodes.ProperQuickShuffle(random);
                for (int i = 0; i < htmlNodes.Count - 1; i++)
                {
                    if (bw.CancellationPending)
                    {
                        break;
                    }

                    Mre.WaitOne();

                    HtmlNode link = htmlNodes[i];
                    if (link != null)
                    {
                        HtmlAttribute att = link.Attributes["href"];
                        if ((att.Value.Length > 12) &&
                            (att.Value.Substring(0, 4).ToUpper() == "HTTP") &&
                            !EndsWithForbiddenExtension(att.Value) &&
                            !ContainsForbiden(att.Value) &&
                            !UrlList.EndWithPart(att.Value.Substring(att.Value.Length - Math.Min(att.Value.Length, 20)))
                            )
                        {
                            //if ((!UrlList.Contains(att.Value)) &&
                            //    (!UrlList.EndWithPart(att.Value.Substring(att.Value.Length - Math.Min(att.Value.Length, 20)))))
                            UrlList.Add(att.Value);
                            UrlListToProcess.Add(att.Value);
                            NoOfLinks++;
                            bw.ReportProgress(NoOfLinks);
                            Log.Information(NoOfLinks + ": " + att.Value);
                            //Console.WriteLine(NoOfLinks.ToString() + ": " + att.Value);

                            if (random.Next(i, htmlNodes.Count) % 2 == 0)
                            {
                                //Log.Information("Soda");
                                LoadPage(att.Value);
                                RetrieveNextPages(att.Value, deep);
                            }
                            else
                            {
                                //Log.Information("Liha");
                                RetrieveNextPages(att.Value, deep);
                                LoadPage(att.Value);
                            }

                            if (MaxDepth != deep)
                            {
                                RetrievePages(att.Value, deep + 1);
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Log.Error($"Error: {ex}", ex);
            }
        }