Ejemplo n.º 1
0
        public void ParseHTML(string link)
        {
            //HtmlWeb web = new HtmlWeb();
            var web = new HtmlWeb()
            {
                PreRequest = request =>
                {
                    // Make any changes to the request object that will be used.
                    request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
                    return(true);
                }
            };

            WebClient    w      = new WebClient();
            WebException except = new WebException();

            try
            {
                string s = w.DownloadString(link);



                var htmlDoc = web.Load(link);
                var node    = htmlDoc.DocumentNode.Descendants("title").FirstOrDefault();

                string title = "No Title";
                if (node != null)
                {
                    title = node.InnerText;
                }

                if (title.Equals("Error"))
                {
                    sm.AddErrorToPerformance(link, title);
                }
                else
                {
                    // Get Date if possible
                    HtmlNode mdnode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']");
                    if (mdnode != null)
                    {
                        HtmlAttribute desc;
                        desc = mdnode.Attributes["content"];
                        string   date         = desc.Value;
                        string[] wordsInTitle = title.Split(' ');

                        if (date.Length > 8)
                        {
                            DateTime d = this.StringToDateTime(date);
                            // Go word by word and insert into the table
                            for (int i = 0; i < wordsInTitle.Length; i++)
                            {
                                // To LowerCase
                                string toAdd = wordsInTitle[i].ToLower();
                                sm.AddLinkToTableStorage(link, toAdd, title, d);
                            }
                        }
                        else
                        {
                            // Go word by word and insert into the table
                            for (int i = 0; i < wordsInTitle.Length; i++)
                            {
                                // ToLowerCase
                                string toAdd = wordsInTitle[i].ToLower();
                                sm.AddLinkToTableStorage(link, toAdd, title);
                            }
                        }
                    }

                    sm.AddLinkToPerformance(link, title);
                    // Extracting Links:

                    List <string> linksOnPage   = this.ExtractAllAHrefTags(htmlDoc);
                    List <string> addToURLQueue = this.ExamineLinksOnPage(linksOnPage);
                    this.AddToURLQueue(addToURLQueue);
                }
            }
            catch (WebException e)
            {
                except = e;
                sm.AddErrorToPerformance(e.ToString(), "Catched Error");
            }
        }