public HTMLParser(UrlStamp url) { body = null; _wordsOnPage = new List<String>(); HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); // There are various options, set as needed htmlDoc.OptionFixNestedTags=true; //I can get the character set by looking into the meta tags in the header and look for charset //CultureInfo pt = CultureInfo.GetCultureInfo("da-DK"); //online /*HtmlWeb hw = new HtmlWeb(); try{ htmlDoc = hw.Load(url.Url); }catch(HtmlWebException ex){ Console.WriteLine("htmlParser.cs " + ex.Message); }*/ //http://www.webr2.com/htmlagilitypack-webget-load-gives-error-object-reference-not-set-to-an-instance-of-an-object/ try{ var temp = new Uri(url.Url); var request = (HttpWebRequest)WebRequest.Create(temp); request.Method = "GET"; using (var response = (HttpWebResponse)request.GetResponse()) { using (var stream = response.GetResponseStream()) { htmlDoc.Load(stream, Encoding.GetEncoding("iso-8859-9")); } } }catch(WebException ex){ Console.WriteLine(ex.Message); } //offline //htmlDoc.Load(url.Url, Encoding.ASCII,true); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors == null) { }else {} //We're only interested in the bodytag for now. later comes keywords if (htmlDoc.DocumentNode != null) { //Find the Schedule in the html doc body = htmlDoc.DocumentNode.SelectSingleNode ("//body"); } if (body != null) { try{ //Console.WriteLine(body.InnerText); _wordsOnPage= ReadParagraphs(body); _aHref = ReadHyperLinkReferences(body, url.Depth); MakeTermList(_wordsOnPage); }catch(Exception ex){Console.WriteLine("HTMLParser.cs "+ex);} } }
public Page(UrlStamp url, int ID) { _url = url; parser = new HTMLParser(_url); Console.WriteLine("\tparser finished!"); shingles = new Shingles(parser, 4, 84, 12); Console.WriteLine("\tshingles finished!"); _dateVisited = DateTime.Now; _depth = url.Depth; _ID=ID; }
public Page(UrlStamp url, int ID) { _url = url; parser = new HTMLParser(_url); Console.WriteLine("\tparser finished!"); shingles = new Shingles(parser, 4, 84, 12); Console.WriteLine("\tshingles finished!"); _dateVisited = DateTime.Now; _depth = url.Depth; _ID = ID; }
public HTMLParser(UrlStamp url) { body = null; _wordsOnPage = new List <String>(); HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); // There are various options, set as needed htmlDoc.OptionFixNestedTags = true; //I can get the character set by looking into the meta tags in the header and look for charset //CultureInfo pt = CultureInfo.GetCultureInfo("da-DK"); //online /*HtmlWeb hw = new HtmlWeb(); * try{ * htmlDoc = hw.Load(url.Url); * }catch(HtmlWebException ex){ * Console.WriteLine("htmlParser.cs " + ex.Message); * }*/ //http://www.webr2.com/htmlagilitypack-webget-load-gives-error-object-reference-not-set-to-an-instance-of-an-object/ try{ var temp = new Uri(url.Url); var request = (HttpWebRequest)WebRequest.Create(temp); request.Method = "GET"; using (var response = (HttpWebResponse)request.GetResponse()) { using (var stream = response.GetResponseStream()) { htmlDoc.Load(stream, Encoding.GetEncoding("iso-8859-9")); } } }catch (WebException ex) { Console.WriteLine(ex.Message); } //offline //htmlDoc.Load(url.Url, Encoding.ASCII,true); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors == null) { } else { } //We're only interested in the bodytag for now. later comes keywords if (htmlDoc.DocumentNode != null) { //Find the Schedule in the html doc body = htmlDoc.DocumentNode.SelectSingleNode("//body"); } if (body != null) { try{ //Console.WriteLine(body.InnerText); _wordsOnPage = ReadParagraphs(body); _aHref = ReadHyperLinkReferences(body, url.Depth); MakeTermList(_wordsOnPage); }catch (Exception ex) { Console.WriteLine("HTMLParser.cs " + ex); } } }