public HTMLParser(UrlStamp url)
        {
            body = null;
            _wordsOnPage = new List<String>();

            HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();

            // There are various options, set as needed
            htmlDoc.OptionFixNestedTags=true;
            //I can get the character set by looking into the meta tags in the header and look for charset
            //CultureInfo pt = CultureInfo.GetCultureInfo("da-DK");

            //online
            /*HtmlWeb hw = new HtmlWeb();
            try{
                htmlDoc = hw.Load(url.Url);
            }catch(HtmlWebException ex){
                Console.WriteLine("htmlParser.cs " + ex.Message);
            }*/

            //http://www.webr2.com/htmlagilitypack-webget-load-gives-error-object-reference-not-set-to-an-instance-of-an-object/
            try{
                var temp = new Uri(url.Url);
                var request = (HttpWebRequest)WebRequest.Create(temp);
                request.Method = "GET";
                using (var response = (HttpWebResponse)request.GetResponse())
                {
                    using (var stream = response.GetResponseStream())
                    {
                        htmlDoc.Load(stream, Encoding.GetEncoding("iso-8859-9"));
                    }
                }
            }catch(WebException ex){
                Console.WriteLine(ex.Message);
            }

            //offline
            //htmlDoc.Load(url.Url, Encoding.ASCII,true);

            // ParseErrors is an ArrayList containing any errors from the Load statement
            if (htmlDoc.ParseErrors == null) { }else {}

            //We're only interested in the bodytag for now. later comes keywords
            if (htmlDoc.DocumentNode != null) {
                //Find the Schedule in the html doc
                body = htmlDoc.DocumentNode.SelectSingleNode ("//body");
            }

            if (body != null)
            {
                try{
                //Console.WriteLine(body.InnerText);
                    _wordsOnPage= ReadParagraphs(body);
                    _aHref = ReadHyperLinkReferences(body, url.Depth);
                    MakeTermList(_wordsOnPage);
                }catch(Exception ex){Console.WriteLine("HTMLParser.cs "+ex);}
            }
        }
Example #2
0
        public Page(UrlStamp url, int ID)
        {
            _url = url;
            parser = new HTMLParser(_url);
            Console.WriteLine("\tparser finished!");
            shingles = new Shingles(parser, 4, 84, 12);
            Console.WriteLine("\tshingles finished!");

            _dateVisited = DateTime.Now;
            _depth = url.Depth;
            _ID=ID;
        }
Example #3
0
        public Page(UrlStamp url, int ID)
        {
            _url   = url;
            parser = new HTMLParser(_url);
            Console.WriteLine("\tparser finished!");
            shingles = new Shingles(parser, 4, 84, 12);
            Console.WriteLine("\tshingles finished!");


            _dateVisited = DateTime.Now;
            _depth       = url.Depth;
            _ID          = ID;
        }
        public HTMLParser(UrlStamp url)
        {
            body         = null;
            _wordsOnPage = new List <String>();

            HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();

            // There are various options, set as needed
            htmlDoc.OptionFixNestedTags = true;
            //I can get the character set by looking into the meta tags in the header and look for charset
            //CultureInfo pt = CultureInfo.GetCultureInfo("da-DK");

            //online

            /*HtmlWeb hw = new HtmlWeb();
             * try{
             *  htmlDoc = hw.Load(url.Url);
             * }catch(HtmlWebException ex){
             *  Console.WriteLine("htmlParser.cs " + ex.Message);
             * }*/

            //http://www.webr2.com/htmlagilitypack-webget-load-gives-error-object-reference-not-set-to-an-instance-of-an-object/
            try{
                var temp    = new Uri(url.Url);
                var request = (HttpWebRequest)WebRequest.Create(temp);
                request.Method = "GET";
                using (var response = (HttpWebResponse)request.GetResponse())
                {
                    using (var stream = response.GetResponseStream())
                    {
                        htmlDoc.Load(stream, Encoding.GetEncoding("iso-8859-9"));
                    }
                }
            }catch (WebException ex) {
                Console.WriteLine(ex.Message);
            }


            //offline
            //htmlDoc.Load(url.Url, Encoding.ASCII,true);

            // ParseErrors is an ArrayList containing any errors from the Load statement
            if (htmlDoc.ParseErrors == null)
            {
            }
            else
            {
            }

            //We're only interested in the bodytag for now. later comes keywords
            if (htmlDoc.DocumentNode != null)
            {
                //Find the Schedule in the html doc
                body = htmlDoc.DocumentNode.SelectSingleNode("//body");
            }

            if (body != null)
            {
                try{
                    //Console.WriteLine(body.InnerText);
                    _wordsOnPage = ReadParagraphs(body);
                    _aHref       = ReadHyperLinkReferences(body, url.Depth);
                    MakeTermList(_wordsOnPage);
                }catch (Exception ex) { Console.WriteLine("HTMLParser.cs " + ex); }
            }
        }