ParseHTML, HeatonResearch.Spider.HTML C# (CSharp)のコード例

コード例 #1

0

ファイルを表示

ファイル: ExtractLinks.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Process the specified URL.
        /// </summary>
        /// <param name="url">The URL to process.</param>
        /// <param name="optionList">Whcih option list to process.</param>
        public void Process(Uri url, int optionList)
        {
            String value = "";
            WebRequest http = HttpWebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);
            StringBuilder buffer = new StringBuilder();

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "a", true) == 0)
                    {
                        value = tag["href"];
                        Uri u = new Uri(url, value.ToString());
                        value = u.ToString();
                        buffer.Length = 0;
                    }
                    else if (String.Compare(tag.Name, "/a", true) == 0)
                    {
                        ProcessOption(buffer.ToString(), value);
                    }
                }
                else
                {
                    buffer.Append((char)ch);
                }
            }
        }

コード例 #2

0

ファイルを表示

ファイル: ExtractImages.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Process the specified URL and download the images.
        /// </summary>
        /// <param name="url">The URL to process.</param>
        /// <param name="saveTo">A directory to save the images to.</param>
        public void Process(Uri url, String saveTo)
        {
            WebRequest http = HttpWebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "img", true) == 0)
                    {
                        String src = tag["src"];
                        Uri u = new Uri(url, src);
                        String filename = ExtractFile(u);
                        String saveFile = Path.Combine(saveTo, filename);
                        WebRequest http2 = HttpWebRequest.Create(u);
                        HttpWebResponse response2 = (HttpWebResponse)http2.GetResponse();
                        this.DownloadBinaryFile(response2, saveFile);
                        response2.Close();
                    }
                }
            }
        }

コード例 #3

0

ファイルを表示

ファイル: FindRSS.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// This method looks for a link tag at the specified URL.  If a link
        /// tag is found that specifies an RSS feed, then that feed is 
        /// displayed.
        /// </summary>
        /// <param name="url">The URL of the web site.</param>
        public void Process(Uri url)
        {
            String href = null;
            WebRequest http = HttpWebRequest.Create(url);
            http.Timeout = 30000;
            WebResponse response = http.GetResponse();
            Stream stream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(stream);

            int ch;
            do
            {
                ch = parse.Read();
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "link", true) == 0)
                    {
                        String type = tag["type"];
                        if (type != null && type.IndexOf("rss") != -1)
                        {
                            href = tag["href"];
                        }
                    }
                }
            } while (ch != -1);

            if (href == null)
            {
                Console.WriteLine("No RSS link found.");
            }
            else
                ProcessRSS(new Uri(href));
        }

コード例 #4

0

ファイルを表示

ファイル: Analizador.cs プロジェクト: johandebruin/Bot-Library

 public Analizador(string url)
 {
     HttpWebRequest peticion = (HttpWebRequest)HttpWebRequest.Create(url);
     peticion.Timeout = 200000;
     //System.Net.WebProxy x = new System.Net.WebProxy("192.168.1.34", 808);
     //peticion.AllowAutoRedirect = true;
     //peticion.Proxy = x;
     peticion.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3";
     respuesta = (HttpWebResponse)peticion.GetResponse();
     istream = respuesta.GetResponseStream();
     html = new ParseHTML(istream);
 }

コード例 #5

0

ファイルを表示

ファイル: MiembrosEstaticos.cs プロジェクト: johandebruin/Bot-Library

 /// <summary>
 /// Versión mejorada para avanzar a un tag que cumpla cierto atributo
 /// </summary>
 /// <param name="analizador">El parseHTML que queramos avanzar en</param>
 /// <param name="etiqueta">La etiqueta a la que queramos llegar</param>
 /// <param name="nombreAtributo">El nombre del atributo como "src"</param>
 /// <param name="atributo">el propio atributo como "/imagenes/...</param>
 /// <returns></returns>
 public static bool AvanzarA(ParseHTML analizador, String etiqueta,String nombreAtributo, String atributo)
 {
     int ch;
     while ((ch = analizador.Read()) != -1)
     {
         if (ch == 0)
         {
             if (analizador.Tag.Name== etiqueta && analizador.Tag[nombreAtributo] == atributo)
                 return true;
         }
     }
     return false;
 }

コード例 #6

0

ファイルを表示

ファイル: ParseTable.cs プロジェクト: Clever-Boy/jeffheaton-book-code

 /// <summary>
 /// Advance to the specified HTML tag.
 /// </summary>
 /// <param name="parse">The HTML parse object to use.</param>
 /// <param name="tag">The HTML tag.</param>
 /// <param name="count">How many tags like this to find.</param>
 /// <returns>True if found, false otherwise.</returns>
 private bool Advance(ParseHTML parse, String tag, int count)
 {
     int ch;
     while ((ch = parse.Read()) != -1)
     {
         if (ch == 0)
         {
             if (String.Compare(parse.Tag.Name, tag,true) == 0)
             {
                 count--;
                 if (count <= 0)
                     return true;
             }
         }
     }
     return false;
 }

コード例 #7

0

ファイルを表示

ファイル: ParseList.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /**
         * Called to extract a list from the specified URL.
         * @param url The URL to extract the list from.
         * @param listType What type of list, specify its beginning tag (i.e. <UL>)
         * @param optionList Which list to search, zero for first.
         * @throws IOException Thrown if an IO exception occurs.
         */
        public void Process(Uri url, String listType, int optionList)
        {
            String listTypeEnd = listType + "/";
            WebRequest http = HttpWebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);
            StringBuilder buffer = new StringBuilder();
            bool capture = false;

            Advance(parse, listType, optionList);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "li", true) == 0)
                    {
                        if (buffer.Length > 0)
                            ProcessItem(buffer.ToString());
                        buffer.Length = 0;
                        capture = true;
                    }
                    else if (String.Compare(tag.Name, "/li", true) == 0)
                    {
                        Console.WriteLine(buffer.ToString());
                        ProcessItem(buffer.ToString());
                        buffer.Length = 0;
                        capture = false;
                    }
                    else if (String.Compare(tag.Name, listTypeEnd, true) == 0)
                    {
                        break;
                    }
                }
                else
                {
                    if (capture)
                        buffer.Append((char)ch);
                }
            }
        }

コード例 #8

0

ファイルを表示

ファイル: Peliculon.cs プロジェクト: johandebruin/Johan-Bot

 static string Categoria(ParseHTML analizador)
 {
     int ch;
     bool leer = false;
     StringBuilder buffer = new StringBuilder();
     while ((ch = analizador.Read()) != -1)
     {
         if (ch == 0)
         {
             if (analizador.Tag.Name == "a")
                 leer = true;
             else if (analizador.Tag.Name == "/a")
                 return buffer.ToString();
         }
         else if (leer)
             buffer.Append((char)ch);
     }
     return "no se encontro la categoria :S";
 }

コード例 #9

0

ファイルを表示

ファイル: ParseChoiceList.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Process the specified URL and extract the option list there.
        /// </summary>
        /// <param name="url">The URL to process.</param>
        /// <param name="optionList">Which option list to process, zero for first.</param>
        public void Process(Uri url, int optionList)
        {
            String value = "";
            WebRequest http = HttpWebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);
            StringBuilder buffer = new StringBuilder();

            Advance(parse, "select", optionList);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "option") == 0)
                    {
                        value = tag["value"];
                        buffer.Length = 0;
                    }
                    else if (String.Compare(tag.Name, "/option") == 0)
                    {
                        ProcessOption(buffer.ToString(), value);
                    }
                    else if (String.Compare(tag.Name, "/choice") == 0)
                    {
                        break;
                    }
                }
                else
                {
                    buffer.Append((char)ch);
                }
            }
        }

コード例 #10

0

ファイルを表示

ファイル: UseCookie.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Use the cookie to search for the specified state or capital.  The search
        /// method can be called multiple times per login.
        /// </summary>
        /// <param name="search">The search string to use.</param>
        /// <param name="type">What to search for(s=state,c=capital).</param>
        /// <returns>A list of states or capitals.</returns>
        public List<String> Search(String search, String type)
        {
            String listType = "ul";
            String listTypeEnd = "/ul";
            StringBuilder buffer = new StringBuilder();
            bool capture = false;
            List<String> result = new List<String>();

            // build the request
            Uri url = new Uri("http://www.httprecipes.com/1/8/menuc.php");
            

            HttpWebRequest http = (HttpWebRequest)HttpWebRequest.Create(url);
            http.CookieContainer = cookies;
            http.Timeout = 30000;
            http.ContentType = "application/x-www-form-urlencoded";
            http.Method = "POST";
            
            Stream ostream = http.GetRequestStream();

            // perform the post
            FormUtility form = new FormUtility(ostream, null);
            form.Add("search", search);
            form.Add("type", type);
            form.Add("action", "Search");
            form.Complete();
            ostream.Close();

            // read the results
            WebResponse response = http.GetResponse();
            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);

            // parse from the URL

            Advance(parse, listType, 0);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "li", true) == 0)
                    {
                        if (buffer.Length > 0)
                            result.Add(buffer.ToString());
                        buffer.Length = 0;
                        capture = true;
                    }
                    else if (String.Compare(tag.Name, "/li", true) == 0)
                    {
                        result.Add(buffer.ToString());
                        buffer.Length = 0;
                        capture = false;
                    }
                    else if (String.Compare(tag.Name, listTypeEnd, true) == 0)
                    {
                        result.Add(buffer.ToString());
                        break;
                    }
                }
                else
                {
                    if (capture)
                        buffer.Append((char)ch);
                }
            }

            return result;
        }

コード例 #11

0

ファイルを表示

ファイル: Peliculon.cs プロジェクト: johandebruin/Johan-Bot

 static string Titulo(ParseHTML analizador)
 {
     int ch;
     StringBuilder buffer = new StringBuilder();
     while ((ch = analizador.Read()) != -1)
     {
         if (ch > 0)
         {
             buffer.Append((char)ch);
         }
         else
             return buffer.ToString();
     }
     return "No se encontro el titulo :S";
 }

コード例 #12

0

ファイルを表示

ファイル: Peliculon.cs プロジェクト: johandebruin/Johan-Bot

 static string Imagen(ParseHTML analizador)
 {
     int ch;
     while ((ch = analizador.Read()) != -1)
     {
         if (ch == 0)
             if (analizador.Tag.Name == "img")
                 return analizador.Tag["src"];
     }
     return "no se encontro una imagen :S";
 }

コード例 #13

0

ファイルを表示

ファイル: Peliculon.cs プロジェクト: johandebruin/Johan-Bot

 static string Enlace(ParseHTML analizador)
 {
     int ch;
     while ((ch = analizador.Read()) != -1)
     {
         if (ch == 0)
             if (analizador.Tag.Name == "a")
                 return analizador.Tag["href"];
     }
     return "no se encontro enlaces :S";
 }

コード例 #14

0

ファイルを表示

ファイル: Peliculon.cs プロジェクト: johandebruin/Johan-Bot

 static string Descripcion(ParseHTML analizador)
 {
     int ch;
     StringBuilder buffer = new StringBuilder();
     while ((ch = analizador.Read()) > 0)
         buffer.Append((char)ch);
     return buffer.ToString();
 }

コード例 #15

0

ファイルを表示

ファイル: ExtractSubPage.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Process the specified URL and extract data from all of the subpages
        /// that this page links to.
        /// </summary>
        /// <param name="url">The URL to process.</param>
        public void Process(Uri url)
        {
            String value = "";
            WebRequest http = HttpWebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "a", true) == 0)
                    {
                        value = tag["href"];
                        Uri u = new Uri(url, value.ToString());
                        value = u.ToString();
                        ProcessSubPage(u);
                    }
                }
            }
        }

コード例 #16

0

ファイルを表示

ファイル: JavaScriptForms.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// This method will download an amortization table for the 
        /// specified parameters.
        /// </summary>
        /// <param name="interest">The interest rate for the loan.</param>
        /// <param name="term">The term(in months) of the loan.</param>
        /// <param name="principle">The principle amount of the loan.</param>
        public void process(double interest, int term, int principle)
        {


            Uri url = new Uri("http://www.httprecipes.com/1/9/loan.php");
            WebRequest http = HttpWebRequest.Create(url);
            http.Timeout = 30000;
            http.ContentType = "application/x-www-form-urlencoded";
            http.Method = "POST";
            Stream ostream = http.GetRequestStream();



            FormUtility form = new FormUtility(ostream, null);
            form.Add("interest", "" + interest);
            form.Add("term", "" + term);
            form.Add("principle", "" + principle);
            form.Complete();
            ostream.Close();
            WebResponse response = http.GetResponse();

            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);
            StringBuilder buffer = new StringBuilder();
            List<String> list = new List<String>();
            bool capture = false;

            Advance(parse, "table", 3);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "tr", true) == 0)
                    {
                        list.Clear();
                        capture = false;
                        buffer.Length = 0;
                    }
                    else if (String.Compare(tag.Name, "/tr", true) == 0)
                    {
                        if (list.Count > 0)
                        {
                            ProcessTableRow(list);
                            list.Clear();
                        }
                    }
                    else if (String.Compare(tag.Name, "td", true) == 0)
                    {
                        if (buffer.Length > 0)
                            list.Add(buffer.ToString());
                        buffer.Length = 0;
                        capture = true;
                    }
                    else if (String.Compare(tag.Name, "/td", true) == 0)
                    {
                        list.Add(buffer.ToString());
                        buffer.Length = 0;
                        capture = false;
                    }
                    else if (String.Compare(tag.Name, "/table", true) == 0)
                    {
                        break;
                    }
                }
                else
                {
                    if (capture)
                        buffer.Append((char)ch);
                }
            }
        }

コード例 #17

0

ファイルを表示

ファイル: WhenBorn.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Check the specified URL for a birth year.  This will occur if one
        /// sentence is found that has the word born, and a numeric value less
        /// than 3000.
        /// </summary>
        /// <param name="url">The URL to check.</param>
        public void CheckURL(Uri url)
        {
            int ch;
            StringBuilder sentence = new StringBuilder();

            try
            {
                WebRequest http = HttpWebRequest.Create(url);
                HttpWebResponse response = (HttpWebResponse)http.GetResponse();
                Stream istream = response.GetResponseStream();
                ParseHTML html = new ParseHTML(istream);
                do
                {
                    ch = html.Read();
                    if ((ch != -1) && (ch != 0))
                    {
                        if (ch == '.')
                        {
                            String str = sentence.ToString();
                            int year = ExtractBirth(str);
                            if ((year > 1) && (year < 3000))
                            {
                                Console.WriteLine("URL supports year: " + year);
                                IncreaseYear(year);
                            }
                            sentence.Length = 0;
                        }
                        else
                            sentence.Append((char)ch);
                    }
                } while (ch != -1);

            }
            catch (WebException)
            {
            }
            catch (IOException)
            {
            }
        }

コード例 #18

0

ファイルを表示

ファイル: FormPOST.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /**
         * Access the website and perform a search for either states or capitals.
         * @param search A search string.
         * @param type What to search for(s=state, c=capital)
         * @throws IOException Thrown if an IO exception occurs.
         */
        public void Process(String search, String type)
        {
            String listType = "ul";
            String listTypeEnd = "/ul";
            StringBuilder buffer = new StringBuilder();
            bool capture = false;

            // Build the URL and POST.
            Uri url = new Uri("http://www.httprecipes.com/1/7/post.php");
            WebRequest http = HttpWebRequest.Create(url);
            http.Timeout = 30000;
            http.ContentType = "application/x-www-form-urlencoded";
            http.Method = "POST";
            Stream ostream = http.GetRequestStream();

            FormUtility form = new FormUtility(ostream, null);
            form.Add("search", search);
            form.Add("type", type);
            form.Add("action", "Search");
            form.Complete();
            ostream.Close();

            // read the results
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream istream = response.GetResponseStream();

            ParseHTML parse = new ParseHTML(istream);

            // parse from the URL

            Advance(parse, listType, 0);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "li", true) == 0)
                    {
                        if (buffer.Length > 0)
                            ProcessItem(buffer.ToString());
                        buffer.Length = 0;
                        capture = true;
                    }
                    else if (String.Compare(tag.Name, "/li", true) == 0)
                    {
                        ProcessItem(buffer.ToString());
                        buffer.Length = 0;
                        capture = false;
                    }
                    else if (String.Compare(tag.Name, listTypeEnd, true) == 0)
                    {
                        ProcessItem(buffer.ToString());
                        break;
                    }
                }
                else
                {
                    if (capture)
                        buffer.Append((char)ch);
                }
            }
        }

コード例 #19

0

ファイルを表示

ファイル: ExtractPartial.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Called to process each partial page.
        /// </summary>
        /// <param name="url">The URL of the partial page.</param>
        /// <returns>Returns the next partial page, or null if no more.</returns>
        public Uri Process(Uri url)
        {
            Uri result = null;
            StringBuilder buffer = new StringBuilder();
            String value = "";
            String src = "";

            WebRequest http = HttpWebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);
            bool first = true;

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "a", true) == 0)
                    {
                        buffer.Length = 0;
                        value = tag["href"];
                        Uri u = new Uri(url, value.ToString());
                        value = u.ToString();
                        src = null;
                    }
                    else if (String.Compare(tag.Name, "img", true) == 0)
                    {
                        src = tag["src"];
                    }
                    else if (String.Compare(tag.Name, "/a", true) == 0)
                    {
                        if (String.Compare(buffer.ToString(), "[Next 5]", true) == 0)
                        {
                            result = new Uri(url, value);
                        }
                        else if (src != null)
                        {
                            if (!first)
                            {
                                Uri urlOfficial = new Uri(url, value);
                                Uri urlFlag = new Uri(url, src);
                                ProcessItem(urlOfficial, urlFlag);
                            }
                            else
                                first = false;
                        }
                    }
                }
                else
                {
                    buffer.Append((char)ch);
                }
            }

            return result;
        }

コード例 #20

0

ファイルを表示

ファイル: FormGET.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Access the website and perform a search for either states or capitals.
        /// </summary>
        /// <param name="search">A search string.</param>
        /// <param name="type">What to search for(s=state, c=capital)</param>
        public void Process(String search, String type)
        {
            String listType = "ul";
            String listTypeEnd = "/ul";
            StringBuilder buffer = new StringBuilder();
            bool capture = false;

            // Build the URL.
            MemoryStream mstream = new MemoryStream();
            FormUtility form = new FormUtility(mstream, null);
            form.Add("search", search);
            form.Add("type", type);
            form.Add("action", "Search");
            form.Complete();

            System.Text.ASCIIEncoding enc = new System.Text.ASCIIEncoding();

            String str = enc.GetString(mstream.GetBuffer());
            String surl = "http://www.httprecipes.com/1/7/get.php?" + str;
            Uri url = new Uri(surl);
            WebRequest http = HttpWebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);

            // Parse from the URL.

            Advance(parse, listType, 0);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "li", true) == 0)
                    {
                        if (buffer.Length > 0)
                            ProcessItem(buffer.ToString());
                        buffer.Length = 0;
                        capture = true;
                    }
                    else if (String.Compare(tag.Name, "/li", true) == 0)
                    {
                        ProcessItem(buffer.ToString());
                        buffer.Length = 0;
                        capture = false;
                    }
                    else if (String.Compare(tag.Name, listTypeEnd, true) == 0)
                    {
                        ProcessItem(buffer.ToString());
                        break;
                    }
                }
                else
                {
                    if (capture)
                        buffer.Append((char)ch);
                }
            }
        }

コード例 #21

0

ファイルを表示

ファイル: Text.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Check the specified URL for a birth year. This will occur if one sentence
        /// is found that has the word born, and a numeric value less than 3000.
        /// </summary>
        /// <param name="report">Object to report to.</param>
        /// <param name="url">The url.</param>
        /// <param name="desiredYear">The desired year.</param>
        public static void CheckURL(ScanReportable report, Uri url,
                 int desiredYear)
        {
            int ch;
            StringBuilder sentence = new StringBuilder();
            String ignoreUntil = null;

            WebRequest http = HttpWebRequest.Create(url);
            http.Timeout = 10000;
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();            
            Stream istream = response.GetResponseStream();
            ParseHTML html = new ParseHTML(istream);


            do
            {
                ch = html.Read();
                if ((ch != -1) && (ch != 0) && (ignoreUntil == null))
                {
                    if (".?!".IndexOf((char)ch) != -1)
                    {
                        String str = sentence.ToString();
                        int year = Text.ExtractYear(str);

                        if (desiredYear == -1)
                        {
                            // looking for any year
                            if (year != -1)
                            {
                                report.ReceiveGoodSentence(str);
                            }
                        }
                        else
                        {
                            // looking for a specific year
                            if (year == desiredYear)
                            {
                                report.ReceiveGoodSentence(str);
                            }
                            else if (year != -1)
                            {
                                report.ReceiveBadSentence(str);
                            }
                        }
                        sentence.Length = 0;
                    }
                    else if (ch == ' ')
                    {
                        string str = sentence.ToString();
                        if ((sentence.Length > 0)
                                && (str[str.Length - 1] != ' '))
                        {
                            sentence.Append(' ');

                        }
                    }
                    else if ((ch != '\n') && (ch != '\t') && (ch != '\r'))
                    {
                        if ((ch) < 128)
                        {
                            sentence.Append((char)ch);
                        }
                    }
                }
                else if (ch == 0)
                {
                    // clear anything before a body tag
                    if (html.Tag.Name.Equals("body", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("br", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("li", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("h1", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("h2", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("h3", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("td", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("th", StringComparison.CurrentCultureIgnoreCase))
                    {
                        sentence.Length = 0;
                    }
                    // ignore everything between script and style tags
                    if (ignoreUntil == null)
                    {
                        if (html.Tag.Name.Equals("script", StringComparison.CurrentCultureIgnoreCase))
                        {
                            ignoreUntil = "/script";
                        }
                        else if (html.Tag.Name
                                .Equals("style", StringComparison.CurrentCultureIgnoreCase))
                        {
                            ignoreUntil = "/style";
                        }
                    }
                    else
                    {
                        if (html.Tag.Name.Equals(ignoreUntil, StringComparison.CurrentCultureIgnoreCase))
                        {
                            ignoreUntil = null;
                        }
                    }

                    // add a space after the tag
                    if (sentence.Length > 0)
                    {
                        string str = sentence.ToString();
                        if (str[str.Length - 1] != ' ')
                        {
                            sentence.Append(' ');
                        }
                    }
                }
            } while (ch != -1);

        }

コード例 #22

0

ファイルを表示

ファイル: Includes.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Called to download the text from a page.  If any JavaScript
        /// include is found, the text from that page is read too.
        /// </summary>
        public void Process()
        {
            Uri url = new Uri("http://www.httprecipes.com/1/9/includes.php");
            WebRequest http = HttpWebRequest.Create(url);
            http.Timeout = 30000;
            WebResponse response = http.GetResponse();
            Stream stream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(stream);
            StringBuilder buffer = new StringBuilder();

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "script", true) == 0 && tag["src"] != null)
                    {
                        String src = tag["src"];
                        Uri u = new Uri(url, src);
                        String include = DownloadPage(u);
                        buffer.Append("<script>");
                        buffer.Append(include);
                        buffer.Append("</script>");
                    }
                    else
                    {
                        buffer.Append(tag.ToString());
                    }
                }
                else
                {
                    buffer.Append((char)ch);
                }
            }

            Console.WriteLine(buffer.ToString());
        }

コード例 #23

0

ファイルを表示

ファイル: ParseTable.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// Called to parse a table.  The table number at the specified URL
        /// will be parsed.
        /// </summary>
        /// <param name="url">The URL of the HTML page that contains the table.</param>
        /// <param name="tableNum">The table number to parse, zero for the first.</param>
        public void Process(Uri url, int tableNum)
        {
            WebRequest http = HttpWebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream istream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(istream);
            StringBuilder buffer = new StringBuilder();
            List<String> list = new List<String>();
            bool capture = false;

            Advance(parse, "table", tableNum);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "tr", true) == 0)
                    {
                        list.Clear();
                        capture = false;
                        buffer.Length = 0;
                    }
                    else if (String.Compare(tag.Name, "/tr", true) == 0)
                    {
                        if (list.Count > 0)
                        {
                            ProcessTableRow(list);
                            list.Clear();
                        }
                    }
                    else if (String.Compare(tag.Name, "td", true) == 0)
                    {
                        if (buffer.Length > 0)
                            list.Add(buffer.ToString());
                        buffer.Length = 0;
                        capture = true;
                    }
                    else if (String.Compare(tag.Name, "/td", true) == 0)
                    {
                        list.Add(buffer.ToString());
                        buffer.Length = 0;
                        capture = false;
                    }
                    else if (String.Compare(tag.Name, "/table", true) == 0)
                    {
                        break;
                    }
                }
                else
                {
                    if (capture)
                        buffer.Append((char)ch);
                }
            }
        }

コード例 #24

0

ファイルを表示

ファイル: DownloadArticle.cs プロジェクト: Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// This method looks for each of the <option> tags that contain
        /// a link to each of the pages.  For each page found the 
        /// downloadArticlePage method is called.
        /// </summary>
        public void Process()
        {
            Uri url = new Uri("http://www.httprecipes.com/1/9/article.php");
            WebRequest http = HttpWebRequest.Create(url);
            http.Timeout = 30000;
            WebResponse response = http.GetResponse();
            Stream stream = response.GetResponseStream();
            ParseHTML parse = new ParseHTML(stream);

            int ch;
            while ((ch = parse.Read()) != -1)
            {
                if (ch == 0)
                {
                    HTMLTag tag = parse.Tag;
                    if (String.Compare(tag.Name, "option", true) == 0)
                    {
                        String str = tag["value"];
                        Uri u = new Uri(url, str);
                        Console.WriteLine(DownloadArticlePage(u));
                    }
                }
            }
        }

C# (CSharp) HeatonResearch.Spider.HTML ParseHTMLの例