/// <summary> /// Process the specified URL. /// </summary> /// <param name="url">The URL to process.</param> /// <param name="optionList">Whcih option list to process.</param> public void Process(Uri url, int optionList) { String value = ""; WebRequest http = HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); StringBuilder buffer = new StringBuilder(); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "a", true) == 0) { value = tag["href"]; Uri u = new Uri(url, value.ToString()); value = u.ToString(); buffer.Length = 0; } else if (String.Compare(tag.Name, "/a", true) == 0) { ProcessOption(buffer.ToString(), value); } } else { buffer.Append((char)ch); } } }
/// <summary> /// Process the specified URL and download the images. /// </summary> /// <param name="url">The URL to process.</param> /// <param name="saveTo">A directory to save the images to.</param> public void Process(Uri url, String saveTo) { WebRequest http = HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "img", true) == 0) { String src = tag["src"]; Uri u = new Uri(url, src); String filename = ExtractFile(u); String saveFile = Path.Combine(saveTo, filename); WebRequest http2 = HttpWebRequest.Create(u); HttpWebResponse response2 = (HttpWebResponse)http2.GetResponse(); this.DownloadBinaryFile(response2, saveFile); response2.Close(); } } } }
/// <summary> /// This method looks for a link tag at the specified URL. If a link /// tag is found that specifies an RSS feed, then that feed is /// displayed. /// </summary> /// <param name="url">The URL of the web site.</param> public void Process(Uri url) { String href = null; WebRequest http = HttpWebRequest.Create(url); http.Timeout = 30000; WebResponse response = http.GetResponse(); Stream stream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(stream); int ch; do { ch = parse.Read(); if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "link", true) == 0) { String type = tag["type"]; if (type != null && type.IndexOf("rss") != -1) { href = tag["href"]; } } } } while (ch != -1); if (href == null) { Console.WriteLine("No RSS link found."); } else ProcessRSS(new Uri(href)); }
public Analizador(string url) { HttpWebRequest peticion = (HttpWebRequest)HttpWebRequest.Create(url); peticion.Timeout = 200000; //System.Net.WebProxy x = new System.Net.WebProxy("192.168.1.34", 808); //peticion.AllowAutoRedirect = true; //peticion.Proxy = x; peticion.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3"; respuesta = (HttpWebResponse)peticion.GetResponse(); istream = respuesta.GetResponseStream(); html = new ParseHTML(istream); }
/// <summary> /// Versión mejorada para avanzar a un tag que cumpla cierto atributo /// </summary> /// <param name="analizador">El parseHTML que queramos avanzar en</param> /// <param name="etiqueta">La etiqueta a la que queramos llegar</param> /// <param name="nombreAtributo">El nombre del atributo como "src"</param> /// <param name="atributo">el propio atributo como "/imagenes/...</param> /// <returns></returns> public static bool AvanzarA(ParseHTML analizador, String etiqueta,String nombreAtributo, String atributo) { int ch; while ((ch = analizador.Read()) != -1) { if (ch == 0) { if (analizador.Tag.Name== etiqueta && analizador.Tag[nombreAtributo] == atributo) return true; } } return false; }
/// <summary> /// Advance to the specified HTML tag. /// </summary> /// <param name="parse">The HTML parse object to use.</param> /// <param name="tag">The HTML tag.</param> /// <param name="count">How many tags like this to find.</param> /// <returns>True if found, false otherwise.</returns> private bool Advance(ParseHTML parse, String tag, int count) { int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { if (String.Compare(parse.Tag.Name, tag,true) == 0) { count--; if (count <= 0) return true; } } } return false; }
/** * Called to extract a list from the specified URL. * @param url The URL to extract the list from. * @param listType What type of list, specify its beginning tag (i.e. <UL>) * @param optionList Which list to search, zero for first. * @throws IOException Thrown if an IO exception occurs. */ public void Process(Uri url, String listType, int optionList) { String listTypeEnd = listType + "/"; WebRequest http = HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); StringBuilder buffer = new StringBuilder(); bool capture = false; Advance(parse, listType, optionList); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "li", true) == 0) { if (buffer.Length > 0) ProcessItem(buffer.ToString()); buffer.Length = 0; capture = true; } else if (String.Compare(tag.Name, "/li", true) == 0) { Console.WriteLine(buffer.ToString()); ProcessItem(buffer.ToString()); buffer.Length = 0; capture = false; } else if (String.Compare(tag.Name, listTypeEnd, true) == 0) { break; } } else { if (capture) buffer.Append((char)ch); } } }
static string Categoria(ParseHTML analizador) { int ch; bool leer = false; StringBuilder buffer = new StringBuilder(); while ((ch = analizador.Read()) != -1) { if (ch == 0) { if (analizador.Tag.Name == "a") leer = true; else if (analizador.Tag.Name == "/a") return buffer.ToString(); } else if (leer) buffer.Append((char)ch); } return "no se encontro la categoria :S"; }
/// <summary> /// Process the specified URL and extract the option list there. /// </summary> /// <param name="url">The URL to process.</param> /// <param name="optionList">Which option list to process, zero for first.</param> public void Process(Uri url, int optionList) { String value = ""; WebRequest http = HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); StringBuilder buffer = new StringBuilder(); Advance(parse, "select", optionList); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "option") == 0) { value = tag["value"]; buffer.Length = 0; } else if (String.Compare(tag.Name, "/option") == 0) { ProcessOption(buffer.ToString(), value); } else if (String.Compare(tag.Name, "/choice") == 0) { break; } } else { buffer.Append((char)ch); } } }
/// <summary> /// Use the cookie to search for the specified state or capital. The search /// method can be called multiple times per login. /// </summary> /// <param name="search">The search string to use.</param> /// <param name="type">What to search for(s=state,c=capital).</param> /// <returns>A list of states or capitals.</returns> public List<String> Search(String search, String type) { String listType = "ul"; String listTypeEnd = "/ul"; StringBuilder buffer = new StringBuilder(); bool capture = false; List<String> result = new List<String>(); // build the request Uri url = new Uri("http://www.httprecipes.com/1/8/menuc.php"); HttpWebRequest http = (HttpWebRequest)HttpWebRequest.Create(url); http.CookieContainer = cookies; http.Timeout = 30000; http.ContentType = "application/x-www-form-urlencoded"; http.Method = "POST"; Stream ostream = http.GetRequestStream(); // perform the post FormUtility form = new FormUtility(ostream, null); form.Add("search", search); form.Add("type", type); form.Add("action", "Search"); form.Complete(); ostream.Close(); // read the results WebResponse response = http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); // parse from the URL Advance(parse, listType, 0); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "li", true) == 0) { if (buffer.Length > 0) result.Add(buffer.ToString()); buffer.Length = 0; capture = true; } else if (String.Compare(tag.Name, "/li", true) == 0) { result.Add(buffer.ToString()); buffer.Length = 0; capture = false; } else if (String.Compare(tag.Name, listTypeEnd, true) == 0) { result.Add(buffer.ToString()); break; } } else { if (capture) buffer.Append((char)ch); } } return result; }
static string Titulo(ParseHTML analizador) { int ch; StringBuilder buffer = new StringBuilder(); while ((ch = analizador.Read()) != -1) { if (ch > 0) { buffer.Append((char)ch); } else return buffer.ToString(); } return "No se encontro el titulo :S"; }
static string Imagen(ParseHTML analizador) { int ch; while ((ch = analizador.Read()) != -1) { if (ch == 0) if (analizador.Tag.Name == "img") return analizador.Tag["src"]; } return "no se encontro una imagen :S"; }
static string Enlace(ParseHTML analizador) { int ch; while ((ch = analizador.Read()) != -1) { if (ch == 0) if (analizador.Tag.Name == "a") return analizador.Tag["href"]; } return "no se encontro enlaces :S"; }
static string Descripcion(ParseHTML analizador) { int ch; StringBuilder buffer = new StringBuilder(); while ((ch = analizador.Read()) > 0) buffer.Append((char)ch); return buffer.ToString(); }
/// <summary> /// Process the specified URL and extract data from all of the subpages /// that this page links to. /// </summary> /// <param name="url">The URL to process.</param> public void Process(Uri url) { String value = ""; WebRequest http = HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "a", true) == 0) { value = tag["href"]; Uri u = new Uri(url, value.ToString()); value = u.ToString(); ProcessSubPage(u); } } } }
/// <summary> /// This method will download an amortization table for the /// specified parameters. /// </summary> /// <param name="interest">The interest rate for the loan.</param> /// <param name="term">The term(in months) of the loan.</param> /// <param name="principle">The principle amount of the loan.</param> public void process(double interest, int term, int principle) { Uri url = new Uri("http://www.httprecipes.com/1/9/loan.php"); WebRequest http = HttpWebRequest.Create(url); http.Timeout = 30000; http.ContentType = "application/x-www-form-urlencoded"; http.Method = "POST"; Stream ostream = http.GetRequestStream(); FormUtility form = new FormUtility(ostream, null); form.Add("interest", "" + interest); form.Add("term", "" + term); form.Add("principle", "" + principle); form.Complete(); ostream.Close(); WebResponse response = http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); StringBuilder buffer = new StringBuilder(); List<String> list = new List<String>(); bool capture = false; Advance(parse, "table", 3); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "tr", true) == 0) { list.Clear(); capture = false; buffer.Length = 0; } else if (String.Compare(tag.Name, "/tr", true) == 0) { if (list.Count > 0) { ProcessTableRow(list); list.Clear(); } } else if (String.Compare(tag.Name, "td", true) == 0) { if (buffer.Length > 0) list.Add(buffer.ToString()); buffer.Length = 0; capture = true; } else if (String.Compare(tag.Name, "/td", true) == 0) { list.Add(buffer.ToString()); buffer.Length = 0; capture = false; } else if (String.Compare(tag.Name, "/table", true) == 0) { break; } } else { if (capture) buffer.Append((char)ch); } } }
/// <summary> /// Check the specified URL for a birth year. This will occur if one /// sentence is found that has the word born, and a numeric value less /// than 3000. /// </summary> /// <param name="url">The URL to check.</param> public void CheckURL(Uri url) { int ch; StringBuilder sentence = new StringBuilder(); try { WebRequest http = HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML html = new ParseHTML(istream); do { ch = html.Read(); if ((ch != -1) && (ch != 0)) { if (ch == '.') { String str = sentence.ToString(); int year = ExtractBirth(str); if ((year > 1) && (year < 3000)) { Console.WriteLine("URL supports year: " + year); IncreaseYear(year); } sentence.Length = 0; } else sentence.Append((char)ch); } } while (ch != -1); } catch (WebException) { } catch (IOException) { } }
/** * Access the website and perform a search for either states or capitals. * @param search A search string. * @param type What to search for(s=state, c=capital) * @throws IOException Thrown if an IO exception occurs. */ public void Process(String search, String type) { String listType = "ul"; String listTypeEnd = "/ul"; StringBuilder buffer = new StringBuilder(); bool capture = false; // Build the URL and POST. Uri url = new Uri("http://www.httprecipes.com/1/7/post.php"); WebRequest http = HttpWebRequest.Create(url); http.Timeout = 30000; http.ContentType = "application/x-www-form-urlencoded"; http.Method = "POST"; Stream ostream = http.GetRequestStream(); FormUtility form = new FormUtility(ostream, null); form.Add("search", search); form.Add("type", type); form.Add("action", "Search"); form.Complete(); ostream.Close(); // read the results HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); // parse from the URL Advance(parse, listType, 0); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "li", true) == 0) { if (buffer.Length > 0) ProcessItem(buffer.ToString()); buffer.Length = 0; capture = true; } else if (String.Compare(tag.Name, "/li", true) == 0) { ProcessItem(buffer.ToString()); buffer.Length = 0; capture = false; } else if (String.Compare(tag.Name, listTypeEnd, true) == 0) { ProcessItem(buffer.ToString()); break; } } else { if (capture) buffer.Append((char)ch); } } }
/// <summary> /// Called to process each partial page. /// </summary> /// <param name="url">The URL of the partial page.</param> /// <returns>Returns the next partial page, or null if no more.</returns> public Uri Process(Uri url) { Uri result = null; StringBuilder buffer = new StringBuilder(); String value = ""; String src = ""; WebRequest http = HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); bool first = true; int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "a", true) == 0) { buffer.Length = 0; value = tag["href"]; Uri u = new Uri(url, value.ToString()); value = u.ToString(); src = null; } else if (String.Compare(tag.Name, "img", true) == 0) { src = tag["src"]; } else if (String.Compare(tag.Name, "/a", true) == 0) { if (String.Compare(buffer.ToString(), "[Next 5]", true) == 0) { result = new Uri(url, value); } else if (src != null) { if (!first) { Uri urlOfficial = new Uri(url, value); Uri urlFlag = new Uri(url, src); ProcessItem(urlOfficial, urlFlag); } else first = false; } } } else { buffer.Append((char)ch); } } return result; }
/// <summary> /// Access the website and perform a search for either states or capitals. /// </summary> /// <param name="search">A search string.</param> /// <param name="type">What to search for(s=state, c=capital)</param> public void Process(String search, String type) { String listType = "ul"; String listTypeEnd = "/ul"; StringBuilder buffer = new StringBuilder(); bool capture = false; // Build the URL. MemoryStream mstream = new MemoryStream(); FormUtility form = new FormUtility(mstream, null); form.Add("search", search); form.Add("type", type); form.Add("action", "Search"); form.Complete(); System.Text.ASCIIEncoding enc = new System.Text.ASCIIEncoding(); String str = enc.GetString(mstream.GetBuffer()); String surl = "http://www.httprecipes.com/1/7/get.php?" + str; Uri url = new Uri(surl); WebRequest http = HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); // Parse from the URL. Advance(parse, listType, 0); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "li", true) == 0) { if (buffer.Length > 0) ProcessItem(buffer.ToString()); buffer.Length = 0; capture = true; } else if (String.Compare(tag.Name, "/li", true) == 0) { ProcessItem(buffer.ToString()); buffer.Length = 0; capture = false; } else if (String.Compare(tag.Name, listTypeEnd, true) == 0) { ProcessItem(buffer.ToString()); break; } } else { if (capture) buffer.Append((char)ch); } } }
/// <summary> /// Check the specified URL for a birth year. This will occur if one sentence /// is found that has the word born, and a numeric value less than 3000. /// </summary> /// <param name="report">Object to report to.</param> /// <param name="url">The url.</param> /// <param name="desiredYear">The desired year.</param> public static void CheckURL(ScanReportable report, Uri url, int desiredYear) { int ch; StringBuilder sentence = new StringBuilder(); String ignoreUntil = null; WebRequest http = HttpWebRequest.Create(url); http.Timeout = 10000; HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML html = new ParseHTML(istream); do { ch = html.Read(); if ((ch != -1) && (ch != 0) && (ignoreUntil == null)) { if (".?!".IndexOf((char)ch) != -1) { String str = sentence.ToString(); int year = Text.ExtractYear(str); if (desiredYear == -1) { // looking for any year if (year != -1) { report.ReceiveGoodSentence(str); } } else { // looking for a specific year if (year == desiredYear) { report.ReceiveGoodSentence(str); } else if (year != -1) { report.ReceiveBadSentence(str); } } sentence.Length = 0; } else if (ch == ' ') { string str = sentence.ToString(); if ((sentence.Length > 0) && (str[str.Length - 1] != ' ')) { sentence.Append(' '); } } else if ((ch != '\n') && (ch != '\t') && (ch != '\r')) { if ((ch) < 128) { sentence.Append((char)ch); } } } else if (ch == 0) { // clear anything before a body tag if (html.Tag.Name.Equals("body", StringComparison.CurrentCultureIgnoreCase) || html.Tag.Name.Equals("br", StringComparison.CurrentCultureIgnoreCase) || html.Tag.Name.Equals("li", StringComparison.CurrentCultureIgnoreCase) || html.Tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase) || html.Tag.Name.Equals("h1", StringComparison.CurrentCultureIgnoreCase) || html.Tag.Name.Equals("h2", StringComparison.CurrentCultureIgnoreCase) || html.Tag.Name.Equals("h3", StringComparison.CurrentCultureIgnoreCase) || html.Tag.Name.Equals("td", StringComparison.CurrentCultureIgnoreCase) || html.Tag.Name.Equals("th", StringComparison.CurrentCultureIgnoreCase)) { sentence.Length = 0; } // ignore everything between script and style tags if (ignoreUntil == null) { if (html.Tag.Name.Equals("script", StringComparison.CurrentCultureIgnoreCase)) { ignoreUntil = "/script"; } else if (html.Tag.Name .Equals("style", StringComparison.CurrentCultureIgnoreCase)) { ignoreUntil = "/style"; } } else { if (html.Tag.Name.Equals(ignoreUntil, StringComparison.CurrentCultureIgnoreCase)) { ignoreUntil = null; } } // add a space after the tag if (sentence.Length > 0) { string str = sentence.ToString(); if (str[str.Length - 1] != ' ') { sentence.Append(' '); } } } } while (ch != -1); }
/// <summary> /// Called to download the text from a page. If any JavaScript /// include is found, the text from that page is read too. /// </summary> public void Process() { Uri url = new Uri("http://www.httprecipes.com/1/9/includes.php"); WebRequest http = HttpWebRequest.Create(url); http.Timeout = 30000; WebResponse response = http.GetResponse(); Stream stream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(stream); StringBuilder buffer = new StringBuilder(); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "script", true) == 0 && tag["src"] != null) { String src = tag["src"]; Uri u = new Uri(url, src); String include = DownloadPage(u); buffer.Append("<script>"); buffer.Append(include); buffer.Append("</script>"); } else { buffer.Append(tag.ToString()); } } else { buffer.Append((char)ch); } } Console.WriteLine(buffer.ToString()); }
/// <summary> /// Called to parse a table. The table number at the specified URL /// will be parsed. /// </summary> /// <param name="url">The URL of the HTML page that contains the table.</param> /// <param name="tableNum">The table number to parse, zero for the first.</param> public void Process(Uri url, int tableNum) { WebRequest http = HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)http.GetResponse(); Stream istream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(istream); StringBuilder buffer = new StringBuilder(); List<String> list = new List<String>(); bool capture = false; Advance(parse, "table", tableNum); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "tr", true) == 0) { list.Clear(); capture = false; buffer.Length = 0; } else if (String.Compare(tag.Name, "/tr", true) == 0) { if (list.Count > 0) { ProcessTableRow(list); list.Clear(); } } else if (String.Compare(tag.Name, "td", true) == 0) { if (buffer.Length > 0) list.Add(buffer.ToString()); buffer.Length = 0; capture = true; } else if (String.Compare(tag.Name, "/td", true) == 0) { list.Add(buffer.ToString()); buffer.Length = 0; capture = false; } else if (String.Compare(tag.Name, "/table", true) == 0) { break; } } else { if (capture) buffer.Append((char)ch); } } }
/// <summary> /// This method looks for each of the <option> tags that contain /// a link to each of the pages. For each page found the /// downloadArticlePage method is called. /// </summary> public void Process() { Uri url = new Uri("http://www.httprecipes.com/1/9/article.php"); WebRequest http = HttpWebRequest.Create(url); http.Timeout = 30000; WebResponse response = http.GetResponse(); Stream stream = response.GetResponseStream(); ParseHTML parse = new ParseHTML(stream); int ch; while ((ch = parse.Read()) != -1) { if (ch == 0) { HTMLTag tag = parse.Tag; if (String.Compare(tag.Name, "option", true) == 0) { String str = tag["value"]; Uri u = new Uri(url, str); Console.WriteLine(DownloadArticlePage(u)); } } } }