public BasicIMDBAliasSearch() { var c = new BasicWebCrawler(Host, 80) { // doesnt seem to respond at 2009.03.18 //CoralEnabled = true }; this.Crawler = c; var DefaultLink = new { Link = "", Title = "", Text = "" }; var DefaultImage = new { Source = "", Alt = "", Title = "", width = "", height = "" }; var ParseLink = DefaultLink.ToAnonymousConstructor( (string element) => { var Link = ""; var Title = ""; var Text = ""; element. ParseAttribute("href", value => Link = value). ParseAttribute("title", value => Title = value). ParseContent(value => Text = value). Parse(); return(new { Link, Title, Text }); } ); var ParseImage = DefaultImage.ToAnonymousConstructor( (string element) => { var Source = ""; var Alt = ""; var Title = ""; var width = ""; var height = ""; element. ParseAttribute("src", value => Source = value). ParseAttribute("alt", value => Alt = value). ParseAttribute("title", value => Title = value). ParseAttribute("width", value => width = value). ParseAttribute("height", value => height = value). ParseContent(null). Parse(); return(new { Source, Alt, Title, width, height }); } ); var EntryIndex = -1; #region AddItem Action <string, string> AddItem = (ImageElement, Content) => { var ImageSource = ""; if (ImageElement.StartsWith("<a")) { var ImageLink = ParseLink(ImageElement); var Image = ParseImage(ImageLink.Text); ImageSource = Image.Source; } /* * <img src="/images/b.gif" width="1" height="6"><br> * <a href="/title/tt0397892/" onclick="(new Image()).src='/rg/find-title-1/title_popular/images/b.gif?link=/title/tt0397892/';">Bolt</a> (2008) <br> *  aka <em>"Bolt - Pes pro kazdý prípad"</em> - Czech Republic<br> *  aka <em>"Bolt - Un perro fuera de serie 3D"</em> - Chile<br> *  aka <em>"Bolt - Supercão"</em> - Brazil<br> *  aka <em>"Bolt - Un perro fuera de serie"</em> - Argentina, Mexico<br> *  aka <em>"Bolt - Ein Hund für alle Fälle"</em> - Germany */ var ContentLink_start = Content.IndexOf("<a"); var ContentLink_end = Content.IndexOf("</a>"); var ContentLink = ParseLink(Content.Substring(ContentLink_start, ContentLink_end - ContentLink_start + 4)); var Details = Content.Substring(ContentLink_end + 4); var ReleaseDate = ""; var Alias = default(AlsoKnownAs); Details.Split("<br>", (text, index) => { if (index == 0) { ReleaseDate = text; return; } Alias = new AlsoKnownAs { Text = text, Alias = Alias }; } ); EntryIndex++; if (this.AddEntry != null) { this.AddEntry( new Entry("http://" + Host + ContentLink.Link) { OptionalAlias = Alias, OptionalReleaseDate = ReleaseDate, OptionalTitle = ContentLink.Text, OptionalImage = ImageSource }, EntryIndex ); } }; #endregion // http://www.imdb.com/find?s=tt;site=aka;q=The%20Dark%20Knight //const string Header_Location = "Location: "; string Redirect = null; // Location: http://www.imdb.com/title/tt1129442/ c.LocationReceived += href => { //Console.WriteLine("LocationReceived."); Redirect = href; }; c.DataReceivedWithTimeSpan += (document, elapsed) => { //Console.WriteLine("DataReceivedWithTimeSpan."); #region redirect if (!string.IsNullOrEmpty(Redirect)) { EntryIndex++; if (this.AddEntry != null) { this.AddEntry( new Entry(Redirect) { }, EntryIndex ); } return; } #endregion var approx_section = document.IndexOf("<b>Titles (Approx Matches)</b>"); var exact_section = document.IndexOf("<b>Titles (Exact Matches)</b>"); var popular_section = document.IndexOf("<b>Popular Titles</b>"); var first_section = popular_section; if (first_section < 0) { first_section = exact_section; } if (first_section < 0) { first_section = approx_section; } if (first_section < 0) { return; } var section_start = document.IndexOf("<table>", first_section); var section_end = document.IndexOf("</table>", section_start); var section = document.Substring(section_start, section_end - section_start + 8); BasicElementParser.Parse(section, "tr", (tr, tr_index) => { /* * <td valign="top"> * <a href="/title/tt0397892/" onClick="(new Image()).src='/rg/find-tiny-photo-1/title_popular/images/b.gif?link=/title/tt0397892/';"><img src="http://ia.media-imdb.com/images/M/MV5BNDQyNDE5NjQ1N15BMl5BanBnXkFtZTcwMDExMTAwMg@@._V1._SY30_SX23_.jpg" width="23" height="32" border="0"></a> </td> * <td align="right" valign="top"><img src="/images/b.gif" width="1" height="6"><br>1.</td> * <td valign="top"><img src="/images/b.gif" width="1" height="6"><br><a href="/title/tt0397892/" onclick="(new Image()).src='/rg/find-title-1/title_popular/images/b.gif?link=/title/tt0397892/';">Bolt</a> (2008) <br> aka <em>"Bolt - Pes pro kazdý prípad"</em> - Czech Republic<br> aka <em>"Bolt - Un perro fuera de serie 3D"</em> - Chile<br> aka <em>"Bolt - Supercão"</em> - Brazil<br> aka <em>"Bolt - Un perro fuera de serie"</em> - Argentina, Mexico<br> aka <em>"Bolt - Ein Hund für alle Fälle"</em> - Germany </td> * */ var Image = ""; var Content = ""; BasicElementParser.Parse(tr, "td", (td, td_index) => { if (td_index == 0) { Image = td; } if (td_index == 2) { Content = td; } } ); AddItem(Image, Content); } ); }; //c.Crawl("/find?s=tt;site=aka;q=" + "The Dark Knight".URLEncode()); //c.Crawl("/find?s=tt;site=aka;q=" + "Bolt".URLEncode()); }
public BasicIMDBPosterSearch() { this.Crawler = new Library.BasicWebCrawler("www.imdb.com", 80) { CoralEnabled = true }; var DefaultImage = new { Source = "", Alt = "", Title = "" }; var ParseImage = DefaultImage.ToAnonymousConstructor( (string element) => { var Source = ""; var Alt = ""; var Title = ""; element. ParseAttribute("src", value => Source = value). ParseAttribute("alt", value => Alt = value). ParseAttribute("title", value => Title = value). ParseContent(null). Parse(); return(new { Source, Alt, Title }); } ); string location = null; this.Crawler.AllHeadersSent += () => { location = null; }; this.Crawler.LocationReceived += value => { location = value; }; this.Crawler.DataReceived += document => { if (!string.IsNullOrEmpty(location)) { var u = new Uri(location); this.Crawler.Crawl(u.PathAndQuery); return; } var poster_tag = "<table id=\"principal\">"; var poster_i = document.IndexOf(poster_tag); var poster_close_tag = "</table>"; var poster_close_i = document.IndexOf(poster_close_tag, poster_i); var poster = ParseImage( BasicElementParser.GetContent( document.Substring(poster_i, poster_close_i + poster_close_tag.Length - poster_i) , "td") ); if (this.AddEntry != null) { this.AddEntry(poster.Source); } }; }
public BasicIMDBCrawler() { this.Crawler = new Library.BasicWebCrawler("www.imdb.com", 80) { //CoralEnabled = true }; var DefaultLink = new { Link = "", Title = "", Text = "" }; var DefaultImage = new { Source = "", Alt = "", Title = "" }; var ParseLink = DefaultLink.ToAnonymousConstructor( (string element) => { var Link = ""; var Title = ""; var Text = ""; element. ParseAttribute("href", value => Link = value). ParseAttribute("title", value => Title = value). ParseContent(value => Text = value). Parse(); return(new { Link, Title, Text }); } ); var ParseImage = DefaultImage.ToAnonymousConstructor( (string element) => { var Source = ""; var Alt = ""; var Title = ""; element. ParseAttribute("src", value => Source = value). ParseAttribute("alt", value => Alt = value). ParseAttribute("title", value => Title = value). ParseContent(null). Parse(); return(new { Source, Alt, Title }); } ); this.Crawler.DataReceived += document => { var entry = new Entry(); var title = BasicElementParser.GetContent(document, "title"); var title_i = title.IndexOf("("); entry.Title = title.Substring(0, title_i).Trim(); // remove qoutes from the title entry.Title = entry.Title.Replace(""", ""); entry.Year = title.Substring(title_i + 1, title.IndexOf(")", title_i + 1) - (title_i + 1)); var poster_i = document.IndexOf("name=\"poster\""); // no poster - the poster may be found on other services if (poster_i < 0) { } else { var poster_j = document.Substring(0, poster_i).LastIndexOf("<a"); var poster_q = document.IndexOf("</a>", poster_i); var poster = ParseLink(document.Substring(poster_j, poster_q - poster_j + 4)); var poster_image = ParseImage(poster.Text); entry.MediumPosterImageProvider = "imdb"; entry.MediumPosterImage = poster_image.Source; entry.MediumPosterImagePage = poster.Link; entry.MediumPosterTitle = poster.Title; } #region UserRating var meta_tag = "<div class=\"meta\">"; var meta_i = document.IndexOf(meta_tag); if (meta_i < 0) { entry.UserRating = ""; } else { var meta = document.Substring(meta_i + meta_tag.Length, document.IndexOf("</div>", meta_i) - meta_i - meta_tag.Length); entry.UserRating = BasicElementParser.GetContent(meta, "b"); } #endregion #region Genres var genre_tag = "<h5>Genre:</h5>"; var genre_i = document.IndexOf(genre_tag); var genres = new List <string>(); if (genre_i < 0) { } else { var genre = document.Substring(genre_i + genre_tag.Length, document.IndexOf("</div>", genre_i) - genre_i - genre_tag.Length); BasicElementParser.Parse(genre, "a", (text, index) => { if (text == "more") { return; } genres.Add(text); } ); } entry.Genres = genres.ToArray(); #endregion #region Runtime var runtime_tag = "<h5>Runtime:</h5>"; if (genre_i < 0) { genre_i = 0; } var runtime_i = document.IndexOf(runtime_tag, genre_i); if (runtime_i < 0) { entry.Runtime = ""; } else { var runtime = document.Substring(runtime_i + runtime_tag.Length, document.IndexOf("</div>", runtime_i) - runtime_i - runtime_tag.Length); entry.Runtime = runtime.Trim(); } #endregion #region Tagline var Tagline_tag = "<h5>Tagline:</h5>"; var Tagline_i = document.IndexOf(Tagline_tag, genre_i); if (Tagline_i < 0) { entry.Tagline = ""; } else { var Tagline = document.Substring(Tagline_i + Tagline_tag.Length, document.IndexOf("<", Tagline_i + Tagline_tag.Length) - Tagline_i - Tagline_tag.Length); entry.Tagline = Tagline.Trim(); } #endregion if (AddEntry != null) { AddEntry(entry); } }; }
public BasicPirateBaySearch() { this.Crawler = new BasicWebCrawler("thepiratebay.org", 80); this.Crawler.DataReceived += document => { var results = document.IndexOf("<table id=\"searchResult\">"); var headend = document.IndexOf("</thead>", results); var results_end = document.IndexOf("</table>", headend); int entryindex = -1; Action <Action <Entry, int> > ForEachEntry = AddEntry => { #region ScanSingleResultOrReturn Func <int, int> ScanSingleResultOrReturn = offset => { var itemstart = document.IndexOf("<tr>", offset); if (itemstart < 0) { return(offset); } if (itemstart > results_end) { return(offset); } var itemend = document.IndexOf("</tr>", itemstart); if (itemend < 0) { return(offset); } if (itemend > results_end) { return(offset); } var itemdata = document.Substring(itemstart, itemend - itemstart); //<tr> //<td class="vertTh"><a href="/browse/205" title="More from this category">Video > TV shows</a></td> //<td><a href="/torrent/4727946/Heroes.S03E16.HDTV.XviD-XOR.avi" class="detLink" title="Details for Heroes.S03E16.HDTV.XviD-XOR.avi">Heroes.S03E16.HDTV.XviD-XOR.avi</a></td> //<td>Today 04:55</td> //<td><a href="http://torrents.thepiratebay.org/4727946/Heroes.S03E16.HDTV.XviD-XOR.avi.4727946.TPB.torrent" title="Download this torrent"><img src="http://static.thepiratebay.org/img/dl.gif" class="dl" alt="Download" /></a><img src="http://static.thepiratebay.org/img/icon_comment.gif" alt="This torrent has 22 comments." title="This torrent has 22 comments." /><img src="http://static.thepiratebay.org/img/vip.gif" alt="VIP" title="VIP" style="width:11px;" /></td> //<td align="right">348.97 MiB</td> //<td align="right">47773</td> //<td align="right">60267</td> //Console.WriteLine("<h1>Most Popular video</h1>"); //Console.WriteLine("<table>"); // type, name, uploaded, links, size, se, le var Fields = new BasicPirateBaySearch.Entry(); Action <string> SetField = null; SetField = Type => SetField = Name => SetField = Time => SetField = Links => SetField = Size => SetField = Seeders => SetField = Leechers => { Fields = new BasicPirateBaySearch.Entry { Type = Type, Name = Name, Time = Time, Links = Links, Size = Size, Seeders = Seeders, Leechers = Leechers }; SetField = delegate { }; }; var ep = new BasicElementParser(); ep.AddContent += (value, index) => { //Console.WriteLine("AddContent start #" + index); SetField(value); //Console.WriteLine("AddContent stop #" + index); }; ep.Parse(itemdata, "td"); entryindex++; if (AddEntry != null) { AddEntry(Fields, entryindex); } return(itemend + 5); }; #endregion ScanSingleResultOrReturn.ToChainedFunc((x, y) => y > x)(headend); }; if (this.Loaded != null) { this.Loaded(ForEachEntry); } }; }
public static void Search(string title, Action <AliasEntry> handler) { var t = new Uri("http://www.movieposterdb.com/browse/search?search_type=movies&title="); var c = new BasicWebCrawler(t.Host, 80); var DefaultLink = new { Link = "", Title = "", Text = "" }; var DefaultSpan = new { Text = "", Title = "" }; var ParseSpan = DefaultSpan.ToAnonymousConstructor( (string element) => { var Text = ""; var Title = ""; element. ParseAttribute("title", value => Title = value). ParseContent(value => Text = value). Parse("span"); return(new { Text, Title }); } ); var ParseLink = DefaultLink.ToAnonymousConstructor( (string element) => { var Link = ""; var Title = ""; var Text = ""; element. ParseAttribute("href", value => Link = value). ParseAttribute("title", value => Title = value). ParseContent(value => Text = value). Parse("a"); return(new { Link, Title, Text }); } ); c.DataReceived += document => { var trigger = "Movies</h3>"; var trigger_i = document.IndexOf(trigger); var data = BasicElementParser.GetContent(document.Substring(trigger_i), "table"); BasicElementParser.Parse(data, "tr", (element, index) => { /* * <td valign="middle" style="font-size: 0pt; border-bottom: 1px solid #D2D2D2; height: 54px; width: 44px;"> * <img src="http://www.movieposterdb.com/posters/08_09/2008/1179855/m_1179855_4fb9999f.jpg" style="margin-right: 8px; padding: 2px; border: 1px solid #D2D2D2; float: left;" /> * </td> * <td valign="middle" style="border-bottom: 1px solid #D2D2D2; width: 60%;"> * <b><a class="bbg" href="http://www.movieposterdb.com/movie/1179855/Go-Fast.html">Go Fast</a><br /><span style="color: #8C8C8C;">2008</span></b> * </td> * <td style="border-bottom: 1px solid #D2D2D2; font-size: 8pt; color: #808080;"> * * </td> */ BasicElementParser.Parse(element, "td", (tdelement, tdindex) => { if (tdindex == 1) { // <b><a class="bbg" href="http://www.movieposterdb.com/movie/1179855/Go-Fast.html">Go Fast</a><br /><span style="color: #8C8C8C;">2008</span></b> var _title = ParseLink(tdelement); var _year = ParseSpan(tdelement); handler( new AliasEntry { Link = _title.Link, Title = _title.Text, Year = _year.Text } ); } } ); } ); }; c.Crawl(t.PathAndQuery + title.URLEncode()); }
public BasicTinyURLCrawler() { this.Crawler = new Library.BasicWebCrawler("tinyurl.com", 80) { //CoralEnabled = true }; this.Crawler.DataReceived += document => { var entry = new Entry(); if (APIMode) { entry.Alias = document; } else { var trigger = "<h1>TinyURL was created!</h1>"; var trigger_i = document.IndexOf(trigger); if (trigger_i < 0) { return; } // we are still in the business... //<h1>TinyURL was created!</h1> //<p>The following URL: //<blockquote><b>http://thepiratebay.org<br /> //</b></blockquote> //has a length of 23 characters and resulted in the following TinyURL which has a length of 24 characters: //<blockquote><b>http://tinyurl.com/5umsn</b><br><small>[<a href="http://tinyurl.com/5umsn" target="_blank">Open in new window</a>]</small></blockquote> //Or, give your recipients confidence with a preview TinyURL: //<blockquote><b>http://preview.tinyurl.com/5umsn</b><br><small>[<a href="http://preview.tinyurl.com/5umsn" target="_blank">Open in new window</a>]</small> //</blockquote> //</p> var start_tag = "<p>"; var start_i = document.IndexOf(start_tag, trigger_i); if (start_i < 0) { return; } var end_tag = "</p>"; var end_i = document.IndexOf(end_tag, start_i); var data = document.Substring(start_i + start_tag.Length, end_i - start_i + start_tag.Length); BasicElementParser.Parse(data, "blockquote", (value, index) => { if (index == 0) { entry.URL = BasicElementParser.GetContent(value, "b"); var br_tag = "<br />"; var br_i = entry.URL.IndexOf(br_tag); if (br_i >= 0) { entry.URL = entry.URL.Substring(0, br_i); } return; } if (index == 1) { entry.Alias = BasicElementParser.GetContent(value, "b"); return; } } ); } if (this.AddEntry != null) { this.AddEntry(entry); } }; }