Пример #1
0
        public BasicIMDBAliasSearch()
        {
            var c = new BasicWebCrawler(Host, 80)
            {
                // doesnt seem to respond at 2009.03.18
                //CoralEnabled = true
            };

            this.Crawler = c;

            var DefaultLink  = new { Link = "", Title = "", Text = "" };
            var DefaultImage = new { Source = "", Alt = "", Title = "", width = "", height = "" };

            var ParseLink = DefaultLink.ToAnonymousConstructor(
                (string element) =>
            {
                var Link  = "";
                var Title = "";
                var Text  = "";

                element.
                ParseAttribute("href", value => Link   = value).
                ParseAttribute("title", value => Title = value).
                ParseContent(value => Text             = value).
                Parse();

                return(new { Link, Title, Text });
            }
                );

            var ParseImage = DefaultImage.ToAnonymousConstructor(
                (string element) =>
            {
                var Source = "";
                var Alt    = "";
                var Title  = "";
                var width  = "";
                var height = "";

                element.
                ParseAttribute("src", value => Source    = value).
                ParseAttribute("alt", value => Alt       = value).
                ParseAttribute("title", value => Title   = value).
                ParseAttribute("width", value => width   = value).
                ParseAttribute("height", value => height = value).
                ParseContent(null).
                Parse();

                return(new { Source, Alt, Title, width, height });
            }
                );

            var EntryIndex = -1;

            #region AddItem
            Action <string, string> AddItem =
                (ImageElement, Content) =>
            {
                var ImageSource = "";

                if (ImageElement.StartsWith("<a"))
                {
                    var ImageLink = ParseLink(ImageElement);
                    var Image     = ParseImage(ImageLink.Text);

                    ImageSource = Image.Source;
                }

                /*
                 * <img src="/images/b.gif" width="1" height="6"><br>
                 * <a href="/title/tt0397892/" onclick="(new Image()).src='/rg/find-title-1/title_popular/images/b.gif?link=/title/tt0397892/';">Bolt</a> (2008)    <br>
                 * &#160;aka <em>"Bolt - Pes pro kazd&#253; pr&#237;pad"</em> - Czech Republic<br>
                 * &#160;aka <em>"Bolt - Un perro fuera de serie 3D"</em> - Chile<br>
                 * &#160;aka <em>"Bolt - Superc&#227;o"</em> - Brazil<br>
                 * &#160;aka <em>"Bolt - Un perro fuera de serie"</em> - Argentina, Mexico<br>
                 * &#160;aka <em>"Bolt - Ein Hund f&#252;r alle F&#228;lle"</em> - Germany
                 */

                var ContentLink_start = Content.IndexOf("<a");
                var ContentLink_end   = Content.IndexOf("</a>");
                var ContentLink       = ParseLink(Content.Substring(ContentLink_start, ContentLink_end - ContentLink_start + 4));

                var Details = Content.Substring(ContentLink_end + 4);

                var ReleaseDate = "";
                var Alias       = default(AlsoKnownAs);

                Details.Split("<br>",
                              (text, index) =>
                {
                    if (index == 0)
                    {
                        ReleaseDate = text;
                        return;
                    }

                    Alias = new AlsoKnownAs
                    {
                        Text  = text,
                        Alias = Alias
                    };
                }
                              );

                EntryIndex++;

                if (this.AddEntry != null)
                {
                    this.AddEntry(
                        new Entry("http://" + Host + ContentLink.Link)
                    {
                        OptionalAlias       = Alias,
                        OptionalReleaseDate = ReleaseDate,
                        OptionalTitle       = ContentLink.Text,
                        OptionalImage       = ImageSource
                    },
                        EntryIndex
                        );
                }
            };
            #endregion

            // http://www.imdb.com/find?s=tt;site=aka;q=The%20Dark%20Knight

            //const string Header_Location = "Location: ";

            string Redirect = null;

            // Location: http://www.imdb.com/title/tt1129442/
            c.LocationReceived +=
                href =>
            {
                //Console.WriteLine("LocationReceived.");

                Redirect = href;
            };


            c.DataReceivedWithTimeSpan +=
                (document, elapsed) =>
            {
                //Console.WriteLine("DataReceivedWithTimeSpan.");

                #region redirect
                if (!string.IsNullOrEmpty(Redirect))
                {
                    EntryIndex++;
                    if (this.AddEntry != null)
                    {
                        this.AddEntry(
                            new Entry(Redirect)
                        {
                        },
                            EntryIndex
                            );
                    }

                    return;
                }
                #endregion

                var approx_section  = document.IndexOf("<b>Titles (Approx Matches)</b>");
                var exact_section   = document.IndexOf("<b>Titles (Exact Matches)</b>");
                var popular_section = document.IndexOf("<b>Popular Titles</b>");

                var first_section = popular_section;

                if (first_section < 0)
                {
                    first_section = exact_section;
                }

                if (first_section < 0)
                {
                    first_section = approx_section;
                }


                if (first_section < 0)
                {
                    return;
                }

                var section_start = document.IndexOf("<table>", first_section);
                var section_end   = document.IndexOf("</table>", section_start);
                var section       = document.Substring(section_start, section_end - section_start + 8);

                BasicElementParser.Parse(section, "tr",
                                         (tr, tr_index) =>
                {
                    /*
                     * <td valign="top">
                     * <a href="/title/tt0397892/" onClick="(new Image()).src='/rg/find-tiny-photo-1/title_popular/images/b.gif?link=/title/tt0397892/';"><img src="http://ia.media-imdb.com/images/M/MV5BNDQyNDE5NjQ1N15BMl5BanBnXkFtZTcwMDExMTAwMg@@._V1._SY30_SX23_.jpg" width="23" height="32" border="0"></a>&nbsp;</td>
                     * <td align="right" valign="top"><img src="/images/b.gif" width="1" height="6"><br>1.</td>
                     * <td valign="top"><img src="/images/b.gif" width="1" height="6"><br><a href="/title/tt0397892/" onclick="(new Image()).src='/rg/find-title-1/title_popular/images/b.gif?link=/title/tt0397892/';">Bolt</a> (2008)    <br>&#160;aka <em>"Bolt - Pes pro kazd&#253; pr&#237;pad"</em> - Czech Republic<br>&#160;aka <em>"Bolt - Un perro fuera de serie 3D"</em> - Chile<br>&#160;aka <em>"Bolt - Superc&#227;o"</em> - Brazil<br>&#160;aka <em>"Bolt - Un perro fuera de serie"</em> - Argentina, Mexico<br>&#160;aka <em>"Bolt - Ein Hund f&#252;r alle F&#228;lle"</em> - Germany </td>
                     *
                     */

                    var Image   = "";
                    var Content = "";

                    BasicElementParser.Parse(tr, "td",
                                             (td, td_index) =>
                    {
                        if (td_index == 0)
                        {
                            Image = td;
                        }

                        if (td_index == 2)
                        {
                            Content = td;
                        }
                    }
                                             );

                    AddItem(Image, Content);
                }
                                         );
            };

            //c.Crawl("/find?s=tt;site=aka;q=" + "The Dark Knight".URLEncode());
            //c.Crawl("/find?s=tt;site=aka;q=" + "Bolt".URLEncode());
        }
Пример #2
0
        public BasicIMDBPosterSearch()
        {
            this.Crawler =
                new Library.BasicWebCrawler("www.imdb.com", 80)
            {
                CoralEnabled = true
            };



            var DefaultImage = new { Source = "", Alt = "", Title = "" };

            var ParseImage = DefaultImage.ToAnonymousConstructor(
                (string element) =>
            {
                var Source = "";
                var Alt    = "";
                var Title  = "";

                element.
                ParseAttribute("src", value => Source  = value).
                ParseAttribute("alt", value => Alt     = value).
                ParseAttribute("title", value => Title = value).
                ParseContent(null).
                Parse();

                return(new { Source, Alt, Title });
            }
                );


            string location = null;

            this.Crawler.AllHeadersSent +=
                () =>
            {
                location = null;
            };

            this.Crawler.LocationReceived +=
                value =>
            {
                location = value;
            };

            this.Crawler.DataReceived +=
                document =>
            {
                if (!string.IsNullOrEmpty(location))
                {
                    var u = new Uri(location);

                    this.Crawler.Crawl(u.PathAndQuery);

                    return;
                }

                var poster_tag       = "<table id=\"principal\">";
                var poster_i         = document.IndexOf(poster_tag);
                var poster_close_tag = "</table>";
                var poster_close_i   = document.IndexOf(poster_close_tag, poster_i);

                var poster = ParseImage(
                    BasicElementParser.GetContent(
                        document.Substring(poster_i, poster_close_i + poster_close_tag.Length - poster_i)
                        , "td")
                    );

                if (this.AddEntry != null)
                {
                    this.AddEntry(poster.Source);
                }
            };
        }
        public BasicIMDBCrawler()
        {
            this.Crawler =
                new Library.BasicWebCrawler("www.imdb.com", 80)
            {
                //CoralEnabled = true
            };

            var DefaultLink  = new { Link = "", Title = "", Text = "" };
            var DefaultImage = new { Source = "", Alt = "", Title = "" };

            var ParseLink = DefaultLink.ToAnonymousConstructor(
                (string element) =>
            {
                var Link  = "";
                var Title = "";
                var Text  = "";

                element.
                ParseAttribute("href", value => Link   = value).
                ParseAttribute("title", value => Title = value).
                ParseContent(value => Text             = value).
                Parse();

                return(new { Link, Title, Text });
            }
                );

            var ParseImage = DefaultImage.ToAnonymousConstructor(
                (string element) =>
            {
                var Source = "";
                var Alt    = "";
                var Title  = "";

                element.
                ParseAttribute("src", value => Source  = value).
                ParseAttribute("alt", value => Alt     = value).
                ParseAttribute("title", value => Title = value).
                ParseContent(null).
                Parse();

                return(new { Source, Alt, Title });
            }
                );

            this.Crawler.DataReceived +=
                document =>
            {
                var entry = new Entry();

                var title   = BasicElementParser.GetContent(document, "title");
                var title_i = title.IndexOf("(");

                entry.Title = title.Substring(0, title_i).Trim();

                // remove qoutes from the title
                entry.Title = entry.Title.Replace("&#34;", "");


                entry.Year = title.Substring(title_i + 1, title.IndexOf(")", title_i + 1) - (title_i + 1));


                var poster_i = document.IndexOf("name=\"poster\"");

                // no poster - the poster may be found on other services
                if (poster_i < 0)
                {
                }
                else
                {
                    var poster_j = document.Substring(0, poster_i).LastIndexOf("<a");
                    var poster_q = document.IndexOf("</a>", poster_i);

                    var poster       = ParseLink(document.Substring(poster_j, poster_q - poster_j + 4));
                    var poster_image = ParseImage(poster.Text);

                    entry.MediumPosterImageProvider = "imdb";
                    entry.MediumPosterImage         = poster_image.Source;
                    entry.MediumPosterImagePage     = poster.Link;
                    entry.MediumPosterTitle         = poster.Title;
                }


                #region UserRating
                var meta_tag = "<div class=\"meta\">";
                var meta_i   = document.IndexOf(meta_tag);

                if (meta_i < 0)
                {
                    entry.UserRating = "";
                }
                else
                {
                    var meta = document.Substring(meta_i + meta_tag.Length, document.IndexOf("</div>", meta_i) - meta_i - meta_tag.Length);

                    entry.UserRating = BasicElementParser.GetContent(meta, "b");
                }
                #endregion

                #region Genres
                var genre_tag = "<h5>Genre:</h5>";
                var genre_i   = document.IndexOf(genre_tag);
                var genres    = new List <string>();

                if (genre_i < 0)
                {
                }
                else
                {
                    var genre = document.Substring(genre_i + genre_tag.Length, document.IndexOf("</div>", genre_i) - genre_i - genre_tag.Length);

                    BasicElementParser.Parse(genre, "a",
                                             (text, index) =>
                    {
                        if (text == "more")
                        {
                            return;
                        }

                        genres.Add(text);
                    }
                                             );
                }

                entry.Genres = genres.ToArray();
                #endregion

                #region Runtime
                var runtime_tag = "<h5>Runtime:</h5>";
                if (genre_i < 0)
                {
                    genre_i = 0;
                }

                var runtime_i = document.IndexOf(runtime_tag, genre_i);

                if (runtime_i < 0)
                {
                    entry.Runtime = "";
                }
                else
                {
                    var runtime = document.Substring(runtime_i + runtime_tag.Length, document.IndexOf("</div>", runtime_i) - runtime_i - runtime_tag.Length);

                    entry.Runtime = runtime.Trim();
                }

                #endregion

                #region Tagline
                var Tagline_tag = "<h5>Tagline:</h5>";
                var Tagline_i   = document.IndexOf(Tagline_tag, genre_i);

                if (Tagline_i < 0)
                {
                    entry.Tagline = "";
                }
                else
                {
                    var Tagline = document.Substring(Tagline_i + Tagline_tag.Length, document.IndexOf("<", Tagline_i + Tagline_tag.Length) - Tagline_i - Tagline_tag.Length);

                    entry.Tagline = Tagline.Trim();
                }
                #endregion

                if (AddEntry != null)
                {
                    AddEntry(entry);
                }
            };
        }
Пример #4
0
        public BasicPirateBaySearch()
        {
            this.Crawler = new BasicWebCrawler("thepiratebay.org", 80);

            this.Crawler.DataReceived +=
                document =>
            {
                var results     = document.IndexOf("<table id=\"searchResult\">");
                var headend     = document.IndexOf("</thead>", results);
                var results_end = document.IndexOf("</table>", headend);

                int entryindex = -1;

                Action <Action <Entry, int> > ForEachEntry =
                    AddEntry =>
                {
                    #region ScanSingleResultOrReturn
                    Func <int, int> ScanSingleResultOrReturn =
                        offset =>
                    {
                        var itemstart = document.IndexOf("<tr>", offset);

                        if (itemstart < 0)
                        {
                            return(offset);
                        }

                        if (itemstart > results_end)
                        {
                            return(offset);
                        }

                        var itemend = document.IndexOf("</tr>", itemstart);

                        if (itemend < 0)
                        {
                            return(offset);
                        }

                        if (itemend > results_end)
                        {
                            return(offset);
                        }

                        var itemdata = document.Substring(itemstart, itemend - itemstart);



                        //<tr>
                        //<td class="vertTh"><a href="/browse/205" title="More from this category">Video &gt; TV shows</a></td>
                        //<td><a href="/torrent/4727946/Heroes.S03E16.HDTV.XviD-XOR.avi" class="detLink" title="Details for Heroes.S03E16.HDTV.XviD-XOR.avi">Heroes.S03E16.HDTV.XviD-XOR.avi</a></td>
                        //<td>Today&nbsp;04:55</td>
                        //<td><a href="http://torrents.thepiratebay.org/4727946/Heroes.S03E16.HDTV.XviD-XOR.avi.4727946.TPB.torrent" title="Download this torrent"><img src="http://static.thepiratebay.org/img/dl.gif" class="dl" alt="Download" /></a><img src="http://static.thepiratebay.org/img/icon_comment.gif" alt="This torrent has 22 comments." title="This torrent has 22 comments." /><img src="http://static.thepiratebay.org/img/vip.gif" alt="VIP" title="VIP" style="width:11px;" /></td>
                        //<td align="right">348.97&nbsp;MiB</td>
                        //<td align="right">47773</td>
                        //<td align="right">60267</td>

                        //Console.WriteLine("<h1>Most Popular video</h1>");
                        //Console.WriteLine("<table>");

                        // type, name, uploaded, links, size, se, le

                        var Fields = new BasicPirateBaySearch.Entry();

                        Action <string> SetField = null;

                        SetField                                                                                                                                                                                                                                                                                                          = Type =>
                                                                                                     SetField                                                                                                                                                                                                                             = Name =>
                                                                                                                                                                       SetField                                                                                                                                                           = Time =>
                                                                                                                                                                                                                              SetField                                                                                                    = Links =>
                                                                                                                                                                                                                                                                          SetField                                                        = Size =>
                                                                                                                                                                                                                                                                                                           SetField                       = Seeders =>
                                                                                                                                                                                                                                                                                                                                 SetField = Leechers =>
                        {
                            Fields = new BasicPirateBaySearch.Entry
                            {
                                Type     = Type,
                                Name     = Name,
                                Time     = Time,
                                Links    = Links,
                                Size     = Size,
                                Seeders  = Seeders,
                                Leechers = Leechers
                            };

                            SetField = delegate { };
                        };


                        var ep = new BasicElementParser();

                        ep.AddContent +=
                            (value, index) =>
                        {
                            //Console.WriteLine("AddContent start #" + index);
                            SetField(value);
                            //Console.WriteLine("AddContent stop #" + index);
                        };

                        ep.Parse(itemdata, "td");

                        entryindex++;

                        if (AddEntry != null)
                        {
                            AddEntry(Fields, entryindex);
                        }



                        return(itemend + 5);
                    };
                    #endregion


                    ScanSingleResultOrReturn.ToChainedFunc((x, y) => y > x)(headend);
                };

                if (this.Loaded != null)
                {
                    this.Loaded(ForEachEntry);
                }
            };
        }
Пример #5
0
        public static void Search(string title, Action <AliasEntry> handler)
        {
            var t = new Uri("http://www.movieposterdb.com/browse/search?search_type=movies&title=");
            var c = new BasicWebCrawler(t.Host, 80);

            var DefaultLink = new { Link = "", Title = "", Text = "" };
            var DefaultSpan = new { Text = "", Title = "" };

            var ParseSpan = DefaultSpan.ToAnonymousConstructor(
                (string element) =>
            {
                var Text  = "";
                var Title = "";

                element.
                ParseAttribute("title", value => Title = value).
                ParseContent(value => Text             = value).
                Parse("span");

                return(new { Text, Title });
            }
                );

            var ParseLink = DefaultLink.ToAnonymousConstructor(
                (string element) =>
            {
                var Link  = "";
                var Title = "";
                var Text  = "";

                element.
                ParseAttribute("href", value => Link   = value).
                ParseAttribute("title", value => Title = value).
                ParseContent(value => Text             = value).
                Parse("a");

                return(new { Link, Title, Text });
            }
                );

            c.DataReceived +=
                document =>
            {
                var trigger = "Movies</h3>";

                var trigger_i = document.IndexOf(trigger);

                var data = BasicElementParser.GetContent(document.Substring(trigger_i), "table");

                BasicElementParser.Parse(data, "tr",
                                         (element, index) =>
                {
                    /*
                     * <td valign="middle" style="font-size: 0pt; border-bottom: 1px solid #D2D2D2; height: 54px; width: 44px;">
                     *  <img src="http://www.movieposterdb.com/posters/08_09/2008/1179855/m_1179855_4fb9999f.jpg" style="margin-right: 8px; padding: 2px; border: 1px solid #D2D2D2; float: left;" />
                     * </td>
                     * <td valign="middle" style="border-bottom: 1px solid #D2D2D2; width: 60%;">
                     *  <b><a class="bbg" href="http://www.movieposterdb.com/movie/1179855/Go-Fast.html">Go Fast</a><br /><span style="color: #8C8C8C;">2008</span></b>
                     * </td>
                     * <td style="border-bottom: 1px solid #D2D2D2; font-size: 8pt; color: #808080;">
                     *
                     * </td>
                     */

                    BasicElementParser.Parse(element, "td",
                                             (tdelement, tdindex) =>
                    {
                        if (tdindex == 1)
                        {
                            // <b><a class="bbg" href="http://www.movieposterdb.com/movie/1179855/Go-Fast.html">Go Fast</a><br /><span style="color: #8C8C8C;">2008</span></b>
                            var _title = ParseLink(tdelement);
                            var _year  = ParseSpan(tdelement);

                            handler(
                                new AliasEntry
                            {
                                Link  = _title.Link,
                                Title = _title.Text,
                                Year  = _year.Text
                            }
                                );
                        }
                    }
                                             );
                }
                                         );
            };

            c.Crawl(t.PathAndQuery + title.URLEncode());
        }
Пример #6
0
        public BasicTinyURLCrawler()
        {
            this.Crawler =
                new Library.BasicWebCrawler("tinyurl.com", 80)
            {
                //CoralEnabled = true
            };

            this.Crawler.DataReceived +=
                document =>
            {
                var entry = new Entry();

                if (APIMode)
                {
                    entry.Alias = document;
                }
                else
                {
                    var trigger = "<h1>TinyURL was created!</h1>";

                    var trigger_i = document.IndexOf(trigger);

                    if (trigger_i < 0)
                    {
                        return;
                    }

                    // we are still in the business...

                    //<h1>TinyURL was created!</h1>
                    //<p>The following URL:
                    //<blockquote><b>http://thepiratebay.org<br />
                    //</b></blockquote>
                    //has a length of 23 characters and resulted in the following TinyURL which has a length of 24 characters:
                    //<blockquote><b>http://tinyurl.com/5umsn</b><br><small>[<a href="http://tinyurl.com/5umsn" target="_blank">Open in new window</a>]</small></blockquote>
                    //Or, give your recipients confidence with a preview TinyURL:
                    //<blockquote><b>http://preview.tinyurl.com/5umsn</b><br><small>[<a href="http://preview.tinyurl.com/5umsn" target="_blank">Open in new window</a>]</small>

                    //</blockquote>
                    //</p>

                    var start_tag = "<p>";
                    var start_i   = document.IndexOf(start_tag, trigger_i);

                    if (start_i < 0)
                    {
                        return;
                    }

                    var end_tag = "</p>";
                    var end_i   = document.IndexOf(end_tag, start_i);

                    var data = document.Substring(start_i + start_tag.Length, end_i - start_i + start_tag.Length);



                    BasicElementParser.Parse(data, "blockquote",
                                             (value, index) =>
                    {
                        if (index == 0)
                        {
                            entry.URL = BasicElementParser.GetContent(value, "b");

                            var br_tag = "<br />";
                            var br_i   = entry.URL.IndexOf(br_tag);

                            if (br_i >= 0)
                            {
                                entry.URL = entry.URL.Substring(0, br_i);
                            }


                            return;
                        }

                        if (index == 1)
                        {
                            entry.Alias = BasicElementParser.GetContent(value, "b");

                            return;
                        }
                    }
                                             );
                }

                if (this.AddEntry != null)
                {
                    this.AddEntry(entry);
                }
            };
        }