Пример #1
0
 //Parse IMDb page data
 private void parseIMDbPage(string html, bool GetExtraInfo)
 {
     Id = match(@"<link rel=""canonical"" href=""http://www.imdb.com/title/(tt\d{7})/"" />", html);
     if (!string.IsNullOrEmpty(Id))
     {
         DoEvents();
         status        = true;
         Title         = match(@"<title>(IMDb \- )*(.*?) \(.*?</title>", html, 2);
         OriginalTitle = match(@"title-extra"">(.*?)<", html);
         Year          = match(@"<title>.*?\(.*?(\d{4}).*?\).*?</title>", html);
         Rating        = match(@"ratingValue"">(\d.\d)<", html);
         Genres        = new ArrayList();
         Genres        = matchAll(@"<a.*?>(.*?)</a>", match(@"Genres:</h4>(.*?)</div>", html));
         Directors     = new ArrayList();
         Directors     = matchAll(@"<a.*?>(.*?)</a>", match(@"Directors?:[\n\r\s]*</h4>(.*?)(</div>|>.?and )", html));
         Writers       = matchAll(@"<a.*?>(.*?)</a>", match(@"Writers?:[\n\r\s]*</h4>(.*?)(</div>|>.?and )", html));
         Stars         = matchAll(@"<a.*?>(.*?)</a>", match(@"Stars?:(.*?)</div>", html));
         Cast          = matchAll(@"class=""name"">[\n\r\s]*<a.*?>(.*?)</a>", html);
         Plot          = match(@"<p itemprop=""description"">(.*?)</p>", html);
         ReleaseDate   = match(@"Release Date:</h4>.*?(\d{1,2} (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)\d{2}).*(\(|<span)", html);
         Runtime       = match(@"Runtime:</h4>[\s]*.*?(\d{1,4}) min[\s]*.*?\<\/div\>", html);
         if (String.IsNullOrEmpty(Runtime))
         {
             Runtime = match(@"infobar.*?([0-9]+) min.*?</div>", html);
         }
         Top250      = match(@"Top 250 #(\d{1,3})<", html);
         Oscars      = match(@"Won (\d{1,2}) Oscars\.", html);
         Awards      = match(@"(\d{1,4}) wins", html);
         Nominations = match(@"(\d{1,4}) nominations", html);
         Storyline   = match(@"Storyline</h2>[\s]*<p>(.*?)[\s]*(<em|</p>)", html);
         Tagline     = match(@"Taglines?:</h4>(.*?)(<span|</div)", html);
         MpaaRating  = match(@"infobar"">.*?<img.*?alt=""(.*?)"" src="".*?certificates.*?"".*?>", html);
         Votes       = match(@"ratingCount"">(\d+,?\d*)</span>", html);
         Languages   = new ArrayList();
         Languages   = matchAll(@"<a.*?>(.*?)</a>", match(@"Language.?:(.*?)(</div>|>.?and )", html));
         Countries   = new ArrayList();
         Countries   = matchAll(@"<a.*?>(.*?)</a>", match(@"Country:(.*?)(</div>|>.?and )", html));
         Poster      = match(@"img_primary"">[\n\r\s]*?<a.*?><img src=""(.*?)"".*?</td>", html);
         if (!string.IsNullOrEmpty(Poster) && Poster.IndexOf("nopicture") < 0)
         {
             PosterSmall = Regex.Replace(Poster, @"_V1\..*?.jpg", "_V1._SY150.jpg");
             PosterLarge = Regex.Replace(Poster, @"_V1\..*?.jpg", "_V1._SY500.jpg");
             PosterFull  = Regex.Replace(Poster, @"_V1\..*?.jpg", "_V1._SY0.jpg");
         }
         else
         {
             Poster      = string.Empty;
             PosterSmall = string.Empty;
             PosterLarge = string.Empty;
             PosterFull  = string.Empty;
         }
         ImdbURL = "http://www.imdb.com/title/" + Id + "/";
         if (GetExtraInfo)
         {
             ReleaseDates = getReleaseDates();
             MediaImages  = getMediaImages();
         }
     }
 }
Пример #2
0
        //Parse IMDb page data
        private void parseIMDbPage(string imdbUrl, bool GetExtraInfo)
        {
            string html = getUrlData(imdbUrl + "combined");

            Id = match(@"<link rel=""canonical"" href=""http://www.imdb.com/title/(tt\d{7})/combined"" />", html);
            if (!string.IsNullOrEmpty(Id))
            {
                status = true;
                //Title = match(@"<title>(IMDb \- )*(.*?) \(.*?</title>", html, 2);
                Title = match(@"tn15title\""><h1>(.*?)<", html); //match(@"<title>(IMDb \- )*(.*?) \([^)]*?(\d{4}).*?\)?.*?</title>", html, 2);

                OriginalTitle = match(@"title-extra"">(.*?)<", html);

                if (IsSeries)
                {
                    string episodeName  = match(@"tn15title\""><h1>[^<]*?<span><em>(.*?)<", html);
                    string episodeExtra = match(@"tn15title\""><h1>[^<]*?<span><em>[^<]*?</em>(.*?)<", html);
                    SeriesSubtitle = episodeName + " " + episodeExtra + "\r\n" + "Season " + SeriesSeason + ", Episode " + SeriesEpisode;
                }
                else
                {
                    try
                    {
                        var    season       = match(@"\(Season (\d*), Episode \d*\)", html);
                        var    episode      = match(@"\(Season \d*, Episode (\d*)\)", html);
                        string episodeName  = match(@"tn15title\""><h1>[^<]*?<span><em>(.*?)<", html);
                        string episodeExtra = match(@"tn15title\""><h1>[^<]*?<span><em>[^<]*?</em>(.*?)<", html);
                        IsSeries = !string.IsNullOrEmpty(season);
                        if (IsSeries)
                        {
                            SeriesSeason   = Convert.ToInt32(season);
                            SeriesEpisode  = Convert.ToInt32(episode);
                            SeriesSubtitle = episodeName + " " + episodeExtra + "\r\n" + "Season " + SeriesSeason + ", Episode " + SeriesEpisode;
                        }
                        else
                        {
                            SeriesSeason   = 0;
                            SeriesEpisode  = 0;
                            SeriesSubtitle = "";
                        }
                    }
                    catch
                    {
                        IsSeries = false;
                    }
                }

                if (OriginalTitle == "")
                {
                    var eng = match(@">\s*\""([^<]*)\"" -.*?International <em>\(English title\)", html, 1);
                    OriginalTitle = eng;
                }
                //Year = match(@"<title>.*?\(.*?(\d{4}).*?\).*?</title>", html);
                Year             = match(@"<title>(IMDb \- )*(.*?) \([^)]*?(\d{4}).*?\)?.*?</title>", html, 3);
                Rating           = match(@"<b>(\d.\d)/10</b>", html);
                Genres           = matchAll(@"<a.*?>(.*?)</a>", match(@"Genre.?:(.*?)(</div>|See more)", html));
                Directors        = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Directed by</a></h5>(.*?)</table>", html));
                Writers          = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Writing credits</a></h5>(.*?)</table>", html));
                Producers        = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Produced by</a></h5>(.*?)</table>", html));
                Musicians        = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Original Music by</a></h5>(.*?)</table>", html));
                Cinematographers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Cinematography by</a></h5>(.*?)</table>", html));
                Editors          = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Film Editing by</a></h5>(.*?)</table>", html));
                Cast             = matchAll(@"<td class=""nm""><a.*?href=""/name/.*?/"".*?>(.*?)</a>", match(@"<h3>Cast</h3>(.*?)</table>", html));
                Plot             = match(@"Plot:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html);
                PlotKeywords     = matchAll(@"<a.*?>(.*?)</a>", match(@"Plot Keywords:</h5>.*?<div class=""info-content"">(.*?)</div", html));
                ReleaseDate      = match(@"Release Date:</h5>.*?<div class=""info-content"">.*?(\d{1,2} (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)\d{2})", html);
                Runtime          = match(@"Runtime:</h5><div class=""info-content"">(\d{1,4}) min[\s]*.*?</div>", html);
                Top250           = match(@"Top 250: #(\d{1,3})<", html);
                Oscars           = match(@"Won (\d+) Oscars?\.", html);
                if (string.IsNullOrEmpty(Oscars) && "Won Oscar.".Equals(match(@"(Won Oscar\.)", html)))
                {
                    Oscars = "1";
                }
                Awards      = match(@"(\d{1,4}) wins", html);
                Nominations = match(@"(\d{1,4}) nominations", html);
                Tagline     = match(@"Tagline:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html);
                MpaaRating  = match(@"MPAA</a>:</h5><div class=""info-content"">Rated (G|PG|PG-13|PG-14|R|NC-17|X) ", html);
                Votes       = match(@">(\d+,?\d*) votes<", html);
                Languages   = matchAll(@"<a.*?>(.*?)</a>", match(@"Language.?:(.*?)(</div>|>.?and )", html));
                Countries   = matchAll(@"<a.*?>(.*?)</a>", match(@"Country:(.*?)(</div>|>.?and )", html));
                Poster      = match(@"<div class=""photo"">.*?<a name=""poster"".*?><img.*?src=""(.*?)"".*?</div>", html);
                if (!string.IsNullOrEmpty(Poster) && Poster.IndexOf("media-imdb.com") > 0)
                {
                    Poster      = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY200.jpg");
                    PosterLarge = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY500.jpg");
                    PosterFull  = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY0.jpg");
                }
                else
                {
                    Poster      = string.Empty;
                    PosterLarge = string.Empty;
                    PosterFull  = string.Empty;
                }
                ImdbURL = "http://www.imdb.com/title/" + Id + "/";
                if (GetExtraInfo)
                {
                    string plotHtml = getUrlData(imdbUrl + "plotsummary");
                    Storyline         = match(@"<p class=""plotpar"">(.*?)(<i>|</p>)", plotHtml);
                    ReleaseDates      = getReleaseDates();
                    MediaImages       = getMediaImages();
                    RecommendedTitles = getRecommendedTitles();
                }
            }
        }
Пример #3
0
        //Get IMDb URL from search results
        //static public ArrayList getIMDbUrl(string MovieName, string searchEngine = "google")
        //{
        //    string url = GoogleSearch + MovieName; //default to Google search
        //    if (searchEngine.ToLower().Equals("bing")) url = BingSearch + MovieName;
        //    if (searchEngine.ToLower().Equals("ask")) url = AskSearch + MovieName;
        //    string html = getUrlData(url);
        //    ArrayList imdbUrls = matchAll(@"<a href=""(http://www.imdb.com/title/tt\d{7}/)"".*?>.*?</a>", html);
        //    if (imdbUrls.Count > 0)
        //        return imdbUrls; //return first IMDb result
        //    else if (searchEngine.ToLower().Equals("google")) //if Google search fails
        //        return getIMDbUrl(MovieName, "bing"); //search using Bing
        //    else if (searchEngine.ToLower().Equals("bing")) //if Bing search fails
        //        return getIMDbUrl(MovieName, "ask"); //search using Ask
        //    else //search fails
        //        return new ArrayList {string.Empty};
        //}

        //Parse IMDb page data
        private void ParseImdbPage(string html, bool getExtraInfo)
        {
            Id = match(@"<link rel=""canonical"" href=""http://www.imdb.com/title/(tt\d{7})/combined"" />", html);
            if (!string.IsNullOrEmpty(Id))
            {
                Status           = true;
                Title            = match(@"<title>(IMDb \- )*(.*?) \(.*?</title>", html, 2);
                OriginalTitle    = match(@"title-extra"">(.*?)<", html);
                Year             = match(@"<title>.*?\(.*?(\d{4}).*?\).*?</title>", html);
                Rating           = match(@"<b>(\d.\d)/10</b>", html);
                Genres           = matchAll(@"<a.*?>(.*?)</a>", match(@"Genre.?:(.*?)(</div>|See more)", html));
                Directors        = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Directed by</a></h5>(.*?)</table>", html));
                Writers          = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Writing credits</a></h5>(.*?)</table>", html));
                Producers        = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Produced by</a></h5>(.*?)</table>", html));
                Musicians        = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Original Music by</a></h5>(.*?)</table>", html));
                Cinematographers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Cinematography by</a></h5>(.*?)</table>", html));
                Editors          = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Film Editing by</a></h5>(.*?)</table>", html));
                Cast             = matchAll(@"<td class=""nm""><a.*?href=""/name/.*?/"".*?>(.*?)</a>", match(@"<h3>Cast</h3>(.*?)</table>", html));
                Plot             = match(@"Plot:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html);
                PlotKeywords     = matchAll(@"<a.*?>(.*?)</a>", match(@"Plot Keywords:</h5>.*?<div class=""info-content"">(.*?)</div", html));
                ReleaseDate      = match(@"Release Date:</h5>.*?<div class=""info-content"">.*?(\d{1,2} (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)\d{2})", html);
                Runtime          = match(@"Runtime:</h5><div class=""info-content"">(\d{1,4}) min[\s]*.*?</div>", html);
                Top250           = match(@"Top 250: #(\d{1,3})<", html);
                Oscars           = match(@"Won (\d+) Oscars?\.", html);
                if (string.IsNullOrEmpty(Oscars) && "Won Oscar.".Equals(match(@"(Won Oscar\.)", html)))
                {
                    Oscars = "1";
                }
                Awards      = match(@"(\d{1,4}) wins", html);
                Nominations = match(@"(\d{1,4}) nominations", html);
                Tagline     = match(@"Tagline:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html);
                MpaaRating  = match(@"MPAA</a>:</h5><div class=""info-content"">Rated (G|PG|PG-13|PG-14|R|NC-17|X) ", html);
                Votes       = match(@">(\d+,?\d*) votes<", html);
                Languages   = matchAll(@"<a.*?>(.*?)</a>", match(@"Language.?:(.*?)(</div>|>.?and )", html));
                Countries   = matchAll(@"<a.*?>(.*?)</a>", match(@"Country:(.*?)(</div>|>.?and )", html));
                Poster      = match(@"<div class=""photo"">.*?<a name=""poster"".*?><img.*?src=""(.*?)"".*?</div>", html);
                if (!string.IsNullOrEmpty(Poster) && Poster.IndexOf("media-imdb.com") > 0)
                {
                    Poster      = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY200.jpg");
                    PosterLarge = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY500.jpg");
                    PosterFull  = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY0.jpg");
                }
                else
                {
                    Poster      = string.Empty;
                    PosterLarge = string.Empty;
                    PosterFull  = string.Empty;
                }
                ImdbUrl = "http://www.imdb.com/title/" + Id + "/";

                //Get Stars
                var html2 = GetUrlData(ImdbUrl);
                Stars = matchAll(@"<a.*?><span class=""itemprop"" itemprop=""name"">(.*?)</span></a>", match(@"Stars?:(.*?)</div>", html2));

                if (getExtraInfo)
                {
                    string plotHtml = GetUrlData(ImdbUrl + "plotsummary");
                    Storyline = StripHtml(match(@"<p class=""plotSummary"">(.*?)(<i>|</p>)", plotHtml));
                    //ReleaseDates = GetReleaseDates();
                    MediaImages = GetMediaImages();
                    //RecommendedTitles = GetRecommendedTitles();
                }
            }
        }
Пример #4
0
        //Parse IMDb page data
        private bool parseIMDbPage(string imdbUrl, bool GetExtraInfo)
        {
            int    istart, iend;
            string substr1;
            string html = getUrlData(imdbUrl + "combined");

            if (!string.IsNullOrEmpty(html))
            {
                status           = true;
                Id               = match(@"<link rel=""canonical"" href=""http://www.imdb.com/title/(tt\d{7})/combined"" />", html);
                Title            = match(@"<title>(IMDb \- )*(.*?) \(.*?</title>", html, 2);
                Title            = System.Web.HttpUtility.HtmlDecode(Title);
                Title            = System.Web.HttpUtility.HtmlDecode(Title);
                OriginalTitle    = System.Web.HttpUtility.HtmlDecode(match(@"title-extra"">(.*?)<", html));
                OriginalTitle    = System.Web.HttpUtility.HtmlDecode(OriginalTitle);
                istart           = html.IndexOf("<title>");
                iend             = html.IndexOf("</title>") + 8;
                substr1          = html.Substring(istart, iend - istart);
                Year             = match(@"<title>.*?\(.*?(\d{4}).*?\).*?</title>", substr1);
                Rating           = match(@"<b>(\d.\d)/10</b>", html);
                Genres           = matchAll(@"<a.*?>(.*?)</a>", match(@"Genre.?:(.*?)(</div>|See more)", html));
                Directors        = new ArrayList();
                Directors        = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Directed by</a></h5>(.*?)</table>", html));
                Writers          = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Writing credits</a></h5>(.*?)</table>", html));
                Producers        = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Produced by</a></h5>(.*?)</table>", html));
                Musicians        = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Original Music by</a></h5>(.*?)</table>", html));
                Cinematographers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Cinematography by</a></h5>(.*?)</table>", html));
                Editors          = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Film Editing by</a></h5>(.*?)</table>", html));
                Cast             = matchAll(@"<td class=""nm""><a.*?href=""/name/.*?/"".*?>(.*?)</a>", match(@"<h3>Cast</h3>(.*?)</table>", html));
                Plot             = System.Web.HttpUtility.HtmlDecode(match(@"Plot:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html));
                Plot             = System.Web.HttpUtility.HtmlDecode(Plot);
                Plot             = Plot.Replace("|", "");
                Plot             = Plot.Trim();
                PlotKeywords     = matchAll(@"<a.*?>(.*?)</a>", match(@"Plot Keywords:</h5>.*?<div class=""info-content"">(.*?)</div", html));
                ReleaseDate      = match(@"Release Date:</h5>.*?<div class=""info-content"">.*?(\d{1,2} (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)\d{2})", html);
                Runtime          = match(@"Runtime:</h5><div class=""info-content"">(\d{1,4}) min[\s]*.*?</div>", html);
                Top250           = match(@"Top 250: #(\d{1,3})<", html);
                Oscars           = match(@"Won (\d+) Oscars?\.", html);
                if (string.IsNullOrEmpty(Oscars) && "Won Oscar.".Equals(match(@"(Won Oscar\.)", html)))
                {
                    Oscars = "1";
                }
                Awards      = match(@"(\d{1,4}) wins", html);
                Nominations = match(@"(\d{1,4}) nominations", html);
                Tagline     = System.Web.HttpUtility.HtmlDecode(match(@"Tagline:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html));
                Tagline     = System.Web.HttpUtility.HtmlDecode(Tagline);
                MpaaRating  = match(@"MPAA</a>:</h5><div class=""info-content"">Rated (G|PG|PG-13|PG-14|R|NC-17|X) ", html);
                if (MpaaRating.Length == 0)
                {
                    // If MPAA rating is not set check the Certification
                    MpaaRating = match(@"Certification:</h5><div class=""info-content""><a href=""/search/title\?certificates=us:g"">USA:(G|PG|PG-13|PG-14|R|NC-17|X)", html);
                }
                Votes     = match(@">(\d+,?\d*) votes<", html);
                Languages = matchAll(@"<a.*?>(.*?)</a>", match(@"Language.?:(.*?)(</div>|>.?and )", html));
                Countries = matchAll(@"<a.*?>(.*?)</a>", match(@"Country:(.*?)(</div>|>.?and )", html));
                Poster    = match(@"<div class=""photo"">.*?<a name=""poster"".*?><img.*?src=""(.*?)"".*?</div>", html);
                if (!string.IsNullOrEmpty(Poster) && Poster.IndexOf("media-imdb.com") > 0)
                {
                    Poster      = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY200.jpg");
                    PosterLarge = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY500.jpg");
                    PosterFull  = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY0.jpg");
                }
                else
                {
                    Poster      = string.Empty;
                    PosterLarge = string.Empty;
                    PosterFull  = string.Empty;
                }
                ImdbURL = "http://www.imdb.com/title/" + Id + "/";
                if (GetExtraInfo)
                {
                    string plotHtml = getUrlData(imdbUrl + "plotsummary");
                    Storyline         = match(@"<p class=""plotpar"">(.*?)(<i>|</p>)", plotHtml);
                    ReleaseDates      = getReleaseDates();
                    MediaImages       = getMediaImages();
                    RecommendedTitles = getRecommendedTitles();
                }
                return(true);
            }
            else
            {
                Id = null;
                return(false);
            }
        }