//Parse IMDb page data private void parseIMDbPage(string html, bool GetExtraInfo) { Id = match(@"<link rel=""canonical"" href=""http://www.imdb.com/title/(tt\d{7})/"" />", html); if (!string.IsNullOrEmpty(Id)) { DoEvents(); status = true; Title = match(@"<title>(IMDb \- )*(.*?) \(.*?</title>", html, 2); OriginalTitle = match(@"title-extra"">(.*?)<", html); Year = match(@"<title>.*?\(.*?(\d{4}).*?\).*?</title>", html); Rating = match(@"ratingValue"">(\d.\d)<", html); Genres = new ArrayList(); Genres = matchAll(@"<a.*?>(.*?)</a>", match(@"Genres:</h4>(.*?)</div>", html)); Directors = new ArrayList(); Directors = matchAll(@"<a.*?>(.*?)</a>", match(@"Directors?:[\n\r\s]*</h4>(.*?)(</div>|>.?and )", html)); Writers = matchAll(@"<a.*?>(.*?)</a>", match(@"Writers?:[\n\r\s]*</h4>(.*?)(</div>|>.?and )", html)); Stars = matchAll(@"<a.*?>(.*?)</a>", match(@"Stars?:(.*?)</div>", html)); Cast = matchAll(@"class=""name"">[\n\r\s]*<a.*?>(.*?)</a>", html); Plot = match(@"<p itemprop=""description"">(.*?)</p>", html); ReleaseDate = match(@"Release Date:</h4>.*?(\d{1,2} (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)\d{2}).*(\(|<span)", html); Runtime = match(@"Runtime:</h4>[\s]*.*?(\d{1,4}) min[\s]*.*?\<\/div\>", html); if (String.IsNullOrEmpty(Runtime)) { Runtime = match(@"infobar.*?([0-9]+) min.*?</div>", html); } Top250 = match(@"Top 250 #(\d{1,3})<", html); Oscars = match(@"Won (\d{1,2}) Oscars\.", html); Awards = match(@"(\d{1,4}) wins", html); Nominations = match(@"(\d{1,4}) nominations", html); Storyline = match(@"Storyline</h2>[\s]*<p>(.*?)[\s]*(<em|</p>)", html); Tagline = match(@"Taglines?:</h4>(.*?)(<span|</div)", html); MpaaRating = match(@"infobar"">.*?<img.*?alt=""(.*?)"" src="".*?certificates.*?"".*?>", html); Votes = match(@"ratingCount"">(\d+,?\d*)</span>", html); Languages = new ArrayList(); Languages = matchAll(@"<a.*?>(.*?)</a>", match(@"Language.?:(.*?)(</div>|>.?and )", html)); Countries = new ArrayList(); Countries = matchAll(@"<a.*?>(.*?)</a>", match(@"Country:(.*?)(</div>|>.?and )", html)); Poster = match(@"img_primary"">[\n\r\s]*?<a.*?><img src=""(.*?)"".*?</td>", html); if (!string.IsNullOrEmpty(Poster) && Poster.IndexOf("nopicture") < 0) { PosterSmall = Regex.Replace(Poster, @"_V1\..*?.jpg", "_V1._SY150.jpg"); PosterLarge = Regex.Replace(Poster, @"_V1\..*?.jpg", "_V1._SY500.jpg"); PosterFull = Regex.Replace(Poster, @"_V1\..*?.jpg", "_V1._SY0.jpg"); } else { Poster = string.Empty; PosterSmall = string.Empty; PosterLarge = string.Empty; PosterFull = string.Empty; } ImdbURL = "http://www.imdb.com/title/" + Id + "/"; if (GetExtraInfo) { ReleaseDates = getReleaseDates(); MediaImages = getMediaImages(); } } }
//Parse IMDb page data private void parseIMDbPage(string imdbUrl, bool GetExtraInfo) { string html = getUrlData(imdbUrl + "combined"); Id = match(@"<link rel=""canonical"" href=""http://www.imdb.com/title/(tt\d{7})/combined"" />", html); if (!string.IsNullOrEmpty(Id)) { status = true; //Title = match(@"<title>(IMDb \- )*(.*?) \(.*?</title>", html, 2); Title = match(@"tn15title\""><h1>(.*?)<", html); //match(@"<title>(IMDb \- )*(.*?) \([^)]*?(\d{4}).*?\)?.*?</title>", html, 2); OriginalTitle = match(@"title-extra"">(.*?)<", html); if (IsSeries) { string episodeName = match(@"tn15title\""><h1>[^<]*?<span><em>(.*?)<", html); string episodeExtra = match(@"tn15title\""><h1>[^<]*?<span><em>[^<]*?</em>(.*?)<", html); SeriesSubtitle = episodeName + " " + episodeExtra + "\r\n" + "Season " + SeriesSeason + ", Episode " + SeriesEpisode; } else { try { var season = match(@"\(Season (\d*), Episode \d*\)", html); var episode = match(@"\(Season \d*, Episode (\d*)\)", html); string episodeName = match(@"tn15title\""><h1>[^<]*?<span><em>(.*?)<", html); string episodeExtra = match(@"tn15title\""><h1>[^<]*?<span><em>[^<]*?</em>(.*?)<", html); IsSeries = !string.IsNullOrEmpty(season); if (IsSeries) { SeriesSeason = Convert.ToInt32(season); SeriesEpisode = Convert.ToInt32(episode); SeriesSubtitle = episodeName + " " + episodeExtra + "\r\n" + "Season " + SeriesSeason + ", Episode " + SeriesEpisode; } else { SeriesSeason = 0; SeriesEpisode = 0; SeriesSubtitle = ""; } } catch { IsSeries = false; } } if (OriginalTitle == "") { var eng = match(@">\s*\""([^<]*)\"" -.*?International <em>\(English title\)", html, 1); OriginalTitle = eng; } //Year = match(@"<title>.*?\(.*?(\d{4}).*?\).*?</title>", html); Year = match(@"<title>(IMDb \- )*(.*?) \([^)]*?(\d{4}).*?\)?.*?</title>", html, 3); Rating = match(@"<b>(\d.\d)/10</b>", html); Genres = matchAll(@"<a.*?>(.*?)</a>", match(@"Genre.?:(.*?)(</div>|See more)", html)); Directors = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Directed by</a></h5>(.*?)</table>", html)); Writers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Writing credits</a></h5>(.*?)</table>", html)); Producers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Produced by</a></h5>(.*?)</table>", html)); Musicians = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Original Music by</a></h5>(.*?)</table>", html)); Cinematographers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Cinematography by</a></h5>(.*?)</table>", html)); Editors = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Film Editing by</a></h5>(.*?)</table>", html)); Cast = matchAll(@"<td class=""nm""><a.*?href=""/name/.*?/"".*?>(.*?)</a>", match(@"<h3>Cast</h3>(.*?)</table>", html)); Plot = match(@"Plot:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html); PlotKeywords = matchAll(@"<a.*?>(.*?)</a>", match(@"Plot Keywords:</h5>.*?<div class=""info-content"">(.*?)</div", html)); ReleaseDate = match(@"Release Date:</h5>.*?<div class=""info-content"">.*?(\d{1,2} (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)\d{2})", html); Runtime = match(@"Runtime:</h5><div class=""info-content"">(\d{1,4}) min[\s]*.*?</div>", html); Top250 = match(@"Top 250: #(\d{1,3})<", html); Oscars = match(@"Won (\d+) Oscars?\.", html); if (string.IsNullOrEmpty(Oscars) && "Won Oscar.".Equals(match(@"(Won Oscar\.)", html))) { Oscars = "1"; } Awards = match(@"(\d{1,4}) wins", html); Nominations = match(@"(\d{1,4}) nominations", html); Tagline = match(@"Tagline:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html); MpaaRating = match(@"MPAA</a>:</h5><div class=""info-content"">Rated (G|PG|PG-13|PG-14|R|NC-17|X) ", html); Votes = match(@">(\d+,?\d*) votes<", html); Languages = matchAll(@"<a.*?>(.*?)</a>", match(@"Language.?:(.*?)(</div>|>.?and )", html)); Countries = matchAll(@"<a.*?>(.*?)</a>", match(@"Country:(.*?)(</div>|>.?and )", html)); Poster = match(@"<div class=""photo"">.*?<a name=""poster"".*?><img.*?src=""(.*?)"".*?</div>", html); if (!string.IsNullOrEmpty(Poster) && Poster.IndexOf("media-imdb.com") > 0) { Poster = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY200.jpg"); PosterLarge = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY500.jpg"); PosterFull = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY0.jpg"); } else { Poster = string.Empty; PosterLarge = string.Empty; PosterFull = string.Empty; } ImdbURL = "http://www.imdb.com/title/" + Id + "/"; if (GetExtraInfo) { string plotHtml = getUrlData(imdbUrl + "plotsummary"); Storyline = match(@"<p class=""plotpar"">(.*?)(<i>|</p>)", plotHtml); ReleaseDates = getReleaseDates(); MediaImages = getMediaImages(); RecommendedTitles = getRecommendedTitles(); } } }
//Get IMDb URL from search results //static public ArrayList getIMDbUrl(string MovieName, string searchEngine = "google") //{ // string url = GoogleSearch + MovieName; //default to Google search // if (searchEngine.ToLower().Equals("bing")) url = BingSearch + MovieName; // if (searchEngine.ToLower().Equals("ask")) url = AskSearch + MovieName; // string html = getUrlData(url); // ArrayList imdbUrls = matchAll(@"<a href=""(http://www.imdb.com/title/tt\d{7}/)"".*?>.*?</a>", html); // if (imdbUrls.Count > 0) // return imdbUrls; //return first IMDb result // else if (searchEngine.ToLower().Equals("google")) //if Google search fails // return getIMDbUrl(MovieName, "bing"); //search using Bing // else if (searchEngine.ToLower().Equals("bing")) //if Bing search fails // return getIMDbUrl(MovieName, "ask"); //search using Ask // else //search fails // return new ArrayList {string.Empty}; //} //Parse IMDb page data private void ParseImdbPage(string html, bool getExtraInfo) { Id = match(@"<link rel=""canonical"" href=""http://www.imdb.com/title/(tt\d{7})/combined"" />", html); if (!string.IsNullOrEmpty(Id)) { Status = true; Title = match(@"<title>(IMDb \- )*(.*?) \(.*?</title>", html, 2); OriginalTitle = match(@"title-extra"">(.*?)<", html); Year = match(@"<title>.*?\(.*?(\d{4}).*?\).*?</title>", html); Rating = match(@"<b>(\d.\d)/10</b>", html); Genres = matchAll(@"<a.*?>(.*?)</a>", match(@"Genre.?:(.*?)(</div>|See more)", html)); Directors = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Directed by</a></h5>(.*?)</table>", html)); Writers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Writing credits</a></h5>(.*?)</table>", html)); Producers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Produced by</a></h5>(.*?)</table>", html)); Musicians = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Original Music by</a></h5>(.*?)</table>", html)); Cinematographers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Cinematography by</a></h5>(.*?)</table>", html)); Editors = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Film Editing by</a></h5>(.*?)</table>", html)); Cast = matchAll(@"<td class=""nm""><a.*?href=""/name/.*?/"".*?>(.*?)</a>", match(@"<h3>Cast</h3>(.*?)</table>", html)); Plot = match(@"Plot:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html); PlotKeywords = matchAll(@"<a.*?>(.*?)</a>", match(@"Plot Keywords:</h5>.*?<div class=""info-content"">(.*?)</div", html)); ReleaseDate = match(@"Release Date:</h5>.*?<div class=""info-content"">.*?(\d{1,2} (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)\d{2})", html); Runtime = match(@"Runtime:</h5><div class=""info-content"">(\d{1,4}) min[\s]*.*?</div>", html); Top250 = match(@"Top 250: #(\d{1,3})<", html); Oscars = match(@"Won (\d+) Oscars?\.", html); if (string.IsNullOrEmpty(Oscars) && "Won Oscar.".Equals(match(@"(Won Oscar\.)", html))) { Oscars = "1"; } Awards = match(@"(\d{1,4}) wins", html); Nominations = match(@"(\d{1,4}) nominations", html); Tagline = match(@"Tagline:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html); MpaaRating = match(@"MPAA</a>:</h5><div class=""info-content"">Rated (G|PG|PG-13|PG-14|R|NC-17|X) ", html); Votes = match(@">(\d+,?\d*) votes<", html); Languages = matchAll(@"<a.*?>(.*?)</a>", match(@"Language.?:(.*?)(</div>|>.?and )", html)); Countries = matchAll(@"<a.*?>(.*?)</a>", match(@"Country:(.*?)(</div>|>.?and )", html)); Poster = match(@"<div class=""photo"">.*?<a name=""poster"".*?><img.*?src=""(.*?)"".*?</div>", html); if (!string.IsNullOrEmpty(Poster) && Poster.IndexOf("media-imdb.com") > 0) { Poster = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY200.jpg"); PosterLarge = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY500.jpg"); PosterFull = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY0.jpg"); } else { Poster = string.Empty; PosterLarge = string.Empty; PosterFull = string.Empty; } ImdbUrl = "http://www.imdb.com/title/" + Id + "/"; //Get Stars var html2 = GetUrlData(ImdbUrl); Stars = matchAll(@"<a.*?><span class=""itemprop"" itemprop=""name"">(.*?)</span></a>", match(@"Stars?:(.*?)</div>", html2)); if (getExtraInfo) { string plotHtml = GetUrlData(ImdbUrl + "plotsummary"); Storyline = StripHtml(match(@"<p class=""plotSummary"">(.*?)(<i>|</p>)", plotHtml)); //ReleaseDates = GetReleaseDates(); MediaImages = GetMediaImages(); //RecommendedTitles = GetRecommendedTitles(); } } }
//Parse IMDb page data private bool parseIMDbPage(string imdbUrl, bool GetExtraInfo) { int istart, iend; string substr1; string html = getUrlData(imdbUrl + "combined"); if (!string.IsNullOrEmpty(html)) { status = true; Id = match(@"<link rel=""canonical"" href=""http://www.imdb.com/title/(tt\d{7})/combined"" />", html); Title = match(@"<title>(IMDb \- )*(.*?) \(.*?</title>", html, 2); Title = System.Web.HttpUtility.HtmlDecode(Title); Title = System.Web.HttpUtility.HtmlDecode(Title); OriginalTitle = System.Web.HttpUtility.HtmlDecode(match(@"title-extra"">(.*?)<", html)); OriginalTitle = System.Web.HttpUtility.HtmlDecode(OriginalTitle); istart = html.IndexOf("<title>"); iend = html.IndexOf("</title>") + 8; substr1 = html.Substring(istart, iend - istart); Year = match(@"<title>.*?\(.*?(\d{4}).*?\).*?</title>", substr1); Rating = match(@"<b>(\d.\d)/10</b>", html); Genres = matchAll(@"<a.*?>(.*?)</a>", match(@"Genre.?:(.*?)(</div>|See more)", html)); Directors = new ArrayList(); Directors = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Directed by</a></h5>(.*?)</table>", html)); Writers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Writing credits</a></h5>(.*?)</table>", html)); Producers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Produced by</a></h5>(.*?)</table>", html)); Musicians = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Original Music by</a></h5>(.*?)</table>", html)); Cinematographers = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Cinematography by</a></h5>(.*?)</table>", html)); Editors = matchAll(@"<td valign=""top""><a.*?href=""/name/.*?/"">(.*?)</a>", match(@"Film Editing by</a></h5>(.*?)</table>", html)); Cast = matchAll(@"<td class=""nm""><a.*?href=""/name/.*?/"".*?>(.*?)</a>", match(@"<h3>Cast</h3>(.*?)</table>", html)); Plot = System.Web.HttpUtility.HtmlDecode(match(@"Plot:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html)); Plot = System.Web.HttpUtility.HtmlDecode(Plot); Plot = Plot.Replace("|", ""); Plot = Plot.Trim(); PlotKeywords = matchAll(@"<a.*?>(.*?)</a>", match(@"Plot Keywords:</h5>.*?<div class=""info-content"">(.*?)</div", html)); ReleaseDate = match(@"Release Date:</h5>.*?<div class=""info-content"">.*?(\d{1,2} (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)\d{2})", html); Runtime = match(@"Runtime:</h5><div class=""info-content"">(\d{1,4}) min[\s]*.*?</div>", html); Top250 = match(@"Top 250: #(\d{1,3})<", html); Oscars = match(@"Won (\d+) Oscars?\.", html); if (string.IsNullOrEmpty(Oscars) && "Won Oscar.".Equals(match(@"(Won Oscar\.)", html))) { Oscars = "1"; } Awards = match(@"(\d{1,4}) wins", html); Nominations = match(@"(\d{1,4}) nominations", html); Tagline = System.Web.HttpUtility.HtmlDecode(match(@"Tagline:</h5>.*?<div class=""info-content"">(.*?)(<a|</div)", html)); Tagline = System.Web.HttpUtility.HtmlDecode(Tagline); MpaaRating = match(@"MPAA</a>:</h5><div class=""info-content"">Rated (G|PG|PG-13|PG-14|R|NC-17|X) ", html); if (MpaaRating.Length == 0) { // If MPAA rating is not set check the Certification MpaaRating = match(@"Certification:</h5><div class=""info-content""><a href=""/search/title\?certificates=us:g"">USA:(G|PG|PG-13|PG-14|R|NC-17|X)", html); } Votes = match(@">(\d+,?\d*) votes<", html); Languages = matchAll(@"<a.*?>(.*?)</a>", match(@"Language.?:(.*?)(</div>|>.?and )", html)); Countries = matchAll(@"<a.*?>(.*?)</a>", match(@"Country:(.*?)(</div>|>.?and )", html)); Poster = match(@"<div class=""photo"">.*?<a name=""poster"".*?><img.*?src=""(.*?)"".*?</div>", html); if (!string.IsNullOrEmpty(Poster) && Poster.IndexOf("media-imdb.com") > 0) { Poster = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY200.jpg"); PosterLarge = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY500.jpg"); PosterFull = Regex.Replace(Poster, @"_V1.*?.jpg", "_V1._SY0.jpg"); } else { Poster = string.Empty; PosterLarge = string.Empty; PosterFull = string.Empty; } ImdbURL = "http://www.imdb.com/title/" + Id + "/"; if (GetExtraInfo) { string plotHtml = getUrlData(imdbUrl + "plotsummary"); Storyline = match(@"<p class=""plotpar"">(.*?)(<i>|</p>)", plotHtml); ReleaseDates = getReleaseDates(); MediaImages = getMediaImages(); RecommendedTitles = getRecommendedTitles(); } return(true); } else { Id = null; return(false); } }