/// <summary> /// Parses an html page with one title's informations /// </summary> /// <param name="fields">The fields allowed to be parsed.</param> /// <param name="media">If it's to parse a movie or a TV Serie.</param> /// <param name="actorN">Number of actors to parse.</param> /// <param name="sSeas">Number of first season to parse.</param> /// <param name="eSeas">Number of last season to parse.</param> /// <returns>A list of Strings with the info from the title.</returns> public IMDbTitle parseTitlePage(bool[] fields, int media, int actorN, int sSeas, int eSeas) { try { sB = new StringBuilder(page); IMDbTitle titl = new IMDbTitle(); titl.Media = media; sB = new StringBuilder(page); string pat = @"<title>(.*?)\((\d{4}).*?\)</title>"; Regex reg = new Regex(pat); Match match = reg.Match(sB.ToString()); string title = match.Groups[1].Value; string year = match.Groups[2].Value; pat = @"<h1>(.*?)</h1>"; reg = new Regex(pat); string type = reg.Match(sB.ToString()).Groups[1].Value; bool parse = true; if ((media == 0 && type.Contains("TV series")) || (media == 1 && !type.Contains("TV series"))) { parse = false; } if (parse) { pat = @";id=(tt\d{7});"; reg = new Regex(pat); string link = "http://www.imdb.com/title/" + reg.Match(sB.ToString()).Groups[1].Value + "/"; titl.URL = link; titl.ID = reg.Match(sB.ToString()).Groups[1].Value; if (fields[0]) //Parse the titles's title { titl.Title = cleanText(title); } if (fields[1]) //Parse the titles's year { titl.Year = cleanText(year); } parentProgressCaller.DynamicInvoke(new object[] { 10 }); if (fields[2]) //Parse the titles's Cover link { if (!sB.ToString().Contains("http://ia.media-imdb.com/media/imdb/01/I/37/89/15/10.gif")) { pat = @"<a name=""poster"".*?src=""(.*?)"""; reg = new Regex(pat); string covLink = reg.Match(sB.ToString()).Groups[1].Value; titl.CoverURL = covLink; } } parentProgressCaller.DynamicInvoke(new object[] { 5 }); if (fields[3]) //Parse the titles's User Rating { pat = @"<b>([0-9/\.]+)*.?</b>"; reg = new Regex(pat); string rating = reg.Match(sB.ToString()).Groups[1].Value; titl.Rating = cleanText(rating); } parentProgressCaller.DynamicInvoke(new object[] { 5 }); if (fields[4]) //Parse the Creators/Directors { if (media == 0) //directors { List <IMDbDirCrea> directors = new List <IMDbDirCrea>(); pat = @"<h5>Director.*?\n(<a href=.*?</a>)<br/>\n{1,2}</div>"; reg = new Regex(pat, RegexOptions.Singleline); match = reg.Match(sB.ToString()); if (match.Success) { string temp = match.Groups[1].Value; pat = @"<a href=""(.{16})"">(.*?)</a>"; reg = new Regex(pat); MatchCollection matches = reg.Matches(temp); foreach (Match m in matches) { IMDbDirCrea director = new IMDbDirCrea(); director.Type = 0; director.Name = cleanText(m.Groups[2].Value); director.URL = "http://www.imdb.com" + m.Groups[1].Value; directors.Add(director); } titl.Directors = directors; } } else if (media == 1) //creators { List <IMDbDirCrea> creators = new List <IMDbDirCrea>(); pat = @"<h5>Creator.*?\n(<a href=.*?</a>)<br/>\n{1,2}<a class"; reg = new Regex(pat, RegexOptions.Singleline); match = reg.Match(sB.ToString()); if (match.Success) { string temp = match.Groups[1].Value; pat = @"<a href=""(.{16})"">(.*?)</a>"; reg = new Regex(pat, RegexOptions.Singleline); MatchCollection matches = reg.Matches(temp); foreach (Match m in matches) { IMDbDirCrea creator = new IMDbDirCrea(); creator.Type = 0; creator.Name = cleanText(m.Groups[2].Value); creator.URL = "http://www.imdb.com" + m.Groups[1].Value; creators.Add(creator); } titl.Directors = creators; } } } parentProgressCaller.DynamicInvoke(new object[] { 15 }); if (media == 1 && fields[5]) // Parse serie's seasons { pat = @"<h5>Seasons.*?(<a href=.*?)</a>\n{1,2}<a class"; reg = new Regex(pat, RegexOptions.Singleline); match = reg.Match(sB.ToString()); if (match.Success) { string startSeas = "episodes#season-"; if (sSeas == -1) { startSeas += "1"; } else { startSeas += sSeas; } string temp = match.Groups[1].Value; reg = new Regex(startSeas, RegexOptions.Singleline); match = reg.Match(temp); if (match.Success) { parseSeason(link + startSeas, eSeas, titl); } } } parentProgressCaller.DynamicInvoke(new object[] { 25 }); if (fields[6]) //Parse Genres { pat = @"<h5>Genre.*?\n(<a href=.*?)<a class="; reg = new Regex(pat); match = reg.Match(sB.ToString()); if (match.Success) { List <string> genres = new List <string>(); string temp = match.Groups[1].Value; pat = @""">(.*?)</a>"; reg = new Regex(pat); MatchCollection matches = reg.Matches(temp); foreach (Match m in matches) { genres.Add(cleanText(m.Groups[1].Value)); } titl.Genres = genres; } } parentProgressCaller.DynamicInvoke(new object[] { 5 }); if (fields[7]) //Parse the Tagline { pat = @"<h5>Tagline.*?\n(.*?)\n?<"; reg = new Regex(pat); titl.Tagline = cleanText(reg.Match(sB.ToString()).Groups[1].Value.Trim()); } if (fields[8]) //Parse the Plot { pat = @"<h5>Plot.*?\n(.*?)\n?<"; reg = new Regex(pat); titl.Plot = cleanText(reg.Match(sB.ToString()).Groups[1].Value.Trim()); } if (fields[9]) //Parse the Actors { pat = @"<h3>Cast.*?(<a href=.*?)<a class="; reg = new Regex(pat); match = reg.Match(sB.ToString()); if (match.Success) { List <IMDbActor> actors = new List <IMDbActor>(); string temp = match.Groups[1].Value; //pat = @"<a href="".*?<img src=""(.*?)"".*?<a href=""(.*?)"">(.*?)</a>.*?(href=""/character/.*?"">(.*?))?</a>"; pat = @"<a href="".*?<img src=""(.*?)"".*?<a href=""(.*?)"">(.*?)</a>.*?(<td class=""char"">(.*?))?</td></tr>"; reg = new Regex(pat); MatchCollection matches = reg.Matches(temp); int count = 0; foreach (Match m in matches) { if (actorN == -1 || (count < actorN)) { IMDbActor actor = new IMDbActor(); actor.Name = cleanText(m.Groups[3].Value); string caract = m.Groups[5].Value; if (caract != null && caract != "") { if (caract.Contains("<a href=")) { pat = @"href=""/character/.*?"">(.*?)</a>"; reg = new Regex(pat); caract = reg.Match(caract).Groups[1].Value; } } actor.Character = cleanText(caract); if (m.Groups[1].Value != "http://i.media-imdb.com/images/tn15/addtiny.gif") { actor.PhotoURL = m.Groups[1].Value; } actor.URL = "http://www.imdb.com" + m.Groups[2].Value; actors.Add(actor); count++; } } titl.Actors = actors; } } parentProgressCaller.DynamicInvoke(new object[] { 10 }); if (fields[10]) //Parse the Runtime { pat = @"<h5>Runtime.*?\n(\d+ min)"; reg = new Regex(pat); match = reg.Match(sB.ToString()); if (match.Success) { titl.Runtime = match.Groups[1].Value; } } parentProgressCaller.DynamicInvoke(new object[] { 5 }); } return(titl); } catch (Exception ex) { parentErrorCaller.DynamicInvoke(new object[] { ex }); } return(null); }
/// <summary> /// Parses an html page with one title's informations /// </summary> /// <param name="fields">The fields allowed to be parsed.</param> /// <param name="media">If it's to parse a movie or a TV Serie.</param> /// <param name="actorN">Number of actors to parse.</param> /// <param name="sSeas">Number of first season to parse.</param> /// <param name="eSeas">Number of last season to parse.</param> /// <returns>A list of Strings with the info from the title.</returns> public IMDbTitle parseTitlePage(bool[] fields, int media, int actorN, int sSeas, int eSeas) { try { sB = new StringBuilder(page); IMDbTitle titl = new IMDbTitle(); titl.Media = media; sB = new StringBuilder(page); string pat = @"<title>(.*?)\((\d{4}).*?\)</title>"; Regex reg = new Regex(pat); Match match = reg.Match(sB.ToString()); string title = match.Groups[1].Value; string year = match.Groups[2].Value; pat = @"<h1>(.*?)</h1>"; reg = new Regex(pat); string type = reg.Match(sB.ToString()).Groups[1].Value; bool parse = true; if ((media == 0 && type.Contains("TV series")) || (media == 1 && !type.Contains("TV series"))) { parse = false; } if (parse) { pat = @";id=(tt\d{7});"; reg = new Regex(pat); string link = "http://www.imdb.com/title/" + reg.Match(sB.ToString()).Groups[1].Value + "/"; titl.URL = link; titl.ID = reg.Match(sB.ToString()).Groups[1].Value; if (fields[0]) //Parse the titles's title { titl.Title = cleanText(title); } if (fields[1]) //Parse the titles's year { titl.Year = cleanText(year); } parentProgressCaller.DynamicInvoke(new object[] { 10 }); if (fields[2]) //Parse the titles's Cover link { if (!sB.ToString().Contains("http://ia.media-imdb.com/media/imdb/01/I/37/89/15/10.gif")) { pat = @"<a name=""poster"".*?src=""(.*?)"""; reg = new Regex(pat); string covLink = reg.Match(sB.ToString()).Groups[1].Value; titl.CoverURL = covLink; } } parentProgressCaller.DynamicInvoke(new object[] { 5 }); if (fields[3]) //Parse the titles's User Rating { pat = @"<b>([0-9/\.]+)*.?</b>"; reg = new Regex(pat); string rating = reg.Match(sB.ToString()).Groups[1].Value; titl.Rating = cleanText(rating); } parentProgressCaller.DynamicInvoke(new object[] { 5 }); if (fields[4]) //Parse the Creators/Directors { if (media == 0) //directors { List<IMDbDirCrea> directors = new List<IMDbDirCrea>(); pat = @"<h5>Director.*?\n(<a href=.*?</a>)<br/>\n{1,2}</div>"; reg = new Regex(pat, RegexOptions.Singleline); match = reg.Match(sB.ToString()); if (match.Success) { string temp = match.Groups[1].Value; pat = @"<a href=""(.{16})"">(.*?)</a>"; reg = new Regex(pat); MatchCollection matches = reg.Matches(temp); foreach (Match m in matches) { IMDbDirCrea director = new IMDbDirCrea(); director.Type = 0; director.Name = cleanText(m.Groups[2].Value); director.URL = "http://www.imdb.com" + m.Groups[1].Value; directors.Add(director); } titl.Directors = directors; } } else if (media == 1) //creators { List<IMDbDirCrea> creators = new List<IMDbDirCrea>(); pat = @"<h5>Creator.*?\n(<a href=.*?</a>)<br/>\n{1,2}<a class"; reg = new Regex(pat, RegexOptions.Singleline); match = reg.Match(sB.ToString()); if (match.Success) { string temp = match.Groups[1].Value; pat = @"<a href=""(.{16})"">(.*?)</a>"; reg = new Regex(pat, RegexOptions.Singleline); MatchCollection matches = reg.Matches(temp); foreach (Match m in matches) { IMDbDirCrea creator = new IMDbDirCrea(); creator.Type = 0; creator.Name = cleanText(m.Groups[2].Value); creator.URL = "http://www.imdb.com" + m.Groups[1].Value; creators.Add(creator); } titl.Directors = creators; } } } parentProgressCaller.DynamicInvoke(new object[] { 15 }); if (media == 1 && fields[5]) // Parse serie's seasons { pat = @"<h5>Seasons.*?(<a href=.*?)</a>\n{1,2}<a class"; reg = new Regex(pat, RegexOptions.Singleline); match = reg.Match(sB.ToString()); if (match.Success) { string startSeas = "episodes#season-"; if (sSeas == -1) startSeas += "1"; else startSeas += sSeas; string temp = match.Groups[1].Value; reg = new Regex(startSeas, RegexOptions.Singleline); match = reg.Match(temp); if (match.Success) { parseSeason(link + startSeas, eSeas, titl); } } } parentProgressCaller.DynamicInvoke(new object[] { 25 }); if (fields[6]) //Parse Genres { pat = @"<h5>Genre.*?\n(<a href=.*?)<a class="; reg = new Regex(pat); match = reg.Match(sB.ToString()); if(match.Success) { List<string> genres = new List<string>(); string temp = match.Groups[1].Value; pat = @""">(.*?)</a>"; reg = new Regex(pat); MatchCollection matches = reg.Matches(temp); foreach (Match m in matches) { genres.Add(cleanText(m.Groups[1].Value)); } titl.Genres = genres; } } parentProgressCaller.DynamicInvoke(new object[] { 5 }); if (fields[7]) //Parse the Tagline { pat = @"<h5>Tagline.*?\n(.*?)\n?<"; reg = new Regex(pat); titl.Tagline = cleanText(reg.Match(sB.ToString()).Groups[1].Value.Trim()); } if (fields[8]) //Parse the Plot { pat = @"<h5>Plot.*?\n(.*?)\n?<"; reg = new Regex(pat); titl.Plot = cleanText(reg.Match(sB.ToString()).Groups[1].Value.Trim()); } if (fields[9]) //Parse the Actors { pat = @"<h3>Cast.*?(<a href=.*?)<a class="; reg = new Regex(pat); match = reg.Match(sB.ToString()); if (match.Success) { List<IMDbActor> actors = new List<IMDbActor>(); string temp = match.Groups[1].Value; //pat = @"<a href="".*?<img src=""(.*?)"".*?<a href=""(.*?)"">(.*?)</a>.*?(href=""/character/.*?"">(.*?))?</a>"; pat = @"<a href="".*?<img src=""(.*?)"".*?<a href=""(.*?)"">(.*?)</a>.*?(<td class=""char"">(.*?))?</td></tr>"; reg = new Regex(pat); MatchCollection matches = reg.Matches(temp); int count = 0; foreach (Match m in matches) { if (actorN == -1 || (count < actorN)) { IMDbActor actor = new IMDbActor(); actor.Name = cleanText(m.Groups[3].Value); string caract = m.Groups[5].Value; if (caract != null && caract != "") { if (caract.Contains("<a href=")) { pat = @"href=""/character/.*?"">(.*?)</a>"; reg = new Regex(pat); caract = reg.Match(caract).Groups[1].Value; } } actor.Character = cleanText(caract); if (m.Groups[1].Value != "http://i.media-imdb.com/images/tn15/addtiny.gif") { actor.PhotoURL = m.Groups[1].Value; } actor.URL = "http://www.imdb.com" + m.Groups[2].Value; actors.Add(actor); count++; } } titl.Actors = actors; } } parentProgressCaller.DynamicInvoke(new object[] { 10 }); if (fields[10]) //Parse the Runtime { pat = @"<h5>Runtime.*?\n(\d+ min)"; reg = new Regex(pat); match = reg.Match(sB.ToString()); if (match.Success) { titl.Runtime = match.Groups[1].Value; } } parentProgressCaller.DynamicInvoke(new object[] { 5 }); } return titl; } catch (Exception ex) { parentErrorCaller.DynamicInvoke(new object[] { ex }); } return null; }