Exemple #1
0
        /// <summary>
        /// Parses an html page with one title's informations
        /// </summary>
        /// <param name="fields">The fields allowed to be parsed.</param>
        /// <param name="media">If it's to parse a movie or a TV Serie.</param>
        /// <param name="actorN">Number of actors to parse.</param>
        /// <param name="sSeas">Number of first season to parse.</param>
        /// <param name="eSeas">Number of last season to parse.</param>
        /// <returns>A list of Strings with the info from the title.</returns>
        public IMDbTitle parseTitlePage(bool[] fields, int media, int actorN, int sSeas, int eSeas)
        {
            try
            {
                sB = new StringBuilder(page);
                IMDbTitle titl = new IMDbTitle();
                titl.Media = media;
                sB         = new StringBuilder(page);

                string pat   = @"<title>(.*?)\((\d{4}).*?\)</title>";
                Regex  reg   = new Regex(pat);
                Match  match = reg.Match(sB.ToString());
                string title = match.Groups[1].Value;
                string year  = match.Groups[2].Value;

                pat = @"<h1>(.*?)</h1>";
                reg = new Regex(pat);
                string type  = reg.Match(sB.ToString()).Groups[1].Value;
                bool   parse = true;
                if ((media == 0 && type.Contains("TV series")) || (media == 1 && !type.Contains("TV series")))
                {
                    parse = false;
                }
                if (parse)
                {
                    pat = @";id=(tt\d{7});";
                    reg = new Regex(pat);
                    string link = "http://www.imdb.com/title/" + reg.Match(sB.ToString()).Groups[1].Value + "/";
                    titl.URL = link;
                    titl.ID  = reg.Match(sB.ToString()).Groups[1].Value;

                    if (fields[0]) //Parse the titles's title
                    {
                        titl.Title = cleanText(title);
                    }
                    if (fields[1]) //Parse the titles's year
                    {
                        titl.Year = cleanText(year);
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 10 });

                    if (fields[2]) //Parse the titles's Cover link
                    {
                        if (!sB.ToString().Contains("http://ia.media-imdb.com/media/imdb/01/I/37/89/15/10.gif"))
                        {
                            pat = @"<a name=""poster"".*?src=""(.*?)""";
                            reg = new Regex(pat);
                            string covLink = reg.Match(sB.ToString()).Groups[1].Value;

                            titl.CoverURL = covLink;
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 5 });

                    if (fields[3]) //Parse the titles's User Rating
                    {
                        pat = @"<b>([0-9/\.]+)*.?</b>";
                        reg = new Regex(pat);
                        string rating = reg.Match(sB.ToString()).Groups[1].Value;

                        titl.Rating = cleanText(rating);
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 5 });

                    if (fields[4])      //Parse the Creators/Directors
                    {
                        if (media == 0) //directors
                        {
                            List <IMDbDirCrea> directors = new List <IMDbDirCrea>();
                            pat   = @"<h5>Director.*?\n(<a href=.*?</a>)<br/>\n{1,2}</div>";
                            reg   = new Regex(pat, RegexOptions.Singleline);
                            match = reg.Match(sB.ToString());
                            if (match.Success)
                            {
                                string temp = match.Groups[1].Value;
                                pat = @"<a href=""(.{16})"">(.*?)</a>";
                                reg = new Regex(pat);
                                MatchCollection matches = reg.Matches(temp);
                                foreach (Match m in matches)
                                {
                                    IMDbDirCrea director = new IMDbDirCrea();
                                    director.Type = 0;
                                    director.Name = cleanText(m.Groups[2].Value);
                                    director.URL  = "http://www.imdb.com" + m.Groups[1].Value;
                                    directors.Add(director);
                                }
                                titl.Directors = directors;
                            }
                        }
                        else if (media == 1) //creators
                        {
                            List <IMDbDirCrea> creators = new List <IMDbDirCrea>();
                            pat   = @"<h5>Creator.*?\n(<a href=.*?</a>)<br/>\n{1,2}<a class";
                            reg   = new Regex(pat, RegexOptions.Singleline);
                            match = reg.Match(sB.ToString());
                            if (match.Success)
                            {
                                string temp = match.Groups[1].Value;
                                pat = @"<a href=""(.{16})"">(.*?)</a>";
                                reg = new Regex(pat, RegexOptions.Singleline);
                                MatchCollection matches = reg.Matches(temp);
                                foreach (Match m in matches)
                                {
                                    IMDbDirCrea creator = new IMDbDirCrea();
                                    creator.Type = 0;
                                    creator.Name = cleanText(m.Groups[2].Value);
                                    creator.URL  = "http://www.imdb.com" + m.Groups[1].Value;
                                    creators.Add(creator);
                                }
                                titl.Directors = creators;
                            }
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 15 });

                    if (media == 1 && fields[5]) // Parse serie's seasons
                    {
                        pat   = @"<h5>Seasons.*?(<a href=.*?)</a>\n{1,2}<a class";
                        reg   = new Regex(pat, RegexOptions.Singleline);
                        match = reg.Match(sB.ToString());

                        if (match.Success)
                        {
                            string startSeas = "episodes#season-";
                            if (sSeas == -1)
                            {
                                startSeas += "1";
                            }
                            else
                            {
                                startSeas += sSeas;
                            }
                            string temp = match.Groups[1].Value;
                            reg   = new Regex(startSeas, RegexOptions.Singleline);
                            match = reg.Match(temp);
                            if (match.Success)
                            {
                                parseSeason(link + startSeas, eSeas, titl);
                            }
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 25 });

                    if (fields[6]) //Parse Genres
                    {
                        pat   = @"<h5>Genre.*?\n(<a href=.*?)<a class=";
                        reg   = new Regex(pat);
                        match = reg.Match(sB.ToString());
                        if (match.Success)
                        {
                            List <string> genres = new List <string>();
                            string        temp   = match.Groups[1].Value;
                            pat = @""">(.*?)</a>";
                            reg = new Regex(pat);
                            MatchCollection matches = reg.Matches(temp);
                            foreach (Match m in matches)
                            {
                                genres.Add(cleanText(m.Groups[1].Value));
                            }
                            titl.Genres = genres;
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 5 });

                    if (fields[7]) //Parse the Tagline
                    {
                        pat          = @"<h5>Tagline.*?\n(.*?)\n?<";
                        reg          = new Regex(pat);
                        titl.Tagline = cleanText(reg.Match(sB.ToString()).Groups[1].Value.Trim());
                    }

                    if (fields[8]) //Parse the Plot
                    {
                        pat       = @"<h5>Plot.*?\n(.*?)\n?<";
                        reg       = new Regex(pat);
                        titl.Plot = cleanText(reg.Match(sB.ToString()).Groups[1].Value.Trim());
                    }


                    if (fields[9]) //Parse the Actors
                    {
                        pat   = @"<h3>Cast.*?(<a href=.*?)<a class=";
                        reg   = new Regex(pat);
                        match = reg.Match(sB.ToString());
                        if (match.Success)
                        {
                            List <IMDbActor> actors = new List <IMDbActor>();
                            string           temp   = match.Groups[1].Value;
                            //pat = @"<a href="".*?<img src=""(.*?)"".*?<a href=""(.*?)"">(.*?)</a>.*?(href=""/character/.*?"">(.*?))?</a>";
                            pat = @"<a href="".*?<img src=""(.*?)"".*?<a href=""(.*?)"">(.*?)</a>.*?(<td class=""char"">(.*?))?</td></tr>";
                            reg = new Regex(pat);
                            MatchCollection matches = reg.Matches(temp);
                            int             count   = 0;
                            foreach (Match m in matches)
                            {
                                if (actorN == -1 || (count < actorN))
                                {
                                    IMDbActor actor = new IMDbActor();
                                    actor.Name = cleanText(m.Groups[3].Value);
                                    string caract = m.Groups[5].Value;
                                    if (caract != null && caract != "")
                                    {
                                        if (caract.Contains("<a href="))
                                        {
                                            pat    = @"href=""/character/.*?"">(.*?)</a>";
                                            reg    = new Regex(pat);
                                            caract = reg.Match(caract).Groups[1].Value;
                                        }
                                    }
                                    actor.Character = cleanText(caract);

                                    if (m.Groups[1].Value != "http://i.media-imdb.com/images/tn15/addtiny.gif")
                                    {
                                        actor.PhotoURL = m.Groups[1].Value;
                                    }

                                    actor.URL = "http://www.imdb.com" + m.Groups[2].Value;
                                    actors.Add(actor);
                                    count++;
                                }
                            }
                            titl.Actors = actors;
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 10 });

                    if (fields[10]) //Parse the Runtime
                    {
                        pat   = @"<h5>Runtime.*?\n(\d+ min)";
                        reg   = new Regex(pat);
                        match = reg.Match(sB.ToString());

                        if (match.Success)
                        {
                            titl.Runtime = match.Groups[1].Value;
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 5 });
                }
                return(titl);
            }
            catch (Exception ex)
            {
                parentErrorCaller.DynamicInvoke(new object[] { ex });
            }
            return(null);
        }
Exemple #2
0
        /// <summary>
        /// Parses an html page with one title's informations
        /// </summary>
        /// <param name="fields">The fields allowed to be parsed.</param>
        /// <param name="media">If it's to parse a movie or a TV Serie.</param>
        /// <param name="actorN">Number of actors to parse.</param>
        /// <param name="sSeas">Number of first season to parse.</param>
        /// <param name="eSeas">Number of last season to parse.</param>
        /// <returns>A list of Strings with the info from the title.</returns>
        public IMDbTitle parseTitlePage(bool[] fields, int media, int actorN, int sSeas, int eSeas)
        {
            try
            {
                sB = new StringBuilder(page);
                IMDbTitle titl = new IMDbTitle();
                titl.Media = media;
                sB = new StringBuilder(page);

                string pat = @"<title>(.*?)\((\d{4}).*?\)</title>";
                Regex reg = new Regex(pat);
                Match match = reg.Match(sB.ToString());
                string title = match.Groups[1].Value;
                string year = match.Groups[2].Value;

                pat = @"<h1>(.*?)</h1>";
                reg = new Regex(pat);
                string type = reg.Match(sB.ToString()).Groups[1].Value;
                bool parse = true;
                if ((media == 0 && type.Contains("TV series")) || (media == 1 && !type.Contains("TV series"))) 
                {
                    parse = false;
                }
                if (parse)
                {
                    pat = @";id=(tt\d{7});";
                    reg = new Regex(pat);
                    string link = "http://www.imdb.com/title/" + reg.Match(sB.ToString()).Groups[1].Value + "/";
                    titl.URL = link;
                    titl.ID = reg.Match(sB.ToString()).Groups[1].Value;

                    if (fields[0]) //Parse the titles's title
                    {
                        titl.Title = cleanText(title);
                    }
                    if (fields[1]) //Parse the titles's year
                    {
                        titl.Year = cleanText(year);
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 10 });

                    if (fields[2]) //Parse the titles's Cover link
                    {
                        if (!sB.ToString().Contains("http://ia.media-imdb.com/media/imdb/01/I/37/89/15/10.gif"))
                        {
                            pat = @"<a name=""poster"".*?src=""(.*?)""";
                            reg = new Regex(pat);
                            string covLink = reg.Match(sB.ToString()).Groups[1].Value;

                            titl.CoverURL = covLink;
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 5 });

                    if (fields[3]) //Parse the titles's User Rating
                    {
                        pat = @"<b>([0-9/\.]+)*.?</b>";
                        reg = new Regex(pat);
                        string rating = reg.Match(sB.ToString()).Groups[1].Value;

                        titl.Rating = cleanText(rating);
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 5 });

                    if (fields[4]) //Parse the Creators/Directors
                    {
                        if (media == 0) //directors
                        {
                            List<IMDbDirCrea> directors = new List<IMDbDirCrea>();
                            pat = @"<h5>Director.*?\n(<a href=.*?</a>)<br/>\n{1,2}</div>";
                            reg = new Regex(pat, RegexOptions.Singleline);
                            match = reg.Match(sB.ToString());
                            if (match.Success)
                            {
                                string temp = match.Groups[1].Value;
                                pat = @"<a href=""(.{16})"">(.*?)</a>";
                                reg = new Regex(pat);
                                MatchCollection matches = reg.Matches(temp);
                                foreach (Match m in matches)
                                {
                                    IMDbDirCrea director = new IMDbDirCrea();
                                    director.Type = 0;
                                    director.Name = cleanText(m.Groups[2].Value);
                                    director.URL = "http://www.imdb.com" + m.Groups[1].Value;
                                    directors.Add(director);
                                }
                                titl.Directors = directors;
                            }
                            
                        }
                        else if (media == 1) //creators
                        {
                            List<IMDbDirCrea> creators = new List<IMDbDirCrea>();
                            pat = @"<h5>Creator.*?\n(<a href=.*?</a>)<br/>\n{1,2}<a class";
                            reg = new Regex(pat, RegexOptions.Singleline);
                            match = reg.Match(sB.ToString());
                            if (match.Success)
                            {
                                string temp = match.Groups[1].Value;
                                pat = @"<a href=""(.{16})"">(.*?)</a>";
                                reg = new Regex(pat, RegexOptions.Singleline);
                                MatchCollection matches = reg.Matches(temp);
                                foreach (Match m in matches)
                                {
                                    IMDbDirCrea creator = new IMDbDirCrea();
                                    creator.Type = 0;
                                    creator.Name = cleanText(m.Groups[2].Value);
                                    creator.URL = "http://www.imdb.com" + m.Groups[1].Value;
                                    creators.Add(creator);
                                }
                                titl.Directors = creators;
                            }
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 15 });

                    if (media == 1 && fields[5]) // Parse serie's seasons
                    {
                        pat = @"<h5>Seasons.*?(<a href=.*?)</a>\n{1,2}<a class";
                        reg = new Regex(pat, RegexOptions.Singleline);
                        match = reg.Match(sB.ToString());

                        if (match.Success)
                        {
                            string startSeas = "episodes#season-";
                            if (sSeas == -1)
                                startSeas += "1";
                            else
                                startSeas += sSeas;
                            string temp = match.Groups[1].Value;
                            reg = new Regex(startSeas, RegexOptions.Singleline);
                            match = reg.Match(temp);
                            if (match.Success)
                            {
                                parseSeason(link + startSeas, eSeas, titl);
                            }
                        }       
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 25 });

                    if (fields[6]) //Parse Genres
                    {
                        pat = @"<h5>Genre.*?\n(<a href=.*?)<a class=";
                        reg = new Regex(pat);
                        match = reg.Match(sB.ToString());
                        if(match.Success) 
                        {
                            List<string> genres = new List<string>();
                            string temp = match.Groups[1].Value;
                            pat = @""">(.*?)</a>";
                            reg = new Regex(pat);
                            MatchCollection matches = reg.Matches(temp);
                            foreach (Match m in matches)
                            {
                                genres.Add(cleanText(m.Groups[1].Value));
                            }
                            titl.Genres = genres;
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 5 });

                    if (fields[7]) //Parse the Tagline
                    {
                        pat = @"<h5>Tagline.*?\n(.*?)\n?<";
                        reg = new Regex(pat);
                        titl.Tagline = cleanText(reg.Match(sB.ToString()).Groups[1].Value.Trim());
                    }

                    if (fields[8]) //Parse the Plot
                    {
                        pat = @"<h5>Plot.*?\n(.*?)\n?<";
                        reg = new Regex(pat);
                        titl.Plot = cleanText(reg.Match(sB.ToString()).Groups[1].Value.Trim());
                    }


                    if (fields[9]) //Parse the Actors
                    {
                        pat = @"<h3>Cast.*?(<a href=.*?)<a class=";
                        reg = new Regex(pat);
                        match = reg.Match(sB.ToString());
                        if (match.Success)
                        {
                            List<IMDbActor> actors = new List<IMDbActor>();
                            string temp = match.Groups[1].Value;
                            //pat = @"<a href="".*?<img src=""(.*?)"".*?<a href=""(.*?)"">(.*?)</a>.*?(href=""/character/.*?"">(.*?))?</a>";
                            pat = @"<a href="".*?<img src=""(.*?)"".*?<a href=""(.*?)"">(.*?)</a>.*?(<td class=""char"">(.*?))?</td></tr>";
                            reg = new Regex(pat);
                            MatchCollection matches = reg.Matches(temp);
                            int count = 0;
                            foreach (Match m in matches)
                            {
                                if (actorN == -1 || (count < actorN))
                                {
                                    IMDbActor actor = new IMDbActor();
                                    actor.Name = cleanText(m.Groups[3].Value);
                                    string caract = m.Groups[5].Value;
                                    if (caract != null && caract != "")
                                    {
                                        if (caract.Contains("<a href="))
                                        {
                                            pat = @"href=""/character/.*?"">(.*?)</a>";
                                            reg = new Regex(pat);
                                            caract = reg.Match(caract).Groups[1].Value;
                                        }
                                    }
                                    actor.Character = cleanText(caract);

                                    if (m.Groups[1].Value != "http://i.media-imdb.com/images/tn15/addtiny.gif")
                                    {
                                        actor.PhotoURL = m.Groups[1].Value;
                                    }

                                    actor.URL = "http://www.imdb.com" + m.Groups[2].Value;
                                    actors.Add(actor);
                                    count++;
                                }
                            }
                            titl.Actors = actors;
                        }
                    }

                    parentProgressCaller.DynamicInvoke(new object[] { 10 });

                    if (fields[10]) //Parse the Runtime
                    {
                        pat = @"<h5>Runtime.*?\n(\d+ min)";
                        reg = new Regex(pat);
                        match = reg.Match(sB.ToString());

                        if (match.Success)
                        {
                            titl.Runtime = match.Groups[1].Value;
                        }
                    }
                    
                    parentProgressCaller.DynamicInvoke(new object[] { 5 });
                }
                return titl;
            }
            catch (Exception ex)
            {
                parentErrorCaller.DynamicInvoke(new object[] { ex });
            }
            return null;
        }