/// <summary>
        /// Removes all HTML comments from a string.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <returns>Comment-free version of string.</returns>
        public static string removeComments
            (string strString)
        {
            // Return comment-free version of string
            string     strCommentFreeString = "";
            string     strSegment           = "";
            HTMLParser parser = new HTMLParser(strString);

            while (parser.extractTo("<!--", ref strSegment))
            {
                strCommentFreeString += strSegment;
                if (!parser.skipToEndOf("-->"))
                {
                    return(strString);
                }
            }

            parser.extractToEnd(ref strSegment);
            strCommentFreeString += strSegment;
            return(strCommentFreeString);
        }
        /// <summary>
        /// Removes all scripts from a string.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <returns>Version of string without any scripts.</returns>
        public static string removeScripts
            (string strString)
        {
            // Get script-free version of content
            string     strStringSansScripts = "";
            string     strSegment           = "";
            HTMLParser parser = new HTMLParser(strString);

            while (parser.extractToNoCase("<script", ref strSegment))
            {
                strStringSansScripts += strSegment;
                if (!parser.skipToEndOfNoCase("</script>"))
                {
                    parser.Content = strStringSansScripts;
                    return(strString);
                }
            }

            parser.extractToEnd(ref strSegment);
            strStringSansScripts += strSegment;
            return(strStringSansScripts);
        }
Esempio n. 3
0
    // Filmograpy and bio
    public bool GetActorDetails(IMDBUrl url, out IMDBActor actor)
    {
      actor = new IMDBActor();

      string[] vdbParserStr = VdbParserStringActorDetails();

      if (vdbParserStr == null || vdbParserStr.Length != 46)
      {
        return false;
      }

      try
      {
        string absoluteUri;
        string strBody = GetPage(url.URL, "utf-8", out absoluteUri);
        
        if (strBody == null)
        {
          return false;
        }
        
        if (strBody.Length == 0)
        {
          return false;
        }
        
        #region Actor imdb id

        // IMDBActorID
        try
        {
          int pos = url.URL.LastIndexOf("nm");
          string id = url.URL.Substring(pos, 9).Replace("/", string.Empty);
          actor.IMDBActorID = id;
        }
        catch (Exception) { }

        #endregion

        HTMLParser parser = new HTMLParser(strBody);
        string strThumb = string.Empty;
        string value = string.Empty;
        string value2 = string.Empty;
        
        #region Actor name

        // Actor name
        if ((parser.skipToEndOf(vdbParserStr[0])) &&        // <title>
            (parser.extractTo(vdbParserStr[1], ref value))) // - IMDb</title>
        {
          value = new HTMLUtil().ConvertHTMLToAnsi(value);
          value = Util.Utils.RemoveParenthesis(value).Trim();
          actor.Name = HttpUtility.HtmlDecode(value.Trim());
        }
        
        if (actor.Name == string.Empty)
        {
          actor.Name = url.Title;
        }

        #endregion

        // Photo
        string parserTxt = parser.Content;
        string photoBlock = string.Empty;

        #region Actor photo

        if (parser.skipToStartOf(vdbParserStr[2]) &&              // <td id="img_primary"
            (parser.extractTo(vdbParserStr[3], ref photoBlock)))  // </td>
        {
          parser.Content = photoBlock;
        
          if ((parser.skipToEndOf(vdbParserStr[4])) &&            // <img src="
              (parser.extractTo(vdbParserStr[5], ref strThumb)))  // "
          {
            actor.ThumbnailUrl = strThumb;
          }
          parser.Content = parserTxt;
        }
        
        #endregion

        #region Actor birth date

        // Birth date
        if ((parser.skipToEndOf(vdbParserStr[6])) &&          // >Born:</h4>
            (parser.skipToEndOf(vdbParserStr[7])) &&          // birth_monthday=
            (parser.skipToEndOf(vdbParserStr[8])) &&          // >
            (parser.extractTo(vdbParserStr[9], ref value)) && // <
            (parser.skipToEndOf(vdbParserStr[10])) &&         // year=
            (parser.extractTo(vdbParserStr[11], ref value2))) // "

        {
          actor.DateOfBirth = value + " " + value2;
        }

        #endregion

        #region Actor death date

        // Death date
        if ((parser.skipToEndOf(vdbParserStr[12])) &&           // >Died:</h4>
            (parser.skipToEndOf(vdbParserStr[13])) &&           // death_monthday="
            (parser.skipToEndOf(vdbParserStr[14])) &&           // >
            (parser.extractTo(vdbParserStr[15], ref value)) &&  // <
            (parser.skipToEndOf(vdbParserStr[16])) &&           // death_date="
            (parser.extractTo(vdbParserStr[17], ref value2)))   // "
        {
          actor.DateOfDeath = value + " " + value2;
        }

        #endregion

        parser.resetPosition();

        #region Actor birth place

        // Birth place
        if ((parser.skipToEndOf(vdbParserStr[18])) &&         // birth_place=
            (parser.skipToEndOf(vdbParserStr[19])) &&         // >
            (parser.extractTo(vdbParserStr[20], ref value)))  // <
        {
          actor.PlaceOfBirth = HttpUtility.HtmlDecode(value);
        }

        #endregion

        #region Actor death place

        // Death place
        if ((parser.skipToEndOf(vdbParserStr[21])) &&         // death_place=
            (parser.skipToEndOf(vdbParserStr[22])) &&         // >
            (parser.extractTo(vdbParserStr[23], ref value)))  // <
        {
          actor.PlaceOfDeath = HttpUtility.HtmlDecode(value);
        }

        #endregion

        //Mini Biography
        parser.resetPosition();

        #region Actor biography

        if ((parser.skipToEndOf(vdbParserStr[24])) &&         // <td id="overview-top">
            (parser.skipToEndOf(vdbParserStr[25])) &&         // <p>
            (parser.extractTo(vdbParserStr[26], ref value)))  // See full bio</a>
        {
          value = new HTMLUtil().ConvertHTMLToAnsi(value);
          actor.MiniBiography = Util.Utils.stripHTMLtags(value);
          actor.MiniBiography = actor.MiniBiography.Replace(vdbParserStr[45], string.Empty).Trim(); // See full bio »
          actor.MiniBiography = HttpUtility.HtmlDecode(actor.MiniBiography); // Remove HTML entities like &#189;
          
          if (actor.MiniBiography != string.Empty)
          {
            // get complete biography
            string bioURL = absoluteUri;
            
            if (!bioURL.EndsWith(vdbParserStr[27])) // /
            {
              bioURL += vdbParserStr[28];           // /bio
            }
            else
            {
              bioURL += vdbParserStr[29];           // bio
            }

            string strBioBody = GetPage(bioURL, "utf-8", out absoluteUri);
            
            if (!string.IsNullOrEmpty(strBioBody))
            {
              HTMLParser parser1 = new HTMLParser(strBioBody);

              if (parser1.skipToEndOf(vdbParserStr[30]) &&        // <h5>Mini Biography</h5>
                  parser1.skipToEndOf(vdbParserStr[31]) &&        // <div class="wikipedia_bio">
                  parser1.extractTo(vdbParserStr[32], ref value)) // </div>
              {
                value = new HTMLUtil().ConvertHTMLToAnsi(value);
                value = Regex.Replace(value, @"</h5>\s<h5>", "\n\r");
                value = Regex.Replace(value, @"<h5>", "\n\r\n\r");
                value = Regex.Replace(value, @"</h5>", ":\n\r");
                actor.Biography = Util.Utils.stripHTMLtags(value).Trim();
                actor.Biography = HttpUtility.HtmlDecode(actor.Biography);
              }
              else
              {
                parser1.resetPosition();
                
                if (parser1.skipToEndOf(vdbParserStr[33]) &&      // <h5>Mini Biography</h5>
                  parser1.extractTo(vdbParserStr[34], ref value)) // </p>
                {
                  value = new HTMLUtil().ConvertHTMLToAnsi(value);
                  actor.Biography = Util.Utils.stripHTMLtags(value).Trim();
                  actor.Biography = HttpUtility.HtmlDecode(actor.Biography);
                }
              }
            }
          }
        }

        #endregion

        // Person is movie director or an actor/actress
        bool isActorPass = false;
        bool isDirectorPass = false;
        bool isWriterPass = false;
        
        parser.resetPosition();

        HTMLParser dirParser = new HTMLParser(); // HTML body for Director
        HTMLParser wriParser = new HTMLParser(); // HTML body for Writers

        #region Check person role in movie (actor, director or writer)

        if ((parser.skipToEndOf(vdbParserStr[35])) && // name="Director">Director</a>
            (parser.skipToEndOf(vdbParserStr[36])))   // </div>
        {
          isDirectorPass = true;
          dirParser.Content = parser.Content;
        }
        
        parser.resetPosition();

        if ((parser.skipToEndOf(vdbParserStr[37])) && // name="Writer">Writer</a>
            (parser.skipToEndOf(vdbParserStr[38])))   // </div>
        {
          isWriterPass = true;
          wriParser.Content = parser.Content;
        }

        parser.resetPosition();

        if (parser.skipToEndOf(vdbParserStr[39]) || // name="Actress">Actress</a>
          parser.skipToEndOf(vdbParserStr[40]))     // name="Actor">Actor</a>
        {
          isActorPass = true;
        }

        #endregion

        #region Get movies for every role

        // Get filmography Actor
        if (isActorPass)
        {
          GetActorMovies(actor, parser, false, false);
        }
        
        // Get filmography for writers
        if (isWriterPass)
        {
          parser = wriParser;
          parser.resetPosition();

          if ((parser.skipToEndOf(vdbParserStr[41])) && // name="Writer">Writer</a>
            (parser.skipToEndOf(vdbParserStr[42])))     // </div>
          {
            GetActorMovies(actor, parser, false, true);
          }
        }

        // Get filmography Director
        if (isDirectorPass)
        {
          parser = dirParser;
          parser.resetPosition();
          
          if (parser.skipToEndOf(vdbParserStr[43]) && // name="Director">Director</a>
              parser.skipToEndOf(vdbParserStr[44]))   // </div>
          {
            GetActorMovies(actor, parser, true, false);
          }
        }

        #endregion

        // Add filmography
        if (actor.Count > 0)
        {
          actor.SortActorMoviesByYear();
        }

        return true;
      }
      catch (Exception ex)
      {
        Log.Error("IMDB.GetActorDetails({0} exception:{1} {2} {3}", url.URL, ex.Message, ex.Source, ex.StackTrace);
      }
      return false;
    }
Esempio n. 4
0
    private void FindIMDBActor(string strURL)
    {
      string[] vdbParserStr = VdbParserStringActor();

      if (vdbParserStr == null || vdbParserStr.Length != 29)
      {
        return;
      }

      try
      {
        string absoluteUri;
        // UTF-8 have problem with special country chars, default IMDB enc is used
        string strBody = GetPage(strURL, "utf-8", out absoluteUri);
        string value = string.Empty;
        HTMLParser parser = new HTMLParser(strBody);
        
        if ((parser.skipToEndOf(vdbParserStr[0])) &&          // <title>
            (parser.extractTo(vdbParserStr[1], ref value)) && // </title>
            !value.ToLower().Equals(vdbParserStr[2]))         // imdb name search
        {
          value = new HTMLUtil().ConvertHTMLToAnsi(value);
          value = Util.Utils.RemoveParenthesis(value).Trim();
          IMDBUrl oneUrl = new IMDBUrl(absoluteUri, value, "IMDB");
          _elements.Add(oneUrl);
          return;
        }

        parser.resetPosition();

        string popularBody = string.Empty;
        string exactBody = string.Empty;
        string url = string.Empty;
        string name = string.Empty;
        string role = string.Empty;

        if (parser.skipToStartOfNoCase(vdbParserStr[3]))      // Popular names
        {
          parser.skipToEndOf(vdbParserStr[4]);                // <table>
          parser.extractTo(vdbParserStr[5], ref popularBody); // </table>

          parser = new HTMLParser(popularBody);
          
          while (parser.skipToStartOf(vdbParserStr[6]))       // href="/name/
          {
            parser.skipToEndOf(vdbParserStr[7]);              // href="
            parser.extractTo(vdbParserStr[8], ref url);       // "
            parser.skipToEndOf(vdbParserStr[9]);              // Image()).src='/rg/find-name-
            parser.skipToEndOf(vdbParserStr[10]);             // ';">
            parser.extractTo(vdbParserStr[11], ref name);     // </a>
            parser.skipToEndOf(vdbParserStr[12]);             // <small>(
            parser.extractTo(vdbParserStr[13], ref role);     // ,
            
            if (role != string.Empty)
            {
              name += " - " + role;
            }

            name = new HTMLUtil().ConvertHTMLToAnsi(name);
            name = Util.Utils.RemoveParenthesis(name).Trim();
            IMDBUrl newUrl = new IMDBUrl("http://www.imdb.com" + url, name, "IMDB");
            _elements.Add(newUrl);
            parser.skipToEndOf(vdbParserStr[14]); // </tr>
          }
        }
        parser = new HTMLParser(strBody);
        
        if (parser.skipToStartOfNoCase(vdbParserStr[15]))       // Exact Matches
        {
          parser.skipToEndOf(vdbParserStr[16]);                 // <table>
          parser.extractTo(vdbParserStr[17], ref exactBody);    // </table>
        }
        else if (parser.skipToStartOfNoCase(vdbParserStr[18]))  // Approx Matches
        {
          parser.skipToEndOf(vdbParserStr[19]);                 // <table>
          parser.extractTo(vdbParserStr[20], ref exactBody);    // </table>
        }
        else
        {
          return;
        }

        parser = new HTMLParser(exactBody);
        url = string.Empty;
        name = string.Empty;
        role = string.Empty;
        
        while (parser.skipToStartOf(vdbParserStr[21]))  // href="/name/
        {
          parser.skipToEndOf(vdbParserStr[22]);         // href="
          parser.extractTo(vdbParserStr[23], ref url);  // "
          parser.skipToEndOf(vdbParserStr[24]);         // Image()).src='/rg/find-name-
          parser.skipToEndOf(vdbParserStr[25]);         // ';">
          parser.extractTo(vdbParserStr[26], ref name); // </a>
          parser.skipToEndOf(vdbParserStr[27]);         // <small>(
          parser.extractTo(vdbParserStr[28], ref role); // ,

          if (role != string.Empty)
          {
            name += " - " + role;
          }

          name = new HTMLUtil().ConvertHTMLToAnsi(name);
          name = Util.Utils.RemoveParenthesis(name).Trim();
          IMDBUrl newUrl = new IMDBUrl("http://www.imdb.com" + url, name, "IMDB");
          _elements.Add(newUrl);
          parser.skipToEndOf(vdbParserStr[29]); // </tr>
        }
      }
      catch (Exception ex)
      {
        Log.Error("exception for imdb lookup of {0} err:{1} stack:{2}", strURL, ex.Message, ex.StackTrace);
      }
    }
Esempio n. 5
0
    private void GetActorMovies(IMDBActor actor, HTMLParser parser, bool director, bool writer)
    {
      string[] vdbParserStr = VdbParserStringActorMovies();

      if (vdbParserStr == null || vdbParserStr.Length != 19)
      {
        return;
      }

      string movies = string.Empty;
      
      // Get films and roles block
      if (parser.extractTo(vdbParserStr[0], ref movies)) // <div id
      {
        parser.Content = movies;
      }
      
      // Parse block for evey film and get year, title and it's imdbID and role
      while (parser.skipToStartOf(vdbParserStr[1])) // <span class="year_column"
      {
        string movie = string.Empty;

        if (parser.extractTo(vdbParserStr[2], ref movie)) // <div class
        {
          movie += vdbParserStr[3]; // </li>

          HTMLParser movieParser = new HTMLParser(movie);
          string title = string.Empty;
          string strYear = string.Empty;
          string role = string.Empty;
          string imdbID = string.Empty;

          // IMDBid
          movieParser.skipToEndOf(vdbParserStr[4]);           // title/
          movieParser.extractTo(vdbParserStr[5], ref imdbID); // /

          // Title
          movieParser.resetPosition();
          movieParser.skipToEndOf(vdbParserStr[6]);           // <a
          movieParser.skipToEndOf(vdbParserStr[7]);           // >
          movieParser.extractTo(vdbParserStr[8], ref title);  // <br/>
          title = CleanCrlf(title);

          if (!SkipNoMovies(title))
          {
            // Year
            movieParser.resetPosition();

            if (movieParser.skipToStartOf(vdbParserStr[9]) &&       // year_column">20
                movieParser.skipToEndOf(vdbParserStr[10]))          // >
            {
              movieParser.extractTo(vdbParserStr[11], ref strYear); // <
            }
            else
            {
              movieParser.resetPosition();
              
              if (movieParser.skipToStartOf(vdbParserStr[12]) &&      // year_column">19
                  movieParser.skipToEndOf(vdbParserStr[13]))          // >
              {
                movieParser.extractTo(vdbParserStr[14], ref strYear); // <
              }
            }

            strYear = strYear.Trim();

            if (strYear.Length > 4)
            {
              strYear = strYear.Substring(0, 4);
            }

            // Roles actor
            if (!director && !writer)
            {
              // Role case 1, no character link
              if (movieParser.skipToEndOf(vdbParserStr[15]))       // <br/>
              {
                movieParser.extractTo(vdbParserStr[16], ref role); // <
                role = CleanCrlf(role);

                // Role case 2, with character link
                if (role == string.Empty)
                {
                  movieParser.resetPosition();
                  movieParser.skipToEndOf(vdbParserStr[17]);          // <br/>
                  movieParser.extractTo(vdbParserStr[18], ref role);  // </a>
                  role = CleanCrlf(role);
                }
              }
            }
            else if (director)
            {
              role = GUILocalizeStrings.Get(199).Replace(":", string.Empty);
            }
            else // Writer
            {
              string wRole = string.Empty;

              if (title != null)
              {
                // Check for cases like "(movie type)(role)" and use "(role)" only
                MatchCollection mc = Regex.Matches(title, @"\([^)]+\)");

                if (mc.Count > 0)
                {
                  if (mc.Count > 1)
                  {
                    wRole = mc[mc.Count - 1].Value;
                  }
                  else
                  {
                    wRole = mc[0].Value;
                  }
                }
                else
                {
                  continue;
                }

                if (!string.IsNullOrEmpty(wRole))
                {
                  // Remove parentheses (leave text inside)
                  wRole = Regex.Replace(wRole, "([(]|[)])", string.Empty);
                  role = GUILocalizeStrings.Get(200) + " " + wRole;
                }
                else
                {
                  role = GUILocalizeStrings.Get(200).Replace(":", string.Empty);
                }
              }
            }

            int year = 0;
            // Set near future for movies without year (99% it's a future project)
            if (!Int32.TryParse(strYear, out year))
            {
             year = DateTime.Today.Year + 3;
            }
            
            IMDBActor.IMDBActorMovie actorMovie = new IMDBActor.IMDBActorMovie();
            title = Util.Utils.RemoveParenthesis(title).Trim();
            role = Util.Utils.RemoveParenthesis(role).Trim();
            actorMovie.MovieTitle = title;
            actorMovie.Role = role;
            actorMovie.Year = year;
            actorMovie.MovieImdbID = imdbID;
            // Check if director/writer movie exists in actors movies, concatenate role
            // to already fetched actor movie (no duplicate movie entries)
            bool skipAdd = false;

            if (writer)
            {
              for (int i = 0; i < actor.Count; i++)
              {
                if (actor[i].MovieImdbID == imdbID)
                {
                  if (actor[i].Role != string.Empty)
                  {
                    actor[i].Role = role + ", " + actor[i].Role;
                  }
                  else
                  {
                    actor[i].Role = role;
                  }

                  skipAdd = true;
                  break;
                }
              }
            }

            if (director)
            {
              for (int i = 0; i < actor.Count; i++)
              {
                if (actor[i].MovieImdbID == imdbID)
                {
                  if (actor[i].Role != string.Empty)
                  {
                    actor[i].Role = role + ", " + actor[i].Role;
                  }
                  else
                  {
                    actor[i].Role = role;
                  }
                  skipAdd = true;
                  break;
                }
              }
            }

            if (!skipAdd)
            {
              actor.Add(actorMovie);
            }
          }
        }
      }
    }
Esempio n. 6
0
    // Changed - parsing all actor DB fields through HTML (IMDB changed HTML code)
    public bool GetActorDetails(IMDBUrl url, bool director, out IMDBActor actor)
    {
      actor = new IMDBActor();
      try
      {
        string absoluteUri;
        string strBody = GetPage(url.URL, "utf-8", out absoluteUri);
        if (strBody == null)
        {
          return false;
        }
        if (strBody.Length == 0)
        {
          return false;
        }
        // IMDBActorID
        try
        {
          int pos = url.URL.LastIndexOf("nm");
          string id = url.URL.Substring(pos, 9).Replace("/", string.Empty);
          actor.IMDBActorID = id;
        }
        catch (Exception) {}

        HTMLParser parser = new HTMLParser(strBody);
        string strThumb = string.Empty;
        string value = string.Empty;
        string value2 = string.Empty;
        // Actor name
        if ((parser.skipToEndOf("<title>")) &&
            (parser.extractTo("- IMDb</title>", ref value)))
        {
          value = new HTMLUtil().ConvertHTMLToAnsi(value);
          value = Util.Utils.RemoveParenthesis(value).Trim();
          actor.Name = HttpUtility.HtmlDecode(value.Trim());
        }
        if (actor.Name == string.Empty)
        {
          actor.Name = url.Title;
        }
        // Photo
        string parserTxt = parser.Content;
        string photoBlock = string.Empty;
        if (parser.skipToStartOf("<td id=\"img_primary\"") &&
            (parser.extractTo("</td>", ref photoBlock)))
        {
          parser.Content = photoBlock;
          if ((parser.skipToEndOf("<img src=\"")) &&
              (parser.extractTo("\"", ref strThumb)))
          {
            actor.ThumbnailUrl = strThumb;
          }
          parser.Content = parserTxt;
        }
        // Birth date
        if ((parser.skipToEndOf("Born:")) &&
            (parser.skipToEndOf("birth_monthday=")) &&
            (parser.skipToEndOf(">")) &&
            (parser.extractTo("<", ref value)) &&
            (parser.skipToEndOf("year=")) &&
            (parser.extractTo("\"", ref value2)))
          
        {
          actor.DateOfBirth = value + " " + value2;
        }
        // Death date
        if ((parser.skipToEndOf(">Died:</h4>")) &&
            (parser.skipToEndOf("deaths\">")) &&
            (parser.extractTo("<", ref value)) &&
            (parser.skipToEndOf("death_date=")) &&
            (parser.extractTo("\"", ref value2)))
        {
          if (actor.DateOfBirth == string.Empty)
            actor.DateOfBirth = "?";
          actor.DateOfBirth += " ~ " + value + " " + value2;
        }

        parser.resetPosition();
        // Birth place
        if ((parser.skipToEndOf("birth_place=")) &&
            (parser.skipToEndOf(">")) &&
            (parser.extractTo("<", ref value)))
        {
          actor.PlaceOfBirth = HttpUtility.HtmlDecode(value);
        }
        //Mini Biography
        parser.resetPosition();
        if ((parser.skipToEndOf("<td id=\"overview-top\">")) &&
            (parser.skipToEndOf("<p>")) &&
            (parser.extractTo("See full bio</a>", ref value)))
        {
          value = new HTMLUtil().ConvertHTMLToAnsi(value);
          actor.MiniBiography = Util.Utils.stripHTMLtags(value);
          actor.MiniBiography = actor.MiniBiography.Replace("See full bio »", string.Empty).Trim();
          actor.MiniBiography = HttpUtility.HtmlDecode(actor.MiniBiography); // Remove HTML entities like &#189;
          if (actor.MiniBiography != string.Empty)
          {
            // get complete biography
            string bioURL = absoluteUri;
            if (!bioURL.EndsWith("/"))
            {
              bioURL += "/bio";
            }
            else
              bioURL += "bio";
            string strBioBody = GetPage(bioURL, "utf-8", out absoluteUri);
            if (!string.IsNullOrEmpty(strBioBody))
            {
              HTMLParser parser1 = new HTMLParser(strBioBody);
              if (parser1.skipToEndOf("<h5>Mini Biography</h5>") &&
                  parser1.extractTo("</p>", ref value))
              {
                value = new HTMLUtil().ConvertHTMLToAnsi(value);
                actor.Biography = Util.Utils.stripHTMLtags(value).Trim();
                actor.Biography = HttpUtility.HtmlDecode(actor.Biography); // Remove HTML entities like &#189;
              }
            }
          }
        }
        // Person is movie director or an actor/actress
        bool isActorPass = false;
        bool isDirectorPass = false;
        parser.resetPosition();

        if (director)
        {
          if ((parser.skipToEndOf("name=\"Director\">Director</a>")) &&
              (parser.skipToEndOf("</div>")))
          {
            isDirectorPass = true;
          }
        }
        else
        {
          if (parser.skipToEndOf("name=\"Actress\">Actress</a>") || parser.skipToEndOf("name=\"Actor\">Actor</a>"))
          {
            isActorPass = true;
          }
        }
        // Get filmography
        if (isDirectorPass | isActorPass)
        {
          string movies = string.Empty;
          // Get films and roles block
          if (parser.extractTo("<div id", ref movies))
          {
            parser.Content = movies;
          }
          // Parse block for evey film and get year, title and it's imdbID and role
          while (parser.skipToStartOf("<span class=\"year_column\""))
          {
            string movie = string.Empty;
            if (parser.extractTo("<div class", ref movie))
            {
              movie += "</li>";
              HTMLParser movieParser = new HTMLParser(movie);
              string title = string.Empty;
              string strYear = string.Empty;
              string role = string.Empty;
              string imdbID = string.Empty;
              // IMDBid
              movieParser.skipToEndOf("title/");
              movieParser.extractTo("/", ref imdbID);
              // Title
              movieParser.resetPosition();
              movieParser.skipToEndOf("<a");
              movieParser.skipToEndOf(">");
              movieParser.extractTo("<br/>", ref title);
              title = Util.Utils.stripHTMLtags(title);
              title = title.Replace("\n", " ").Replace("\r", string.Empty);
              title = HttpUtility.HtmlDecode(title.Trim()); // Remove HTML entities like &#189;
              // Year
              movieParser.resetPosition();
              if (movieParser.skipToStartOf(">20") &&
                  movieParser.skipToEndOf(">"))
              {
                movieParser.extractTo("<", ref strYear);
              }
              else if (movieParser.skipToStartOf(">19") &&
                       movieParser.skipToEndOf(">"))
              {
                movieParser.extractTo("<", ref strYear);
              }
              // Roles
              if ((director == false) && (movieParser.skipToEndOf("<br/>"))) // Role case 1, no character link
              {
                movieParser.extractTo("<", ref role);
                role = Util.Utils.stripHTMLtags(role).Trim();
                role = HttpUtility.HtmlDecode(role.Replace("\n", " ")
                                                .Replace("\r", string.Empty).Trim());
                if (role == string.Empty) // Role case 2, with character link
                {
                  movieParser.resetPosition();
                  movieParser.skipToEndOf("<br/>");
                  movieParser.extractTo("</a>", ref role);
                  role = Util.Utils.stripHTMLtags(role).Trim();
                  role = HttpUtility.HtmlDecode(role.Replace("\n", " ")
                                                  .Replace("\r", string.Empty).Trim());
                }
              }
              else
              {
                // Just director
                if (director)
                  role = "Director";
              }

              int year = 0;
              try
              {
                year = Int32.Parse(strYear.Substring(0, 4));
              }
              catch (Exception)
              {
                year = 1900;
              }
              IMDBActor.IMDBActorMovie actorMovie = new IMDBActor.IMDBActorMovie();
              actorMovie.MovieTitle = title;
              actorMovie.Role = role;
              actorMovie.Year = year;
              actorMovie.imdbID = imdbID;
              actor.Add(actorMovie);
            }
          }
        }
        return true;
      }
      catch (Exception ex)
      {
        Log.Error("IMDB.GetActorDetails({0} exception:{1} {2} {3}", url.URL, ex.Message, ex.Source, ex.StackTrace);
      }
      return false;
    }
Esempio n. 7
0
 // Changed - IMDB changed HTML code
 private void FindIMDBActor(string strURL)
 {
   try
   {
     string absoluteUri;
     // UTF-8 have problem with special country chars, default IMDB enc is used
     string strBody = GetPage(strURL, "utf-8", out absoluteUri);
     string value = string.Empty;
     HTMLParser parser = new HTMLParser(strBody);
     if ((parser.skipToEndOf("<title>")) &&
         (parser.extractTo("</title>", ref value)) && !value.ToLower().Equals("imdb name search"))
     {
       value = new HTMLUtil().ConvertHTMLToAnsi(value);
       value = Util.Utils.RemoveParenthesis(value).Trim();
       IMDBUrl oneUrl = new IMDBUrl(absoluteUri, value, "IMDB");
       _elements.Add(oneUrl);
       return;
     }
     parser.resetPosition();
     
     while (parser.skipToEndOfNoCase("Exact Matches"))
     {
       string url = string.Empty;
       string name = string.Empty;
       //<a href="/name/nm0000246/" onclick="set_args('nm0000246', 1)">Bruce Willis</a>
       if (parser.skipToStartOf("href=\"/name/"))
       {
         parser.skipToEndOf("href=\"");
         parser.extractTo("\"", ref url);
         parser.skipToEndOf("<br><a");
         parser.skipToEndOf(">");
         parser.extractTo("</a>", ref name);
         name = new HTMLUtil().ConvertHTMLToAnsi(name);
         name = Util.Utils.RemoveParenthesis(name).Trim();
         IMDBUrl newUrl = new IMDBUrl("http://akas.imdb.com" + url, name, "IMDB");
         _elements.Add(newUrl);
       }
       else
       {
         parser.skipToEndOfNoCase("</a>");
       }
     }
     // Maybe more actors with the similar name
     parser.resetPosition();
     
     while (parser.skipToEndOfNoCase("Popular Names"))
     {
       string url = string.Empty;
       string name = string.Empty;
       //<a href="/name/nm0000246/" onclick="set_args('nm0000246', 1)">Bruce Willis</a>
       if (parser.skipToStartOf("href=\"/name/"))
       {
         parser.skipToEndOf("href=\"");
         parser.extractTo("\"", ref url);
         parser.skipToEndOf("<br><a");
         parser.skipToEndOf(">");
         parser.extractTo("</a>", ref name);
         name = new HTMLUtil().ConvertHTMLToAnsi(name);
         name = Util.Utils.RemoveParenthesis(name).Trim();
         IMDBUrl newUrl = new IMDBUrl("http://akas.imdb.com" + url, name, "IMDB");
         _elements.Add(newUrl);
       }
       else
       {
         parser.skipToEndOfNoCase("</a>");
       }
     }
   }
   catch (Exception ex)
   {
     Log.Error("exception for imdb lookup of {0} err:{1} stack:{2}", strURL, ex.Message, ex.StackTrace);
   }
 }
Esempio n. 8
0
    /////////////////
    // Static methods

    /// <summary>
    /// Retrieves the collection of HTML links in a string.
    /// </summary>
    /// <param name="strString">The string.</param>
    /// <param name="strRootUrl">Root url (may be null).</param>
    /// <param name="documents">Collection of document link strings.</param>
    /// <param name="images">Collection of image link strings.</param>
    public static void getLinks
      (string strString,
       string strRootUrl,
       ref ArrayList documents,
       ref ArrayList images)
    {
      // Remove comments and JavaScript and fix links
      strString = HTMLParser.removeComments(strString);
      strString = HTMLParser.removeScripts(strString);
      HTMLParser parser = new HTMLParser(strString);
      parser.replaceEvery("\'", "\"");

      // Set root url
      string rootUrl = "";
      if (strRootUrl != null)
        rootUrl = strRootUrl.Trim();
      if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/"))
        rootUrl += "/";

      // Extract HREF targets
      string strUrl = "";
      parser.resetPosition();
      while (parser.skipToEndOfNoCase("href=\""))
      {
        if (parser.extractTo("\"", ref strUrl))
        {
          strUrl = strUrl.Trim();
          if (strUrl.Length > 0)
          {
            if (strUrl.IndexOf("mailto:") == -1)
            {
              // Get fully qualified url (best guess)
              if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
              {
                try
                {
                  UriBuilder uriBuilder = new UriBuilder(rootUrl);
                  uriBuilder.Path = strUrl;
                  strUrl = uriBuilder.Uri.ToString();
                }
                catch (Exception)
                {
                  strUrl = "http://" + strUrl;
                }
              }

              // Add url to document list if not already present
              if (!documents.Contains(strUrl))
                documents.Add(strUrl);
            }
          }
        }
      }

      // Extract SRC targets
      parser.resetPosition();
      while (parser.skipToEndOfNoCase("src=\""))
      {
        if (parser.extractTo("\"", ref strUrl))
        {
          strUrl = strUrl.Trim();
          if (strUrl.Length > 0)
          {
            // Get fully qualified url (best guess)
            if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
            {
              try
              {
                UriBuilder uriBuilder = new UriBuilder(rootUrl);
                uriBuilder.Path = strUrl;
                strUrl = uriBuilder.Uri.ToString();
              }
              catch (Exception)
              {
                strUrl = "http://" + strUrl;
              }
            }

            // Add url to images list if not already present
            if (!images.Contains(strUrl))
              images.Add(strUrl);
          }
        }
      }
    }
Esempio n. 9
0
    /// <summary>
    /// Removes all scripts from a string.
    /// </summary>
    /// <param name="strString">The string.</param>
    /// <returns>Version of string without any scripts.</returns>
    public static string removeScripts
      (string strString)
    {
      // Get script-free version of content
      string strStringSansScripts = "";
      string strSegment = "";
      HTMLParser parser = new HTMLParser(strString);

      while (parser.extractToNoCase("<script", ref strSegment))
      {
        strStringSansScripts += strSegment;
        if (!parser.skipToEndOfNoCase("</script>"))
        {
          parser.Content = strStringSansScripts;
          return strString;
        }
      }

      parser.extractToEnd(ref strSegment);
      strStringSansScripts += strSegment;
      return (strStringSansScripts);
    }
Esempio n. 10
0
    /// <summary>
    /// Returns a version of a string without any HTML tags.
    /// </summary>
    /// <param name="strString">The string.</param>
    /// <returns>Version of string without HTML tags.</returns>
    public static string removeHtml
      (string strString)
    {
      // Do some common case-sensitive replacements
      Hashtable replacements = new Hashtable();
      replacements.Add("&nbsp;", " ");
      replacements.Add("&amp;", "&");
      replacements.Add("&aring;", "");
      replacements.Add("&auml;", "");
      replacements.Add("&eacute;", "");
      replacements.Add("&iacute;", "");
      replacements.Add("&igrave;", "");
      replacements.Add("&ograve;", "");
      replacements.Add("&ouml;", "");
      replacements.Add("&quot;", "\"");
      replacements.Add("&szlig;", "");
      HTMLParser parser = new HTMLParser(strString);
      foreach (string key in replacements.Keys)
      {
        string val = replacements[key] as string;
        if (strString.IndexOf(key) != -1)
          parser.replaceEveryExact(key, val);
      }

      // Do some sequential replacements
      parser.replaceEveryExact("&#0", "&#");
      parser.replaceEveryExact("&#39;", "'");
      parser.replaceEveryExact("</", " <~/");
      parser.replaceEveryExact("<~/", "</");

      // Case-insensitive replacements
      replacements.Clear();
      replacements.Add("<br>", " ");
      replacements.Add("<br />", " ");
      replacements.Add("<br/>", " ");
      replacements.Add("<p>", " ");
      replacements.Add("<p />", " ");
      replacements.Add("<p/>", " ");
      foreach (string key in replacements.Keys)
      {
        string val = replacements[key] as string;
        if (strString.IndexOf(key) != -1)
          parser.replaceEvery(key, val);
      }
      strString = parser.Content;

      // Remove all tags
      string strClean = "";
      int nIndex = 0;
      int nStartTag = 0;
      while ((nStartTag = strString.IndexOf("<", nIndex)) != -1)
      {
        // Extract to start of tag
        string strSubstring = strString.Substring(nIndex, (nStartTag - nIndex));
        strClean += strSubstring;
        nIndex = nStartTag + 1;

        // Skip over tag
        int nEndTag = strString.IndexOf(">", nIndex);
        if (nEndTag == (-1))
          break;
        nIndex = nEndTag + 1;
      }

      // Gather remaining text
      if (nIndex < strString.Length)
        strClean += strString.Substring(nIndex, strString.Length - nIndex);
      strString = strClean;
      strClean = "";

      // Finally, reduce spaces
      parser.Content = strString;
      parser.replaceEveryExact("  ", " ");
      strString = parser.Content.Trim();

      // Return the de-HTMLized string
      return strString;
    }
Esempio n. 11
0
    /// <summary>
    /// Removes all HTML comments from a string.
    /// </summary>
    /// <param name="strString">The string.</param>
    /// <returns>Comment-free version of string.</returns>
    public static string removeComments
      (string strString)
    {
      // Return comment-free version of string
      string strCommentFreeString = "";
      string strSegment = "";
      HTMLParser parser = new HTMLParser(strString);

      while (parser.extractTo("<!--", ref strSegment))
      {
        strCommentFreeString += strSegment;
        if (!parser.skipToEndOf("-->"))
          return strString;
      }

      parser.extractToEnd(ref strSegment);
      strCommentFreeString += strSegment;
      return strCommentFreeString;
    }
Esempio n. 12
0
        /////////////////
        // Static methods

        /// <summary>
        /// Retrieves the collection of HTML links in a string.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <param name="strRootUrl">Root url (may be null).</param>
        /// <param name="documents">Collection of document link strings.</param>
        /// <param name="images">Collection of image link strings.</param>
        public static void getLinks
            (string strString,
            string strRootUrl,
            ref ArrayList documents,
            ref ArrayList images)
        {
            // Remove comments and JavaScript and fix links
            strString = HTMLParser.removeComments(strString);
            strString = HTMLParser.removeScripts(strString);
            HTMLParser parser = new HTMLParser(strString);

            parser.replaceEvery("\'", "\"");

            // Set root url
            string rootUrl = "";

            if (strRootUrl != null)
            {
                rootUrl = strRootUrl.Trim();
            }
            if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/"))
            {
                rootUrl += "/";
            }

            // Extract HREF targets
            string strUrl = "";

            parser.resetPosition();
            while (parser.skipToEndOfNoCase("href=\""))
            {
                if (parser.extractTo("\"", ref strUrl))
                {
                    strUrl = strUrl.Trim();
                    if (strUrl.Length > 0)
                    {
                        if (strUrl.IndexOf("mailto:") == -1)
                        {
                            // Get fully qualified url (best guess)
                            if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
                            {
                                try
                                {
                                    UriBuilder uriBuilder = new UriBuilder(rootUrl);
                                    uriBuilder.Path = strUrl;
                                    strUrl          = uriBuilder.Uri.ToString();
                                }
                                catch (Exception)
                                {
                                    strUrl = "http://" + strUrl;
                                }
                            }

                            // Add url to document list if not already present
                            if (!documents.Contains(strUrl))
                            {
                                documents.Add(strUrl);
                            }
                        }
                    }
                }
            }

            // Extract SRC targets
            parser.resetPosition();
            while (parser.skipToEndOfNoCase("src=\""))
            {
                if (parser.extractTo("\"", ref strUrl))
                {
                    strUrl = strUrl.Trim();
                    if (strUrl.Length > 0)
                    {
                        // Get fully qualified url (best guess)
                        if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder uriBuilder = new UriBuilder(rootUrl);
                                uriBuilder.Path = strUrl;
                                strUrl          = uriBuilder.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strUrl = "http://" + strUrl;
                            }
                        }

                        // Add url to images list if not already present
                        if (!images.Contains(strUrl))
                        {
                            images.Add(strUrl);
                        }
                    }
                }
            }
        }
Esempio n. 13
0
        /// <summary>
        /// Returns a version of a string without any HTML tags.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <returns>Version of string without HTML tags.</returns>
        public static string removeHtml
            (string strString)
        {
            // Do some common case-sensitive replacements
            Hashtable replacements = new Hashtable();

            replacements.Add("&nbsp;", " ");
            replacements.Add("&amp;", "&");
            replacements.Add("&aring;", "");
            replacements.Add("&auml;", "");
            replacements.Add("&eacute;", "");
            replacements.Add("&iacute;", "");
            replacements.Add("&igrave;", "");
            replacements.Add("&ograve;", "");
            replacements.Add("&ouml;", "");
            replacements.Add("&quot;", "\"");
            replacements.Add("&szlig;", "");
            HTMLParser parser = new HTMLParser(strString);

            foreach (string key in replacements.Keys)
            {
                string val = replacements[key] as string;
                if (strString.IndexOf(key) != -1)
                {
                    parser.replaceEveryExact(key, val);
                }
            }

            // Do some sequential replacements
            parser.replaceEveryExact("&#0", "&#");
            parser.replaceEveryExact("&#39;", "'");
            parser.replaceEveryExact("</", " <~/");
            parser.replaceEveryExact("<~/", "</");

            // Case-insensitive replacements
            replacements.Clear();
            replacements.Add("<br>", " ");
            replacements.Add("<br />", " ");
            replacements.Add("<br/>", " ");
            replacements.Add("<p>", " ");
            replacements.Add("<p />", " ");
            replacements.Add("<p/>", " ");
            foreach (string key in replacements.Keys)
            {
                string val = replacements[key] as string;
                if (strString.IndexOf(key) != -1)
                {
                    parser.replaceEvery(key, val);
                }
            }
            strString = parser.Content;

            // Remove all tags
            string strClean  = "";
            int    nIndex    = 0;
            int    nStartTag = 0;

            while ((nStartTag = strString.IndexOf("<", nIndex)) != -1)
            {
                // Extract to start of tag
                string strSubstring = strString.Substring(nIndex, (nStartTag - nIndex));
                strClean += strSubstring;
                nIndex    = nStartTag + 1;

                // Skip over tag
                int nEndTag = strString.IndexOf(">", nIndex);
                if (nEndTag == (-1))
                {
                    break;
                }
                nIndex = nEndTag + 1;
            }

            // Gather remaining text
            if (nIndex < strString.Length)
            {
                strClean += strString.Substring(nIndex, strString.Length - nIndex);
            }
            strString = strClean;
            strClean  = "";

            // Finally, reduce spaces
            parser.Content = strString;
            parser.replaceEveryExact("  ", " ");
            strString = parser.Content.Trim();

            // Return the de-HTMLized string
            return(strString);
        }