public override List<ScrapedMovie> ScrapeMovies(List<string> skipUrls, List<int> years = null) { if (years == null) years = new List<int>(); foreach (var entry in RootLinks) { try { if (years.Count > 0 && !years.Any(x => x == entry.Value)) continue; int year = entry.Value; var dom = GotoUrl(entry.Key); var lastPage = SelectItem(dom, "a.last").Attributes["href"]; int ix1 = lastPage.LastIndexOf('/'); int ix2 = ix1 > 0 ? lastPage.LastIndexOf('/', ix1 - 1) : -1; var count = Convert.ToInt32(lastPage.Substring(ix2 + 1, lastPage.Length - ix2 - 2)); lastPage = lastPage.Substring(0, ix2 + 1); var pageUrls = new List<string>(); for (var i = count; i >= 2; i--) pageUrls.Add(lastPage + i.ToString() + "/"); pageUrls.Add(entry.Key); foreach (var elem in pageUrls) { try { dom = GotoUrl(elem); var postBoxes = SelectItems(dom, "h2.title a"); for (var j = postBoxes.Count - 1; j >= 0; j--) { try { var movie = new ScrapedMovie(this); allMovies.Add(movie); movie.PageUrl = ReadAttribute(postBoxes[j], "href"); OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + year.ToString())); movie.LangCode = "te"; movie.ReleasedDate = new DateTime(year, 1, 1); dom = GotoUrl(movie.PageUrl); movie.Name = FixTitle( SelectItem(dom, "h2.title").InnerText); movie.ImageUrl = SelectItem(dom, "img.wp-post-image").Attributes["src"]; try { movie.Description = SelectItems(dom, "div.entry p span")[1].InnerText; } catch { //try //{ // movie.Description = SelectItem(dom, ".textsection").InnerText; //} //catch { } } var links = new Dictionary<string, string>(); var anchors = SelectItems(dom, "a"); foreach (var anchor in anchors) { try { var url = anchor.Attributes["href"]; if (GetScrapper(url) != null) { if(!links.ContainsKey(url)) links.Add(url, anchor.InnerText.Replace(" ", "")); } if(url.Contains("http://www.power4link.us")) { dom = GotoUrl(url); var frame = SelectItem(dom, "div.entry-content iframe"); if (!links.ContainsKey(url)) links.Add(frame.Attributes["src"], "Watch Online"); } } catch { } } var iframes = SelectItems(dom, "div.entry iframe"); foreach (var iframe in iframes) links.Add(iframe.Attributes["src"], "Watch Online"); if (links.Count == 0) { var k = 0; } foreach (var l in links) { try { var linkUrl = l.Key; if (IgnoreLink(linkUrl)) continue; var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) continue; linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) continue; MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.Success && result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var k = 0; } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var name = l.Value; if (name.ToLower().Contains("part")) { var ind = name.IndexOf("part", StringComparison.InvariantCultureIgnoreCase); name = "Watch " + name.Substring(ind, name.Length - ind); } else name = "Watch Online"; movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, name)); } } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } catch { } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) return allMovies; } catch { } } } catch { } } } catch { } } return allMovies; }
private bool GetMovies(int year, CQ dom, List<string> skipUrls) { var items = SelectItems(dom, ".itemList a"); if (items.Count == 0) { Debug.Assert(false, "No movies"); } for (var i = items.Count - 1; i >= 0; i--) { var elem1 = items[i]; var movieUrl = RootUrl + ReadAttribute(elem1, "href"); OnNotify(new NotificationEventArgs("Processing " + movieUrl + ". Year: " + year.ToString())); if (allMovies.Any(x => x.PageUrl == movieUrl) ) continue; try { dom = GotoUrl(movieUrl); } catch { continue; } var links = SelectItems(dom, ".itemIntroText table a"); if (links.Count == 0) links = SelectItems(dom, "div.itemFullText a"); if (links.Count == 0) links = SelectItems(dom, ".avPlayerBlock iframe"); if (links.Count > 0) { var movie = new ScrapedMovie(this); movie.ReleasedDate = new DateTime(year, 1, 1); movie.LangCode = "ml"; movie.Language = "Malayalam"; movie.Name = ReadText(SelectItems(dom, ".itemTitle")[0]).Replace("\n", "").Replace("\t", ""); try { try { movie.Description = ReadText(SelectItems(dom, ".itemIntroText p")[0]); } catch { try { movie.Description = ReadText(SelectItems(dom, ".itemIntroText")[0]); } catch{ var spans = SelectItems(dom, ".typeTextfield span"); if (spans.Count > 0) { movie.Description = String.Empty; foreach (var span in spans) movie.Description += span.InnerText; } } } if (movie.Description != null) movie.Description = movie.Description.Replace("\n", "").Replace("\t", ""); } catch { } try { var a = SelectItems(dom, ".itemIntroText p img").FirstOrDefault(); if(a == null) a = SelectItems(dom, ".itemImage a img").FirstOrDefault(); if (a == null) a = SelectItems(dom, ".itemIntroText img").FirstOrDefault(); if(a == null) a = SelectItems(dom, ".itemIntroText span img").FirstOrDefault(); if (a != null) movie.ImageUrl = RootUrl + ReadAttribute(a, "src"); } catch { } movie.PageUrl = movieUrl; allMovies.Add(movie); foreach (var link in links) { string linkUrl = null; try { linkUrl = ReadAttribute(link, "href"); } catch { try { linkUrl = ReadAttribute(link, "src"); } catch { } } if (IgnoreLink(linkUrl)) continue; var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) continue; try { linkUrl = host.SanitizeUrl(linkUrl); } catch { continue; } if (skipUrls.Any(x => x == linkUrl)) continue; MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, link.InnerText)); } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) return false; } else { } } return true; }
public override List<ScrapedMovie> ScrapeMovies(List<string> skipUrls, List<int> years = null) { if (years == null) years = new List<int>(); foreach (var entry in RootLinks) { try { if (years.Count > 0 && !years.Any(x => x == entry.Value)) continue; int year = entry.Value; var dom = GotoUrl(entry.Key); var elems = SelectItems(dom, ".wp-pagenavi a").Where(x => x.Attributes["class"] == "page larger" || x.Attributes["class"] == "page smaller").ToList(); var pageUrls = new List<string>(); for (var i = elems.Count - 1; i >= 0; i--) pageUrls.Add(new Uri(new Uri(entry.Key), ReadAttribute(elems[i], "href")).AbsoluteUri); pageUrls.Add(entry.Key); foreach (var elem in pageUrls) { try { dom = GotoUrl(elem); var postBoxes = SelectItems(dom, ".boxentry a"); for (var j = postBoxes.Count - 1; j >= 0; j--) { try { var pb = postBoxes[j]; var movie = new ScrapedMovie(this); allMovies.Add(movie); movie.PageUrl = ReadAttribute(pb, "href"); OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + year.ToString())); movie.LangCode = "ta"; movie.ReleasedDate = new DateTime(year, 1, 1); var title = ReadAttribute(pb, "title"); if (title.ToLower().Contains("dubbed")) continue; movie.Name = FixTitle(title); movie.ImageUrl = pb.FirstElementChild.Attributes["src"]; dom = GotoUrl(movie.PageUrl); try { //movie.Description = SelectItem(dom, "meta[name='description']").InnerText; } catch { //try //{ // movie.Description = SelectItem(dom, ".textsection").InnerText; //} //catch { } } var embedds = SelectItems(dom, ".videosection embed"); var iframes = SelectItems(dom, ".videosection iframe"); var links = new List<string>(); foreach (var emb in embedds) links.Add(emb.Attributes["src"]); foreach (var iframe in iframes) links.Add(iframe.Attributes["src"]); if (links.Count == 0) { var k = 0; } foreach (var l in links) { try { var linkUrl = l; if (IgnoreLink(linkUrl)) continue; var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) continue; linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) continue; MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.Success && result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var k = 0; } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, "Watch Full Movie")); } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } catch { } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) return allMovies; } catch { } } } catch { } } } catch { } } return allMovies; }
void scraper_MovieFound(object sender, MovieFoundEventArgs e) { UpdateUI(e.Movie); if (this.stop) ((MovieDetailsScraperBase)sender).Stop(); }
private List<ScrapedMovie> DoScrapeMovies(List<string> links, string langCode, List<string> skipUrls, List<int> years = null) { if (years == null) years = new List<int>(); foreach (var entry in links) { var dom = GotoUrl(entry); var movies = SelectItems(dom, ".movie"); foreach (var m in movies) { try { var movie = new ScrapedMovie(this); allMovies.Add(movie); movie.PageUrl = "http://apnaview.com" + m.FirstElementChild.Attributes["href"]; var children = m.FirstElementChild.ChildElements.ToList(); movie.ReleasedDate = new DateTime(Convert.ToInt32(children[2].InnerText), 1, 1); OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + movie.ReleasedDate.Year.ToString())); movie.LangCode = langCode; movie.Name = children[1].InnerText; if(children[0].Attributes["src"].Contains("/img")) movie.ImageUrl = "http://apnaview.com" + children[0].Attributes["src"]; dom = GotoUrl(movie.PageUrl); var vids = SelectItems(dom, ".table.table-bordered tbody tr"); foreach (var vid in vids) { try { var vidLinks = vid.ChildElements.ToList()[1].ChildElements.ToList(); foreach (var vl in vidLinks) { var linkUrl = vl.Attributes["href"]; if (GetScrapper(linkUrl) == null) { linkUrl = String.Empty; dom = GotoUrl(vl.Attributes["href"]); try { linkUrl = SelectItem(dom, ".videoplayer iframe").Attributes["src"]; } catch { } try { if (String.IsNullOrWhiteSpace(linkUrl)) linkUrl = SelectItem(dom, ".videoplayer embed").Attributes["src"]; } catch { } } if (IgnoreLink(linkUrl)) continue; try { var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) continue; linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) continue; MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.Success && result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var k = 0; } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var name = vl.InnerText.Trim(); movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, name)); } } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } catch { } } } catch { } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) return allMovies; } catch { } } } return allMovies; }
private void ScrapThread(CQ dom, int year,string startUrl, List<string> skipUrls) { var last = SelectItem(dom, ".wp-pagenavi > .last"); var lasthRef = ReadAttribute(last, "href"); var pos = lasthRef.LastIndexOf('/'); var lastIndex = Int32.Parse( lasthRef.Substring(pos + 1, lasthRef.Length - pos -1) ); var urls = new List<string>(); var urlTemplate = lasthRef.Substring(0, pos); for (var i = lastIndex; i > 1; i--) urls.Add(String.Format("{0}/{1}", urlTemplate, i)); urls.Add(startUrl); foreach(var url in urls) { try { dom = GotoUrl(url); var elems = SelectItems(dom, "a.clip-link"); for (var i = elems.Count - 1; i >= 0; i--) { try { var subElem = elems[i]; var movie = new ScrapedMovie(this); movie.PageUrl = ReadAttribute(subElem, "href"); OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + year.ToString())); if (movie.PageUrl.ToLower().Contains("-in-hindi") || movie.PageUrl.ToLower().Contains("-hindi.")) continue; dom = GotoUrl(movie.PageUrl); movie.ReleasedDate = new DateTime(year, 1, 1); movie.LangCode = "hi"; movie.Language = "Hindi"; movie.Description = String.Empty; movie.Name = ReadText(SelectItem(dom, ".entry-title")).Replace("\n", "").Replace("\t", ""); try { var descElems = SelectItems(dom, ".entry-content p"); //var descs = descElems.Count > 5 ? descElems.Skip(3) : descElems.Skip(2); foreach (var p in descElems) { if (!p.InnerHTML.Contains("<strong>")) { var t = ReadText(p); movie.Description += ReadText(p) + Environment.NewLine; } else { break; } } if (String.IsNullOrWhiteSpace(movie.Description)) movie.Description = String.Empty; } catch { } var imgElems = SelectItems(dom, "#thumb img"); if (imgElems.Count == 0) { Debug.WriteLine("No Image: " + movie.PageUrl); } else movie.ImageUrl = ReadAttribute(imgElems[0], "src"); allMovies.Add(movie); //links var linkPages = SelectItems(dom, ".entry-content p a.external"); foreach (var l in linkPages) { var pageUrl = ReadAttribute(l, "href"); string linkUrl = ""; if (pageUrl.Contains("filmshowonline.net")) { continue; dom = GotoUrl(pageUrl); IDomElement item = null; var attrib = "src"; if (dom.Document.Body.InnerHTML.Contains("id=\"cipher\"")) { var html = DecryptLink(ReadAttribute(SelectItem(dom, "#key"), "value"), ReadAttribute(SelectItem(dom, "#cipher"), "value")); var doc = CsQuery.CQ.CreateDocument(html); item = SelectItem(doc, "iframe"); if (item == null) item = SelectItem(doc, "embed"); if (item == null) { item = SelectItems(doc, "object param").FirstOrDefault(x => x.Attributes["name"] == "movie"); if (item != null) attrib = "value"; } if (html.Contains("flashvars")) { html = System.Web.HttpUtility.UrlDecode(html); linkUrl = System.Web.HttpUtility.UrlDecode(SubstringBetween(html, "&url=", "&")); } } if (item == null) item = SelectItems(dom, "center embed").FirstOrDefault(x => x.HasAttribute("allowfullscreen")); if (item == null) item = SelectItems(dom, "center iframe").FirstOrDefault(x => x.HasAttribute("allowfullscreen")); if (item == null) { OnScraperNotFound(new ScraperNotFound("No Link", pageUrl)); continue; } if (String.IsNullOrWhiteSpace(linkUrl)) linkUrl = ReadAttribute(item, attrib); } else if (pageUrl.Contains("www.veoh.com/download")) { continue; } else linkUrl = pageUrl; if (IgnoreLink(linkUrl)) continue; try { var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) continue; linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) continue; if (!movie.Links.Any(x => x.DownloadUrl.ToLower() == linkUrl.ToLower())) { MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, l.InnerText)); } } else { OnScraperNotFound(new ScraperNotFound(linkUrl, pageUrl)); } } catch (Exception ex) { OnScraperNotFound(new ScraperNotFound("Exception", ex.Message)); } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) return; } catch (Exception ex) { OnScraperNotFound(new ScraperNotFound("Exception", ex.Message)); } } } catch (Exception ex) { OnScraperNotFound(new ScraperNotFound("Exception", ex.Message)); } } }
public override List<ScrapedMovie> ScrapeMovies(List<string> skipUrls, List<int> years = null) { try { var dom = GotoUrl(RootUrl); if (years == null) years = new List<int>(); //loop through each year var elems = SelectItems(dom, "#HTML5.widget div.widget-content a"); for (var i = elems.Count - 1; i >= 0; i--) { try { var elem = elems[i]; int year = Convert.ToInt32(elem.InnerText.Trim()); if (years.Count > 0 && !years.Any(x => x == year)) continue; var urls = new Stack<string>(); //goto year page try { dom = GotoUrl(ReadAttribute(elem, "href")); while (true) //grab all links { var mUrls = SelectItems(dom, "div.blog-posts a").Skip(4).ToList(); foreach (var mu in mUrls) { var h = ReadAttribute(mu, "href"); if (urls.Contains(h)) continue; urls.Push(h); } if (mUrls.Count == 0) break; dom = GotoUrl(ReadAttribute(SelectItem(dom, "#blog-pager-older-link a"), "href")); } } catch { continue; } while (urls.Count > 0) { string u = null; try { u = urls.Pop(); dom = GotoUrl(u); var title = SelectItem(dom, ".post-title.entry-title a").InnerText; ScrapedMovie movie = null; try { movie = new ScrapedMovie(this) { PageUrl = u, Description = title.Contains("-") ? title.Split('-')[1] : String.Empty, Name = FixTitle(title.Contains("-") ? title.Split('-')[0] : title), LangCode = "ta", ReleasedDate = new DateTime(year, 1, 1), ImageUrl = ReadAttribute(SelectItem(dom, "div.post-body.entry-content img"), "src") }; } catch { } OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + year.ToString())); allMovies.Add(movie); foreach (var item in SelectItems(dom, ".fullpost a")) { string linkUrl = null; try { linkUrl = ReadAttribute(item, "href"); if (linkUrl.Contains("links2sites")) { dom = GotoUrl(linkUrl); try { linkUrl = ReadAttribute(SelectItem(dom, ".post-body.entry-content embed"), "src"); } catch { try { linkUrl = ReadAttribute(SelectItem(dom, ".post-body.entry-content iframe"), "src"); } catch { } } } if (IgnoreLink(linkUrl)) continue; var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) continue; linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) continue; skipUrls.Add(linkUrl); MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.Success && result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var k = 0; } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, "Watch Full Movie")); } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } catch (WebException ex) { } catch { } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) return allMovies; } catch { } } } catch { } } } catch (Exception ex) { //throw; } return allMovies; }
public override List<ScrapedMovie> ScrapeMovies(List<string> skipUrls, List<int> years = null) { if (years == null) years = new List<int>(); try { foreach (var entry in RootLinks) { var dom = GotoUrl(entry.Key, 3); var elems = SelectItems(dom, ".video-organizer-element-wrapper a"); for (var i = elems.Count - 1; i >= 0; i--) { try { var elem = elems[i]; int year; Int32.TryParse(elem.InnerText, out year); if (years.Count > 0 && !years.Any(x => x == year)) continue; dom = GotoUrl(new Uri(new Uri(entry.Key), ReadAttribute(elem, "href")).AbsoluteUri, 3); if (!Int32.TryParse((Regex.Replace(ReadText(SelectItems(dom, ".filter-selected").First()), "[^0-9.]", "")), out year)) continue; if (year.ToString().Length != 4) continue; if (years.Count > 0 && !years.Any(x => x == year)) continue; foreach (var el in SelectItems(dom, ".numerical-nav a")) { try { dom = GotoUrl(new Uri(new Uri(entry.Key), ReadAttribute(el, "href")).AbsoluteUri, 3); foreach (var subElem in SelectItems(dom, ".video-object-wrapper")) { try { dom = CQ.Create(subElem); var movie = new ScrapedMovie(this); var e = dom.Select(".movie-title").Elements.First(); movie.PageUrl = new Uri(new Uri(RootUrl), ReadAttribute(e, "href").Replace("..", "")).AbsoluteUri; OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + year.ToString())); if (skipUrls.Any(x => x == movie.PageUrl)) continue; movie.ImageUrl = ReadAttribute(dom.Select(".video-object-thumb img").Elements.First(), "src"); movie.ReleasedDate = new DateTime(year, 1, 1); movie.LangCode = entry.Value; movie.Description = ReadText(dom.Select(".desc_body").Elements.First()).Replace("-", ""); movie.Name = ReadText(e).Replace("\n", "").Replace("\t", ""); movie.Name = Regex.Replace(movie.Name, @"\s*?(?:\(.*?\)|\[.*?\]|\{.*?\})", String.Empty); movie.Links.Add(new ScrapedMovieLink(movie.PageUrl, "einthusan.com", "With Subtitles")); allMovies.Add(movie); var args = new MovieFoundEventArgs(movie); OnMovieFound(args); if (this.stop) return allMovies; } catch { } } } catch { } } } catch { } } } } catch { } return allMovies; }