//protected override IEnumDataPages<int, IHeaderData> GetDataFromWeb(LoadDataFromWeb loadDataFromWeb) protected override IEnumDataPages_v2 <int, IHeaderData_v2> GetData(LoadDataFromWeb_v4 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.Http.zGetXDocument().Root); string url = loadDataFromWeb.WebRequest.HttpRequest.Url; Ebookdz_HeaderPage data = new Ebookdz_HeaderPage(); data.SourceUrl = url; data.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate; data.Id = Ebookdz_LoadHeaderPagesManager.GetHeaderPageKey(loadDataFromWeb.WebRequest.HttpRequest); //data.UrlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='basenavi']//span[@class='nnext']//a/@href")); data.UrlNextPage = null; // <div id="vba_news4"> IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[@id='vba_news4']//div[@class='collapse']"); List <Ebookdz_PostHeader> headers = new List <Ebookdz_PostHeader>(); foreach (XXElement xeHeader in xeHeaders) { Ebookdz_PostHeader header = new Ebookdz_PostHeader(); header.SourceUrl = url; header.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate; //XXElement xe = xeHeader.XPathElement(".//h2[@class='blockhead']//a[@class!='mcbadge mcbadge_r']"); XXElement xe = xeHeader.XPathElement(".//h2[@class='blockhead']//a[2]"); header.Title = xe.XPathValue(".//text()"); header.UrlDetail = xe.XPathValue("./@href"); //header.images = xeHeader.XPathImages(xeImg => new UrlImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToList(); //XXElement xe = xeHeader.XPathElement(".//*[@class='shd']//a"); //header.urlDetail = zurl.GetUrl(url, xe.XPathValue("@href")); //header.title = RapideDdl.ExtractTextValues(header.infos, xe.XPathValue(".//text()", RapideDdl.TrimFunc1)); //xe = xeHeader.XPathElement(".//div[@class='shdinfo']"); //header.postAuthor = xe.XPathValue(".//span[@class='arg']//a//text()"); //// Aujourd'hui, 17:13 //header.creationDate = RapideDdl.ParseDateTime(xe.XPathValue(".//span[@class='date']//text()"), loadDataFromWeb.loadFromWebDate); //xe = xeHeader.XPathElement(".//div[@class='maincont']"); //header.images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToList(); //RapideDdl.SetTextValues(header, xe.DescendantTextList()); //xe = xeHeader.XPathElement(".//div[@class='morelink']//span[@class='arg']"); //header.category = xe.DescendantTextList(".//a").Select(RapideDdl.TrimFunc1).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/"); headers.Add(header); } data.PostHeaders = headers.ToArray(); return((IEnumDataPages_v2 <int, IHeaderData_v2>)data); }
public LoadDataFromWeb_v4 Load(RequestFromWeb_v4 webRequest) { LoadDataFromWeb_v4 loadDataFromWeb = new LoadDataFromWeb_v4 { WebRequest = webRequest }; DateTime loadFromWebDate; HttpRequest httpRequest = webRequest.HttpRequest; if (_urlCache != null) { string urlPath = _urlCache.GetUrlPath(httpRequest); if (webRequest.ReloadFromWeb || !zFile.Exists(urlPath)) { if (_firstLoadFromWeb && httpRequest.Url.StartsWith("http://")) { InitLoadFromWeb(); _firstLoadFromWeb = false; } //if (!HttpManager.CurrentHttpManager.LoadToFile(httpRequest, urlPath, GetHttpRequestParameters())) // return default(T); if (!HttpManager.CurrentHttpManager.LoadToFile(httpRequest, urlPath, _exportRequest, GetHttpRequestParameters())) return loadDataFromWeb; } httpRequest = new HttpRequest { Url = urlPath }; // get last write time as loadFromWebDate, dont take creation time because creation time is modified when copying the file //loadFromWebDate = new FileInfo(urlPath).LastWriteTime; loadFromWebDate = zFile.CreateFileInfo(urlPath).LastWriteTime; } else loadFromWebDate = DateTime.Now; if (_firstLoadFromWeb && httpRequest.Url.StartsWith("http://")) { InitLoadFromWeb(); _firstLoadFromWeb = false; } loadDataFromWeb.Http = HttpManager.CurrentHttpManager.Load(httpRequest, GetHttpRequestParameters()); //if (loadDataFromWeb.Http == null) // return default(T); if (loadDataFromWeb.Http != null) { loadDataFromWeb.LoadResult = true; loadDataFromWeb.LoadFromWebDate = loadFromWebDate; } return loadDataFromWeb; }
public IEnumerable <Ebookdz_Forum> LoadSubForum(string url, string forum, Predicate <string> filter = null, bool reload = false) { LoadDataFromWeb_v4 loadDataFromWeb = Load(new RequestFromWeb_v4(new HttpRequest { Url = url }, reload: reload)); if (loadDataFromWeb.LoadResult) { XXElement xeSource = new XXElement(loadDataFromWeb.Http.zGetXDocument().Root); // <div class="body_bd"> // <div id="forumbits" class="forumbits"> // <ol> // <li id="forum10" class="forumbit_post new L1"> // <div class="forumrow"> // <ol id="childforum_for_161" class="childsubforum"> // <div class="titleline"> foreach (XXElement xe in xeSource.XPathElements("//div[@id='forumbits']/ol/li")) { XXElement xe2 = xe.XPathElement(".//div[@class='forumrow']//a"); string category = xe2.XPathValue(".//text()"); url = Ebookdz.GetUrl(zurl.GetUrl(loadDataFromWeb.WebRequest.HttpRequest.Url, xe2.XPathValue("@href"))); yield return(new Ebookdz_Forum { Forum = forum, Category = category, Url = url }); foreach (XXElement xe3 in xe.XPathElements(".//ol[@class='childsubforum']/li//div[@class='titleline']//a")) { string name = xe3.XPathValue(".//text()"); if (filter != null && !filter(name)) { continue; } url = Ebookdz.GetUrl(zurl.GetUrl(loadDataFromWeb.WebRequest.HttpRequest.Url, xe3.XPathValue("@href"))); yield return(new Ebookdz_Forum { Forum = forum, Category = category, Name = name, Url = url }); } } } }
public IEnumerable <Ebookdz_Forum> LoadMainForum(Predicate <string> filter = null, bool reload = false) { LoadDataFromWeb_v4 loadDataFromWeb = Load(new RequestFromWeb_v4(new HttpRequest { Url = __urlForum }, reload: reload)); if (loadDataFromWeb.LoadResult) { XXElement xeSource = new XXElement(loadDataFromWeb.Http.zGetXDocument().Root); //HtmlRun.Select("//ol[@id='forums']/li:.:EmptyRow", ".//text()", ".//a//text()", ".//a/@href"); // <ol id="forums" class="floatcontainer"> foreach (XXElement xe in xeSource.XPathElements("//ol[@id='forums']/li")) { // Accueil de la Board, Forum de l'entraide, Journaux, MAGAZINES, Les Livres, Sujet supprimés ou à supprimer // http://www.ebookdz.com/forum/forumdisplay.php?f=1&s=1fdf76d35a57d09aa11e75ff6f0d9985 XXElement xe2 = xe.XPathElement(".//a"); string name = xe2.XPathValue(".//text()"); if (filter != null && !filter(name)) { continue; } string url = Ebookdz.GetUrl(zurl.GetUrl(loadDataFromWeb.WebRequest.HttpRequest.Url, xe2.XPathValue("@href"))); //if (url != null) //{ // PBUriBuilder uriBuilder = new PBUriBuilder(url); // uriBuilder.RemoveQueryValue("s"); // url = uriBuilder.ToString(); //} yield return(new Ebookdz_Forum { Forum = name, Url = url }); } } }
protected override IPost GetData(LoadDataFromWeb_v4 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.Http.zGetXDocument().Root); Ebookdz_PostDetail data = new Ebookdz_PostDetail(); data.SourceUrl = loadDataFromWeb.WebRequest.HttpRequest.Url; data.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate; data.Id = GetPostDetailKey(loadDataFromWeb.WebRequest.HttpRequest); // <div class="body_bd"> XXElement xePost = xeSource.XPathElement("//div[@class='body_bd']"); // Le Monde + Magazine + 2 suppléments du samedi 03 janvier 2015 //data.Title = xePost.XPathValue(".//div[@id='pagetitle']//a//text()", DownloadPrint.Trim); data.Title = xePost.XPathValue(".//div[@id='pagetitle']//a//text()").Trim(DownloadPrint.TrimChars); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); if (titleInfos.foundInfo) { data.OriginalTitle = data.Title; data.Title = titleInfos.title; data.Infos.SetValues(titleInfos.infos); } // Forum / Journaux / Presse quotidienne / Le Monde / Journal Le Monde + Magazine + 2 suppléments du samedi 03 janvier 2015 string lowerTitle = null; if (data.Title != null) { lowerTitle = data.Title.ToLowerInvariant(); } //data.Category = xePost.DescendantTextList(".//div[@id='breadcrumb']//a").Where(text => { text = text.ToLowerInvariant(); return text != "forum" && !text.EndsWith(lowerTitle); }).Select(DownloadPrint.TrimFunc1).zToStringValues("/"); data.Category = xePost.XPathElements(".//div[@id='breadcrumb']//a").DescendantTexts().Where(text => { text = text.ToLowerInvariant(); return(text != "forum" && !text.EndsWith(lowerTitle)); }).Select(DownloadPrint.Trim).zToStringValues("/"); string category = data.Category.ToLowerInvariant(); data.PrintType = GetPrintType(category); //Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); // <div id="postlist" class="postlist restrain"> XXElement xe = xePost.XPathElement(".//div[@id='postlist']"); // Aujourd'hui, 07h32 - Aujourd'hui, 10h51 - Hier, 12h55 - 22/02/2014, 21h09 //string date = xe.DescendantTextList(".//div[@class='posthead']//text()", nodeFilter: node => node.zGetName() != "a").zToStringValues(""); XXElement xe2 = xe.XPathElement(".//div[@class='posthead']"); //string date = xe2.DescendantTextList(nodeFilter: node => node.zGetName() != "a").zToStringValues(""); string date = xe2.DescendantTexts(node => node.zGetName() != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode).zToStringValues(""); date = date.Replace('\xA0', ' '); data.PostCreationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.LoadFromWebDate, @"d/M/yyyy, HH\hmm", @"d-M-yyyy, HH\hmm"); if (data.PostCreationDate == null) { pb.Trace.WriteLine("unknow post creation date \"{0}\"", date); } if (__trace) { pb.Trace.WriteLine("post creation date {0} - \"{1}\"", data.PostCreationDate, date); } //data.PostAuthor = xe.XPathValue(".//div[@class='userinfo']//a//text()", DownloadPrint.Trim); data.PostAuthor = xe.XPathValue(".//div[@class='userinfo']//a//text()").Trim(DownloadPrint.TrimChars); // <div class="postbody"> xe = xePost.XPathElement(".//div[@class='postbody']//div[@class='content']//blockquote/div"); //data.Images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(data.SourceUrl, xeImg.zAttribValue("src")))).ToArray(); data.Images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(data.SourceUrl, xeImg.zAttribValue("src")))).ToArray(); // force load image to get image width and height if (loadDataFromWeb.WebRequest.LoadImage) { data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); } // get infos, description, language, size, nbPages // xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a") PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.Title); data.Description = textValues.description; data.Language = textValues.language; data.Size = textValues.size; data.NbPages = textValues.nbPages; data.Infos.SetValues(textValues.infos); data.DownloadLinks = xe.XPathValues(".//a/@href").ToArray(); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
public static IKeyData<int> GetForumHeaderPageData(LoadDataFromWeb_v4 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.Http.zGetXDocument().Root); string url = loadDataFromWeb.WebRequest.HttpRequest.Url; Ebookdz_HeaderPage data = new Ebookdz_HeaderPage(); data.SourceUrl = url; data.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate; //data.Id = Ebookdz_LoadHeaderPagesManager.GetHeaderPageKey(loadDataFromWeb.WebRequest.HttpRequest); // <div id="above_threadlist" class="above_threadlist"> // <div class="threadpagenav"> // <span class="prev_next"> // <a rel="next" href="forumdisplay.php?f=74&page=2&s=4807e931448c05da34dd54fbd0308479" title="Page suivante - Résultats de 21 à 40 sur 66"> data.UrlNextPage = GetUrl(zurl.GetUrl(url, xeSource.XPathValue("//div[@id='above_threadlist']//span[@class='prev_next']//a[@rel='next']/@href"))); // <div class="body_bd"> XXElement xePost = xeSource.XPathElement("//div[@class='body_bd']"); // <div id="breadcrumb" class="breadcrumb"> // <ul class="floatcontainer"> // <li class="navbit"> // Forum / Journaux / Presse quotidienne / Autres Journaux // <div id="threadlist" class="threadlist"> // <ol id="threads" class="threads"> IEnumerable<XXElement> xeHeaders = xeSource.XPathElements("//div[@id='threadlist']//ol[@id='threads']/li"); List<Ebookdz_PostHeader> headers = new List<Ebookdz_PostHeader>(); foreach (XXElement xeHeader in xeHeaders) { Ebookdz_PostHeader header = new Ebookdz_PostHeader(); header.SourceUrl = url; header.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate; // <div class="threadinfo" title=""> // <div class="inner"> // <a title="" class="title" href="showthread.php?t=111210&s=4807e931448c05da34dd54fbd0308479" id="thread_title_111210">L'OPINION du mardi 20 janvier 2015</a> XXElement xe = xeHeader.XPathElement(".//div[@class='threadinfo']//a[@class='title']"); header.Title = xe.XPathValue(".//text()"); header.UrlDetail = GetUrl(zurl.GetUrl(loadDataFromWeb.WebRequest.HttpRequest.Url, xe.XPathValue("@href"))); //header.images = xeHeader.XPathImages(xeImg => new UrlImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToList(); //XXElement xe = xeHeader.XPathElement(".//*[@class='shd']//a"); //header.urlDetail = zurl.GetUrl(url, xe.XPathValue("@href")); //header.title = RapideDdl.ExtractTextValues(header.infos, xe.XPathValue(".//text()", RapideDdl.TrimFunc1)); //xe = xeHeader.XPathElement(".//div[@class='shdinfo']"); //header.postAuthor = xe.XPathValue(".//span[@class='arg']//a//text()"); //// Aujourd'hui, 17:13 //header.creationDate = RapideDdl.ParseDateTime(xe.XPathValue(".//span[@class='date']//text()"), loadDataFromWeb.loadFromWebDate); //xe = xeHeader.XPathElement(".//div[@class='maincont']"); //header.images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToList(); //RapideDdl.SetTextValues(header, xe.DescendantTextList()); //xe = xeHeader.XPathElement(".//div[@class='morelink']//span[@class='arg']"); //header.category = xe.DescendantTextList(".//a").Select(RapideDdl.TrimFunc1).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/"); headers.Add(header); } data.PostHeaders = headers.ToArray(); //return (IEnumDataPages_new2<int, IHeaderData_new>)data; return (IKeyData<int>)data; }
public static IKeyData <int> GetForumHeaderPageData(LoadDataFromWeb_v4 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.Http.zGetXDocument().Root); string url = loadDataFromWeb.WebRequest.HttpRequest.Url; Ebookdz_HeaderPage data = new Ebookdz_HeaderPage(); data.SourceUrl = url; data.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate; //data.Id = Ebookdz_LoadHeaderPagesManager.GetHeaderPageKey(loadDataFromWeb.WebRequest.HttpRequest); // <div id="above_threadlist" class="above_threadlist"> // <div class="threadpagenav"> // <span class="prev_next"> // <a rel="next" href="forumdisplay.php?f=74&page=2&s=4807e931448c05da34dd54fbd0308479" title="Page suivante - Résultats de 21 à 40 sur 66"> data.UrlNextPage = GetUrl(zurl.GetUrl(url, xeSource.XPathValue("//div[@id='above_threadlist']//span[@class='prev_next']//a[@rel='next']/@href"))); // <div class="body_bd"> XXElement xePost = xeSource.XPathElement("//div[@class='body_bd']"); // <div id="breadcrumb" class="breadcrumb"> // <ul class="floatcontainer"> // <li class="navbit"> // Forum / Journaux / Presse quotidienne / Autres Journaux // <div id="threadlist" class="threadlist"> // <ol id="threads" class="threads"> IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[@id='threadlist']//ol[@id='threads']/li"); List <Ebookdz_PostHeader> headers = new List <Ebookdz_PostHeader>(); foreach (XXElement xeHeader in xeHeaders) { Ebookdz_PostHeader header = new Ebookdz_PostHeader(); header.SourceUrl = url; header.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate; // <div class="threadinfo" title=""> // <div class="inner"> // <a title="" class="title" href="showthread.php?t=111210&s=4807e931448c05da34dd54fbd0308479" id="thread_title_111210">L'OPINION du mardi 20 janvier 2015</a> XXElement xe = xeHeader.XPathElement(".//div[@class='threadinfo']//a[@class='title']"); header.Title = xe.XPathValue(".//text()"); header.UrlDetail = GetUrl(zurl.GetUrl(loadDataFromWeb.WebRequest.HttpRequest.Url, xe.XPathValue("@href"))); //header.images = xeHeader.XPathImages(xeImg => new UrlImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToList(); //XXElement xe = xeHeader.XPathElement(".//*[@class='shd']//a"); //header.urlDetail = zurl.GetUrl(url, xe.XPathValue("@href")); //header.title = RapideDdl.ExtractTextValues(header.infos, xe.XPathValue(".//text()", RapideDdl.TrimFunc1)); //xe = xeHeader.XPathElement(".//div[@class='shdinfo']"); //header.postAuthor = xe.XPathValue(".//span[@class='arg']//a//text()"); //// Aujourd'hui, 17:13 //header.creationDate = RapideDdl.ParseDateTime(xe.XPathValue(".//span[@class='date']//text()"), loadDataFromWeb.loadFromWebDate); //xe = xeHeader.XPathElement(".//div[@class='maincont']"); //header.images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToList(); //RapideDdl.SetTextValues(header, xe.DescendantTextList()); //xe = xeHeader.XPathElement(".//div[@class='morelink']//span[@class='arg']"); //header.category = xe.DescendantTextList(".//a").Select(RapideDdl.TrimFunc1).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/"); headers.Add(header); } data.PostHeaders = headers.ToArray(); //return (IEnumDataPages_new2<int, IHeaderData_new>)data; return((IKeyData <int>)data); }