public static void Test_XXElement_DescendantTextList_01() { string url = @"c:\pib\dev_data\exe\runsource\download\sites\rapide-ddl\cache\detail\39000\ebooks_magazine_39023-multi-lautomobile-no821-octobre-2014.html"; pb.old.Http_v2.LoadUrl(url); XXElement xe = new XXElement(pb.old.Http_v2.HtmlReader.XDocument.Root).XPathElement("//div[@class='lcolomn mainside']").XPathElement(".//div[@class='maincont']"); //string xpath = ".//div"; //foreach (string s in xe.DescendantTextList()) foreach (string s in xe.DescendantTexts()) { Trace.WriteLine(s); } //foreach (string s in from xe2 in xe.XElement.XPathSelectElements(xpath) from s in xe2.zDescendantTextList() select s) //{ // Trace.WriteLine(s); //} //foreach (XElement xe2 in xe.XElement.XPathSelectElements(xpath)) //{ // Trace.WriteLine("XElement {0}", xe2.zGetPath()); // foreach (string s in xe2.zDescendantTextList()) // { // Trace.WriteLine(s); // } //} }
public bool MoveNext() { while (_xmlEnum.MoveNext()) { XXElement xeHeader = _xmlEnum.Current; _header = new Gesat_HeaderCompany(); _header.sourceUrl = _url; _header.loadFromWebDate = DateTime.Now; //<span class="NOM"><a title="ESAT BETTY LAUNAY-MOULIN VERT" href="/Gesat/Hauts-de-Seine,92/Bois-Colombes,35494/esat-betty-launay-moulin-vert-competences-et-handicap-92,e1837/">ESAT BETTY LAUNAY-MOULIN VERT</a></span> //_header.companyName = xeHeader.ExplicitXPathValue(".//span[@class='NOM']//a//text()"); XXElement xe = xeHeader.XPathElement(".//span[@class='NOM']//a"); if (xe != null) { _header.url = GetUrl(xe.ExplicitXPathValue("@href")); //_header.name = xe.ExplicitXPathValue(".//text()", _trimFunc1); _header.name = _trimFunc1(xe.ExplicitXPathValue(".//text()")); } //<span class="VILLE">E.S.A.T.<br />Bois-Colombes (92)</span> xe = xeHeader.XPathElement(".//span[@class='VILLE']"); if (xe != null) { //IEnumerator<string> texts = xe.DescendantTextList().GetEnumerator(); IEnumerator <string> texts = xe.DescendantTexts().GetEnumerator(); if (texts.MoveNext()) { _header.type = texts.Current.Trim(); } else { Trace.CurrentTrace.WriteLine("error companyType not found"); } if (texts.MoveNext()) { _header.location = texts.Current.Trim(); } else { Trace.CurrentTrace.WriteLine("error companyLocation not found"); } } // <span class="TELEPHONE">01 47 86 11 48</span> //_header.phone = xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()", _trimFunc1); _header.phone = _trimFunc1(xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()")); //<img info_bulle="Signataire de la charte Ethique et Valeurs" border="0" alt="/images/bullesGesat/pictoCharte.png" src="/images/bullesGesat/pictoCharte.png" style=" border: 0;" /> //<img info_bulle="Lauréat des trophées HandiResponsables 2013" border="0" alt="/images/bullesGesat/LAURIERS-OR-2013.png" src="/images/bullesGesat/LAURIERS-OR-2013.png" style=" border: 0;" /> //_header.infos = xeHeader.XPathValues(".//img/@info_bulle", _trimFunc1); _header.infos = xeHeader.XPathValues(".//img/@info_bulle").Select(_trimFunc1).ToArray(); //_header.SetInfo(xeHeader.XPathValues(".//img/@info_bulle")); return(true); } return(false); }
public static RapideDdl_HeaderPage LoadHeaderPageFromWeb(pb.Web.v1.RequestFromWeb_v2 request) { // loadDataFromWeb XXElement xeSource = new XXElement(request.GetXmlDocument().Root); string url = request.Url; RapideDdl_HeaderPage data = new RapideDdl_HeaderPage(); //data.urlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='navigation']//a[text()='Next']/@href")); data.urlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='basenavi']//span[@class='nnext']//a/@href")); IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[@class='base shortstory']"); List <RapideDdl_PostHeader> headers = new List <RapideDdl_PostHeader>(); foreach (XXElement xeHeader in xeHeaders) { RapideDdl_PostHeader header = new RapideDdl_PostHeader(); header.sourceUrl = url; header.loadFromWebDate = DateTime.Now; XXElement xe = xeHeader.XPathElement(".//*[@class='shd']//a"); header.urlDetail = zurl.GetUrl(url, xe.XPathValue("@href")); // xe.XPathValue(".//text()", Download.Print.RapideDdl.RapideDdl.TrimFunc1) /////////////////////////////////header.title = Download.Print.RapideDdl.RapideDdl.ExtractTextValues(header.infos, xe.XPathValue(".//text()").Trim(DownloadPrint.TrimChars)); //xe = xeHeader.XPathElement(".//div[@class='shdinf']/div[@class='shdinf']"); xe = xeHeader.XPathElement(".//div[@class='shdinf']"); header.postAuthor = xe.XPathValue(".//span[@class='arg']//a//text()"); // Aujourd'hui, 17:13 ////////////////////////////////header.creationDate = Download.Print.RapideDdl.RapideDdl.ParseDateTime(xe.XPathValue(".//span[@class='date']//text()"), (DateTime)header.loadFromWebDate); //xe = xeHeader.XPathElement(".//span[@id='post-img']//div[starts-with(@id, 'news-id')]"); xe = xeHeader.XPathElement(".//div[@class='maincont']"); //header.images = xe.XPathImages(url, TelechargementPlus.ImagesToSkip); //header.images = xe.XPathImages(url); //header.images = xe.XPathImages(xeImg => new ImageHtml(xeImg, url)).ToList(); header.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new pb.old.ImageHtml((XElement)xeImg, url)).ToList(); //if (request.LoadImage) // Http2.LoadImageFromWeb(header.images); //header.SetTextValues(xe.DescendantTextList()); header.SetTextValues(xe.DescendantTexts()); xe = xeHeader.XPathElement(".//div[@class='morelink']//span[@class='arg']"); //header.category = xe.DescendantTextList(".//span[@class='lcol']").Select(RapideDdl.TrimFunc1).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/"); //header.category = xe.DescendantTextList(".//a").Select(Download.Print.RapideDdl.RapideDdl.TrimFunc1).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/"); header.category = xe.XPathElements(".//a").DescendantTexts().Select(Download.Print.RapideDdl.RapideDdl.TrimFunc1).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/"); headers.Add(header); } data.postHeaders = headers.ToArray(); return(data); }
protected override void SetXml(XElement xelement) { XXElement xeSource = new XXElement(xelement); _data = new Gesat_HeaderPage(); // <div class="PAGENAVIGLIST"> // <a href="/Gesat/EtablissementList-10-10.html" title="page suivante">></a> _data.urlNextPage = GetUrl(xeSource.XPathValue("//div[@class='PAGENAVIGLIST']//a[@title='page suivante']/@href")); // <div class="ETABLISSEMENT STAR-1 ODD"> <div class="ETABLISSEMENT STAR-0 ODD"> <div class="ETABLISSEMENT STAR-1 EVEN"> IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[starts-with(@class, 'ETABLISSEMENT STAR-')]"); List <Gesat_HeaderCompany> headers = new List <Gesat_HeaderCompany>(); foreach (XXElement xeHeader in xeHeaders) { Gesat_HeaderCompany header = new Gesat_HeaderCompany(); header.sourceUrl = _url; header.loadFromWebDate = DateTime.Now; //<span class="NOM"><a title="ESAT BETTY LAUNAY-MOULIN VERT" href="/Gesat/Hauts-de-Seine,92/Bois-Colombes,35494/esat-betty-launay-moulin-vert-competences-et-handicap-92,e1837/">ESAT BETTY LAUNAY-MOULIN VERT</a></span> //_header.companyName = xeHeader.ExplicitXPathValue(".//span[@class='NOM']//a//text()"); XXElement xe = xeHeader.XPathElement(".//span[@class='NOM']//a"); if (xe != null) { header.url = GetUrl(xe.ExplicitXPathValue("@href")); //header.name = xe.ExplicitXPathValue(".//text()", _trimFunc1); header.name = _trimFunc1(xe.ExplicitXPathValue(".//text()")); } //<span class="VILLE">E.S.A.T.<br />Bois-Colombes (92)</span> xe = xeHeader.XPathElement(".//span[@class='VILLE']"); if (xe != null) { //IEnumerator<string> texts = xe.DescendantTextList().GetEnumerator(); IEnumerator <string> texts = xe.DescendantTexts().GetEnumerator(); if (texts.MoveNext()) { header.type = texts.Current.Trim(); } else { Trace.CurrentTrace.WriteLine("error companyType not found"); } if (texts.MoveNext()) { header.location = texts.Current.Trim(); } else { Trace.CurrentTrace.WriteLine("error companyLocation not found"); } } // <span class="TELEPHONE">01 47 86 11 48</span> //header.phone = xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()", _trimFunc1); header.phone = _trimFunc1(xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()")); //<img info_bulle="Signataire de la charte Ethique et Valeurs" border="0" alt="/images/bullesGesat/pictoCharte.png" src="/images/bullesGesat/pictoCharte.png" style=" border: 0;" /> //<img info_bulle="Lauréat des trophées HandiResponsables 2013" border="0" alt="/images/bullesGesat/LAURIERS-OR-2013.png" src="/images/bullesGesat/LAURIERS-OR-2013.png" style=" border: 0;" /> //header.infos = xeHeader.XPathValues(".//img/@info_bulle", _trimFunc1); header.infos = xeHeader.XPathValues(".//img/@info_bulle").Select(_trimFunc1).ToArray(); //_header.SetInfo(xeHeader.XPathValues(".//img/@info_bulle")); headers.Add(header); } _data.headerCompanies = headers.ToArray(); }
protected override RapideDdl_HeaderPage GetDataFromWeb(LoadDataFromWeb_v3 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.GetXmlDocument().Root); string url = loadDataFromWeb.request.Url; RapideDdl_HeaderPage data = new RapideDdl_HeaderPage(); data.sourceUrl = url; data.loadFromWebDate = loadDataFromWeb.loadFromWebDate; data.id = RapideDdl_LoadHeaderPagesManager.GetHeaderPageKey(url); data.urlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='basenavi']//span[@class='nnext']//a/@href")); IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[@class='base shortstory']"); List <RapideDdl_PostHeader> headers = new List <RapideDdl_PostHeader>(); foreach (XXElement xeHeader in xeHeaders) { RapideDdl_PostHeader header = new RapideDdl_PostHeader(); header.sourceUrl = url; header.loadFromWebDate = loadDataFromWeb.loadFromWebDate; XXElement xe = xeHeader.XPathElement(".//*[@class='shd']//a"); header.urlDetail = zurl.GetUrl(url, xe.XPathValue("@href")); //header.title = RapideDdl.ExtractTextValues(header.infos, xe.XPathValue(".//text()", RapideDdl.TrimFunc1)); //header.title = xe.XPathValue(".//text()", DownloadPrint.Trim); header.title = xe.XPathValue(".//text()").Trim(DownloadPrint.TrimChars); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(header.title); if (titleInfos.foundInfo) { //header.originalTitle = header.title; header.title = titleInfos.title; header.infos.SetValues(titleInfos.infos); } xe = xeHeader.XPathElement(".//div[@class='shdinfo']"); header.postAuthor = xe.XPathValue(".//span[@class='arg']//a//text()"); // Aujourd'hui, 17:13 //header.creationDate = RapideDdl.ParseDateTime(xe.XPathValue(".//span[@class='date']//text()"), loadDataFromWeb.loadFromWebDate); string date = xe.XPathValue(".//span[@class='date']//text()"); header.creationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.loadFromWebDate, "d-M-yyyy, HH:mm", "d M yyyy", "d MMMM yyyy"); if (header.creationDate == null) { pb.Trace.WriteLine("unknow date time \"{0}\"", date); } if (__trace) { pb.Trace.WriteLine("creationDate {0} - \"{1}\"", header.creationDate, date); } xe = xeHeader.XPathElement(".//div[@class='maincont']"); //header.images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToArray(); header.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToArray(); //if (request.LoadImage) // Http2.LoadImageFromWeb(header.images); //RapideDdl.SetTextValues(header, xe.DescendantTextList()); // get infos, description, language, size, nbPages // xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a") PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), header.title); header.description = textValues.description; header.language = textValues.language; header.size = textValues.size; header.nbPages = textValues.nbPages; header.infos.SetValues(textValues.infos); xe = xeHeader.XPathElement(".//div[@class='morelink']//span[@class='arg']"); //header.category = xe.DescendantTextList(".//a").Select(DownloadPrint.TrimFunc1).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/"); header.category = xe.XPathElements(".//a").DescendantTexts().Select(DownloadPrint.Trim).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/"); headers.Add(header); } data.postHeaders = headers.ToArray(); return(data); }
protected override Unea_HeaderCompany[] GetData() { XXElement xeSource = new XXElement(GetXmlDocument().Root); string url = Url; // <div class="ctn_result"> IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[@class = 'ctn_result']"); List <Unea_HeaderCompany> headers = new List <Unea_HeaderCompany>(); foreach (XXElement xeHeader in xeHeaders) { Unea_HeaderCompany header = new Unea_HeaderCompany(); header.sourceUrl = url; header.loadFromWebDate = DateTime.Now; // <div class="ctn_result-header"> XXElement xe = xeHeader.XPathElement(".//div[@class='ctn_result-header']"); // <div class="lien"><a href="http://www.unea.fr/union-nationale-entreprises-adaptees/annuaire-unea/71/4583/ACCAA TAKTIM.htm" target="_blank"><strong>></strong> Voir la fiche détaillée</a></div> header.urlDetail2 = zurl.GetUrl(url, xe.ExplicitXPathValue(".//div[@class = 'lien']//a/@href")); // <iframe src="detail.asp?id=4583" width="420" height="800" frameborder="0" scrolling="auto" marginheight="0" marginwidth="0"></iframe> header.urlDetail1 = zurl.GetUrl(url, xe.ExplicitXPathValue(".//iframe/@src")); // <h4><a href="http://www.unea.fr/union-nationale-entreprises-adaptees/annuaire-unea/71/4583/ACCAA TAKTIM.htm" target="_blank"> </a><span>|</span> ACCAA TAKTIM</h4> //header.name = xe.DescendantTextList(func: __trimFunc2).LastOrDefault(); header.name = xe.DescendantTexts().Select(__trimFunc2).LastOrDefault(); // <div class="ctn_result-content clearfix"> // ... // <p> // <strong>Activités:</strong> TRAVAUX PAYSAGERS<br>PROPRETE<br>PRESTATION DE SERVICES<br>SOUS TRAITANCE INDUSTRIELLE<br>MECANIQUE<br>AUTOMOBILE<br>METALLURGIE<br /> // <strong>Région - Département:</strong> Alsace - HAUT RHIN (68)<br /> // <strong>Téléphone:</strong> 0389570210 // <strong>Fax:</strong> 0389571761 // <strong>Adresse e-mail:</strong> // <a href="mailto:[email protected]">[email protected]</a> // </p> // </div> Unea_TextType textType = Unea_TextType.unknow; //foreach (string s in xeHeader.DescendantTextList(".//div[@class = 'ctn_result-content clearfix']", func: __trimFunc2)) foreach (string s in xeHeader.XPathElements(".//div[@class = 'ctn_result-content clearfix']").DescendantTexts().Select(__trimFunc2)) { if (s.Equals("Activités", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.activity; } else if (s.Equals("Région - Département", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.location; } else if (s.Equals("Téléphone", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.phone; } else if (s.Equals("Fax", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.fax; } else if (s.Equals("Adresse e-mail", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.email; } else { switch (textType) { case Unea_TextType.activity: if (!header.activities.ContainsKey(s)) { header.activities.Add(s, null); } break; case Unea_TextType.location: header.location = s; textType = Unea_TextType.unknow; break; case Unea_TextType.phone: header.phone = s; textType = Unea_TextType.unknow; break; case Unea_TextType.fax: header.fax = s; textType = Unea_TextType.unknow; break; case Unea_TextType.email: header.email = s; textType = Unea_TextType.unknow; break; default: header.unknowInfos.Add(s); break; } } } headers.Add(header); } return(headers.ToArray()); }
protected override IPost GetData(LoadDataFromWeb_v4 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.Http.zGetXDocument().Root); Ebookdz_PostDetail data = new Ebookdz_PostDetail(); data.SourceUrl = loadDataFromWeb.WebRequest.HttpRequest.Url; data.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate; data.Id = GetPostDetailKey(loadDataFromWeb.WebRequest.HttpRequest); // <div class="body_bd"> XXElement xePost = xeSource.XPathElement("//div[@class='body_bd']"); // Le Monde + Magazine + 2 suppléments du samedi 03 janvier 2015 //data.Title = xePost.XPathValue(".//div[@id='pagetitle']//a//text()", DownloadPrint.Trim); data.Title = xePost.XPathValue(".//div[@id='pagetitle']//a//text()").Trim(DownloadPrint.TrimChars); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); if (titleInfos.foundInfo) { data.OriginalTitle = data.Title; data.Title = titleInfos.title; data.Infos.SetValues(titleInfos.infos); } // Forum / Journaux / Presse quotidienne / Le Monde / Journal Le Monde + Magazine + 2 suppléments du samedi 03 janvier 2015 string lowerTitle = null; if (data.Title != null) { lowerTitle = data.Title.ToLowerInvariant(); } //data.Category = xePost.DescendantTextList(".//div[@id='breadcrumb']//a").Where(text => { text = text.ToLowerInvariant(); return text != "forum" && !text.EndsWith(lowerTitle); }).Select(DownloadPrint.TrimFunc1).zToStringValues("/"); data.Category = xePost.XPathElements(".//div[@id='breadcrumb']//a").DescendantTexts().Where(text => { text = text.ToLowerInvariant(); return(text != "forum" && !text.EndsWith(lowerTitle)); }).Select(DownloadPrint.Trim).zToStringValues("/"); string category = data.Category.ToLowerInvariant(); data.PrintType = GetPrintType(category); //Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); // <div id="postlist" class="postlist restrain"> XXElement xe = xePost.XPathElement(".//div[@id='postlist']"); // Aujourd'hui, 07h32 - Aujourd'hui, 10h51 - Hier, 12h55 - 22/02/2014, 21h09 //string date = xe.DescendantTextList(".//div[@class='posthead']//text()", nodeFilter: node => node.zGetName() != "a").zToStringValues(""); XXElement xe2 = xe.XPathElement(".//div[@class='posthead']"); //string date = xe2.DescendantTextList(nodeFilter: node => node.zGetName() != "a").zToStringValues(""); string date = xe2.DescendantTexts(node => node.zGetName() != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode).zToStringValues(""); date = date.Replace('\xA0', ' '); data.PostCreationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.LoadFromWebDate, @"d/M/yyyy, HH\hmm", @"d-M-yyyy, HH\hmm"); if (data.PostCreationDate == null) { pb.Trace.WriteLine("unknow post creation date \"{0}\"", date); } if (__trace) { pb.Trace.WriteLine("post creation date {0} - \"{1}\"", data.PostCreationDate, date); } //data.PostAuthor = xe.XPathValue(".//div[@class='userinfo']//a//text()", DownloadPrint.Trim); data.PostAuthor = xe.XPathValue(".//div[@class='userinfo']//a//text()").Trim(DownloadPrint.TrimChars); // <div class="postbody"> xe = xePost.XPathElement(".//div[@class='postbody']//div[@class='content']//blockquote/div"); //data.Images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(data.SourceUrl, xeImg.zAttribValue("src")))).ToArray(); data.Images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(data.SourceUrl, xeImg.zAttribValue("src")))).ToArray(); // force load image to get image width and height if (loadDataFromWeb.WebRequest.LoadImage) { data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); } // get infos, description, language, size, nbPages // xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a") PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.Title); data.Description = textValues.description; data.Language = textValues.language; data.Size = textValues.size; data.NbPages = textValues.nbPages; data.Infos.SetValues(textValues.infos); data.DownloadLinks = xe.XPathValues(".//a/@href").ToArray(); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
// detail get data protected override TelechargerMagazine_PostDetail GetDetailData(WebResult webResult) { XXElement xeSource = webResult.Http.zGetXDocument().zXXElement(); TelechargerMagazine_PostDetail data = new TelechargerMagazine_PostDetail(); data.SourceUrl = webResult.WebRequest.HttpRequest.Url; data.LoadFromWebDate = webResult.LoadFromWebDate; data.Id = GetDetailKey(webResult.WebRequest.HttpRequest); // la date est juste la date du jour // <div id="calendar-layer"> // <table id="calendar" cellpadding="3" class="calendar"> // ... // <tr> // ... // <td class="day-active-v day-current" ><a class="day-active-v" href="http://www.telecharger-magazine.com/2015/07/17/" title="Article posté dans 17 Juillet 2015">17</a></td> // ... // </tr> // ... // </table> // </div> // <div id='dle-content'> // ... // <div class="right-full"> // // <div class="cat_name"> // Posted in: // <a href="http://www.telecharger-magazine.com/journaux/">Journaux</a> // </div> // // <h2 class="title"> // <img src="/templates/MStarter/images/title.png" alt="" class="img" /> // Journaux Français Du 17 Juillet 2015 // </h2> // // <div class="contenttext"> // la date est juste la date du jour // http://www.telecharger-magazine.com/2015/07/17/ //xeSource.XPathValue("//div[@id='calendar-layer']//table[@id='calendar']//td[@class='day-active-v day-current']//a/@href"); XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']//div[@class='right-full']"); // Journaux data.Category = xePost.XPathValues(".//div[@class='cat_name']//a/text()").Select(DownloadPrint.Trim).zToStringValues("/"); data.PrintType = GetPrintType(data.Category); //pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); data.Title = xePost.XPathValue(".//h2[@class='title']//text()").zFunc(DownloadPrint.ReplaceChars).zFunc(DownloadPrint.Trim); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); if (titleInfos.foundInfo) { data.OriginalTitle = data.Title; data.Title = titleInfos.title; data.Infos.SetValues(titleInfos.infos); } XXElement xeContent = xePost.XPathElement(".//div[@class='contenttext']"); data.Images = new WebImage[] { new WebImage(zurl.GetUrl(data.SourceUrl, xeContent.XPathValue(".//img/@src"))) }; // force load image to get image width and height //if (webResult.WebRequest.LoadImageFromWeb) // data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); // get infos, description, language, size, nbPages PrintTextValues textValues = DownloadPrint.PrintTextValuesManager.GetTextValues( xeContent.DescendantTexts( node => { if (node is XText) { string text = ((XText)node).Value.Trim(); if (text.ToLowerInvariant() == "description") { return(XNodeFilter.DontSelectNode); } } if (node is XElement) { XElement xe = (XElement)node; if (xe.Name == "a") { return(XNodeFilter.Stop); } } return(XNodeFilter.SelectNode); } ).Select(DownloadPrint.ReplaceChars).Select(DownloadPrint.TrimWithoutColon), data.Title, extractValuesFromText: false); data.Description = textValues.description; data.Infos.SetValues(textValues.infos); data.DownloadLinks = xeContent.DescendantNodes( node => { if (!(node is XElement)) { return(XNodeFilter.DontSelectNode); } XElement xe2 = (XElement)node; if (xe2.Name == "a") { return(XNodeFilter.SelectNode); } if (xe2.Name != "p") { return(XNodeFilter.DontSelectNode); } XAttribute xa = xe2.Attribute("class"); if (xa == null) { return(XNodeFilter.DontSelectNode); } if (xa.Value != "submeta") { return(XNodeFilter.DontSelectNode); } //return XNodeFilter.SkipNode; return(XNodeFilter.Stop); }) .Select(node => ((XElement)node).Attribute("href").Value).ToArray(); data.DownloadLinks = xeContent.XPathValues(".//a/@href").ToArray(); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
protected override Unea_DetailCompany2 GetData() { XXElement xeSource = new XXElement(GetXmlDocument().Root); Unea_DetailCompany2 data = new Unea_DetailCompany2(); data.sourceUrl = Url; data.loadFromWebDate = DateTime.Now; // <div class='ctn_content-article'> XXElement xeContent = xeSource.XPathElement(".//div[@class='ctn_content-article']"); //IEnumerator<string> texts = xeContent.DescendantTextList(nodeFilter: node => !(node is XElement) || (((XElement)node).Name != "script" && ((XElement)node).Name != "table"), func: __trimFunc2).GetEnumerator(); IEnumerator <string> texts = xeContent.DescendantTexts(node => !(node is XElement) || (((XElement)node).Name != "script" && ((XElement)node).Name != "table") ? XNodeFilter.SelectNode : XNodeFilter.SkipNode).Select(__trimFunc2).GetEnumerator(); // <h1> // <img src="http://unea.griotte.biz/BaseDocumentaire/Docs/Public/4017/LOGOAmpouleC.JPG" style='border-width:2px;border-color:#5593C9;' height='60px' /> // <span>Entreprise Adaptée</span><br /> // ALSACE ENTREPRISE ADAPTEE // </h1> if (texts.MoveNext() && texts.MoveNext()) { data.name = texts.Current; } // <h2>ALSACE ENTREPRISE ADAPTEE est implantée sur les sites de Colmar et Mulhouse avec un effectif de 106 salariés, avec les activités sous-traitance : assemblage de pièces, cintrage de tuyaux, montage complexe, ainsi qu'une activité prestation de service en espaces verts, ménage et transport.</h2> if (texts.MoveNext()) { data.presentation = texts.Current; } Unea_TextType textType = Unea_TextType.unknow; //foreach (XText xtext in xeContent.DescendantTextNodeList(".//table")) foreach (XText xtext in xeContent.XPathElements(".//table").DescendantTextNodes()) { string text = __trimFunc2(xtext.Value); if (text == "") { continue; } if (text.Equals("NOS ACTIVITES", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.activity; } else if (text.Equals("FILIERES METIER UNEA", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.sector; } else if (text.Equals("DOCUMENTS TÉLÉCHARGEABLES", StringComparison.InvariantCultureIgnoreCase)) { foreach (XXElement xe2 in new XXElement(xtext.Parent).XPathElements("following-sibling::ul//a")) { string url = xe2.XPathValue("@href"); //string name = name = xe2.XPathValue(".//text()", __trimFunc2); string name = __trimFunc2(xe2.XPathValue(".//text()")); if (!data.downloadDocuments.ContainsKey(url)) { data.downloadDocuments.Add(url, new Unea_Document() { name = name, url = url }); } else { Trace.CurrentTrace.WriteLine("warning download document already exists \"{0}\" \"{1}\"", name, url); } } // textType = novalues pour ne pas avoir Plaquette_AEA.pdf dans unknowInfos textType = Unea_TextType.novalues; } else if (text.Equals("NOUS CONTACTER", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.novalue; } else if (text.Equals("ADRESSE", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.address; } else if (text.Equals("TELEPHONE", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.phone; } else if (text.Equals("FAX", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.fax; } else if (text.Equals("EMAIL", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.email; } else if (text.Equals("SITE", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.webSite; } else if (text.Equals("QUI SOMMES NOUS", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.novalue; } else if (text.Equals("DIRIGEANT", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.leader; } else if (text.Equals("NOMBRE DE SALARIÉS", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.employeNumber; } else if (text.Equals("CHIFFRE D'AFFAIRE DE L'ANNÉE ÉCOULÉE", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.lastYearRevenue; } else if (text.Equals("NUMÉRO SIRET", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.siret; } else if (text.Equals("CERTIFICATION", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.certification; } else if (text.Equals("PRINCIPAUX CLIENTS", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.client; } else { switch (textType) { case Unea_TextType.activity: if (!data.activities.ContainsKey(text)) { data.activities.Add(text, null); } else { Trace.CurrentTrace.WriteLine("warning activity already exists \"{0}\"", text); } break; case Unea_TextType.sector: //data.sectors.Add(text); if (!data.sectors.ContainsKey(text)) { data.sectors.Add(text, null); } else { Trace.CurrentTrace.WriteLine("warning sector already exists \"{0}\"", text); } break; case Unea_TextType.address: if (data.address == null) { data.address = text; } else { data.address += " " + text; } break; case Unea_TextType.phone: data.phone = text; textType = Unea_TextType.unknow; break; case Unea_TextType.fax: data.fax = text; textType = Unea_TextType.unknow; break; case Unea_TextType.email: data.email = text; textType = Unea_TextType.unknow; break; case Unea_TextType.webSite: data.webSite = text; textType = Unea_TextType.unknow; break; case Unea_TextType.leader: data.leader = text; textType = Unea_TextType.unknow; break; case Unea_TextType.employeNumber: int employeNumber; if (int.TryParse(text, out employeNumber)) { data.employeNumber = employeNumber; } else { Trace.CurrentTrace.WriteLine("error unknow employe number \"{0}\"", text); } textType = Unea_TextType.unknow; break; case Unea_TextType.lastYearRevenue: if (text != "€") { data.lastYearRevenue = text; } textType = Unea_TextType.unknow; break; case Unea_TextType.siret: data.siret = text; textType = Unea_TextType.unknow; break; case Unea_TextType.certification: data.certification = text; textType = Unea_TextType.unknow; break; case Unea_TextType.client: data.clients = text; textType = Unea_TextType.unknow; break; case Unea_TextType.novalues: break; default: data.unknowInfos.Add(text); break; } } } foreach (XXElement xe in xeContent.XPathElements(".//table//td/a/img")) { string url = xe.XPathValue("@src"); if (!data.photos.ContainsKey(url)) { data.photos.Add(url, null); } else { Trace.CurrentTrace.WriteLine("warning photo already exists \"{0}\"", url); } } return(data); }
protected override RapideDdl_PostDetail GetDataFromWeb(LoadDataFromWeb_v3 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.GetXmlDocument().Root); RapideDdl_PostDetail data = new RapideDdl_PostDetail(); data.sourceUrl = loadDataFromWeb.request.Url; data.loadFromWebDate = loadDataFromWeb.loadFromWebDate; data.id = GetPostDetailKey(data.sourceUrl); XXElement xePost = xeSource.XPathElement("//div[@class='lcolomn mainside']"); //data.category = xePost.DescendantTextList(".//div[@class='spbar']//a").Select(DownloadPrint.TrimFunc1).Where( data.category = xePost.XPathElements(".//div[@class='spbar']//a").DescendantTexts().Select(DownloadPrint.Trim).Where( s => { s = s.ToLowerInvariant(); return(s != "" && !s.Contains("acceuil") && !s.Contains("accueil")); } ).zToStringValues("/"); string category = data.category.ToLowerInvariant(); data.printType = GetPostType(category); //data.title = xePost.DescendantTextList(".//div[@class='spbar']", func: DownloadPrint.TrimFunc1).LastOrDefault(); data.title = xePost.XPathElements(".//div[@class='spbar']").DescendantTexts().Select(DownloadPrint.Trim).LastOrDefault(); //ExtractTitleInfos(data); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.title); if (titleInfos.foundInfo) { data.originalTitle = data.title; data.title = titleInfos.title; data.infos.SetValues(titleInfos.infos); } XXElement xe = xePost.XPathElement(".//div[@class='shdinfo']"); string date = xe.XPathValue(".//span[@class='date']//text()"); //data.creationDate = Download.Print.RapideDdl.RapideDdl.ParseDateTime(date, loadDataFromWeb.loadFromWebDate); data.creationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.loadFromWebDate, "d-M-yyyy, HH:mm", "d M yyyy", "d MMMM yyyy"); if (data.creationDate == null) { pb.Trace.WriteLine("unknow date time \"{0}\"", date); } if (__trace) { pb.Trace.WriteLine("creationDate {0} - \"{1}\"", data.creationDate, date); } data.postAuthor = xe.XPathValue(".//span[@class='arg']//a//text()"); xe = xePost.XPathElement(".//div[@class='maincont']"); //data.images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).ToArray(); data.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).ToArray(); if (loadDataFromWeb.request.LoadImage) { data.images = DownloadPrint.LoadImages(data.images).ToArray(); } //RapideDdl.SetTextValues(data, xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a" )); // xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a") PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.title); data.description = textValues.description; data.language = textValues.language; data.size = textValues.size; data.nbPages = textValues.nbPages; data.infos.SetValues(textValues.infos); List <string> downloadLinks = new List <string>(); foreach (XXElement xe2 in xe.XPathElements("div/div")) { // http://prezup.eu http://pixhst.com/avaxhome/27/36/002e3627.jpeg http://www.zupmage.eu/i/R1UgqdXn4F.jpg // http://i.imgur.com/Gu7hagN.jpg http://img11.hostingpics.net/pics/591623liens.png http://www.hapshack.com/images/jUfTZ.gif // http://pixhst.com/pictures/3029467 downloadLinks.AddRange(xe2.XPathValues(".//a/@href").Where(url => !url.StartsWith("http://prezup.eu") && !url.StartsWith("http://pixhst.com") && !url.EndsWith(".jpg") && !url.EndsWith("jpeg") && !url.EndsWith("png") && !url.EndsWith("gif"))); } data.downloadLinks = downloadLinks.ToArray(); //if (__trace) // RapideDdl_LoadPostDetail.Trace_RapideDdl_PostDetail(data); return(data); }
//protected override Telechargementz_PostDetail GetDataFromWeb(LoadDataFromWeb loadDataFromWeb) protected override IPost GetDataFromWeb(LoadDataFromWeb_v3 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.GetXmlDocument().Root); Telechargementz_PostDetail data = new Telechargementz_PostDetail(); data.SourceUrl = loadDataFromWeb.request.Url; data.LoadFromWebDate = loadDataFromWeb.loadFromWebDate; data.Id = GetPostDetailKey(data.SourceUrl); XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']"); data.PostAuthor = xePost.XPathValue(".//div[@class='title-info']//a//text()"); // , 26.12.14 string date = xePost.XPathValue(".//div[@class='title-info']//a/following-sibling::text()"); if (date != null) { data.PostCreationDate = zdate.ParseDateTimeLikeToday(date.Trim(' ', ','), loadDataFromWeb.loadFromWebDate, "dd.MM.yy"); if (data.PostCreationDate == null) { pb.Trace.WriteLine("unknow date time \"{0}\"", date); } if (__trace) { pb.Trace.WriteLine("creationDate {0} - \"{1}\"", data.PostCreationDate, date); } } else { pb.Trace.WriteLine("creationDate not found \"{0}\"", data.SourceUrl); } //data.Title = xePost.XPathElement(".//div[@class='post-title']").DescendantTextList(func: DownloadPrint.TrimFunc1).FirstOrDefault(); data.Title = xePost.XPathElement(".//div[@class='post-title']").DescendantTexts().Select(DownloadPrint.Trim).FirstOrDefault(); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); if (titleInfos.foundInfo) { data.OriginalTitle = data.Title; data.Title = titleInfos.title; data.infos.SetValues(titleInfos.infos); } XXElement xe = xePost.XPathElement(".//div[starts-with(@id, 'news-id-')]"); if (xe.XElement == null) { pb.Trace.WriteLine("element not found \".//div[starts-with(@id, 'news-id-')]\""); } //data.Images = new List<UrlImage>(); //data.Images.Add(xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).FirstOrDefault()); //data.Images = new UrlImage[] { xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).FirstOrDefault() }; WebImage image = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).FirstOrDefault(); if (image != null) { data.Images = new WebImage[] { image } } ; // force load image to get image width and height if (loadDataFromWeb.request.LoadImage) { data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); } data.DownloadLinks = xe.XPathValues(".//a/@href").ToArray(); //data.category = xePost.DescendantTextList(".//div[@class='hdiin']//a").Select(DownloadPrint.TrimFunc1).zToStringValues("/"); //string category = data.category.ToLowerInvariant(); //data.printType = GetPrintType(category); ////pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); // get infos, description, language, size, nbPages // nodeFilter: not <a> and not <span> // nodeFilter: node => !(node is XElement) || (((XElement)node).Name != "a" && ((XElement)node).Name != "span") // nodeFilter: not <a> //PrintTextValues_old textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_old(xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a"), data.Title); PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.Title); data.description = textValues.description; data.language = textValues.language; data.size = textValues.size; data.nbPages = textValues.nbPages; data.infos.SetValues(textValues.infos); data.PrintType = PrintType.UnknowEBook; if (data.infos.ContainsKey("Bd") || data.infos.ContainsKey("bd") || data.infos.ContainsKey("BD")) { data.PrintType = PrintType.Comics; } // Editeur : Presse fr else if (data.infos.ContainsKey("editeur") && data.infos["editeur"] is ZString && ((string)data.infos["editeur"]).ToLowerInvariant() == "presse fr") { data.PrintType = PrintType.Print; } else if (data.infos.ContainsKey("isbn")) { data.PrintType = PrintType.Book; } //pb.Trace.WriteLine(xe.DescendantNodes(returnNodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a").Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson()); //pb.Trace.WriteLine(xe.DescendantNodes(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a").Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson()); //pb.Trace.WriteLine(xe.DescendantNodes(returnNodeFilter: node => node is XText).Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson()); //pb.Trace.WriteLine(xe.DescendantNodes(nodeFilter: node => !(node is XElement) || (((XElement)node).Name != "a" && ((XElement)node).Name != "span"), returnNodeFilter: node => node is XText).Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson()); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
// header get data, from WebHeaderDetailMongoManagerBase_v2<THeaderData, TDetailData> protected override IEnumDataPages <Gesat_Header_v2> GetHeaderPageData(HttpResult <string> httpResult) { XXElement xeSource = httpResult.zGetXDocument().zXXElement(); string url = httpResult.Http.HttpRequest.Url; Gesat_HeaderDataPages data = new Gesat_HeaderDataPages(); data.SourceUrl = url; data.LoadFromWebDate = httpResult.Http.RequestTime; data.Id = GetPageKey(httpResult.Http.HttpRequest); // <div class="PAGENAVIGLIST"> // <a href="/Gesat/EtablissementList-10-10.html" title="page suivante">></a> data.UrlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='PAGENAVIGLIST']//a[@title='page suivante']/@href")); // <div class="ETABLISSEMENT STAR-1 ODD"> <div class="ETABLISSEMENT STAR-0 ODD"> <div class="ETABLISSEMENT STAR-1 EVEN"> IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[starts-with(@class, 'ETABLISSEMENT STAR-')]"); List <Gesat_Header_v2> headers = new List <Gesat_Header_v2>(); foreach (XXElement xeHeader in xeHeaders) { Gesat_Header_v2 header = new Gesat_Header_v2(); header.SourceUrl = url; header.LoadFromWebDate = DateTime.Now; //<span class="NOM"><a title="ESAT BETTY LAUNAY-MOULIN VERT" href="/Gesat/Hauts-de-Seine,92/Bois-Colombes,35494/esat-betty-launay-moulin-vert-competences-et-handicap-92,e1837/">ESAT BETTY LAUNAY-MOULIN VERT</a></span> XXElement xe = xeHeader.XPathElement(".//span[@class='NOM']//a"); if (xe != null) { header.UrlDetail = zurl.GetUrl(url, xe.ExplicitXPathValue("@href")); header.Name = _trimFunc(xe.ExplicitXPathValue(".//text()")); } //<span class="VILLE">E.S.A.T.<br />Bois-Colombes (92)</span> xe = xeHeader.XPathElement(".//span[@class='VILLE']"); if (xe != null) { IEnumerator <string> texts = xe.DescendantTexts().GetEnumerator(); if (texts.MoveNext()) { header.Type = texts.Current.Trim(); } else { Trace.WriteLine("error companyType not found"); } if (texts.MoveNext()) { header.Location = texts.Current.Trim(); } else { Trace.WriteLine("error companyLocation not found"); } } // <span class="TELEPHONE">01 47 86 11 48</span> header.Phone = _trimFunc(xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()")); //<img info_bulle="Signataire de la charte Ethique et Valeurs" border="0" alt="/images/bullesGesat/pictoCharte.png" src="/images/bullesGesat/pictoCharte.png" style=" border: 0;" /> //<img info_bulle="Lauréat des trophées HandiResponsables 2013" border="0" alt="/images/bullesGesat/LAURIERS-OR-2013.png" src="/images/bullesGesat/LAURIERS-OR-2013.png" style=" border: 0;" /> header.Infos = xeHeader.XPathValues(".//img/@info_bulle").Select(_trimFunc).ToArray(); //_header.SetInfo(xeHeader.XPathValues(".//img/@info_bulle")); headers.Add(header); } data.Data = headers.ToArray(); return(data); }
//protected override GoldenDdl_PostDetail GetDataFromWeb(LoadDataFromWeb loadDataFromWeb) protected override IPost GetDataFromWeb(LoadDataFromWeb_v3 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.GetXmlDocument().Root); GoldenDdl_PostDetail data = new GoldenDdl_PostDetail(); data.sourceUrl = loadDataFromWeb.request.Url; data.loadFromWebDate = loadDataFromWeb.loadFromWebDate; data.id = GetPostDetailKey(data.sourceUrl); XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']"); //data.category = xePost.DescendantTextList(".//div[@class='hdiin']//a").Select(DownloadPrint.TrimFunc1).zToStringValues("/"); data.category = xePost.XPathElements(".//div[@class='hdiin']//a").DescendantTexts().Select(DownloadPrint.Trim).zToStringValues("/"); string category = data.category.ToLowerInvariant(); data.printType = GetPrintType(category); //pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); //data.title = xePost.XPathValue(".//div[@class='bheading']//text()", DownloadPrint.Trim); data.title = xePost.XPathValue(".//div[@class='bheading']//text()").Trim(DownloadPrint.TrimChars); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.title); if (titleInfos.foundInfo) { data.originalTitle = data.title; data.title = titleInfos.title; data.infos.SetValues(titleInfos.infos); } string date = xePost.XPathValue(".//div[@class='datenews']//text()"); data.creationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.loadFromWebDate, "d-M-yyyy, HH:mm", "d M yyyy", "d MMMM yyyy"); if (data.creationDate == null) { pb.Trace.WriteLine("unknow date time \"{0}\"", date); } if (__trace) { pb.Trace.WriteLine("creationDate {0} - \"{1}\"", data.creationDate, date); } data.postAuthor = xePost.XPathValue(".//div[@class='argr']//a//text()"); XXElement xe = xePost.XPathElement(".//div[@class='maincont']"); //data.images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).ToArray(); data.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).ToArray(); // force load image to get image width and height if (loadDataFromWeb.request.LoadImage) { data.images = DownloadPrint.LoadImages(data.images).ToArray(); } // get infos, description, language, size, nbPages //PrintTextValues_old textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_old(xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a"), data.title); PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.title); data.description = textValues.description; data.language = textValues.language; data.size = textValues.size; data.nbPages = textValues.nbPages; data.infos.SetValues(textValues.infos); List <string> downloadLinks = new List <string>(); foreach (XXElement xe2 in xe.XPathElements("div/div")) { // http://prezup.eu http://pixhst.com/avaxhome/27/36/002e3627.jpeg http://www.zupmage.eu/i/R1UgqdXn4F.jpg // http://i.imgur.com/Gu7hagN.jpg http://img11.hostingpics.net/pics/591623liens.png http://www.hapshack.com/images/jUfTZ.gif // http://pixhst.com/pictures/3029467 downloadLinks.AddRange(xe2.XPathValues(".//a/@href").Where(url => !url.StartsWith("http://prezup.eu") && !url.StartsWith("http://pixhst.com") && !url.EndsWith(".jpg") && !url.EndsWith("jpeg") && !url.EndsWith("png") && !url.EndsWith("gif"))); } data.downloadLinks = downloadLinks.ToArray(); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
//public static TelechargementPlus_HeaderPage Load_old(string url, HttpRequestParameters requestParameters = null, bool reload = false, bool loadImage = false) //{ // RequestFromWeb request = new RequestFromWeb(url, requestParameters, reload, loadImage); // return _loadHeaderPage_old.Load(request); //} public static TelechargementPlus_HeaderPage LoadHeaderPageFromWeb(pb.Web.v1.RequestFromWeb_v2 request) { // loadDataFromWeb XXElement xeSource = new XXElement(request.GetXmlDocument().Root); string url = request.Url; TelechargementPlus_HeaderPage data = new TelechargementPlus_HeaderPage(); // post list : // <div class="base shortstory"> // _hxr.ReadSelect("//div[@class='base shortstory']:.:EmptyRow", ".//text()"); // next page : // <div class="navigation"> // <div align="center"> // <span>Prev.</span> // <span>1</span> // <a href="http://www.telechargement-plus.com/e-book-magazines/page/2/">2</a> // ... // <a href="http://www.telechargement-plus.com/e-book-magazines/page/2/">Next</a> // </div> // </div> // _hxr.ReadSelect("//div[@class='navigation']//a[text()='Next']:.:EmptyRow", "text()", "@href"); data.urlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='navigation']//a[text()='Next']/@href")); IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[@class='base shortstory']"); List <TelechargementPlus_PostHeader> headers = new List <TelechargementPlus_PostHeader>(); foreach (XXElement xeHeader in xeHeaders) { TelechargementPlus_PostHeader header = new TelechargementPlus_PostHeader(); //_postHeader.sourceUrl = _sourceUrl; header.sourceUrl = url; header.loadFromWebDate = DateTime.Now; //<h1 class="shd"> // <a href="http://www.telechargement-plus.com/e-book-magazines/magazines/86236-multi-ici-paris-n3562-9-au-15-octobre-2013.html"> // [Multi] Ici Paris N°3562 - 9 au 15 Octobre 2013 // </a> //</h1> XXElement xe = xeHeader.XPathElement(".//*[@class='shd']//a"); header.urlDetail = zurl.GetUrl(url, xe.XPathValue("@href")); //header.title = TelechargementPlus.TrimString(TelechargementPlus.ExtractTextValues(header.infos, xe.XPathValue(".//text()"))); // xe.XPathValue(".//text()", TelechargementPlus.TrimFunc1) header.title = TelechargementPlus.ExtractTextValues(header.infos, TelechargementPlus.TrimFunc1(xe.XPathValue(".//text()"))); //<div class="shdinf"> // <div class="shdinf"> // <span class="rcol">Auteur: // <a onclick="ShowProfile('bakafa', 'http://www.telechargement-plus.com/user/bakafa/', '0'); return false;" href="http://www.telechargement-plus.com/user/bakafa/"> // bakafa // </a> // </span> // <span class="date"> // <b><a href="http://www.telechargement-plus.com/2013/10/09/">Aujourd'hui, 17:13</a></b> // </span> // <span class="lcol">Catégorie: // <a href="http://www.telechargement-plus.com/e-book-magazines/"> // E-Book / Magazines // </a> » // <a href="http://www.telechargement-plus.com/e-book-magazines/magazines/"> // Magazines // </a> // </span> // </div> //</div> xe = xeHeader.XPathElement(".//div[@class='shdinf']/div[@class='shdinf']"); header.postAuthor = xe.XPathValue(".//span[@class='rcol']//a//text()"); //string postDate = xe.XPathValue(".//span[@class='date']//text()"); // Aujourd'hui, 17:13 //if (postDate != null) // _postHeader.infos.SetValue("postDate", new ZString(postDate)); header.creationDate = TelechargementPlus.ParseDateTime(xe.XPathValue(".//span[@class='date']//text()")); //header.category = xe.DescendantTextList(".//span[@class='lcol']").Select(TelechargementPlus.TrimFunc1).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/"); header.category = xe.XPathElements(".//span[@class='lcol']").DescendantTexts().Select(TelechargementPlus.TrimFunc1).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/"); //Trace.CurrentTrace.WriteLine("post header category \"{0}\"", _postHeader.category); //.zForEach(s => s.Trim()) //<span id="post-img"> // <div id="news-id-86236" style="display: inline;"> // <div style="text-align: center;"> // <!--dle_image_begin:http://zupimages.net/up/3/1515486591.jpeg|--> // <img src="http://zupimages.net/up/3/1515486591.jpeg" alt="[Multi] Ici Paris N°3562 - 9 au 15 Octobre 2013" // title="[Multi] Ici Paris N°3562 - 9 au 15 Octobre 2013" /><!--dle_image_end--> // <br /> // <b> // <br /> // Ici Paris N°3562 - 9 au 15 Octobre 2013<br /> // French | 52 pages | HQ PDF | 101 MB // </b> // <br /> // <br /> // Ici Paris vous fait partager la vie publique et privée de celles et ceux qui font // l'actualité : exclusivités, interviews, enquêtes (la face cachée du showbiz, les // coulisses de la télé) indiscrétions, potins.<br /> // </div> // </div> //</span> xe = xeHeader.XPathElement(".//span[@id='post-img']//div[starts-with(@id, 'news-id')]"); //_postHeader.images = xe.XPathImages(".//img", _url, TelechargementPlus.ImagesToSkip); //header.images = xe.XPathImages(url, TelechargementPlus.ImagesToSkip); //header.images = xe.XPathImages(url, imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source)); //header.images = xe.XPathImages(xeImg => new ImageHtml(xeImg, url), imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source)).ToList(); header.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new pb.old.ImageHtml((XElement)xeImg, url)).Where(imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source)).ToList(); if (request.LoadImage) { pb.old.Http_v2.LoadImageFromWeb(header.images); } //header.SetTextValues(xe.DescendantTextList()); header.SetTextValues(xe.DescendantTexts()); headers.Add(header); } data.postHeaders = headers.ToArray(); return(data); }