public bool MoveNext() { while (_xmlEnum.MoveNext()) { XXElement xeHeader = _xmlEnum.Current; _header = new Gesat_HeaderCompany(); _header.sourceUrl = _url; _header.loadFromWebDate = DateTime.Now; //<span class="NOM"><a title="ESAT BETTY LAUNAY-MOULIN VERT" href="/Gesat/Hauts-de-Seine,92/Bois-Colombes,35494/esat-betty-launay-moulin-vert-competences-et-handicap-92,e1837/">ESAT BETTY LAUNAY-MOULIN VERT</a></span> //_header.companyName = xeHeader.ExplicitXPathValue(".//span[@class='NOM']//a//text()"); XXElement xe = xeHeader.XPathElement(".//span[@class='NOM']//a"); if (xe != null) { _header.url = GetUrl(xe.ExplicitXPathValue("@href")); //_header.name = xe.ExplicitXPathValue(".//text()", _trimFunc1); _header.name = _trimFunc1(xe.ExplicitXPathValue(".//text()")); } //<span class="VILLE">E.S.A.T.<br />Bois-Colombes (92)</span> xe = xeHeader.XPathElement(".//span[@class='VILLE']"); if (xe != null) { //IEnumerator<string> texts = xe.DescendantTextList().GetEnumerator(); IEnumerator <string> texts = xe.DescendantTexts().GetEnumerator(); if (texts.MoveNext()) { _header.type = texts.Current.Trim(); } else { Trace.CurrentTrace.WriteLine("error companyType not found"); } if (texts.MoveNext()) { _header.location = texts.Current.Trim(); } else { Trace.CurrentTrace.WriteLine("error companyLocation not found"); } } // <span class="TELEPHONE">01 47 86 11 48</span> //_header.phone = xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()", _trimFunc1); _header.phone = _trimFunc1(xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()")); //<img info_bulle="Signataire de la charte Ethique et Valeurs" border="0" alt="/images/bullesGesat/pictoCharte.png" src="/images/bullesGesat/pictoCharte.png" style=" border: 0;" /> //<img info_bulle="Lauréat des trophées HandiResponsables 2013" border="0" alt="/images/bullesGesat/LAURIERS-OR-2013.png" src="/images/bullesGesat/LAURIERS-OR-2013.png" style=" border: 0;" /> //_header.infos = xeHeader.XPathValues(".//img/@info_bulle", _trimFunc1); _header.infos = xeHeader.XPathValues(".//img/@info_bulle").Select(_trimFunc1).ToArray(); //_header.SetInfo(xeHeader.XPathValues(".//img/@info_bulle")); return(true); } return(false); }
public override string[] UnprotectLink(string protectLink) { string key = GetKey(); //HttpRequestParameters requestParameters = new HttpRequestParameters(); //requestParameters.accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; //requestParameters.headers.Add("Accept-Language", "de, en-gb;q=0.9, en;q=0.8"); //Http2.LoadUrl(protectLink, requestParameters); HttpRequestParameters requestParameters = new HttpRequestParameters(); requestParameters.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; requestParameters.Headers.Add("Accept-Language", "de, en-gb;q=0.9, en;q=0.8"); HttpManager.CurrentHttpManager.Load(new HttpRequest { Url = protectLink }, requestParameters); //string content = "action=qaptcha&qaptcha_key=Wb6aEMQQ_xxQRMgkYX-XuWsdUyGHrQpZ"; string content = "action=qaptcha&qaptcha_key=" + key; requestParameters.Accept = "application/json, text/javascript, */*; q=0.01"; requestParameters.Headers.Add("X-Requested-With", "XMLHttpRequest"); requestParameters.ContentType = "application/x-www-form-urlencoded"; HttpManager.CurrentHttpManager.Load(new HttpRequest { Url = __urlFormulaire, Method = HttpRequestMethod.Post, Referer = protectLink, Content = content }, requestParameters); //content = "Wb6aEMQQ_xxQRMgkYX-XuWsdUyGHrQpZ=&submit_captcha=VALIDER"; content = key + "=&submit_captcha=VALIDER"; requestParameters.Accept = "text/html, application/xhtml+xml, */*"; requestParameters.Headers.Add("X-Requested-With", "XMLHttpRequest"); requestParameters.ContentType = "application/x-www-form-urlencoded"; Http http = HttpManager.CurrentHttpManager.Load(new HttpRequest { Url = protectLink, Method = HttpRequestMethod.Post, Referer = __urlFormulaire, Content = content }, requestParameters); //XXElement xeSource = new XXElement(Http2.HtmlReader.XDocument.Root); //XXElement xeSource = webResult.Http.zGetXDocument().zXXElement(); XXElement xeSource = http.zGetXDocument().zXXElement(); return(xeSource.XPathValues(".//div[@class='all_liens']//a/@href").ToArray()); }
protected void _GetDetailData(XXElement xeSource, BlogDemoorDetailData data) { // <div id="content"> XXElement xe = xeSource.XPathElement("//div[@id='content']//div[@class='item_div']"); data.Title = xe.XPathValue(".//h2//text()"); string date = xe.XPathValue(".//div[@class='dateheader']/text()"); Date d; if (Date.TryParseExact(date, "d MMMM yyyy", __cultureInfo, DateTimeStyles.None, out d)) { data.Date = d; } else { Trace.WriteLine($"date not found \"{date}\""); } //<div class="articlebody" itemprop="articleBody"> XXElement xeBody = xe.XPathElement(".//div[@class='articlebody']"); if (xeBody.XElement != null) { data.Content = xeBody.XElement.ToString(); } data.Images = xeBody.XPathValues(".//a/@href").Where(url => new Uri(url).Host.EndsWith(".canalblog.com")).Select(url => new WebImage(zurl.GetUrl(data.SourceUrl, url))).ToArray(); // force load image to get image width and height //if (webResult.WebRequest.LoadImage) // data.LoadImages(); //if (__trace) // pb.Trace.WriteLine(data.zToJson()); }
private static OnisepInstitution_Detail GetData(WebResult webResult) { XXElement xeSource = webResult.Http.zGetXDocument().zXXElement(); OnisepInstitution_Detail data = new OnisepInstitution_Detail(); data.SourceUrl = webResult.WebRequest.HttpRequest.Url; data.LoadFromWebDate = webResult.LoadFromWebDate; data.Id = GetKey(webResult.WebRequest.HttpRequest); XXElement xeData = xeSource.XPathElement("//div[@id='oni_content-page']//div[@class='oni_innerContent']//div[@id='oni_zoom-block']"); data.Institution = OnisepInstitution.Trim(xeData.XPathValue(".//h1/text()")); // <span class="oni_span-title">Code UAI : 0062080D</span> string s = OnisepInstitution.Trim(xeData.XPathValue(".//span[@class='oni_span-title']/text()")); if (s != null && s.StartsWith("Code UAI :", StringComparison.InvariantCultureIgnoreCase)) { data.UAICode = OnisepInstitution.Trim(s.Substring(10)); } XXElement xe = xeData.XPathElement(".//div[@class='oni_fiche-info-1']"); data.Address = OnisepInstitution.Trim(xe.XPathValue(".//span[@class='street-address']/text()")); data.PostalCode = OnisepInstitution.Trim(xe.XPathValue(".//span[@class='postal-code']/text()")); data.City = OnisepInstitution.Trim(xe.XPathValue(".//span[@class='locality']/text()")); data.Tel = OnisepInstitution.Trim(xe.XPathValue(".//span[@class='tel']/text()")); s = xe.XPathValues(".//p[@class='vcard']//text()").Select(OnisepInstitution.Trim).Where(t => t.StartsWith("Fax :", StringComparison.InvariantCultureIgnoreCase)).FirstOrDefault(); if (s != null) { data.Fax = OnisepInstitution.Trim(s.Substring(5)); } s = xe.XPathValue(".//a[@class='email']/@href"); if (s != null && s.StartsWith("mailto:", StringComparison.InvariantCultureIgnoreCase)) { s = s.Substring(7); } data.Mail = s; data.WebSite = xe.DescendantTextNodes().Where(xt => string.Equals(OnisepInstitution.Trim(xt.Value), "site :", StringComparison.InvariantCultureIgnoreCase)).FirstOrDefault() //.zXPathValue(".//following-sibling::a/@href"); .zXPathValue(".//following::a/@href"); foreach (XXElement xe2 in xeData.XPathElements(".//div[@class='oni_fiche-info-2']//li")) { string[] values = xe2.DescendantTexts().Take(2).ToArray(); if (values.Length != 2) { continue; } switch (OnisepInstitution.Trim(values[0]).ToLower()) { case "statut de l'établissement :": data.InstitutionStatus = OnisepInstitution.Trim(values[1]); break; case "hébergement :": data.Lodging = OnisepInstitution.Trim(values[1]); break; case "présence d'une ulis": data.Ulis = OnisepInstitution.Trim(values[1]); break; } } data.StudyLevels = xeData.XPathElements(".//div[@class='oni_nav-in']//ul[@class='oni_nav-in-ul']//li").Select(li => li.DescendantTexts().zConcatStrings()).Where(txt => txt != null).ToArray(); data.BacLevel = GetBacLevel(data.StudyLevels); // Address = text in <span class="street-address"> // PostalCode = text in <span class="postal-code"> // City = text in <span class="locality"> // Tel = text in <span class="tel"> // Fax = text start with "Fax :" // Mail = @href start with mailto: in <a class="email"> //bool address = false; //foreach (XNode node in xeData.XPathElement(".//div[@class='oni_fiche-info-1']//p[@class='vcard']").DescendantNodes()) //{ // if (node is XElement) // { // XElement xe = (XElement)node; // if (xe.Name == "span") // { // XAttribute attribute = xe.Attribute("class"); // if (attribute != null && attribute.Value == "street-address") // address = true; // } // else // address = false; // } // if (node is XText) // { // if (address) // { // data.Address = OnisepInstitution.Trim(((XText)node).Value); // address = false; // } // } //} if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
protected void InitXml() { _data = new Gesat_Company(); _data.url = _url; _data.loadFromWebDate = DateTime.Now; if (_header != null) { _data.name = _header.name; _data.type = _header.type; _data.location = _header.location; _data.phone = _header.phone; _data.infos = _header.infos; } // <div class="PAGES" id="content"> XXElement xe = _xeSource.XPathElement(".//div[@id='content']"); // <h1><span>ESAT BETTY LAUNAY-MOULIN VERT >></span><br />Coordonnées & activités</h1> //string s = xe.XPathValue(".//h1//text()", _trimFunc2); string s = _trimFunc2(xe.XPathValue(".//h1//text()")); //s = s.Trim(' ', '>'); if (!s.Equals(_data.name, StringComparison.InvariantCultureIgnoreCase)) { _data.headerName = _data.name; _data.name = s; } // <div class="BLOC B100 ACCROCHE"> // <div class="CONTENU-BLOC">Cet E.S.A.T. est ouvert depuis 1989 et accueille 55 personnes reconnues travailleurs handicapés. Il est situé dans la ville de // <a href="/Gesat/Hauts-de-Seine,92/Bois-Colombes,35494/" title="Bois-Colombes // Les ESAT et EA de la ville">Bois-Colombes</a> ( // <a href="/Gesat/Hauts-de-Seine,92/" title="Hauts-de-Seine // Les ESAT et EA du département">Hauts-de-Seine</a>) // </div></div> _data.descryption = xe.XPathConcatText(".//div[@class='BLOC B100 ACCROCHE']//text()", resultFunc: _trimFunc1); _data.descryption = _data.descryption.Replace("\r", ""); _data.descryption = _data.descryption.Replace("\n", ""); _data.descryption = _data.descryption.Replace("\t", ""); //_data.city = xe.XPathValue(".//div[@class='BLOC B100 ACCROCHE']//a[1]//text()", _trimFunc1); _data.city = _trimFunc1(xe.XPathValue(".//div[@class='BLOC B100 ACCROCHE']//a[1]//text()")); //_data.department = xe.XPathValue(".//div[@class='BLOC B100 ACCROCHE']//a[2]//text()", _trimFunc1); _data.department = _trimFunc1(xe.XPathValue(".//div[@class='BLOC B100 ACCROCHE']//a[2]//text()")); // <div class="ADRESSE">78, RUE RASPAIL<br />92270 Bois-Colombes</div> _data.address = xe.XPathConcatText(".//div[@class='ADRESSE']//text()", " ", itemFunc: _trimFunc1); _data.address = _data.address.Replace("\r", ""); _data.address = _data.address.Replace("\n", ""); _data.address = _data.address.Replace("\t", ""); // <div class="TEL">01 47 86 11 48</div> //s = xe.XPathValue(".//div[@class='TEL']//text()", _trimFunc1); s = _trimFunc1(xe.XPathValue(".//div[@class='TEL']//text()")); if (!s.Equals(_data.phone, StringComparison.InvariantCultureIgnoreCase)) { _data.headerPhone = _data.phone; _data.phone = s; } // <div class="FAX">01 47 82 42 64</div> //_data.fax = xe.XPathValue(".//div[@class='FAX']//text()", _trimFunc1); _data.fax = _trimFunc1(xe.XPathValue(".//div[@class='FAX']//text()")); // <div class="EMAIL">production.launay<img border="0" alt="arobase.png" src="/images/bulles/arobase.png" style=" border: 0;" />lemoulinvert.org</div> _data.email = xe.XPathConcatText(".//div[@class='EMAIL']//text()", "@", itemFunc: _trimFunc1); // <div class="WWW"><a href="http://www.esat-b-launay.com" target="_blank">www.esat-b-launay.com</a></div> //_data.webSite = xe.XPathValue(".//div[@class='WWW']//a/@href", _trimFunc1); _data.webSite = _trimFunc1(xe.XPathValue(".//div[@class='WWW']//a/@href")); // <div class="BLOC-FICHE BLOC-ACTIVITES"> // <dl><dt>Conditionnement, travaux à façon</dt></dl> // <dl><dt>Assemblage, montage</dt></dl> // <dl><dt>Mise sous pli, mailing, routage</dt></dl> // <dl><dt>Toutes activités en entreprise </dt></dl> // <dl><dt>Numérisation, saisie informatique</dt></dl> // <dl><dt>Remplissage, ensachage, flaconnage</dt></dl> // <dl><dt>Etiquetage, codage, badges</dt></dl> // <dl><dt>Secrétariat, travaux administratifs</dt></dl> // <dl><dt>Artisanats divers</dt></dl> // </div> //_data.activities = xe.XPathValues(".//div[@class='BLOC-FICHE BLOC-ACTIVITES']//dl//text()", _trimFunc1); _data.activities = xe.XPathValues(".//div[@class='BLOC-FICHE BLOC-ACTIVITES']//dl//text()").Select(_trimFunc1).ToArray(); }
private static ExtremeDown_PostDetail_v2 GetData(WebResult webResult) { XXElement xeSource = webResult.Http.zGetXDocument().zXXElement(); ExtremeDown_PostDetail_v2 data = new ExtremeDown_PostDetail_v2(); data.SourceUrl = webResult.WebRequest.HttpRequest.Url; data.LoadFromWebDate = webResult.LoadFromWebDate; data.Id = GetPostDetailKey(webResult.WebRequest.HttpRequest); XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']"); //data.Title = xePost.XPathValue(".//h2[@class='blocktitle']//text()", DownloadPrint.Trim); data.Title = xePost.XPathValue(".//h2[@class='blocktitle']//text()").Trim(DownloadPrint.TrimChars); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); if (titleInfos.foundInfo) { data.OriginalTitle = data.Title; data.Title = titleInfos.title; data.Infos.SetValues(titleInfos.infos); } XXElement xeDiv = xePost.XPathElement(".//div[@class='blockheader']"); data.Category = xeDiv.XPathValues(".//i[@class='icon-cats']/ancestor::span//a//text()").Select(DownloadPrint.Trim).zToStringValues("/"); //string category = data.Category.ToLowerInvariant(); data.PrintType = GetPrintType(data.Category); data.PostAuthor = xeDiv.XPathValue(".//span/i[@class='icon-user']/ancestor::span//a//text()"); string date = xeDiv.XPathValue(".//span/i[@class='icon-date']/ancestor::span//a//text()"); data.PostCreationDate = zdate.ParseDateTimeLikeToday(date, webResult.LoadFromWebDate, "d-M-yyyy, HH:mm", "d M yyyy", "d MMMM yyyy"); if (data.PostCreationDate == null) { pb.Trace.WriteLine("unknow date time \"{0}\"", date); } if (__trace) { pb.Trace.WriteLine("creationDate {0} - \"{1}\"", data.PostCreationDate, date); } xeDiv = xePost.XPathElement(".//div[@class='blockcontent']"); List <string> description = new List <string>(); description.AddRange(xeDiv.XPathValues(".//p[@class='release-name']//text()")); //data.Images = xeDiv.XPathElement(".//table//td[@class='image-block']").XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).ToArray(); data.Images = xeDiv.XPathElement(".//table//td[@class='image-block']").DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(data.SourceUrl, xeImg.zAttribValue("src")))).ToArray(); // force load image to get image width and height if (webResult.WebRequest.LoadImage) { data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); } description.AddRange(xeDiv.XPathValues(".//table//td//blockquote//text()")); //xeDiv = xePost.XPathElement(".//div[@class='clearfix']"); xeDiv = xePost.XPathElement(".//div[@class='upload-infos clearfix']"); description.AddRange(xeDiv.XPathValues(".//table//text()")); data.Description = description.ToArray(); string title = null; // xePost.XPathElements(".//script/parent::div//following-sibling::h2") Func <XXElement, XNodeFilter> filter = xe => { if (xe.XElement.Name == "h2") { title = xe.XPathValue(".//text()"); } else if (xe.XElement.Name == "script") { return(XNodeFilter.Stop); } else if (xe.XElement.Name == "div") { return(XNodeFilter.SelectNode); } return(XNodeFilter.DontSelectNode); }; foreach (XXElement xe in xePost.XPathElements(".//div[@class='prez_2']//following-sibling::*").zFilterElements(filter)) { //string s = xe.XPathValue(".//text()"); //// Liens de téléchargement - Pack 1 //if (s.StartsWith("Liens de téléchargement")) //{ // s = s.Substring(23).Trim(' ', '-'); // if (s == "") // s = title; // else if (title != null) // s = title + " - " + s; // title = null; data.DownloadLinks_new.AddItem(title); title = null; //foreach (XXElement xe2 in xe.XPathElements("following-sibling::div[1]//a")) foreach (XXElement xe2 in xe.XPathElements(".//a")) { //s = xe2.DescendantTextList().FirstOrDefault(); // <strong class="hebergeur"> string server = xe2.XPathValue(".//strong[@class='hebergeur']//text()"); string link = xe2.XPathValue("@href"); if (__getLinksExtremeProtect && __extremeProtect.IsLinkProtected(link)) { data.DownloadLinks_new.AddServer(server, link); data.DownloadLinks_new.AddLinks(__extremeProtect.UnprotectLink(link)); } else { data.DownloadLinks_new.AddServer(server); data.DownloadLinks_new.AddLink(link); } } //} //else if (s != null) // title = s; } //xeDiv = xePost.XPathElement(".//div[@class='blockfooter links']"); ////data.category = xeDiv.DescendantTextList(".//i[@class='icon-cats']/parent::span//a").Select(DownloadPrint.TrimFunc1).zToStringValues("/"); //data.Category = xeDiv.XPathElements(".//i[@class='icon-cats']/parent::span//a").DescendantTexts().Select(DownloadPrint.Trim).zToStringValues("/"); //string category = data.Category.ToLowerInvariant(); //data.PrintType = GetPrintType(category); ////pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
protected override IPost GetData(LoadDataFromWeb_v4 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.Http.zGetXDocument().Root); Ebookdz_PostDetail data = new Ebookdz_PostDetail(); data.SourceUrl = loadDataFromWeb.WebRequest.HttpRequest.Url; data.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate; data.Id = GetPostDetailKey(loadDataFromWeb.WebRequest.HttpRequest); // <div class="body_bd"> XXElement xePost = xeSource.XPathElement("//div[@class='body_bd']"); // Le Monde + Magazine + 2 suppléments du samedi 03 janvier 2015 //data.Title = xePost.XPathValue(".//div[@id='pagetitle']//a//text()", DownloadPrint.Trim); data.Title = xePost.XPathValue(".//div[@id='pagetitle']//a//text()").Trim(DownloadPrint.TrimChars); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); if (titleInfos.foundInfo) { data.OriginalTitle = data.Title; data.Title = titleInfos.title; data.Infos.SetValues(titleInfos.infos); } // Forum / Journaux / Presse quotidienne / Le Monde / Journal Le Monde + Magazine + 2 suppléments du samedi 03 janvier 2015 string lowerTitle = null; if (data.Title != null) { lowerTitle = data.Title.ToLowerInvariant(); } //data.Category = xePost.DescendantTextList(".//div[@id='breadcrumb']//a").Where(text => { text = text.ToLowerInvariant(); return text != "forum" && !text.EndsWith(lowerTitle); }).Select(DownloadPrint.TrimFunc1).zToStringValues("/"); data.Category = xePost.XPathElements(".//div[@id='breadcrumb']//a").DescendantTexts().Where(text => { text = text.ToLowerInvariant(); return(text != "forum" && !text.EndsWith(lowerTitle)); }).Select(DownloadPrint.Trim).zToStringValues("/"); string category = data.Category.ToLowerInvariant(); data.PrintType = GetPrintType(category); //Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); // <div id="postlist" class="postlist restrain"> XXElement xe = xePost.XPathElement(".//div[@id='postlist']"); // Aujourd'hui, 07h32 - Aujourd'hui, 10h51 - Hier, 12h55 - 22/02/2014, 21h09 //string date = xe.DescendantTextList(".//div[@class='posthead']//text()", nodeFilter: node => node.zGetName() != "a").zToStringValues(""); XXElement xe2 = xe.XPathElement(".//div[@class='posthead']"); //string date = xe2.DescendantTextList(nodeFilter: node => node.zGetName() != "a").zToStringValues(""); string date = xe2.DescendantTexts(node => node.zGetName() != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode).zToStringValues(""); date = date.Replace('\xA0', ' '); data.PostCreationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.LoadFromWebDate, @"d/M/yyyy, HH\hmm", @"d-M-yyyy, HH\hmm"); if (data.PostCreationDate == null) { pb.Trace.WriteLine("unknow post creation date \"{0}\"", date); } if (__trace) { pb.Trace.WriteLine("post creation date {0} - \"{1}\"", data.PostCreationDate, date); } //data.PostAuthor = xe.XPathValue(".//div[@class='userinfo']//a//text()", DownloadPrint.Trim); data.PostAuthor = xe.XPathValue(".//div[@class='userinfo']//a//text()").Trim(DownloadPrint.TrimChars); // <div class="postbody"> xe = xePost.XPathElement(".//div[@class='postbody']//div[@class='content']//blockquote/div"); //data.Images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(data.SourceUrl, xeImg.zAttribValue("src")))).ToArray(); data.Images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(data.SourceUrl, xeImg.zAttribValue("src")))).ToArray(); // force load image to get image width and height if (loadDataFromWeb.WebRequest.LoadImage) { data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); } // get infos, description, language, size, nbPages // xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a") PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.Title); data.Description = textValues.description; data.Language = textValues.language; data.Size = textValues.size; data.NbPages = textValues.nbPages; data.Infos.SetValues(textValues.infos); data.DownloadLinks = xe.XPathValues(".//a/@href").ToArray(); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
public string[] GetTextValues() { //return _currentElement.XPathValues(".//td//text()", s => __trimFunc1(__badCharacters.Replace(s, " "))); ; return(_currentElement.XPathValues(".//td//text()").Select(s => __trimFunc1(__badCharacters.Replace(s, " "))).ToArray());; }
protected override Handeco_DetailCompany GetData() { XXElement xeSource = new XXElement(GetXmlDocument().Root); Handeco_DetailCompany data = new Handeco_DetailCompany(); data.sourceUrl = Url; data.loadFromWebDate = DateTime.Now; //<div style="text-align: right; font-size: 10px;"> //<em>Dernière mise à jour le 18-01-2013</em> //</div> //string lastUpdate = xeSource.XPathValue("//em[starts-with(text(), 'Dernière mise à jour')]/text()", __trimFunc1); string lastUpdate = __trimFunc1(xeSource.XPathValue("//em[starts-with(text(), 'Dernière mise à jour')]/text()")); if (lastUpdate != null) { Match match = __lastUpdateRegex.Match(lastUpdate); DateTime date; if (match.Success && DateTime.TryParseExact(match.Value, "dd-MM-yyyy", System.Globalization.CultureInfo.CurrentCulture, System.Globalization.DateTimeStyles.None, out date)) { data.dernièreMiseàjour = date; } else { data.unknowInfos.Add(lastUpdate); } } else { Trace.WriteLine("error \"Dernière mise à jour\" not found"); } // NOTRE OFFRE - activities - multiple //<select style="width: 200px; display: none;" onchange="change_activite(this.selectedIndex);" id="select_activites"> // <option>Sous-traitance industrielle - Autre</option> // <option>Assemblage mécanique</option> // <option>Energie renouvelable - Autre</option> //</select> //string[] activityTypes = xeSource.XPathValues("//select[@id = 'select_activites']/option/text()", __trimFunc1); string[] activityTypes = xeSource.XPathValues("//select[@id = 'select_activites']/option/text()").Select(__trimFunc1).ToArray(); // CONTACTS - multiple //<select style="width: 200px; display: none;" onchange="change_contact(this.selectedIndex);" id="select_contacts"> // <option>Jacky STEINLE (Chef d'atelier)</option> //</select> //string[] contactDescriptions = xeSource.XPathValues("//select[@id = 'select_contacts']/option/text()", __trimFunc1); string[] contactDescriptions = xeSource.XPathValues("//select[@id = 'select_contacts']/option/text()").Select(__trimFunc1).ToArray(); int indexActivityType = 0; int indexContactDescription = 0; List <Activity> activities = new List <Activity>(); List <Contact> contacts = new List <Contact>(); foreach (XXElement xxe in xeSource.XPathElements("//table[@class = 'fiche organisation']")) { //string id = xxe.XPathValue("@id", s => s.ToLower()); string id = xxe.XPathValue("@id").ToLower(); if (__trace) { Trace.WriteLine("table id = \"{0}\"", id); } Activity activity = null; Contact contact = null; if (id != null && id.StartsWith("fiche_activite_")) { activity = new Activity(); activities.Add(activity); if (indexActivityType < activityTypes.Length) { activity.type = activityTypes[indexActivityType++]; } else { Trace.WriteLine("warning miss an activity type in html (<select id='select_activites'>)"); } } else if (id != null && id.StartsWith("fiche_contact_")) { contact = new Contact(); contacts.Add(contact); if (indexContactDescription < contactDescriptions.Length) { contact.description = contactDescriptions[indexContactDescription++]; } else { Trace.WriteLine("warning miss an activity type in html (<select id='select_contacts'>)"); } } foreach (XXElement xxe2 in xxe.XPathElements(".//tr")) { //string valueName = xxe2.XPathValue(".//th//text()", __trimFunc1); string valueName = __trimFunc1(xxe2.XPathValue(".//th//text()")); //string value = xxe2.XPathConcatText(".//td//text()", separator: " ", itemFunc: s => __trimFunc1(__badCharacters.Replace(s, " "))); _currentElement = xxe2; //if (valueName == null || value == null) if (valueName == null) { continue; } //if ((activity == null || !SetActivityValue(activity, valueName, value)) // && (contact == null || !SetContactValue(contact, valueName, value))) // SetValue(data, valueName, value); if (activity != null) { if (__trace) { Trace.Write("activité "); } if (!SetActivityValue(activity, valueName)) { if (__trace) { Trace.Write("error "); } data.unknowInfos.Add("valeur activité inconnu : " + valueName + " = " + GetTextValue()); } else if (__trace) { Trace.Write(" "); } if (__trace) { Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue()); } } else if (contact != null) { if (__trace) { Trace.Write("contact "); } if (!SetContactValue(contact, valueName)) { if (__trace) { Trace.Write("error "); } data.unknowInfos.Add("valeur contact inconnu : " + valueName + " = " + GetTextValue()); } else if (__trace) { Trace.Write(" "); } if (__trace) { Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue()); } } else { if (__trace) { Trace.Write("société "); } if (!SetValue(data, valueName)) { if (__trace) { Trace.Write("error "); } data.unknowInfos.Add("valeur inconnu : " + valueName + " = " + GetTextValue()); } else if (__trace) { Trace.Write(" "); } if (__trace) { Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue()); } } } } data.activités = activities.ToArray(); data.contacts = contacts.ToArray(); return(data); }
// detail get data protected override TelechargerMagazine_PostDetail GetDetailData(WebResult webResult) { XXElement xeSource = webResult.Http.zGetXDocument().zXXElement(); TelechargerMagazine_PostDetail data = new TelechargerMagazine_PostDetail(); data.SourceUrl = webResult.WebRequest.HttpRequest.Url; data.LoadFromWebDate = webResult.LoadFromWebDate; data.Id = GetDetailKey(webResult.WebRequest.HttpRequest); // la date est juste la date du jour // <div id="calendar-layer"> // <table id="calendar" cellpadding="3" class="calendar"> // ... // <tr> // ... // <td class="day-active-v day-current" ><a class="day-active-v" href="http://www.telecharger-magazine.com/2015/07/17/" title="Article posté dans 17 Juillet 2015">17</a></td> // ... // </tr> // ... // </table> // </div> // <div id='dle-content'> // ... // <div class="right-full"> // // <div class="cat_name"> // Posted in: // <a href="http://www.telecharger-magazine.com/journaux/">Journaux</a> // </div> // // <h2 class="title"> // <img src="/templates/MStarter/images/title.png" alt="" class="img" /> // Journaux Français Du 17 Juillet 2015 // </h2> // // <div class="contenttext"> // la date est juste la date du jour // http://www.telecharger-magazine.com/2015/07/17/ //xeSource.XPathValue("//div[@id='calendar-layer']//table[@id='calendar']//td[@class='day-active-v day-current']//a/@href"); XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']//div[@class='right-full']"); // Journaux data.Category = xePost.XPathValues(".//div[@class='cat_name']//a/text()").Select(DownloadPrint.Trim).zToStringValues("/"); data.PrintType = GetPrintType(data.Category); //pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); data.Title = xePost.XPathValue(".//h2[@class='title']//text()").zFunc(DownloadPrint.ReplaceChars).zFunc(DownloadPrint.Trim); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); if (titleInfos.foundInfo) { data.OriginalTitle = data.Title; data.Title = titleInfos.title; data.Infos.SetValues(titleInfos.infos); } XXElement xeContent = xePost.XPathElement(".//div[@class='contenttext']"); data.Images = new WebImage[] { new WebImage(zurl.GetUrl(data.SourceUrl, xeContent.XPathValue(".//img/@src"))) }; // force load image to get image width and height //if (webResult.WebRequest.LoadImageFromWeb) // data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); // get infos, description, language, size, nbPages PrintTextValues textValues = DownloadPrint.PrintTextValuesManager.GetTextValues( xeContent.DescendantTexts( node => { if (node is XText) { string text = ((XText)node).Value.Trim(); if (text.ToLowerInvariant() == "description") { return(XNodeFilter.DontSelectNode); } } if (node is XElement) { XElement xe = (XElement)node; if (xe.Name == "a") { return(XNodeFilter.Stop); } } return(XNodeFilter.SelectNode); } ).Select(DownloadPrint.ReplaceChars).Select(DownloadPrint.TrimWithoutColon), data.Title, extractValuesFromText: false); data.Description = textValues.description; data.Infos.SetValues(textValues.infos); data.DownloadLinks = xeContent.DescendantNodes( node => { if (!(node is XElement)) { return(XNodeFilter.DontSelectNode); } XElement xe2 = (XElement)node; if (xe2.Name == "a") { return(XNodeFilter.SelectNode); } if (xe2.Name != "p") { return(XNodeFilter.DontSelectNode); } XAttribute xa = xe2.Attribute("class"); if (xa == null) { return(XNodeFilter.DontSelectNode); } if (xa.Value != "submeta") { return(XNodeFilter.DontSelectNode); } //return XNodeFilter.SkipNode; return(XNodeFilter.Stop); }) .Select(node => ((XElement)node).Attribute("href").Value).ToArray(); data.DownloadLinks = xeContent.XPathValues(".//a/@href").ToArray(); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
private static string[] GetTextValues(XXElement xe) { return(xe.XPathValues(".//td//text()").Select(s => Handeco.Trim(_badCharacters.Replace(s, " "))).ToArray());; }
protected void _GetDetailData(XXElement xeSource, Handeco_Detail_v2 data) { //<div style="text-align: right; font-size: 10px;"> //<em>Dernière mise à jour le 18-01-2013</em> //</div> string lastUpdate = Handeco.Trim(xeSource.XPathValue("//em[starts-with(text(), 'Dernière mise à jour')]/text()")); if (lastUpdate != null) { Match match = _lastUpdateRegex.Match(lastUpdate); DateTime date; if (match.Success && DateTime.TryParseExact(match.Value, "dd-MM-yyyy", System.Globalization.CultureInfo.CurrentCulture, System.Globalization.DateTimeStyles.None, out date)) { data.DernièreMiseàjour = date; } else { data.UnknowInfos.Add(lastUpdate); } } else { pb.Trace.WriteLine("error \"Dernière mise à jour\" not found"); } // NOTRE OFFRE - activities - multiple //<select style="width: 200px; display: none;" onchange="change_activite(this.selectedIndex);" id="select_activites"> // <option>Sous-traitance industrielle - Autre</option> // <option>Assemblage mécanique</option> // <option>Energie renouvelable - Autre</option> //</select> string[] activityTypes = xeSource.XPathValues("//select[@id = 'select_activites']/option/text()").Select(Handeco.Trim).ToArray(); // CONTACTS - multiple //<select style="width: 200px; display: none;" onchange="change_contact(this.selectedIndex);" id="select_contacts"> // <option>Jacky STEINLE (Chef d'atelier)</option> //</select> string[] contactDescriptions = xeSource.XPathValues("//select[@id = 'select_contacts']/option/text()").Select(Handeco.Trim).ToArray(); int indexActivityType = 0; int indexContactDescription = 0; List <Activity> activities = new List <Activity>(); List <Contact> contacts = new List <Contact>(); foreach (XXElement xxe in xeSource.XPathElements("//table[@class = 'fiche organisation']")) { //string id = xxe.XPathValue("@id").ToLower(); string id = xxe.XPathValue("@id"); if (id != null) { id = id.ToLower(); } //if (__trace) // pb.Trace.WriteLine("table id = \"{0}\"", id); Activity activity = null; Contact contact = null; if (id != null && id.StartsWith("fiche_activite_")) { activity = new Activity(); activities.Add(activity); if (indexActivityType < activityTypes.Length) { activity.Type = activityTypes[indexActivityType++]; } else { pb.Trace.WriteLine("warning miss an activity type in html (<select id='select_activites'>)"); } } else if (id != null && id.StartsWith("fiche_contact_")) { contact = new Contact(); contacts.Add(contact); if (indexContactDescription < contactDescriptions.Length) { contact.Description = contactDescriptions[indexContactDescription++]; } else { pb.Trace.WriteLine("warning miss an activity type in html (<select id='select_contacts'>)"); } } foreach (XXElement xxe2 in xxe.XPathElements(".//tr")) { string valueName = Handeco.Trim(xxe2.XPathValue(".//th//text()")); //_currentElement = xxe2; XXElement currentElement = xxe2; if (valueName == null) { continue; } if (activity != null) { //if (__trace) // pb.Trace.Write("activité "); if (!SetActivityValue(activity, valueName, currentElement)) { //if (__trace) // pb.Trace.Write("error "); data.UnknowInfos.Add("valeur activité inconnu : " + valueName + " = " + GetTextValue(currentElement)); } //else if (__trace) // pb.Trace.Write(" "); //if (__trace) // pb.Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue(currentElement)); } else if (contact != null) { //if (__trace) // pb.Trace.Write("contact "); if (!SetContactValue(contact, valueName, currentElement)) { //if (__trace) // pb.Trace.Write("error "); data.UnknowInfos.Add("valeur contact inconnu : " + valueName + " = " + GetTextValue(currentElement)); } //else if (__trace) // pb.Trace.Write(" "); //if (__trace) // pb.Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue(currentElement)); } else { //if (__trace) // pb.Trace.Write("société "); if (!SetValue(data, valueName, currentElement)) { //if (__trace) // pb.Trace.Write("error "); data.UnknowInfos.Add("valeur inconnu : " + valueName + " = " + GetTextValue(currentElement)); } //else if (__trace) // pb.Trace.Write(" "); //if (__trace) // pb.Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue(currentElement)); } } } data.Activités = activities.ToArray(); data.Contacts = contacts.ToArray(); }
protected override TelechargementPlus_PostDetail GetData() { XXElement xeSource = new XXElement(GetXmlDocument().Root); TelechargementPlus_PostDetail data = new TelechargementPlus_PostDetail(); data.sourceUrl = Url; data.loadFromWebDate = DateTime.Now; XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']"); XXElement xe = xePost.XPathElement(".//div[@class='heading']//div[@class='binner']"); //data.title = TelechargementPlus.TrimString(TelechargementPlus.ExtractTextValues(data.infos, xe.XPathValue(".//text()"))); // xe.XPathValue(".//text()", TelechargementPlus.TrimFunc1) data.title = TelechargementPlus.ExtractTextValues(data.infos, TelechargementPlus.TrimFunc1(xe.XPathValue(".//text()"))); data.creationDate = TelechargementPlus.ParseDateTime(xe.XPathValue(".//a//text()")); //data.category = xe.DescendantTextList(".//div[@class='storeinfo']").Skip(2).Select(TelechargementPlus.TrimFunc1).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/"); data.category = xe.XPathElements(".//div[@class='storeinfo']").DescendantTexts().Skip(2).Select(TelechargementPlus.TrimFunc1).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/"); //TelechargementPlus_Print print = new TelechargementPlus_Print(); //print.url = Url; //print.loadFromWebDate = DateTime.Now; //data.infos.SetValues(data.infos); //<div class="base"> // <div class="heading"> // <div class="binner"> // <h1> // Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct] Gratuit</h1> // <div class="storeinfo"> // <a href="http://www.telechargement-plus.com/2013/10/14/">Aujourd'hui, 11:59</a> // | Catégorie: // <a href="http://www.telechargement-plus.com/e-book-magazines/">E-Book / Magazines</a>, // <a href="http://www.telechargement-plus.com/e-book-magazines/journaux/">Journaux</a>, // <a href="http://www.telechargement-plus.com/e-book-magazines/magazines/">Magazines</a> // <!-- | Views: 16--> // </div> // </div> // </div> // <div class="maincont"> // <div class="binner"> // <div class="shortstory"> // <div class="story-text"> // <center> // <span id="post-img"> // <img src="/templates/film-gratuit/images/prez/livre.png" alt="E-Book / Magazines, Journaux, Magazines" /> // </span> // </center> // <span id="post-img"> // <div style="text-align: center;"> // <br /> // <!--dle_image_begin:http://www.hapshack.com/images/TX72Y.jpg|--> // <img src="http://www.hapshack.com/images/TX72Y.jpg" alt="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" // title="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" /><!--dle_image_end--> // <br /> // <br /> // <b>Editeur :</b> Presse Fr<br /> // <b>Date de sortie :</b> 2013 // <br /> // <b>H�bergeur : </b>Multi / // <b> // <!--colorstart:#FF0000--> // <span style="color: #FF0000"> // <!--/colorstart--> // [Link Direct]<!--colorend--> // </span><!--/colorend--> // </b> // <br /> // <br /> // <!--dle_image_begin:http://prezup.eu/prez/infossurlebook.png|--> // <img src="http://prezup.eu/prez/infossurlebook.png" alt="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" // title="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" /><!--dle_image_end--> // <br /> // <br /> // <b>Advanced Cr�ation Photoshop HS�rie N�19 - Novembre 2013 [Lien Direct]</b> // <br /> // PDF | French | 186 pages | 100 MB<br /> // <br /> // <b>Le CD | zipper/22 Fichiers &+ | 520 MB</b><br /> // 37 Projets complets<br /> // SAVOIR TOUT FAIRE : Avec Photoshop Volume XIII<br /> // SPECIAL PHOTOMONTAGE & PEINTURE NUMERIQUE<br /> // BONUS : 2 Tutoriels Illustrator<br /> // / / / // <br /> // <br /> // </div> // </span> // <span id="post-img"> // <div id="news-id-86887" style="display: inline;"> // *<br /> // *<br /> // *<br /> // <div style="text-align: center;"> // <b> // <!--sizestart:6--> // <span style="font-size: 24pt;"> // <!--/sizestart--> // <!--colorstart:#FF6600--> // <span style="color: #FF6600"> // <!--/colorstart--> // Cloudzer<!--colorend--> // </span><!--/colorend--><!--sizeend--> // </span><!--/sizeend--> // = // <!--colorstart:#FF0000--> // <span style="color: #FF0000"> // <!--/colorstart--> // [Link Direct]<!--colorend--> // </span><!--/colorend--> // </b> // <br /> // <br /> // <a href="http://clz.to/q83zrwga" target="_blank"> // <!--dle_image_begin:http://www.hapshack.com/images/0THnp.gif|--> // <img src="http://www.hapshack.com/images/0THnp.gif" alt="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" // title="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" /><!--dle_image_end--> // </a> // <br /> // <a href="http://ul.to/ukqruco3" target="_blank"> // <!--dle_image_begin:http://www.hapshack.com/images/9MfYk.gif|--> // <img src="http://www.hapshack.com/images/9MfYk.gif" alt="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" // title="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" /><!--dle_image_end--> // </a> // <br /> // <br /> // <a href="http://hulkfile.eu/gap3aafrlmaj.html" target="_blank"> // <!--dle_image_begin:http://www.hapshack.com/images/Js84x.jpg|--> // <img src="http://www.hapshack.com/images/Js84x.jpg" alt="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" // title="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" /><!--dle_image_end--> // </a> // <br /> // <br /> // <a href="http://turbobit.net/blki3znuvzeg.html" target="_blank"> // <!--dle_image_begin:http://www.hapshack.com/images/QYeW0.gif|--> // <img src="http://www.hapshack.com/images/QYeW0.gif" alt="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" // title="Advanced Cr�ation Photoshop H-S�rie N�19 - Novembre 2013 [Lien Direct]" /><!--dle_image_end--> // </a> // <br /> // <br /> // *<br /> // *<br /> // <b>Le CD &+ : </b> // <br /> // http://clz.to/o58urag6<br /> // http://ul.to/rpqjypm4<br /> // http://hulkfile.eu/i2k3bbz835zg.html<br /> // http://turbobit.net/v644k3dd8izl.html<br /> // <br /> // <br /> // Bonne lecture<br /> // ************* // </div> // </div> // </span> //XXElement xe = _xePost.XPathElement(".//div[@class='heading']//div[@class='binner']"); //_post.title = _print.title = TelechargementPlus.TrimString(TelechargementPlus.ExtractTextValues(_print.infos, xe.XPathValue(".//text()"))); //string postDate = xe.XPathValue(".//a//text()"); ////WriteLine("postDate : \"{0}\"", postDate); //// Aujourd'hui, 17:13 ////if (postDate != null) //// _print.infos.SetValue("postDate", new ZString(postDate)); ////_print.creationDate = FrboardPrint.GetDateTime(date.Trim(_trimAll), time.Trim(_trimAll)); //_post.creationDate = TelechargementPlus.ParseDateTime(postDate); //_print.category = xe.DescendantTextList(".//div[@class='storeinfo']").Skip(2).Select(s => TelechargementPlus.TrimString(s)).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/"); //print.title = data.title; //print.category = data.category; xe = xePost.XPathElement(".//div[@class='maincont']//div[@class='binner']//div[@class='story-text']"); //data.images = xe.XPathElements(".//span[@id='post-img']").XPathImages(Url, TelechargementPlus.ImagesToSkip, node => node is XElement && ((XElement)node).Name == "a" ? false : true); //data.images = xe.XPathElements(".//span[@id='post-img']").XPathImages(Url, imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source), node => node is XElement && ((XElement)node).Name == "a" ? false : true); //data.images = xe.XPathElements(".//span[@id='post-img']").XPathImages(xeImg => new ImageHtml(xeImg, Url), imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source), // node => node is XElement && ((XElement)node).Name == "a" ? false : true).ToList(); //data.images = xe.XPathElements(".//span[@id='post-img']").XPathImages(xeImg => new ImageHtml(xeImg, Url), imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source), // node => node is XElement && ((XElement)node).Name == "a" ? XNodeFilter.SkipNode : XNodeFilter.SelectNode).ToList(); data.images = xe.XPathElements(".//span[@id='post-img']") .DescendantNodes(node => XmlDescendant.ImageFilter(node, node2 => node2 is XElement && ((XElement)node2).Name == "a" ? XNodeFilter.SkipNode : XNodeFilter.SelectNode)) .Select(xeImg => new pb.old.ImageHtml((XElement)xeImg, Url)) .Where(imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source)) .ToList(); if (_loadImage) { pb.old.Http_v2.LoadImageFromWeb(data.images); } //data.SetTextValues(xe.DescendantTextList(".//span[@id='post-img']", node => node is XElement && ((XElement)node).Name == "a" ? false : true)); data.SetTextValues(xe.XPathElements(".//span[@id='post-img']").DescendantTexts(node => node is XElement && ((XElement)node).Name == "a" ? XNodeFilter.SkipNode : XNodeFilter.SelectNode)); data.downloadLinks.AddRange(xe.XPathValues(".//span[@id='post-img']//a/@href")); ////<h1 class="shd"> //// <a href="http://www.telechargement-plus.com/e-book-magazines/magazines/86236-multi-ici-paris-n3562-9-au-15-octobre-2013.html"> //// [Multi] Ici Paris N°3562 - 9 au 15 Octobre 2013 //// </a> ////</h1> //XXElement xe = xePost.XPathElement(".//*[@class='shd']//a"); //_print.url = xe.XPathValue("@href"); //_print.title = TrimString(ExtractTextValues(xe.XPathValue(".//text()"))); ////<div class="shdinf"> //// <div class="shdinf"> //// <span class="rcol">Auteur: //// <a onclick="ShowProfile('bakafa', 'http://www.telechargement-plus.com/user/bakafa/', '0'); return false;" href="http://www.telechargement-plus.com/user/bakafa/"> //// bakafa //// </a> //// </span> //// <span class="date"> //// <b><a href="http://www.telechargement-plus.com/2013/10/09/">Aujourd'hui, 17:13</a></b> //// </span> //// <span class="lcol">Catégorie: //// <a href="http://www.telechargement-plus.com/e-book-magazines/"> //// E-Book / Magazines //// </a> » //// <a href="http://www.telechargement-plus.com/e-book-magazines/magazines/"> //// Magazines //// </a> //// </span> //// </div> ////</div> //xe = xePost.XPathElement(".//div[@class='shdinf']/div[@class='shdinf']"); //_print.postAuthor = xe.XPathValue(".//span[@class='rcol']//a//text()"); //string postDate = xe.XPathValue(".//span[@class='date']//text()"); //// Aujourd'hui, 17:13 //if (postDate != null) // _print.infos.SetValue("postDate", new ZString(postDate)); //_print.category = xe.DescendantTextList(".//span[@class='lcol']").Select(s => TrimString(s)).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/"); ////.zForEach(s => s.Trim()) ////<span id="post-img"> //// <div id="news-id-86236" style="display: inline;"> //// <div style="text-align: center;"> //// <!--dle_image_begin:http://zupimages.net/up/3/1515486591.jpeg|--> //// <img src="http://zupimages.net/up/3/1515486591.jpeg" alt="[Multi] Ici Paris N°3562 - 9 au 15 Octobre 2013" //// title="[Multi] Ici Paris N°3562 - 9 au 15 Octobre 2013" /><!--dle_image_end--> //// <br /> //// <b> //// <br /> //// Ici Paris N°3562 - 9 au 15 Octobre 2013<br /> //// French | 52 pages | HQ PDF | 101 MB //// </b> //// <br /> //// <br /> //// Ici Paris vous fait partager la vie publique et privée de celles et ceux qui font //// l'actualité : exclusivités, interviews, enquêtes (la face cachée du showbiz, les //// coulisses de la télé) indiscrétions, potins.<br /> //// </div> //// </div> ////</span> //xe = xePost.XPathElement(".//span[@id='post-img']//div[starts-with(@id, 'news-id')]"); //_print.images = xe.XPathImages(".//img", _imagesToSkip); //if (_loadImage) // Http2.LoadImageFromWeb(_print.images); return(data); }
//protected override Telechargementz_PostDetail GetDataFromWeb(LoadDataFromWeb loadDataFromWeb) protected override IPost GetDataFromWeb(LoadDataFromWeb_v3 loadDataFromWeb) { XXElement xeSource = new XXElement(loadDataFromWeb.GetXmlDocument().Root); Telechargementz_PostDetail data = new Telechargementz_PostDetail(); data.SourceUrl = loadDataFromWeb.request.Url; data.LoadFromWebDate = loadDataFromWeb.loadFromWebDate; data.Id = GetPostDetailKey(data.SourceUrl); XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']"); data.PostAuthor = xePost.XPathValue(".//div[@class='title-info']//a//text()"); // , 26.12.14 string date = xePost.XPathValue(".//div[@class='title-info']//a/following-sibling::text()"); if (date != null) { data.PostCreationDate = zdate.ParseDateTimeLikeToday(date.Trim(' ', ','), loadDataFromWeb.loadFromWebDate, "dd.MM.yy"); if (data.PostCreationDate == null) { pb.Trace.WriteLine("unknow date time \"{0}\"", date); } if (__trace) { pb.Trace.WriteLine("creationDate {0} - \"{1}\"", data.PostCreationDate, date); } } else { pb.Trace.WriteLine("creationDate not found \"{0}\"", data.SourceUrl); } //data.Title = xePost.XPathElement(".//div[@class='post-title']").DescendantTextList(func: DownloadPrint.TrimFunc1).FirstOrDefault(); data.Title = xePost.XPathElement(".//div[@class='post-title']").DescendantTexts().Select(DownloadPrint.Trim).FirstOrDefault(); PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); if (titleInfos.foundInfo) { data.OriginalTitle = data.Title; data.Title = titleInfos.title; data.infos.SetValues(titleInfos.infos); } XXElement xe = xePost.XPathElement(".//div[starts-with(@id, 'news-id-')]"); if (xe.XElement == null) { pb.Trace.WriteLine("element not found \".//div[starts-with(@id, 'news-id-')]\""); } //data.Images = new List<UrlImage>(); //data.Images.Add(xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).FirstOrDefault()); //data.Images = new UrlImage[] { xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).FirstOrDefault() }; WebImage image = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).FirstOrDefault(); if (image != null) { data.Images = new WebImage[] { image } } ; // force load image to get image width and height if (loadDataFromWeb.request.LoadImage) { data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); } data.DownloadLinks = xe.XPathValues(".//a/@href").ToArray(); //data.category = xePost.DescendantTextList(".//div[@class='hdiin']//a").Select(DownloadPrint.TrimFunc1).zToStringValues("/"); //string category = data.category.ToLowerInvariant(); //data.printType = GetPrintType(category); ////pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); // get infos, description, language, size, nbPages // nodeFilter: not <a> and not <span> // nodeFilter: node => !(node is XElement) || (((XElement)node).Name != "a" && ((XElement)node).Name != "span") // nodeFilter: not <a> //PrintTextValues_old textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_old(xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a"), data.Title); PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.Title); data.description = textValues.description; data.language = textValues.language; data.size = textValues.size; data.nbPages = textValues.nbPages; data.infos.SetValues(textValues.infos); data.PrintType = PrintType.UnknowEBook; if (data.infos.ContainsKey("Bd") || data.infos.ContainsKey("bd") || data.infos.ContainsKey("BD")) { data.PrintType = PrintType.Comics; } // Editeur : Presse fr else if (data.infos.ContainsKey("editeur") && data.infos["editeur"] is ZString && ((string)data.infos["editeur"]).ToLowerInvariant() == "presse fr") { data.PrintType = PrintType.Print; } else if (data.infos.ContainsKey("isbn")) { data.PrintType = PrintType.Book; } //pb.Trace.WriteLine(xe.DescendantNodes(returnNodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a").Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson()); //pb.Trace.WriteLine(xe.DescendantNodes(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a").Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson()); //pb.Trace.WriteLine(xe.DescendantNodes(returnNodeFilter: node => node is XText).Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson()); //pb.Trace.WriteLine(xe.DescendantNodes(nodeFilter: node => !(node is XElement) || (((XElement)node).Name != "a" && ((XElement)node).Name != "span"), returnNodeFilter: node => node is XText).Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson()); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); }
private static string GetTextValue(XXElement xe) { //return currentElement.XPathConcatText(".//td//text()", separator: " ", itemFunc: s => Handeco.Trim(__badCharacters.Replace(s, " "))); ; return(xe.XPathValues(".//td//text()").Select(s => Handeco.Trim(__badCharacters.Replace(s, " "))).zToStringValues(" ")); }
private static Handeco_Detail GetData(WebResult webResult) { XXElement xeSource = webResult.Http.zGetXDocument().zXXElement(); Handeco_Detail data = new Handeco_Detail(); data.SourceUrl = webResult.WebRequest.HttpRequest.Url; data.LoadFromWebDate = webResult.LoadFromWebDate; data.Id = GetKey(webResult.WebRequest.HttpRequest); //<div style="text-align: right; font-size: 10px;"> //<em>Dernière mise à jour le 18-01-2013</em> //</div> string lastUpdate = Handeco.Trim(xeSource.XPathValue("//em[starts-with(text(), 'Dernière mise à jour')]/text()")); if (lastUpdate != null) { Match match = __lastUpdateRegex.Match(lastUpdate); DateTime date; if (match.Success && DateTime.TryParseExact(match.Value, "dd-MM-yyyy", System.Globalization.CultureInfo.CurrentCulture, System.Globalization.DateTimeStyles.None, out date)) { data.DernièreMiseàjour = date; } else { data.UnknowInfos.Add(lastUpdate); } } else { pb.Trace.WriteLine("error \"Dernière mise à jour\" not found"); } // NOTRE OFFRE - activities - multiple //<select style="width: 200px; display: none;" onchange="change_activite(this.selectedIndex);" id="select_activites"> // <option>Sous-traitance industrielle - Autre</option> // <option>Assemblage mécanique</option> // <option>Energie renouvelable - Autre</option> //</select> string[] activityTypes = xeSource.XPathValues("//select[@id = 'select_activites']/option/text()").Select(Handeco.Trim).ToArray(); // CONTACTS - multiple //<select style="width: 200px; display: none;" onchange="change_contact(this.selectedIndex);" id="select_contacts"> // <option>Jacky STEINLE (Chef d'atelier)</option> //</select> string[] contactDescriptions = xeSource.XPathValues("//select[@id = 'select_contacts']/option/text()").Select(Handeco.Trim).ToArray(); int indexActivityType = 0; int indexContactDescription = 0; List <Activity> activities = new List <Activity>(); List <Contact> contacts = new List <Contact>(); foreach (XXElement xxe in xeSource.XPathElements("//table[@class = 'fiche organisation']")) { //string id = xxe.XPathValue("@id").ToLower(); string id = xxe.XPathValue("@id"); if (id != null) { id = id.ToLower(); } if (__trace) { pb.Trace.WriteLine("table id = \"{0}\"", id); } Activity activity = null; Contact contact = null; if (id != null && id.StartsWith("fiche_activite_")) { activity = new Activity(); activities.Add(activity); if (indexActivityType < activityTypes.Length) { activity.Type = activityTypes[indexActivityType++]; } else { pb.Trace.WriteLine("warning miss an activity type in html (<select id='select_activites'>)"); } } else if (id != null && id.StartsWith("fiche_contact_")) { contact = new Contact(); contacts.Add(contact); if (indexContactDescription < contactDescriptions.Length) { contact.Description = contactDescriptions[indexContactDescription++]; } else { pb.Trace.WriteLine("warning miss an activity type in html (<select id='select_contacts'>)"); } } foreach (XXElement xxe2 in xxe.XPathElements(".//tr")) { string valueName = Handeco.Trim(xxe2.XPathValue(".//th//text()")); //_currentElement = xxe2; XXElement currentElement = xxe2; if (valueName == null) { continue; } if (activity != null) { if (__trace) { pb.Trace.Write("activité "); } if (!SetActivityValue(activity, valueName, currentElement)) { if (__trace) { pb.Trace.Write("error "); } data.UnknowInfos.Add("valeur activité inconnu : " + valueName + " = " + GetTextValue(currentElement)); } else if (__trace) { pb.Trace.Write(" "); } if (__trace) { pb.Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue(currentElement)); } } else if (contact != null) { if (__trace) { pb.Trace.Write("contact "); } if (!SetContactValue(contact, valueName, currentElement)) { if (__trace) { pb.Trace.Write("error "); } data.UnknowInfos.Add("valeur contact inconnu : " + valueName + " = " + GetTextValue(currentElement)); } else if (__trace) { pb.Trace.Write(" "); } if (__trace) { pb.Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue(currentElement)); } } else { if (__trace) { pb.Trace.Write("société "); } if (!SetValue(data, valueName, currentElement)) { if (__trace) { pb.Trace.Write("error "); } data.UnknowInfos.Add("valeur inconnu : " + valueName + " = " + GetTextValue(currentElement)); } else if (__trace) { pb.Trace.Write(" "); } if (__trace) { pb.Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue(currentElement)); } } } } data.Activités = activities.ToArray(); data.Contacts = contacts.ToArray(); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); //XXElement xePost = xeSource.XPathElement("//table[@id='layout']//div[@id='content']//div[@class='post']"); //XXElement xe = xePost.XPathElement(".//table[@id='post-head']"); ////string[] dates = xe.DescendantTextList(".//td[@id='head-date']", func: Vosbooks.TrimFunc1).ToArray(); //string[] dates = xe.XPathElement(".//td[@id='head-date']").DescendantTexts().Select(DownloadPrint.Trim).ToArray(); //data.PostCreationDate = GetDate(dates, __lastPostDate); //if (data.PostCreationDate != null) // __lastPostDate = new Date(data.PostCreationDate.Value); //if (__trace) // pb.Trace.WriteLine("post creation date {0} - {1}", data.PostCreationDate, dates.zToStringValues()); ////data.Title = xePost.XPathValue(".//div[@class='title']//a//text()", DownloadPrint.TrimFunc1); //data.Title = xePost.XPathValue(".//div[@class='title']//a//text()").zFunc(DownloadPrint.ReplaceChars).zFunc(DownloadPrint.Trim); //PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); //if (titleInfos.foundInfo) //{ // data.OriginalTitle = data.Title; // data.Title = titleInfos.title; // data.Infos.SetValues(titleInfos.infos); //} //// Ebooks en Epub / Livre ////data.Category = xePost.DescendantTextList(".//div[@class='postdata']//span[@class='category']//a").Select(DownloadPrint.TrimFunc1).zToStringValues("/"); //data.Category = xePost.XPathElements(".//div[@class='postdata']//span[@class='category']//a").DescendantTexts().Select(DownloadPrint.Trim).zToStringValues("/"); //data.PrintType = GetPrintType(data.Category); ////pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); //xe = xePost.XPathElement(".//div[@class='entry']"); //data.Images = new WebImage[] { new WebImage(zurl.GetUrl(data.SourceUrl, xe.XPathValue("div[starts-with(@class, 'post-views')]/following-sibling::h3/following-sibling::p/img/@src"))) }; //// force load image to get image width and height //if (webResult.WebRequest.LoadImage) // data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); //// get infos, description, language, size, nbPages //// xe.DescendantTextList(".//p") //PrintTextValues textValues = DownloadPrint.PrintTextValuesManager.GetTextValues( // xe.XPathElements(".//p").DescendantTexts( // node => // { // if (node is XText) // { // string text = ((XText)node).Value.Trim(); // //if (text.StartsWith("Lien Direct", StringComparison.InvariantCultureIgnoreCase)) // if (text.StartsWith("lien ", StringComparison.InvariantCultureIgnoreCase)) // return XNodeFilter.Stop; // } // if (node is XElement) // { // XElement xe2 = (XElement)node; // if (xe2.Name == "p" && xe2.zAttribValue("class") == "submeta") // return XNodeFilter.Stop; // } // return XNodeFilter.SelectNode; // } // ).Select(DownloadPrint.ReplaceChars).Select(DownloadPrint.TrimWithoutColon), data.Title); //data.Description = textValues.description; //data.Infos.SetValues(textValues.infos); //data.DownloadLinks = xe.DescendantNodes( // node => // { // if (!(node is XElement)) // return XNodeFilter.DontSelectNode; // XElement xe2 = (XElement)node; // if (xe2.Name == "a") // return XNodeFilter.SelectNode; // if (xe2.Name != "p") // return XNodeFilter.DontSelectNode; // XAttribute xa = xe2.Attribute("class"); // if (xa == null) // return XNodeFilter.DontSelectNode; // if (xa.Value != "submeta") // return XNodeFilter.DontSelectNode; // //return XNodeFilter.SkipNode; // return XNodeFilter.Stop; // }) // .Select(node => ((XElement)node).Attribute("href").Value).ToArray(); }