private static Handeco_HeaderPage GetData(WebResult webResult) { XXElement xeSource = new XXElement(webResult.Http.zGetXDocument().Root); string url = webResult.WebRequest.HttpRequest.Url; Handeco_HeaderPage data = new Handeco_HeaderPage(); data.SourceUrl = url; data.LoadFromWebDate = webResult.LoadFromWebDate; data.Id = GetPageKey(webResult.WebRequest.HttpRequest); //data.UrlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='page-nav']//li[last()]//a[text()='>']/@href")); //IEnumerable<XXElement> xeHeaders = xeSource.XPathElements("//table[@id='layout']//div[@id='content']/div"); //List<Handeco_PostHeader> headers = new List<Handeco_PostHeader>(); //foreach (XXElement xeHeader in xeHeaders) //{ // Handeco_PostHeader header = new Handeco_PostHeader(); // header.SourceUrl = url; // header.LoadFromWebDate = webResult.LoadFromWebDate; // if (xeHeader.XPathValue("@class") == "page-nav") // continue; // XXElement xe = xeHeader.XPathElement(".//div/div/div//a"); // //header.Title = xe.XPathValue(".//text()"); // header.UrlDetail = xe.XPathValue("./@href"); // headers.Add(header); //} //data.PostHeaders = headers.ToArray(); //return data; // <div class="paginationControl"> // page n : <a href="/fournisseurs/rechercher/page/2#resultats">></a> | // last page : <span class="disabled">></span> | data.UrlNextPage = zurl.RemoveFragment(zurl.GetUrl(url, xeSource.XPathValue("//div[@class='paginationControl']//*[position()=last()-1]/@href"))); IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//table//tr[position() > 1]"); List <Handeco_Header> headers = new List <Handeco_Header>(); foreach (XXElement xeHeader in xeHeaders) { Handeco_Header header = new Handeco_Header(); header.SourceUrl = url; header.LoadFromWebDate = DateTime.Now; header.Name = Handeco.Trim(xeHeader.XPathValue(".//td[1]//text()")); header.UrlDetail = zurl.RemoveFragment(zurl.GetUrl(url, xeHeader.XPathValue(".//td[1]//a/@href"))); //header.Siret = Handeco.Trim(xeHeader.XPathValue(".//td[2]//text()")); header.Type = Handeco.Trim(xeHeader.XPathValue(".//td[2]//text()")); header.Groupes = xeHeader.XPathValues(".//td[3]//text()").Select(Handeco.Trim).ToArray(); header.Activités = xeHeader.XPathValues(".//td[4]//text()").Select(Handeco.Trim).ToArray(); header.PostalCode = Handeco.Trim(xeHeader.XPathValue(".//td[5]//text()")); headers.Add(header); } data.Headers = headers.ToArray(); return(data); }
private static string GetTextValue(XXElement xe) { //return currentElement.XPathConcatText(".//td//text()", separator: " ", itemFunc: s => Handeco.Trim(__badCharacters.Replace(s, " "))); ; return(xe.XPathValues(".//td//text()").Select(s => Handeco.Trim(__badCharacters.Replace(s, " "))).zToStringValues(" ")); }
private static string[] GetTextValues(XXElement xe) { return(xe.XPathValues(".//td//text()").Select(s => Handeco.Trim(__badCharacters.Replace(s, " "))).ToArray());; }
private static Handeco_Detail GetData(WebResult webResult) { XXElement xeSource = webResult.Http.zGetXDocument().zXXElement(); Handeco_Detail data = new Handeco_Detail(); data.SourceUrl = webResult.WebRequest.HttpRequest.Url; data.LoadFromWebDate = webResult.LoadFromWebDate; data.Id = GetKey(webResult.WebRequest.HttpRequest); //<div style="text-align: right; font-size: 10px;"> //<em>Dernière mise à jour le 18-01-2013</em> //</div> string lastUpdate = Handeco.Trim(xeSource.XPathValue("//em[starts-with(text(), 'Dernière mise à jour')]/text()")); if (lastUpdate != null) { Match match = __lastUpdateRegex.Match(lastUpdate); DateTime date; if (match.Success && DateTime.TryParseExact(match.Value, "dd-MM-yyyy", System.Globalization.CultureInfo.CurrentCulture, System.Globalization.DateTimeStyles.None, out date)) { data.DernièreMiseàjour = date; } else { data.UnknowInfos.Add(lastUpdate); } } else { pb.Trace.WriteLine("error \"Dernière mise à jour\" not found"); } // NOTRE OFFRE - activities - multiple //<select style="width: 200px; display: none;" onchange="change_activite(this.selectedIndex);" id="select_activites"> // <option>Sous-traitance industrielle - Autre</option> // <option>Assemblage mécanique</option> // <option>Energie renouvelable - Autre</option> //</select> string[] activityTypes = xeSource.XPathValues("//select[@id = 'select_activites']/option/text()").Select(Handeco.Trim).ToArray(); // CONTACTS - multiple //<select style="width: 200px; display: none;" onchange="change_contact(this.selectedIndex);" id="select_contacts"> // <option>Jacky STEINLE (Chef d'atelier)</option> //</select> string[] contactDescriptions = xeSource.XPathValues("//select[@id = 'select_contacts']/option/text()").Select(Handeco.Trim).ToArray(); int indexActivityType = 0; int indexContactDescription = 0; List <Activity> activities = new List <Activity>(); List <Contact> contacts = new List <Contact>(); foreach (XXElement xxe in xeSource.XPathElements("//table[@class = 'fiche organisation']")) { //string id = xxe.XPathValue("@id").ToLower(); string id = xxe.XPathValue("@id"); if (id != null) { id = id.ToLower(); } if (__trace) { pb.Trace.WriteLine("table id = \"{0}\"", id); } Activity activity = null; Contact contact = null; if (id != null && id.StartsWith("fiche_activite_")) { activity = new Activity(); activities.Add(activity); if (indexActivityType < activityTypes.Length) { activity.Type = activityTypes[indexActivityType++]; } else { pb.Trace.WriteLine("warning miss an activity type in html (<select id='select_activites'>)"); } } else if (id != null && id.StartsWith("fiche_contact_")) { contact = new Contact(); contacts.Add(contact); if (indexContactDescription < contactDescriptions.Length) { contact.Description = contactDescriptions[indexContactDescription++]; } else { pb.Trace.WriteLine("warning miss an activity type in html (<select id='select_contacts'>)"); } } foreach (XXElement xxe2 in xxe.XPathElements(".//tr")) { string valueName = Handeco.Trim(xxe2.XPathValue(".//th//text()")); //_currentElement = xxe2; XXElement currentElement = xxe2; if (valueName == null) { continue; } if (activity != null) { if (__trace) { pb.Trace.Write("activité "); } if (!SetActivityValue(activity, valueName, currentElement)) { if (__trace) { pb.Trace.Write("error "); } data.UnknowInfos.Add("valeur activité inconnu : " + valueName + " = " + GetTextValue(currentElement)); } else if (__trace) { pb.Trace.Write(" "); } if (__trace) { pb.Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue(currentElement)); } } else if (contact != null) { if (__trace) { pb.Trace.Write("contact "); } if (!SetContactValue(contact, valueName, currentElement)) { if (__trace) { pb.Trace.Write("error "); } data.UnknowInfos.Add("valeur contact inconnu : " + valueName + " = " + GetTextValue(currentElement)); } else if (__trace) { pb.Trace.Write(" "); } if (__trace) { pb.Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue(currentElement)); } } else { if (__trace) { pb.Trace.Write("société "); } if (!SetValue(data, valueName, currentElement)) { if (__trace) { pb.Trace.Write("error "); } data.UnknowInfos.Add("valeur inconnu : " + valueName + " = " + GetTextValue(currentElement)); } else if (__trace) { pb.Trace.Write(" "); } if (__trace) { pb.Trace.WriteLine("\"{0}\" = \"{1}\"", valueName, GetTextValue(currentElement)); } } } } data.Activités = activities.ToArray(); data.Contacts = contacts.ToArray(); if (__trace) { pb.Trace.WriteLine(data.zToJson()); } return(data); //XXElement xePost = xeSource.XPathElement("//table[@id='layout']//div[@id='content']//div[@class='post']"); //XXElement xe = xePost.XPathElement(".//table[@id='post-head']"); ////string[] dates = xe.DescendantTextList(".//td[@id='head-date']", func: Vosbooks.TrimFunc1).ToArray(); //string[] dates = xe.XPathElement(".//td[@id='head-date']").DescendantTexts().Select(DownloadPrint.Trim).ToArray(); //data.PostCreationDate = GetDate(dates, __lastPostDate); //if (data.PostCreationDate != null) // __lastPostDate = new Date(data.PostCreationDate.Value); //if (__trace) // pb.Trace.WriteLine("post creation date {0} - {1}", data.PostCreationDate, dates.zToStringValues()); ////data.Title = xePost.XPathValue(".//div[@class='title']//a//text()", DownloadPrint.TrimFunc1); //data.Title = xePost.XPathValue(".//div[@class='title']//a//text()").zFunc(DownloadPrint.ReplaceChars).zFunc(DownloadPrint.Trim); //PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title); //if (titleInfos.foundInfo) //{ // data.OriginalTitle = data.Title; // data.Title = titleInfos.title; // data.Infos.SetValues(titleInfos.infos); //} //// Ebooks en Epub / Livre ////data.Category = xePost.DescendantTextList(".//div[@class='postdata']//span[@class='category']//a").Select(DownloadPrint.TrimFunc1).zToStringValues("/"); //data.Category = xePost.XPathElements(".//div[@class='postdata']//span[@class='category']//a").DescendantTexts().Select(DownloadPrint.Trim).zToStringValues("/"); //data.PrintType = GetPrintType(data.Category); ////pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType); //xe = xePost.XPathElement(".//div[@class='entry']"); //data.Images = new WebImage[] { new WebImage(zurl.GetUrl(data.SourceUrl, xe.XPathValue("div[starts-with(@class, 'post-views')]/following-sibling::h3/following-sibling::p/img/@src"))) }; //// force load image to get image width and height //if (webResult.WebRequest.LoadImage) // data.Images = DownloadPrint.LoadImages(data.Images).ToArray(); //// get infos, description, language, size, nbPages //// xe.DescendantTextList(".//p") //PrintTextValues textValues = DownloadPrint.PrintTextValuesManager.GetTextValues( // xe.XPathElements(".//p").DescendantTexts( // node => // { // if (node is XText) // { // string text = ((XText)node).Value.Trim(); // //if (text.StartsWith("Lien Direct", StringComparison.InvariantCultureIgnoreCase)) // if (text.StartsWith("lien ", StringComparison.InvariantCultureIgnoreCase)) // return XNodeFilter.Stop; // } // if (node is XElement) // { // XElement xe2 = (XElement)node; // if (xe2.Name == "p" && xe2.zAttribValue("class") == "submeta") // return XNodeFilter.Stop; // } // return XNodeFilter.SelectNode; // } // ).Select(DownloadPrint.ReplaceChars).Select(DownloadPrint.TrimWithoutColon), data.Title); //data.Description = textValues.description; //data.Infos.SetValues(textValues.infos); //data.DownloadLinks = xe.DescendantNodes( // node => // { // if (!(node is XElement)) // return XNodeFilter.DontSelectNode; // XElement xe2 = (XElement)node; // if (xe2.Name == "a") // return XNodeFilter.SelectNode; // if (xe2.Name != "p") // return XNodeFilter.DontSelectNode; // XAttribute xa = xe2.Attribute("class"); // if (xa == null) // return XNodeFilter.DontSelectNode; // if (xa.Value != "submeta") // return XNodeFilter.DontSelectNode; // //return XNodeFilter.SkipNode; // return XNodeFilter.Stop; // }) // .Select(node => ((XElement)node).Attribute("href").Value).ToArray(); }