private static void GetNewDataFicheBloc(Unea_DetailCompany1 data, XXElement xeFiche) { //IEnumerator<XXElement> xeFicheBlocs = xeFiche.XPathElements(".//div[@class='fiche_bloc']").GetEnumerator(); // <div class="fiche_entete"><!-- <h3>Logo UNEA</h3> --><h1>ALSACE ENTREPRISE ADAPTEE</h1></div> //data.name = xe.XPathValue(".//div[@class='fiche_entete']//text()", __trimFunc1); //GetDataFicheBlocNo1 //foreach (string text in xe.DescendantTextList(".//td[@class='fiche_infos']", func: __trimFunc2)) //GetDataFicheBlocNo2 //foreach (XText xtext in xe.DescendantTextNodeList(".//div[@class='fiche_contenu']")) //GetDataFicheBlocNo3 //foreach (string text in xe.DescendantTextList(".//table", func: __trimFunc2)) bool firstText = true; Unea_TextType textType = Unea_TextType.unknow; //foreach (XText xtext in xeFiche.DescendantTextNodeList(".//div[@class='fiche_bloc']")) foreach (XText xtext in xeFiche.XPathElements(".//div[@class='fiche_bloc']").DescendantTextNodes()) { string text = __trimFunc2(xtext.Value); if (text == "") { continue; } // fiche_bloc no 1 if (text.Equals("Activités", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.activity; } else if (text.Equals("Région - Département", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.location; } else if (text.Equals("Filières Métiers UNEA", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.sector; } // fiche_bloc no 2 else if (text.Equals("Présentation de l'Entreprise Adaptée", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.unknow; } else if (text.Equals("Présentation", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.presentation; } else if (text.Equals("Principaux clients", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.client; } else if (text.Equals("Dirigeant", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.leader; } else if (text.Equals("Nombre de salariés", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.employeNumber; } else if (text.Equals("Chiffre d'affaire de l'année écoulée", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.lastYearRevenue; } else if (text.Equals("Certification", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.certification; } else if (text.Equals("Numéro SIRET", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.siret; } else if (text.Equals("Photos", StringComparison.InvariantCultureIgnoreCase)) { foreach (XXElement xe2 in new XXElement(xtext.Parent).XPathElements("ancestor::p/following-sibling::p//img")) { string url = xe2.XPathValue("@src"); if (!data.photos.ContainsKey(url)) { data.photos.Add(url, null); } else { Trace.CurrentTrace.WriteLine("warning photo already exists \"{0}\"", url); } } textType = Unea_TextType.novalues; } else if (text.Equals("Documents téléchargeables", StringComparison.InvariantCultureIgnoreCase)) { //foreach (XXElement xe2 in new XXElement(xtext.Parent).XPathElements("ancestor::p/following-sibling::p//a")) bool stop = false; //foreach (XXElement xe2 in new XXElement(xtext.Parent).XPathElements("ancestor::p/following-sibling::p", // e => // { // if (e.Value.StartsWith("Photos", StringComparison.InvariantCultureIgnoreCase)) // stop = true; // return !stop; // })) foreach (XXElement xe2 in new XXElement(xtext.Parent).XPathElements("ancestor::p/following-sibling::p").Where( e => { if (e.XElement.Value.StartsWith("Photos", StringComparison.InvariantCultureIgnoreCase)) { stop = true; } return(!stop); })) { XXElement xe3 = xe2.XPathElement(".//a", writeError: false); if (xe3.XElement == null) { continue; } string url = xe3.XPathValue("@href"); //string name = name = xe3.XPathValue(".//text()", __trimFunc2); string name = __trimFunc2(xe3.XPathValue(".//text()")); if (!data.downloadDocuments.ContainsKey(url)) { data.downloadDocuments.Add(url, new Unea_Document() { name = name, url = url }); } else { Trace.CurrentTrace.WriteLine("warning download document already exists \"{0}\" \"{1}\"", name, url); } } // textType = novalues pour ne pas avoir Plaquette_AEA.pdf dans unknowInfos textType = Unea_TextType.novalues; } // fiche_bloc no 3 else if (text.Equals("Nous contacter", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.unknow; } else if (text.Equals("Adresse", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.address; } else if (text.Equals("Téléphone", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.phone; } else if (text.Equals("Fax", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.fax; } else if (text.Equals("Adresse e-mail", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.email; } else if (text.Equals("Site internet", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.webSite; } else { switch (textType) { // fiche_bloc no 1 case Unea_TextType.activity: if (!data.activities.ContainsKey(text)) { data.activities.Add(text, null); } else { Trace.CurrentTrace.WriteLine("warning activity already exists \"{0}\"", text); } break; case Unea_TextType.location: data.location = text; textType = Unea_TextType.unknow; break; case Unea_TextType.sector: if (!data.sectors.ContainsKey(text)) { data.sectors.Add(text, null); } else { Trace.CurrentTrace.WriteLine("warning sector already exists \"{0}\"", text); } break; // fiche_bloc no 2 case Unea_TextType.presentation: data.presentation = text; textType = Unea_TextType.unknow; break; case Unea_TextType.client: data.clients = text; textType = Unea_TextType.unknow; break; case Unea_TextType.leader: data.leader = text; textType = Unea_TextType.unknow; break; case Unea_TextType.employeNumber: int employeNumber; if (int.TryParse(text, out employeNumber)) { data.employeNumber = employeNumber; } else { Trace.CurrentTrace.WriteLine("error unknow employe number \"{0}\"", text); } textType = Unea_TextType.unknow; break; case Unea_TextType.lastYearRevenue: data.lastYearRevenue = text; textType = Unea_TextType.unknow; break; case Unea_TextType.certification: data.certification = text; textType = Unea_TextType.unknow; break; case Unea_TextType.siret: data.siret = text; textType = Unea_TextType.unknow; break; // fiche_bloc no 3 case Unea_TextType.address: if (data.address == null) { data.address = text; } else { data.address += " " + text; } break; case Unea_TextType.phone: data.phone = text; textType = Unea_TextType.unknow; break; case Unea_TextType.fax: data.fax = text; textType = Unea_TextType.unknow; break; case Unea_TextType.email: data.email = text; textType = Unea_TextType.unknow; break; case Unea_TextType.webSite: data.webSite = text; textType = Unea_TextType.unknow; break; case Unea_TextType.novalue: textType = Unea_TextType.unknow; break; case Unea_TextType.novalues: break; default: if (firstText) { data.name = text; firstText = false; } else { data.unknowInfos.Add(text); } break; } } } }
protected override Unea_HeaderCompany[] GetData() { XXElement xeSource = new XXElement(GetXmlDocument().Root); string url = Url; // <div class="ctn_result"> IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[@class = 'ctn_result']"); List <Unea_HeaderCompany> headers = new List <Unea_HeaderCompany>(); foreach (XXElement xeHeader in xeHeaders) { Unea_HeaderCompany header = new Unea_HeaderCompany(); header.sourceUrl = url; header.loadFromWebDate = DateTime.Now; // <div class="ctn_result-header"> XXElement xe = xeHeader.XPathElement(".//div[@class='ctn_result-header']"); // <div class="lien"><a href="http://www.unea.fr/union-nationale-entreprises-adaptees/annuaire-unea/71/4583/ACCAA TAKTIM.htm" target="_blank"><strong>></strong> Voir la fiche détaillée</a></div> header.urlDetail2 = zurl.GetUrl(url, xe.ExplicitXPathValue(".//div[@class = 'lien']//a/@href")); // <iframe src="detail.asp?id=4583" width="420" height="800" frameborder="0" scrolling="auto" marginheight="0" marginwidth="0"></iframe> header.urlDetail1 = zurl.GetUrl(url, xe.ExplicitXPathValue(".//iframe/@src")); // <h4><a href="http://www.unea.fr/union-nationale-entreprises-adaptees/annuaire-unea/71/4583/ACCAA TAKTIM.htm" target="_blank"> </a><span>|</span> ACCAA TAKTIM</h4> //header.name = xe.DescendantTextList(func: __trimFunc2).LastOrDefault(); header.name = xe.DescendantTexts().Select(__trimFunc2).LastOrDefault(); // <div class="ctn_result-content clearfix"> // ... // <p> // <strong>Activités:</strong> TRAVAUX PAYSAGERS<br>PROPRETE<br>PRESTATION DE SERVICES<br>SOUS TRAITANCE INDUSTRIELLE<br>MECANIQUE<br>AUTOMOBILE<br>METALLURGIE<br /> // <strong>Région - Département:</strong> Alsace - HAUT RHIN (68)<br /> // <strong>Téléphone:</strong> 0389570210 // <strong>Fax:</strong> 0389571761 // <strong>Adresse e-mail:</strong> // <a href="mailto:[email protected]">[email protected]</a> // </p> // </div> Unea_TextType textType = Unea_TextType.unknow; //foreach (string s in xeHeader.DescendantTextList(".//div[@class = 'ctn_result-content clearfix']", func: __trimFunc2)) foreach (string s in xeHeader.XPathElements(".//div[@class = 'ctn_result-content clearfix']").DescendantTexts().Select(__trimFunc2)) { if (s.Equals("Activités", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.activity; } else if (s.Equals("Région - Département", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.location; } else if (s.Equals("Téléphone", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.phone; } else if (s.Equals("Fax", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.fax; } else if (s.Equals("Adresse e-mail", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.email; } else { switch (textType) { case Unea_TextType.activity: if (!header.activities.ContainsKey(s)) { header.activities.Add(s, null); } break; case Unea_TextType.location: header.location = s; textType = Unea_TextType.unknow; break; case Unea_TextType.phone: header.phone = s; textType = Unea_TextType.unknow; break; case Unea_TextType.fax: header.fax = s; textType = Unea_TextType.unknow; break; case Unea_TextType.email: header.email = s; textType = Unea_TextType.unknow; break; default: header.unknowInfos.Add(s); break; } } } headers.Add(header); } return(headers.ToArray()); }
protected override Unea_DetailCompany2 GetData() { XXElement xeSource = new XXElement(GetXmlDocument().Root); Unea_DetailCompany2 data = new Unea_DetailCompany2(); data.sourceUrl = Url; data.loadFromWebDate = DateTime.Now; // <div class='ctn_content-article'> XXElement xeContent = xeSource.XPathElement(".//div[@class='ctn_content-article']"); //IEnumerator<string> texts = xeContent.DescendantTextList(nodeFilter: node => !(node is XElement) || (((XElement)node).Name != "script" && ((XElement)node).Name != "table"), func: __trimFunc2).GetEnumerator(); IEnumerator <string> texts = xeContent.DescendantTexts(node => !(node is XElement) || (((XElement)node).Name != "script" && ((XElement)node).Name != "table") ? XNodeFilter.SelectNode : XNodeFilter.SkipNode).Select(__trimFunc2).GetEnumerator(); // <h1> // <img src="http://unea.griotte.biz/BaseDocumentaire/Docs/Public/4017/LOGOAmpouleC.JPG" style='border-width:2px;border-color:#5593C9;' height='60px' /> // <span>Entreprise Adaptée</span><br /> // ALSACE ENTREPRISE ADAPTEE // </h1> if (texts.MoveNext() && texts.MoveNext()) { data.name = texts.Current; } // <h2>ALSACE ENTREPRISE ADAPTEE est implantée sur les sites de Colmar et Mulhouse avec un effectif de 106 salariés, avec les activités sous-traitance : assemblage de pièces, cintrage de tuyaux, montage complexe, ainsi qu'une activité prestation de service en espaces verts, ménage et transport.</h2> if (texts.MoveNext()) { data.presentation = texts.Current; } Unea_TextType textType = Unea_TextType.unknow; //foreach (XText xtext in xeContent.DescendantTextNodeList(".//table")) foreach (XText xtext in xeContent.XPathElements(".//table").DescendantTextNodes()) { string text = __trimFunc2(xtext.Value); if (text == "") { continue; } if (text.Equals("NOS ACTIVITES", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.activity; } else if (text.Equals("FILIERES METIER UNEA", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.sector; } else if (text.Equals("DOCUMENTS TÉLÉCHARGEABLES", StringComparison.InvariantCultureIgnoreCase)) { foreach (XXElement xe2 in new XXElement(xtext.Parent).XPathElements("following-sibling::ul//a")) { string url = xe2.XPathValue("@href"); //string name = name = xe2.XPathValue(".//text()", __trimFunc2); string name = __trimFunc2(xe2.XPathValue(".//text()")); if (!data.downloadDocuments.ContainsKey(url)) { data.downloadDocuments.Add(url, new Unea_Document() { name = name, url = url }); } else { Trace.CurrentTrace.WriteLine("warning download document already exists \"{0}\" \"{1}\"", name, url); } } // textType = novalues pour ne pas avoir Plaquette_AEA.pdf dans unknowInfos textType = Unea_TextType.novalues; } else if (text.Equals("NOUS CONTACTER", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.novalue; } else if (text.Equals("ADRESSE", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.address; } else if (text.Equals("TELEPHONE", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.phone; } else if (text.Equals("FAX", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.fax; } else if (text.Equals("EMAIL", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.email; } else if (text.Equals("SITE", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.webSite; } else if (text.Equals("QUI SOMMES NOUS", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.novalue; } else if (text.Equals("DIRIGEANT", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.leader; } else if (text.Equals("NOMBRE DE SALARIÉS", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.employeNumber; } else if (text.Equals("CHIFFRE D'AFFAIRE DE L'ANNÉE ÉCOULÉE", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.lastYearRevenue; } else if (text.Equals("NUMÉRO SIRET", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.siret; } else if (text.Equals("CERTIFICATION", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.certification; } else if (text.Equals("PRINCIPAUX CLIENTS", StringComparison.InvariantCultureIgnoreCase)) { textType = Unea_TextType.client; } else { switch (textType) { case Unea_TextType.activity: if (!data.activities.ContainsKey(text)) { data.activities.Add(text, null); } else { Trace.CurrentTrace.WriteLine("warning activity already exists \"{0}\"", text); } break; case Unea_TextType.sector: //data.sectors.Add(text); if (!data.sectors.ContainsKey(text)) { data.sectors.Add(text, null); } else { Trace.CurrentTrace.WriteLine("warning sector already exists \"{0}\"", text); } break; case Unea_TextType.address: if (data.address == null) { data.address = text; } else { data.address += " " + text; } break; case Unea_TextType.phone: data.phone = text; textType = Unea_TextType.unknow; break; case Unea_TextType.fax: data.fax = text; textType = Unea_TextType.unknow; break; case Unea_TextType.email: data.email = text; textType = Unea_TextType.unknow; break; case Unea_TextType.webSite: data.webSite = text; textType = Unea_TextType.unknow; break; case Unea_TextType.leader: data.leader = text; textType = Unea_TextType.unknow; break; case Unea_TextType.employeNumber: int employeNumber; if (int.TryParse(text, out employeNumber)) { data.employeNumber = employeNumber; } else { Trace.CurrentTrace.WriteLine("error unknow employe number \"{0}\"", text); } textType = Unea_TextType.unknow; break; case Unea_TextType.lastYearRevenue: if (text != "€") { data.lastYearRevenue = text; } textType = Unea_TextType.unknow; break; case Unea_TextType.siret: data.siret = text; textType = Unea_TextType.unknow; break; case Unea_TextType.certification: data.certification = text; textType = Unea_TextType.unknow; break; case Unea_TextType.client: data.clients = text; textType = Unea_TextType.unknow; break; case Unea_TextType.novalues: break; default: data.unknowInfos.Add(text); break; } } } foreach (XXElement xe in xeContent.XPathElements(".//table//td/a/img")) { string url = xe.XPathValue("@src"); if (!data.photos.ContainsKey(url)) { data.photos.Add(url, null); } else { Trace.CurrentTrace.WriteLine("warning photo already exists \"{0}\"", url); } } return(data); }