예제 #1
0
        private static Handeco_HeaderPage GetData(WebResult webResult)
        {
            XXElement          xeSource = new XXElement(webResult.Http.zGetXDocument().Root);
            string             url      = webResult.WebRequest.HttpRequest.Url;
            Handeco_HeaderPage data     = new Handeco_HeaderPage();

            data.SourceUrl       = url;
            data.LoadFromWebDate = webResult.LoadFromWebDate;
            data.Id = GetPageKey(webResult.WebRequest.HttpRequest);

            //data.UrlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='page-nav']//li[last()]//a[text()='>']/@href"));

            //IEnumerable<XXElement> xeHeaders = xeSource.XPathElements("//table[@id='layout']//div[@id='content']/div");
            //List<Handeco_PostHeader> headers = new List<Handeco_PostHeader>();
            //foreach (XXElement xeHeader in xeHeaders)
            //{
            //    Handeco_PostHeader header = new Handeco_PostHeader();
            //    header.SourceUrl = url;
            //    header.LoadFromWebDate = webResult.LoadFromWebDate;

            //    if (xeHeader.XPathValue("@class") == "page-nav")
            //        continue;

            //    XXElement xe = xeHeader.XPathElement(".//div/div/div//a");
            //    //header.Title = xe.XPathValue(".//text()");
            //    header.UrlDetail = xe.XPathValue("./@href");

            //    headers.Add(header);
            //}
            //data.PostHeaders = headers.ToArray();
            //return data;


            // <div class="paginationControl">
            // page n    : <a href="/fournisseurs/rechercher/page/2#resultats">&gt;</a> |
            // last page : <span class="disabled">&gt;</span> |
            data.UrlNextPage = zurl.RemoveFragment(zurl.GetUrl(url, xeSource.XPathValue("//div[@class='paginationControl']//*[position()=last()-1]/@href")));

            IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//table//tr[position() > 1]");
            List <Handeco_Header>   headers   = new List <Handeco_Header>();

            foreach (XXElement xeHeader in xeHeaders)
            {
                Handeco_Header header = new Handeco_Header();
                header.SourceUrl       = url;
                header.LoadFromWebDate = DateTime.Now;
                header.Name            = Handeco.Trim(xeHeader.XPathValue(".//td[1]//text()"));
                header.UrlDetail       = zurl.RemoveFragment(zurl.GetUrl(url, xeHeader.XPathValue(".//td[1]//a/@href")));
                //header.Siret = Handeco.Trim(xeHeader.XPathValue(".//td[2]//text()"));
                header.Type       = Handeco.Trim(xeHeader.XPathValue(".//td[2]//text()"));
                header.Groupes    = xeHeader.XPathValues(".//td[3]//text()").Select(Handeco.Trim).ToArray();
                header.Activités  = xeHeader.XPathValues(".//td[4]//text()").Select(Handeco.Trim).ToArray();
                header.PostalCode = Handeco.Trim(xeHeader.XPathValue(".//td[5]//text()"));
                headers.Add(header);
            }
            data.Headers = headers.ToArray();
            return(data);
        }
예제 #2
0
 private static string GetTextValue(XXElement xe)
 {
     //return currentElement.XPathConcatText(".//td//text()", separator: " ", itemFunc: s => Handeco.Trim(__badCharacters.Replace(s, " "))); ;
     return(xe.XPathValues(".//td//text()").Select(s => Handeco.Trim(__badCharacters.Replace(s, " "))).zToStringValues(" "));
 }
예제 #3
0
 private static string[] GetTextValues(XXElement xe)
 {
     return(xe.XPathValues(".//td//text()").Select(s => Handeco.Trim(__badCharacters.Replace(s, " "))).ToArray());;
 }
예제 #4
0
        private static Handeco_Detail GetData(WebResult webResult)
        {
            XXElement      xeSource = webResult.Http.zGetXDocument().zXXElement();
            Handeco_Detail data     = new Handeco_Detail();

            data.SourceUrl       = webResult.WebRequest.HttpRequest.Url;
            data.LoadFromWebDate = webResult.LoadFromWebDate;
            data.Id = GetKey(webResult.WebRequest.HttpRequest);

            //<div style="text-align: right; font-size: 10px;">
            //<em>Dernière mise à jour le 18-01-2013</em>
            //</div>
            string lastUpdate = Handeco.Trim(xeSource.XPathValue("//em[starts-with(text(), 'Dernière mise à jour')]/text()"));

            if (lastUpdate != null)
            {
                Match    match = __lastUpdateRegex.Match(lastUpdate);
                DateTime date;
                if (match.Success && DateTime.TryParseExact(match.Value, "dd-MM-yyyy", System.Globalization.CultureInfo.CurrentCulture, System.Globalization.DateTimeStyles.None, out date))
                {
                    data.DernièreMiseàjour = date;
                }
                else
                {
                    data.UnknowInfos.Add(lastUpdate);
                }
            }
            else
            {
                pb.Trace.WriteLine("error \"Dernière mise à jour\" not found");
            }

            // NOTRE OFFRE - activities - multiple
            //<select style="width: 200px; display: none;" onchange="change_activite(this.selectedIndex);" id="select_activites">
            //    <option>Sous-traitance industrielle - Autre</option>
            //    <option>Assemblage mécanique</option>
            //    <option>Energie renouvelable - Autre</option>
            //</select>
            string[] activityTypes = xeSource.XPathValues("//select[@id = 'select_activites']/option/text()").Select(Handeco.Trim).ToArray();

            // CONTACTS - multiple
            //<select style="width: 200px; display: none;" onchange="change_contact(this.selectedIndex);" id="select_contacts">
            //    <option>Jacky STEINLE (Chef d'atelier)</option>
            //</select>
            string[] contactDescriptions = xeSource.XPathValues("//select[@id = 'select_contacts']/option/text()").Select(Handeco.Trim).ToArray();

            int             indexActivityType       = 0;
            int             indexContactDescription = 0;
            List <Activity> activities = new List <Activity>();
            List <Contact>  contacts   = new List <Contact>();

            foreach (XXElement xxe in xeSource.XPathElements("//table[@class = 'fiche organisation']"))
            {
                //string id = xxe.XPathValue("@id").ToLower();
                string id = xxe.XPathValue("@id");
                if (id != null)
                {
                    id = id.ToLower();
                }

                if (__trace)
                {
                    pb.Trace.WriteLine("table id = \"{0}\"", id);
                }

                Activity activity = null;
                Contact  contact  = null;
                if (id != null && id.StartsWith("fiche_activite_"))
                {
                    activity = new Activity();
                    activities.Add(activity);
                    if (indexActivityType < activityTypes.Length)
                    {
                        activity.Type = activityTypes[indexActivityType++];
                    }
                    else
                    {
                        pb.Trace.WriteLine("warning miss an activity type in html (<select id='select_activites'>)");
                    }
                }
                else if (id != null && id.StartsWith("fiche_contact_"))
                {
                    contact = new Contact();
                    contacts.Add(contact);
                    if (indexContactDescription < contactDescriptions.Length)
                    {
                        contact.Description = contactDescriptions[indexContactDescription++];
                    }
                    else
                    {
                        pb.Trace.WriteLine("warning miss an activity type in html (<select id='select_contacts'>)");
                    }
                }

                foreach (XXElement xxe2 in xxe.XPathElements(".//tr"))
                {
                    string valueName = Handeco.Trim(xxe2.XPathValue(".//th//text()"));
                    //_currentElement = xxe2;
                    XXElement currentElement = xxe2;

                    if (valueName == null)
                    {
                        continue;
                    }

                    if (activity != null)
                    {
                        if (__trace)
                        {
                            pb.Trace.Write("activité ");
                        }
                        if (!SetActivityValue(activity, valueName, currentElement))
                        {
                            if (__trace)
                            {
                                pb.Trace.Write("error ");
                            }
                            data.UnknowInfos.Add("valeur activité inconnu : " + valueName + " = " + GetTextValue(currentElement));
                        }
                        else if (__trace)
                        {
                            pb.Trace.Write("      ");
                        }
                        if (__trace)
                        {
                            pb.Trace.WriteLine("\"{0}\" =  \"{1}\"", valueName, GetTextValue(currentElement));
                        }
                    }
                    else if (contact != null)
                    {
                        if (__trace)
                        {
                            pb.Trace.Write("contact  ");
                        }
                        if (!SetContactValue(contact, valueName, currentElement))
                        {
                            if (__trace)
                            {
                                pb.Trace.Write("error ");
                            }
                            data.UnknowInfos.Add("valeur contact inconnu : " + valueName + " = " + GetTextValue(currentElement));
                        }
                        else if (__trace)
                        {
                            pb.Trace.Write("      ");
                        }
                        if (__trace)
                        {
                            pb.Trace.WriteLine("\"{0}\" =  \"{1}\"", valueName, GetTextValue(currentElement));
                        }
                    }
                    else
                    {
                        if (__trace)
                        {
                            pb.Trace.Write("société  ");
                        }
                        if (!SetValue(data, valueName, currentElement))
                        {
                            if (__trace)
                            {
                                pb.Trace.Write("error ");
                            }
                            data.UnknowInfos.Add("valeur inconnu : " + valueName + " = " + GetTextValue(currentElement));
                        }
                        else if (__trace)
                        {
                            pb.Trace.Write("      ");
                        }
                        if (__trace)
                        {
                            pb.Trace.WriteLine("\"{0}\" =  \"{1}\"", valueName, GetTextValue(currentElement));
                        }
                    }
                }
            }
            data.Activités = activities.ToArray();
            data.Contacts  = contacts.ToArray();

            if (__trace)
            {
                pb.Trace.WriteLine(data.zToJson());
            }

            return(data);

            //XXElement xePost = xeSource.XPathElement("//table[@id='layout']//div[@id='content']//div[@class='post']");

            //XXElement xe = xePost.XPathElement(".//table[@id='post-head']");
            ////string[] dates = xe.DescendantTextList(".//td[@id='head-date']", func: Vosbooks.TrimFunc1).ToArray();
            //string[] dates = xe.XPathElement(".//td[@id='head-date']").DescendantTexts().Select(DownloadPrint.Trim).ToArray();
            //data.PostCreationDate = GetDate(dates, __lastPostDate);
            //if (data.PostCreationDate != null)
            //    __lastPostDate = new Date(data.PostCreationDate.Value);
            //if (__trace)
            //    pb.Trace.WriteLine("post creation date {0} - {1}", data.PostCreationDate, dates.zToStringValues());

            ////data.Title = xePost.XPathValue(".//div[@class='title']//a//text()", DownloadPrint.TrimFunc1);
            //data.Title = xePost.XPathValue(".//div[@class='title']//a//text()").zFunc(DownloadPrint.ReplaceChars).zFunc(DownloadPrint.Trim);
            //PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title);
            //if (titleInfos.foundInfo)
            //{
            //    data.OriginalTitle = data.Title;
            //    data.Title = titleInfos.title;
            //    data.Infos.SetValues(titleInfos.infos);
            //}

            //// Ebooks en Epub / Livre
            ////data.Category = xePost.DescendantTextList(".//div[@class='postdata']//span[@class='category']//a").Select(DownloadPrint.TrimFunc1).zToStringValues("/");
            //data.Category = xePost.XPathElements(".//div[@class='postdata']//span[@class='category']//a").DescendantTexts().Select(DownloadPrint.Trim).zToStringValues("/");
            //data.PrintType = GetPrintType(data.Category);
            ////pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType);

            //xe = xePost.XPathElement(".//div[@class='entry']");
            //data.Images = new WebImage[] { new WebImage(zurl.GetUrl(data.SourceUrl, xe.XPathValue("div[starts-with(@class, 'post-views')]/following-sibling::h3/following-sibling::p/img/@src"))) };

            //// force load image to get image width and height
            //if (webResult.WebRequest.LoadImage)
            //    data.Images = DownloadPrint.LoadImages(data.Images).ToArray();

            //// get infos, description, language, size, nbPages
            //// xe.DescendantTextList(".//p")
            //PrintTextValues textValues = DownloadPrint.PrintTextValuesManager.GetTextValues(
            //    xe.XPathElements(".//p").DescendantTexts(
            //    node =>
            //    {
            //        if (node is XText)
            //        {
            //            string text = ((XText)node).Value.Trim();
            //            //if (text.StartsWith("Lien Direct", StringComparison.InvariantCultureIgnoreCase))
            //            if (text.StartsWith("lien ", StringComparison.InvariantCultureIgnoreCase))
            //                return XNodeFilter.Stop;
            //        }
            //        if (node is XElement)
            //        {
            //            XElement xe2 = (XElement)node;
            //            if (xe2.Name == "p" && xe2.zAttribValue("class") == "submeta")
            //                return XNodeFilter.Stop;
            //        }
            //        return XNodeFilter.SelectNode;
            //    }
            //    ).Select(DownloadPrint.ReplaceChars).Select(DownloadPrint.TrimWithoutColon), data.Title);
            //data.Description = textValues.description;
            //data.Infos.SetValues(textValues.infos);

            //data.DownloadLinks = xe.DescendantNodes(
            //    node =>
            //    {
            //        if (!(node is XElement))
            //            return XNodeFilter.DontSelectNode;
            //        XElement xe2 = (XElement)node;
            //        if (xe2.Name == "a")
            //            return XNodeFilter.SelectNode;
            //        if (xe2.Name != "p")
            //            return XNodeFilter.DontSelectNode;
            //        XAttribute xa = xe2.Attribute("class");
            //        if (xa == null)
            //            return XNodeFilter.DontSelectNode;
            //        if (xa.Value != "submeta")
            //            return XNodeFilter.DontSelectNode;
            //        //return XNodeFilter.SkipNode;
            //        return XNodeFilter.Stop;
            //    })
            //    .Select(node => ((XElement)node).Attribute("href").Value).ToArray();
        }