Пример #1
0
        public static void Test_XXElement_DescendantTextList_01()
        {
            string url = @"c:\pib\dev_data\exe\runsource\download\sites\rapide-ddl\cache\detail\39000\ebooks_magazine_39023-multi-lautomobile-no821-octobre-2014.html";

            pb.old.Http_v2.LoadUrl(url);
            XXElement xe = new XXElement(pb.old.Http_v2.HtmlReader.XDocument.Root).XPathElement("//div[@class='lcolomn mainside']").XPathElement(".//div[@class='maincont']");

            //string xpath = ".//div";
            //foreach (string s in xe.DescendantTextList())
            foreach (string s in xe.DescendantTexts())
            {
                Trace.WriteLine(s);
            }
            //foreach (string s in from xe2 in xe.XElement.XPathSelectElements(xpath) from s in xe2.zDescendantTextList() select s)
            //{
            //    Trace.WriteLine(s);
            //}
            //foreach (XElement xe2 in xe.XElement.XPathSelectElements(xpath))
            //{
            //    Trace.WriteLine("XElement {0}", xe2.zGetPath());
            //    foreach (string s in xe2.zDescendantTextList())
            //    {
            //        Trace.WriteLine(s);
            //    }
            //}
        }
Пример #2
0
        public bool MoveNext()
        {
            while (_xmlEnum.MoveNext())
            {
                XXElement xeHeader = _xmlEnum.Current;
                _header                 = new Gesat_HeaderCompany();
                _header.sourceUrl       = _url;
                _header.loadFromWebDate = DateTime.Now;

                //<span class="NOM"><a title="ESAT BETTY LAUNAY-MOULIN VERT" href="/Gesat/Hauts-de-Seine,92/Bois-Colombes,35494/esat-betty-launay-moulin-vert-competences-et-handicap-92,e1837/">ESAT BETTY LAUNAY-MOULIN VERT</a></span>
                //_header.companyName = xeHeader.ExplicitXPathValue(".//span[@class='NOM']//a//text()");
                XXElement xe = xeHeader.XPathElement(".//span[@class='NOM']//a");
                if (xe != null)
                {
                    _header.url = GetUrl(xe.ExplicitXPathValue("@href"));
                    //_header.name = xe.ExplicitXPathValue(".//text()", _trimFunc1);
                    _header.name = _trimFunc1(xe.ExplicitXPathValue(".//text()"));
                }
                //<span class="VILLE">E.S.A.T.<br />Bois-Colombes (92)</span>
                xe = xeHeader.XPathElement(".//span[@class='VILLE']");
                if (xe != null)
                {
                    //IEnumerator<string> texts = xe.DescendantTextList().GetEnumerator();
                    IEnumerator <string> texts = xe.DescendantTexts().GetEnumerator();
                    if (texts.MoveNext())
                    {
                        _header.type = texts.Current.Trim();
                    }
                    else
                    {
                        Trace.CurrentTrace.WriteLine("error companyType not found");
                    }
                    if (texts.MoveNext())
                    {
                        _header.location = texts.Current.Trim();
                    }
                    else
                    {
                        Trace.CurrentTrace.WriteLine("error companyLocation not found");
                    }
                }
                // <span class="TELEPHONE">01 47 86 11 48</span>
                //_header.phone = xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()", _trimFunc1);
                _header.phone = _trimFunc1(xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()"));
                //<img info_bulle="Signataire de la charte Ethique et Valeurs" border="0" alt="/images/bullesGesat/pictoCharte.png" src="/images/bullesGesat/pictoCharte.png" style=" border: 0;" />
                //<img info_bulle="Lauréat des trophées HandiResponsables 2013" border="0" alt="/images/bullesGesat/LAURIERS-OR-2013.png" src="/images/bullesGesat/LAURIERS-OR-2013.png" style=" border: 0;" />
                //_header.infos = xeHeader.XPathValues(".//img/@info_bulle", _trimFunc1);
                _header.infos = xeHeader.XPathValues(".//img/@info_bulle").Select(_trimFunc1).ToArray();
                //_header.SetInfo(xeHeader.XPathValues(".//img/@info_bulle"));
                return(true);
            }
            return(false);
        }
Пример #3
0
        public static RapideDdl_HeaderPage LoadHeaderPageFromWeb(pb.Web.v1.RequestFromWeb_v2 request)
        {
            // loadDataFromWeb
            XXElement            xeSource = new XXElement(request.GetXmlDocument().Root);
            string               url      = request.Url;
            RapideDdl_HeaderPage data     = new RapideDdl_HeaderPage();

            //data.urlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='navigation']//a[text()='Next']/@href"));
            data.urlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='basenavi']//span[@class='nnext']//a/@href"));
            IEnumerable <XXElement>     xeHeaders = xeSource.XPathElements("//div[@class='base shortstory']");
            List <RapideDdl_PostHeader> headers   = new List <RapideDdl_PostHeader>();

            foreach (XXElement xeHeader in xeHeaders)
            {
                RapideDdl_PostHeader header = new RapideDdl_PostHeader();
                header.sourceUrl       = url;
                header.loadFromWebDate = DateTime.Now;

                XXElement xe = xeHeader.XPathElement(".//*[@class='shd']//a");
                header.urlDetail = zurl.GetUrl(url, xe.XPathValue("@href"));
                // xe.XPathValue(".//text()", Download.Print.RapideDdl.RapideDdl.TrimFunc1)
                /////////////////////////////////header.title = Download.Print.RapideDdl.RapideDdl.ExtractTextValues(header.infos, xe.XPathValue(".//text()").Trim(DownloadPrint.TrimChars));

                //xe = xeHeader.XPathElement(".//div[@class='shdinf']/div[@class='shdinf']");
                xe = xeHeader.XPathElement(".//div[@class='shdinf']");
                header.postAuthor = xe.XPathValue(".//span[@class='arg']//a//text()");
                // Aujourd'hui, 17:13
                ////////////////////////////////header.creationDate = Download.Print.RapideDdl.RapideDdl.ParseDateTime(xe.XPathValue(".//span[@class='date']//text()"), (DateTime)header.loadFromWebDate);

                //xe = xeHeader.XPathElement(".//span[@id='post-img']//div[starts-with(@id, 'news-id')]");
                xe = xeHeader.XPathElement(".//div[@class='maincont']");
                //header.images = xe.XPathImages(url, TelechargementPlus.ImagesToSkip);
                //header.images = xe.XPathImages(url);
                //header.images = xe.XPathImages(xeImg => new ImageHtml(xeImg, url)).ToList();
                header.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new pb.old.ImageHtml((XElement)xeImg, url)).ToList();

                //if (request.LoadImage)
                //    Http2.LoadImageFromWeb(header.images);

                //header.SetTextValues(xe.DescendantTextList());
                header.SetTextValues(xe.DescendantTexts());

                xe = xeHeader.XPathElement(".//div[@class='morelink']//span[@class='arg']");
                //header.category = xe.DescendantTextList(".//span[@class='lcol']").Select(RapideDdl.TrimFunc1).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/");
                //header.category = xe.DescendantTextList(".//a").Select(Download.Print.RapideDdl.RapideDdl.TrimFunc1).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/");
                header.category = xe.XPathElements(".//a").DescendantTexts().Select(Download.Print.RapideDdl.RapideDdl.TrimFunc1).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/");

                headers.Add(header);
            }
            data.postHeaders = headers.ToArray();
            return(data);
        }
Пример #4
0
        protected override void SetXml(XElement xelement)
        {
            XXElement xeSource = new XXElement(xelement);

            _data = new Gesat_HeaderPage();
            // <div class="PAGENAVIGLIST">
            // <a href="/Gesat/EtablissementList-10-10.html" title="page suivante">&gt;</a>&nbsp;
            _data.urlNextPage = GetUrl(xeSource.XPathValue("//div[@class='PAGENAVIGLIST']//a[@title='page suivante']/@href"));
            // <div class="ETABLISSEMENT STAR-1 ODD"> <div class="ETABLISSEMENT STAR-0 ODD"> <div class="ETABLISSEMENT STAR-1 EVEN">
            IEnumerable <XXElement>    xeHeaders = xeSource.XPathElements("//div[starts-with(@class, 'ETABLISSEMENT STAR-')]");
            List <Gesat_HeaderCompany> headers   = new List <Gesat_HeaderCompany>();

            foreach (XXElement xeHeader in xeHeaders)
            {
                Gesat_HeaderCompany header = new Gesat_HeaderCompany();
                header.sourceUrl       = _url;
                header.loadFromWebDate = DateTime.Now;

                //<span class="NOM"><a title="ESAT BETTY LAUNAY-MOULIN VERT" href="/Gesat/Hauts-de-Seine,92/Bois-Colombes,35494/esat-betty-launay-moulin-vert-competences-et-handicap-92,e1837/">ESAT BETTY LAUNAY-MOULIN VERT</a></span>
                //_header.companyName = xeHeader.ExplicitXPathValue(".//span[@class='NOM']//a//text()");
                XXElement xe = xeHeader.XPathElement(".//span[@class='NOM']//a");
                if (xe != null)
                {
                    header.url = GetUrl(xe.ExplicitXPathValue("@href"));
                    //header.name = xe.ExplicitXPathValue(".//text()", _trimFunc1);
                    header.name = _trimFunc1(xe.ExplicitXPathValue(".//text()"));
                }
                //<span class="VILLE">E.S.A.T.<br />Bois-Colombes (92)</span>
                xe = xeHeader.XPathElement(".//span[@class='VILLE']");
                if (xe != null)
                {
                    //IEnumerator<string> texts = xe.DescendantTextList().GetEnumerator();
                    IEnumerator <string> texts = xe.DescendantTexts().GetEnumerator();
                    if (texts.MoveNext())
                    {
                        header.type = texts.Current.Trim();
                    }
                    else
                    {
                        Trace.CurrentTrace.WriteLine("error companyType not found");
                    }
                    if (texts.MoveNext())
                    {
                        header.location = texts.Current.Trim();
                    }
                    else
                    {
                        Trace.CurrentTrace.WriteLine("error companyLocation not found");
                    }
                }
                // <span class="TELEPHONE">01 47 86 11 48</span>
                //header.phone = xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()", _trimFunc1);
                header.phone = _trimFunc1(xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()"));
                //<img info_bulle="Signataire de la charte Ethique et Valeurs" border="0" alt="/images/bullesGesat/pictoCharte.png" src="/images/bullesGesat/pictoCharte.png" style=" border: 0;" />
                //<img info_bulle="Lauréat des trophées HandiResponsables 2013" border="0" alt="/images/bullesGesat/LAURIERS-OR-2013.png" src="/images/bullesGesat/LAURIERS-OR-2013.png" style=" border: 0;" />
                //header.infos = xeHeader.XPathValues(".//img/@info_bulle", _trimFunc1);
                header.infos = xeHeader.XPathValues(".//img/@info_bulle").Select(_trimFunc1).ToArray();
                //_header.SetInfo(xeHeader.XPathValues(".//img/@info_bulle"));
                headers.Add(header);
            }
            _data.headerCompanies = headers.ToArray();
        }
Пример #5
0
        protected override RapideDdl_HeaderPage GetDataFromWeb(LoadDataFromWeb_v3 loadDataFromWeb)
        {
            XXElement            xeSource = new XXElement(loadDataFromWeb.GetXmlDocument().Root);
            string               url      = loadDataFromWeb.request.Url;
            RapideDdl_HeaderPage data     = new RapideDdl_HeaderPage();

            data.sourceUrl       = url;
            data.loadFromWebDate = loadDataFromWeb.loadFromWebDate;
            data.id = RapideDdl_LoadHeaderPagesManager.GetHeaderPageKey(url);

            data.urlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='basenavi']//span[@class='nnext']//a/@href"));
            IEnumerable <XXElement>     xeHeaders = xeSource.XPathElements("//div[@class='base shortstory']");
            List <RapideDdl_PostHeader> headers   = new List <RapideDdl_PostHeader>();

            foreach (XXElement xeHeader in xeHeaders)
            {
                RapideDdl_PostHeader header = new RapideDdl_PostHeader();
                header.sourceUrl       = url;
                header.loadFromWebDate = loadDataFromWeb.loadFromWebDate;

                XXElement xe = xeHeader.XPathElement(".//*[@class='shd']//a");
                header.urlDetail = zurl.GetUrl(url, xe.XPathValue("@href"));

                //header.title = RapideDdl.ExtractTextValues(header.infos, xe.XPathValue(".//text()", RapideDdl.TrimFunc1));
                //header.title = xe.XPathValue(".//text()", DownloadPrint.Trim);
                header.title = xe.XPathValue(".//text()").Trim(DownloadPrint.TrimChars);
                PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(header.title);
                if (titleInfos.foundInfo)
                {
                    //header.originalTitle = header.title;
                    header.title = titleInfos.title;
                    header.infos.SetValues(titleInfos.infos);
                }

                xe = xeHeader.XPathElement(".//div[@class='shdinfo']");
                header.postAuthor = xe.XPathValue(".//span[@class='arg']//a//text()");
                // Aujourd'hui, 17:13
                //header.creationDate = RapideDdl.ParseDateTime(xe.XPathValue(".//span[@class='date']//text()"), loadDataFromWeb.loadFromWebDate);
                string date = xe.XPathValue(".//span[@class='date']//text()");
                header.creationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.loadFromWebDate, "d-M-yyyy, HH:mm", "d M yyyy", "d MMMM yyyy");
                if (header.creationDate == null)
                {
                    pb.Trace.WriteLine("unknow date time \"{0}\"", date);
                }
                if (__trace)
                {
                    pb.Trace.WriteLine("creationDate {0} - \"{1}\"", header.creationDate, date);
                }

                xe = xeHeader.XPathElement(".//div[@class='maincont']");
                //header.images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToArray();
                header.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(url, xeImg.zAttribValue("src")))).ToArray();

                //if (request.LoadImage)
                //    Http2.LoadImageFromWeb(header.images);

                //RapideDdl.SetTextValues(header, xe.DescendantTextList());
                // get infos, description, language, size, nbPages
                // xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a")
                PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), header.title);
                header.description = textValues.description;
                header.language    = textValues.language;
                header.size        = textValues.size;
                header.nbPages     = textValues.nbPages;
                header.infos.SetValues(textValues.infos);

                xe = xeHeader.XPathElement(".//div[@class='morelink']//span[@class='arg']");
                //header.category = xe.DescendantTextList(".//a").Select(DownloadPrint.TrimFunc1).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/");
                header.category = xe.XPathElements(".//a").DescendantTexts().Select(DownloadPrint.Trim).Where(s => !s.StartsWith("Commentaires")).zToStringValues("/");

                headers.Add(header);
            }
            data.postHeaders = headers.ToArray();
            return(data);
        }
Пример #6
0
        protected override Unea_HeaderCompany[] GetData()
        {
            XXElement xeSource = new XXElement(GetXmlDocument().Root);
            string    url      = Url;
            // <div class="ctn_result">
            IEnumerable <XXElement>   xeHeaders = xeSource.XPathElements("//div[@class = 'ctn_result']");
            List <Unea_HeaderCompany> headers   = new List <Unea_HeaderCompany>();

            foreach (XXElement xeHeader in xeHeaders)
            {
                Unea_HeaderCompany header = new Unea_HeaderCompany();
                header.sourceUrl       = url;
                header.loadFromWebDate = DateTime.Now;

                // <div class="ctn_result-header">
                XXElement xe = xeHeader.XPathElement(".//div[@class='ctn_result-header']");

                // <div class="lien"><a href="http://www.unea.fr/union-nationale-entreprises-adaptees/annuaire-unea/71/4583/ACCAA TAKTIM.htm" target="_blank"><strong>></strong> Voir la fiche détaillée</a></div>
                header.urlDetail2 = zurl.GetUrl(url, xe.ExplicitXPathValue(".//div[@class = 'lien']//a/@href"));

                // <iframe src="detail.asp?id=4583" width="420" height="800" frameborder="0" scrolling="auto" marginheight="0" marginwidth="0"></iframe>
                header.urlDetail1 = zurl.GetUrl(url, xe.ExplicitXPathValue(".//iframe/@src"));

                // <h4><a href="http://www.unea.fr/union-nationale-entreprises-adaptees/annuaire-unea/71/4583/ACCAA TAKTIM.htm"  target="_blank">&nbsp;</a><span>|</span> ACCAA TAKTIM</h4>
                //header.name = xe.DescendantTextList(func: __trimFunc2).LastOrDefault();
                header.name = xe.DescendantTexts().Select(__trimFunc2).LastOrDefault();

                // <div class="ctn_result-content clearfix">
                // ...
                // <p>
                // <strong>Activités:</strong> TRAVAUX PAYSAGERS<br>PROPRETE<br>PRESTATION DE SERVICES<br>SOUS TRAITANCE INDUSTRIELLE<br>MECANIQUE<br>AUTOMOBILE<br>METALLURGIE<br />
                // <strong>Région - Département:</strong> Alsace - HAUT RHIN (68)<br />
                // <strong>Téléphone:</strong> 0389570210&nbsp;&nbsp;&nbsp;&nbsp;
                // <strong>Fax:</strong> 0389571761&nbsp;&nbsp;&nbsp;&nbsp;
                // <strong>Adresse e-mail:</strong>
                // <a href="mailto:[email protected]">[email protected]</a>
                // </p>
                // </div>
                Unea_TextType textType = Unea_TextType.unknow;
                //foreach (string s in xeHeader.DescendantTextList(".//div[@class = 'ctn_result-content clearfix']", func: __trimFunc2))
                foreach (string s in xeHeader.XPathElements(".//div[@class = 'ctn_result-content clearfix']").DescendantTexts().Select(__trimFunc2))
                {
                    if (s.Equals("Activités", StringComparison.InvariantCultureIgnoreCase))
                    {
                        textType = Unea_TextType.activity;
                    }
                    else if (s.Equals("Région - Département", StringComparison.InvariantCultureIgnoreCase))
                    {
                        textType = Unea_TextType.location;
                    }
                    else if (s.Equals("Téléphone", StringComparison.InvariantCultureIgnoreCase))
                    {
                        textType = Unea_TextType.phone;
                    }
                    else if (s.Equals("Fax", StringComparison.InvariantCultureIgnoreCase))
                    {
                        textType = Unea_TextType.fax;
                    }
                    else if (s.Equals("Adresse e-mail", StringComparison.InvariantCultureIgnoreCase))
                    {
                        textType = Unea_TextType.email;
                    }
                    else
                    {
                        switch (textType)
                        {
                        case Unea_TextType.activity:
                            if (!header.activities.ContainsKey(s))
                            {
                                header.activities.Add(s, null);
                            }
                            break;

                        case Unea_TextType.location:
                            header.location = s;
                            textType        = Unea_TextType.unknow;
                            break;

                        case Unea_TextType.phone:
                            header.phone = s;
                            textType     = Unea_TextType.unknow;
                            break;

                        case Unea_TextType.fax:
                            header.fax = s;
                            textType   = Unea_TextType.unknow;
                            break;

                        case Unea_TextType.email:
                            header.email = s;
                            textType     = Unea_TextType.unknow;
                            break;

                        default:
                            header.unknowInfos.Add(s);
                            break;
                        }
                    }
                }

                headers.Add(header);
            }
            return(headers.ToArray());
        }
Пример #7
0
        protected override IPost GetData(LoadDataFromWeb_v4 loadDataFromWeb)
        {
            XXElement          xeSource = new XXElement(loadDataFromWeb.Http.zGetXDocument().Root);
            Ebookdz_PostDetail data     = new Ebookdz_PostDetail();

            data.SourceUrl       = loadDataFromWeb.WebRequest.HttpRequest.Url;
            data.LoadFromWebDate = loadDataFromWeb.LoadFromWebDate;
            data.Id = GetPostDetailKey(loadDataFromWeb.WebRequest.HttpRequest);

            // <div class="body_bd">
            XXElement xePost = xeSource.XPathElement("//div[@class='body_bd']");

            // Le Monde + Magazine + 2 suppléments du samedi 03 janvier 2015
            //data.Title = xePost.XPathValue(".//div[@id='pagetitle']//a//text()", DownloadPrint.Trim);
            data.Title = xePost.XPathValue(".//div[@id='pagetitle']//a//text()").Trim(DownloadPrint.TrimChars);
            PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title);

            if (titleInfos.foundInfo)
            {
                data.OriginalTitle = data.Title;
                data.Title         = titleInfos.title;
                data.Infos.SetValues(titleInfos.infos);
            }

            // Forum / Journaux / Presse quotidienne / Le Monde / Journal Le Monde + Magazine + 2 suppléments du samedi 03 janvier 2015
            string lowerTitle = null;

            if (data.Title != null)
            {
                lowerTitle = data.Title.ToLowerInvariant();
            }
            //data.Category = xePost.DescendantTextList(".//div[@id='breadcrumb']//a").Where(text => { text = text.ToLowerInvariant(); return text != "forum" && !text.EndsWith(lowerTitle); }).Select(DownloadPrint.TrimFunc1).zToStringValues("/");
            data.Category = xePost.XPathElements(".//div[@id='breadcrumb']//a").DescendantTexts().Where(text => { text = text.ToLowerInvariant(); return(text != "forum" && !text.EndsWith(lowerTitle)); }).Select(DownloadPrint.Trim).zToStringValues("/");
            string category = data.Category.ToLowerInvariant();

            data.PrintType = GetPrintType(category);
            //Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType);

            // <div id="postlist" class="postlist restrain">
            XXElement xe = xePost.XPathElement(".//div[@id='postlist']");

            // Aujourd'hui, 07h32 - Aujourd'hui, 10h51 - Hier, 12h55 - 22/02/2014, 21h09
            //string date = xe.DescendantTextList(".//div[@class='posthead']//text()", nodeFilter: node => node.zGetName() != "a").zToStringValues("");
            XXElement xe2 = xe.XPathElement(".//div[@class='posthead']");
            //string date = xe2.DescendantTextList(nodeFilter: node => node.zGetName() != "a").zToStringValues("");
            string date = xe2.DescendantTexts(node => node.zGetName() != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode).zToStringValues("");

            date = date.Replace('\xA0', ' ');
            data.PostCreationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.LoadFromWebDate, @"d/M/yyyy, HH\hmm", @"d-M-yyyy, HH\hmm");
            if (data.PostCreationDate == null)
            {
                pb.Trace.WriteLine("unknow post creation date \"{0}\"", date);
            }
            if (__trace)
            {
                pb.Trace.WriteLine("post creation date {0} - \"{1}\"", data.PostCreationDate, date);
            }

            //data.PostAuthor = xe.XPathValue(".//div[@class='userinfo']//a//text()", DownloadPrint.Trim);
            data.PostAuthor = xe.XPathValue(".//div[@class='userinfo']//a//text()").Trim(DownloadPrint.TrimChars);

            // <div class="postbody">
            xe = xePost.XPathElement(".//div[@class='postbody']//div[@class='content']//blockquote/div");

            //data.Images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(data.SourceUrl, xeImg.zAttribValue("src")))).ToArray();
            data.Images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(data.SourceUrl, xeImg.zAttribValue("src")))).ToArray();

            // force load image to get image width and height
            if (loadDataFromWeb.WebRequest.LoadImage)
            {
                data.Images = DownloadPrint.LoadImages(data.Images).ToArray();
            }

            // get infos, description, language, size, nbPages
            // xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a")
            PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.Title);

            data.Description = textValues.description;
            data.Language    = textValues.language;
            data.Size        = textValues.size;
            data.NbPages     = textValues.nbPages;
            data.Infos.SetValues(textValues.infos);

            data.DownloadLinks = xe.XPathValues(".//a/@href").ToArray();

            if (__trace)
            {
                pb.Trace.WriteLine(data.zToJson());
            }

            return(data);
        }
Пример #8
0
        // detail get data
        protected override TelechargerMagazine_PostDetail GetDetailData(WebResult webResult)
        {
            XXElement xeSource = webResult.Http.zGetXDocument().zXXElement();
            TelechargerMagazine_PostDetail data = new TelechargerMagazine_PostDetail();

            data.SourceUrl       = webResult.WebRequest.HttpRequest.Url;
            data.LoadFromWebDate = webResult.LoadFromWebDate;
            data.Id = GetDetailKey(webResult.WebRequest.HttpRequest);

            // la date est juste la date du jour
            // <div id="calendar-layer">
            // <table id="calendar" cellpadding="3" class="calendar">
            // ...
            // <tr>
            // ...
            // <td  class="day-active-v day-current" ><a class="day-active-v" href="http://www.telecharger-magazine.com/2015/07/17/" title="Article posté dans 17 Juillet 2015">17</a></td>
            // ...
            // </tr>
            // ...
            // </table>
            // </div>

            // <div id='dle-content'>
            // ...
            // <div class="right-full">
            //
            // <div class="cat_name">
            // Posted in:
            // <a href="http://www.telecharger-magazine.com/journaux/">Journaux</a>
            // </div>
            //
            // <h2 class="title">
            // <img src="/templates/MStarter/images/title.png" alt="" class="img" />
            // Journaux Français Du 17 Juillet 2015
            // </h2>
            //
            // <div class="contenttext">

            // la date est juste la date du jour
            // http://www.telecharger-magazine.com/2015/07/17/
            //xeSource.XPathValue("//div[@id='calendar-layer']//table[@id='calendar']//td[@class='day-active-v day-current']//a/@href");


            XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']//div[@class='right-full']");

            // Journaux
            data.Category  = xePost.XPathValues(".//div[@class='cat_name']//a/text()").Select(DownloadPrint.Trim).zToStringValues("/");
            data.PrintType = GetPrintType(data.Category);
            //pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType);

            data.Title = xePost.XPathValue(".//h2[@class='title']//text()").zFunc(DownloadPrint.ReplaceChars).zFunc(DownloadPrint.Trim);
            PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title);

            if (titleInfos.foundInfo)
            {
                data.OriginalTitle = data.Title;
                data.Title         = titleInfos.title;
                data.Infos.SetValues(titleInfos.infos);
            }

            XXElement xeContent = xePost.XPathElement(".//div[@class='contenttext']");

            data.Images = new WebImage[] { new WebImage(zurl.GetUrl(data.SourceUrl, xeContent.XPathValue(".//img/@src"))) };

            // force load image to get image width and height
            //if (webResult.WebRequest.LoadImageFromWeb)
            //    data.Images = DownloadPrint.LoadImages(data.Images).ToArray();

            // get infos, description, language, size, nbPages
            PrintTextValues textValues = DownloadPrint.PrintTextValuesManager.GetTextValues(
                xeContent.DescendantTexts(
                    node =>
            {
                if (node is XText)
                {
                    string text = ((XText)node).Value.Trim();
                    if (text.ToLowerInvariant() == "description")
                    {
                        return(XNodeFilter.DontSelectNode);
                    }
                }
                if (node is XElement)
                {
                    XElement xe = (XElement)node;
                    if (xe.Name == "a")
                    {
                        return(XNodeFilter.Stop);
                    }
                }
                return(XNodeFilter.SelectNode);
            }
                    ).Select(DownloadPrint.ReplaceChars).Select(DownloadPrint.TrimWithoutColon), data.Title, extractValuesFromText: false);

            data.Description = textValues.description;
            data.Infos.SetValues(textValues.infos);

            data.DownloadLinks = xeContent.DescendantNodes(
                node =>
            {
                if (!(node is XElement))
                {
                    return(XNodeFilter.DontSelectNode);
                }
                XElement xe2 = (XElement)node;
                if (xe2.Name == "a")
                {
                    return(XNodeFilter.SelectNode);
                }
                if (xe2.Name != "p")
                {
                    return(XNodeFilter.DontSelectNode);
                }
                XAttribute xa = xe2.Attribute("class");
                if (xa == null)
                {
                    return(XNodeFilter.DontSelectNode);
                }
                if (xa.Value != "submeta")
                {
                    return(XNodeFilter.DontSelectNode);
                }
                //return XNodeFilter.SkipNode;
                return(XNodeFilter.Stop);
            })
                                 .Select(node => ((XElement)node).Attribute("href").Value).ToArray();
            data.DownloadLinks = xeContent.XPathValues(".//a/@href").ToArray();

            if (__trace)
            {
                pb.Trace.WriteLine(data.zToJson());
            }

            return(data);
        }
Пример #9
0
        protected override Unea_DetailCompany2 GetData()
        {
            XXElement           xeSource = new XXElement(GetXmlDocument().Root);
            Unea_DetailCompany2 data     = new Unea_DetailCompany2();

            data.sourceUrl       = Url;
            data.loadFromWebDate = DateTime.Now;

            // <div class='ctn_content-article'>
            XXElement xeContent = xeSource.XPathElement(".//div[@class='ctn_content-article']");

            //IEnumerator<string> texts = xeContent.DescendantTextList(nodeFilter: node => !(node is XElement) || (((XElement)node).Name != "script" && ((XElement)node).Name != "table"), func: __trimFunc2).GetEnumerator();
            IEnumerator <string> texts = xeContent.DescendantTexts(node => !(node is XElement) || (((XElement)node).Name != "script" && ((XElement)node).Name != "table") ? XNodeFilter.SelectNode : XNodeFilter.SkipNode).Select(__trimFunc2).GetEnumerator();

            // <h1>
            // <img src="http://unea.griotte.biz/BaseDocumentaire/Docs/Public/4017/LOGOAmpouleC.JPG" style='border-width:2px;border-color:#5593C9;' height='60px' />
            // <span>Entreprise Adapt&eacute;e</span><br />
            // ALSACE ENTREPRISE ADAPTEE
            // </h1>
            if (texts.MoveNext() && texts.MoveNext())
            {
                data.name = texts.Current;
            }

            // <h2>ALSACE ENTREPRISE ADAPTEE est implant&eacute;e sur les sites de Colmar et Mulhouse avec un effectif de 106 salari&eacute;s, avec les activit&eacute;s sous-traitance : assemblage de pi&egrave;ces, cintrage de tuyaux, montage complexe, ainsi qu'une activit&eacute; prestation de service en espaces verts, m&eacute;nage et transport.</h2>
            if (texts.MoveNext())
            {
                data.presentation = texts.Current;
            }

            Unea_TextType textType = Unea_TextType.unknow;

            //foreach (XText xtext in xeContent.DescendantTextNodeList(".//table"))
            foreach (XText xtext in xeContent.XPathElements(".//table").DescendantTextNodes())
            {
                string text = __trimFunc2(xtext.Value);
                if (text == "")
                {
                    continue;
                }
                if (text.Equals("NOS ACTIVITES", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.activity;
                }
                else if (text.Equals("FILIERES METIER UNEA", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.sector;
                }
                else if (text.Equals("DOCUMENTS TÉLÉCHARGEABLES", StringComparison.InvariantCultureIgnoreCase))
                {
                    foreach (XXElement xe2 in new XXElement(xtext.Parent).XPathElements("following-sibling::ul//a"))
                    {
                        string url = xe2.XPathValue("@href");
                        //string name = name = xe2.XPathValue(".//text()", __trimFunc2);
                        string name = __trimFunc2(xe2.XPathValue(".//text()"));
                        if (!data.downloadDocuments.ContainsKey(url))
                        {
                            data.downloadDocuments.Add(url, new Unea_Document()
                            {
                                name = name, url = url
                            });
                        }
                        else
                        {
                            Trace.CurrentTrace.WriteLine("warning download document already exists \"{0}\" \"{1}\"", name, url);
                        }
                    }
                    // textType = novalues pour ne pas avoir Plaquette_AEA.pdf dans unknowInfos
                    textType = Unea_TextType.novalues;
                }
                else if (text.Equals("NOUS CONTACTER", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.novalue;
                }
                else if (text.Equals("ADRESSE", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.address;
                }
                else if (text.Equals("TELEPHONE", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.phone;
                }
                else if (text.Equals("FAX", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.fax;
                }
                else if (text.Equals("EMAIL", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.email;
                }
                else if (text.Equals("SITE", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.webSite;
                }
                else if (text.Equals("QUI SOMMES NOUS", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.novalue;
                }
                else if (text.Equals("DIRIGEANT", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.leader;
                }
                else if (text.Equals("NOMBRE DE SALARIÉS", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.employeNumber;
                }
                else if (text.Equals("CHIFFRE D'AFFAIRE DE L'ANNÉE ÉCOULÉE", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.lastYearRevenue;
                }
                else if (text.Equals("NUMÉRO SIRET", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.siret;
                }
                else if (text.Equals("CERTIFICATION", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.certification;
                }
                else if (text.Equals("PRINCIPAUX CLIENTS", StringComparison.InvariantCultureIgnoreCase))
                {
                    textType = Unea_TextType.client;
                }
                else
                {
                    switch (textType)
                    {
                    case Unea_TextType.activity:
                        if (!data.activities.ContainsKey(text))
                        {
                            data.activities.Add(text, null);
                        }
                        else
                        {
                            Trace.CurrentTrace.WriteLine("warning activity already exists \"{0}\"", text);
                        }
                        break;

                    case Unea_TextType.sector:
                        //data.sectors.Add(text);
                        if (!data.sectors.ContainsKey(text))
                        {
                            data.sectors.Add(text, null);
                        }
                        else
                        {
                            Trace.CurrentTrace.WriteLine("warning sector already exists \"{0}\"", text);
                        }
                        break;

                    case Unea_TextType.address:
                        if (data.address == null)
                        {
                            data.address = text;
                        }
                        else
                        {
                            data.address += " " + text;
                        }
                        break;

                    case Unea_TextType.phone:
                        data.phone = text;
                        textType   = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.fax:
                        data.fax = text;
                        textType = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.email:
                        data.email = text;
                        textType   = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.webSite:
                        data.webSite = text;
                        textType     = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.leader:
                        data.leader = text;
                        textType    = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.employeNumber:
                        int employeNumber;
                        if (int.TryParse(text, out employeNumber))
                        {
                            data.employeNumber = employeNumber;
                        }
                        else
                        {
                            Trace.CurrentTrace.WriteLine("error unknow employe number \"{0}\"", text);
                        }
                        textType = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.lastYearRevenue:
                        if (text != "€")
                        {
                            data.lastYearRevenue = text;
                        }
                        textType = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.siret:
                        data.siret = text;
                        textType   = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.certification:
                        data.certification = text;
                        textType           = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.client:
                        data.clients = text;
                        textType     = Unea_TextType.unknow;
                        break;

                    case Unea_TextType.novalues:
                        break;

                    default:
                        data.unknowInfos.Add(text);
                        break;
                    }
                }
            }

            foreach (XXElement xe in xeContent.XPathElements(".//table//td/a/img"))
            {
                string url = xe.XPathValue("@src");
                if (!data.photos.ContainsKey(url))
                {
                    data.photos.Add(url, null);
                }
                else
                {
                    Trace.CurrentTrace.WriteLine("warning photo already exists \"{0}\"", url);
                }
            }

            return(data);
        }
Пример #10
0
        protected override RapideDdl_PostDetail GetDataFromWeb(LoadDataFromWeb_v3 loadDataFromWeb)
        {
            XXElement            xeSource = new XXElement(loadDataFromWeb.GetXmlDocument().Root);
            RapideDdl_PostDetail data     = new RapideDdl_PostDetail();

            data.sourceUrl       = loadDataFromWeb.request.Url;
            data.loadFromWebDate = loadDataFromWeb.loadFromWebDate;
            data.id = GetPostDetailKey(data.sourceUrl);

            XXElement xePost = xeSource.XPathElement("//div[@class='lcolomn mainside']");

            //data.category = xePost.DescendantTextList(".//div[@class='spbar']//a").Select(DownloadPrint.TrimFunc1).Where(
            data.category = xePost.XPathElements(".//div[@class='spbar']//a").DescendantTexts().Select(DownloadPrint.Trim).Where(
                s =>
            {
                s = s.ToLowerInvariant();
                return(s != "" && !s.Contains("acceuil") && !s.Contains("accueil"));
            }
                ).zToStringValues("/");
            string category = data.category.ToLowerInvariant();

            data.printType = GetPostType(category);

            //data.title = xePost.DescendantTextList(".//div[@class='spbar']", func: DownloadPrint.TrimFunc1).LastOrDefault();
            data.title = xePost.XPathElements(".//div[@class='spbar']").DescendantTexts().Select(DownloadPrint.Trim).LastOrDefault();
            //ExtractTitleInfos(data);
            PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.title);

            if (titleInfos.foundInfo)
            {
                data.originalTitle = data.title;
                data.title         = titleInfos.title;
                data.infos.SetValues(titleInfos.infos);
            }

            XXElement xe   = xePost.XPathElement(".//div[@class='shdinfo']");
            string    date = xe.XPathValue(".//span[@class='date']//text()");

            //data.creationDate = Download.Print.RapideDdl.RapideDdl.ParseDateTime(date, loadDataFromWeb.loadFromWebDate);
            data.creationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.loadFromWebDate, "d-M-yyyy, HH:mm", "d M yyyy", "d MMMM yyyy");
            if (data.creationDate == null)
            {
                pb.Trace.WriteLine("unknow date time \"{0}\"", date);
            }
            if (__trace)
            {
                pb.Trace.WriteLine("creationDate {0} - \"{1}\"", data.creationDate, date);
            }
            data.postAuthor = xe.XPathValue(".//span[@class='arg']//a//text()");

            xe = xePost.XPathElement(".//div[@class='maincont']");
            //data.images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).ToArray();
            data.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).ToArray();

            if (loadDataFromWeb.request.LoadImage)
            {
                data.images = DownloadPrint.LoadImages(data.images).ToArray();
            }

            //RapideDdl.SetTextValues(data, xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a" ));
            // xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a")
            PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.title);

            data.description = textValues.description;
            data.language    = textValues.language;
            data.size        = textValues.size;
            data.nbPages     = textValues.nbPages;
            data.infos.SetValues(textValues.infos);

            List <string> downloadLinks = new List <string>();

            foreach (XXElement xe2 in xe.XPathElements("div/div"))
            {
                // http://prezup.eu http://pixhst.com/avaxhome/27/36/002e3627.jpeg http://www.zupmage.eu/i/R1UgqdXn4F.jpg
                // http://i.imgur.com/Gu7hagN.jpg http://img11.hostingpics.net/pics/591623liens.png http://www.hapshack.com/images/jUfTZ.gif
                // http://pixhst.com/pictures/3029467
                downloadLinks.AddRange(xe2.XPathValues(".//a/@href").Where(url => !url.StartsWith("http://prezup.eu") && !url.StartsWith("http://pixhst.com") &&
                                                                           !url.EndsWith(".jpg") && !url.EndsWith("jpeg") && !url.EndsWith("png") && !url.EndsWith("gif")));
            }
            data.downloadLinks = downloadLinks.ToArray();

            //if (__trace)
            //    RapideDdl_LoadPostDetail.Trace_RapideDdl_PostDetail(data);

            return(data);
        }
Пример #11
0
        //protected override Telechargementz_PostDetail GetDataFromWeb(LoadDataFromWeb loadDataFromWeb)
        protected override IPost GetDataFromWeb(LoadDataFromWeb_v3 loadDataFromWeb)
        {
            XXElement xeSource = new XXElement(loadDataFromWeb.GetXmlDocument().Root);
            Telechargementz_PostDetail data = new Telechargementz_PostDetail();

            data.SourceUrl       = loadDataFromWeb.request.Url;
            data.LoadFromWebDate = loadDataFromWeb.loadFromWebDate;
            data.Id = GetPostDetailKey(data.SourceUrl);

            XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']");

            data.PostAuthor = xePost.XPathValue(".//div[@class='title-info']//a//text()");

            // , 26.12.14
            string date = xePost.XPathValue(".//div[@class='title-info']//a/following-sibling::text()");

            if (date != null)
            {
                data.PostCreationDate = zdate.ParseDateTimeLikeToday(date.Trim(' ', ','), loadDataFromWeb.loadFromWebDate, "dd.MM.yy");
                if (data.PostCreationDate == null)
                {
                    pb.Trace.WriteLine("unknow date time \"{0}\"", date);
                }
                if (__trace)
                {
                    pb.Trace.WriteLine("creationDate {0} - \"{1}\"", data.PostCreationDate, date);
                }
            }
            else
            {
                pb.Trace.WriteLine("creationDate not found \"{0}\"", data.SourceUrl);
            }

            //data.Title = xePost.XPathElement(".//div[@class='post-title']").DescendantTextList(func: DownloadPrint.TrimFunc1).FirstOrDefault();
            data.Title = xePost.XPathElement(".//div[@class='post-title']").DescendantTexts().Select(DownloadPrint.Trim).FirstOrDefault();
            PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.Title);

            if (titleInfos.foundInfo)
            {
                data.OriginalTitle = data.Title;
                data.Title         = titleInfos.title;
                data.infos.SetValues(titleInfos.infos);
            }

            XXElement xe = xePost.XPathElement(".//div[starts-with(@id, 'news-id-')]");

            if (xe.XElement == null)
            {
                pb.Trace.WriteLine("element not found \".//div[starts-with(@id, 'news-id-')]\"");
            }

            //data.Images = new List<UrlImage>();
            //data.Images.Add(xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).FirstOrDefault());
            //data.Images = new UrlImage[] { xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).FirstOrDefault() };
            WebImage image = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).FirstOrDefault();

            if (image != null)
            {
                data.Images = new WebImage[] { image }
            }
            ;

            // force load image to get image width and height
            if (loadDataFromWeb.request.LoadImage)
            {
                data.Images = DownloadPrint.LoadImages(data.Images).ToArray();
            }

            data.DownloadLinks = xe.XPathValues(".//a/@href").ToArray();

            //data.category = xePost.DescendantTextList(".//div[@class='hdiin']//a").Select(DownloadPrint.TrimFunc1).zToStringValues("/");
            //string category = data.category.ToLowerInvariant();
            //data.printType = GetPrintType(category);
            ////pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType);


            // get infos, description, language, size, nbPages
            // nodeFilter: not <a> and not <span>
            //   nodeFilter: node => !(node is XElement) || (((XElement)node).Name != "a" && ((XElement)node).Name != "span")
            // nodeFilter: not <a>
            //PrintTextValues_old textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_old(xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a"), data.Title);
            PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.Title);

            data.description = textValues.description;
            data.language    = textValues.language;
            data.size        = textValues.size;
            data.nbPages     = textValues.nbPages;
            data.infos.SetValues(textValues.infos);

            data.PrintType = PrintType.UnknowEBook;
            if (data.infos.ContainsKey("Bd") || data.infos.ContainsKey("bd") || data.infos.ContainsKey("BD"))
            {
                data.PrintType = PrintType.Comics;
            }
            // Editeur : Presse fr
            else if (data.infos.ContainsKey("editeur") && data.infos["editeur"] is ZString && ((string)data.infos["editeur"]).ToLowerInvariant() == "presse fr")
            {
                data.PrintType = PrintType.Print;
            }
            else if (data.infos.ContainsKey("isbn"))
            {
                data.PrintType = PrintType.Book;
            }


            //pb.Trace.WriteLine(xe.DescendantNodes(returnNodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a").Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson());
            //pb.Trace.WriteLine(xe.DescendantNodes(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a").Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson());
            //pb.Trace.WriteLine(xe.DescendantNodes(returnNodeFilter: node => node is XText).Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson());
            //pb.Trace.WriteLine(xe.DescendantNodes(nodeFilter: node => !(node is XElement) || (((XElement)node).Name != "a" && ((XElement)node).Name != "span"), returnNodeFilter: node => node is XText).Select(node => new { type = node.NodeType, name = node is XElement ? ((XElement)node).Name.LocalName : null, value = node is XText ? ((XText)node).Value : null }).zToJson());

            if (__trace)
            {
                pb.Trace.WriteLine(data.zToJson());
            }

            return(data);
        }
Пример #12
0
        // header get data, from WebHeaderDetailMongoManagerBase_v2<THeaderData, TDetailData>
        protected override IEnumDataPages <Gesat_Header_v2> GetHeaderPageData(HttpResult <string> httpResult)
        {
            XXElement             xeSource = httpResult.zGetXDocument().zXXElement();
            string                url      = httpResult.Http.HttpRequest.Url;
            Gesat_HeaderDataPages data     = new Gesat_HeaderDataPages();

            data.SourceUrl       = url;
            data.LoadFromWebDate = httpResult.Http.RequestTime;
            data.Id = GetPageKey(httpResult.Http.HttpRequest);

            // <div class="PAGENAVIGLIST">
            // <a href="/Gesat/EtablissementList-10-10.html" title="page suivante">&gt;</a>&nbsp;
            data.UrlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='PAGENAVIGLIST']//a[@title='page suivante']/@href"));

            // <div class="ETABLISSEMENT STAR-1 ODD"> <div class="ETABLISSEMENT STAR-0 ODD"> <div class="ETABLISSEMENT STAR-1 EVEN">
            IEnumerable <XXElement> xeHeaders = xeSource.XPathElements("//div[starts-with(@class, 'ETABLISSEMENT STAR-')]");

            List <Gesat_Header_v2> headers = new List <Gesat_Header_v2>();

            foreach (XXElement xeHeader in xeHeaders)
            {
                Gesat_Header_v2 header = new Gesat_Header_v2();
                header.SourceUrl       = url;
                header.LoadFromWebDate = DateTime.Now;

                //<span class="NOM"><a title="ESAT BETTY LAUNAY-MOULIN VERT" href="/Gesat/Hauts-de-Seine,92/Bois-Colombes,35494/esat-betty-launay-moulin-vert-competences-et-handicap-92,e1837/">ESAT BETTY LAUNAY-MOULIN VERT</a></span>
                XXElement xe = xeHeader.XPathElement(".//span[@class='NOM']//a");
                if (xe != null)
                {
                    header.UrlDetail = zurl.GetUrl(url, xe.ExplicitXPathValue("@href"));
                    header.Name      = _trimFunc(xe.ExplicitXPathValue(".//text()"));
                }
                //<span class="VILLE">E.S.A.T.<br />Bois-Colombes (92)</span>
                xe = xeHeader.XPathElement(".//span[@class='VILLE']");
                if (xe != null)
                {
                    IEnumerator <string> texts = xe.DescendantTexts().GetEnumerator();
                    if (texts.MoveNext())
                    {
                        header.Type = texts.Current.Trim();
                    }
                    else
                    {
                        Trace.WriteLine("error companyType not found");
                    }
                    if (texts.MoveNext())
                    {
                        header.Location = texts.Current.Trim();
                    }
                    else
                    {
                        Trace.WriteLine("error companyLocation not found");
                    }
                }
                // <span class="TELEPHONE">01 47 86 11 48</span>
                header.Phone = _trimFunc(xeHeader.ExplicitXPathValue(".//span[@class='TELEPHONE']//text()"));
                //<img info_bulle="Signataire de la charte Ethique et Valeurs" border="0" alt="/images/bullesGesat/pictoCharte.png" src="/images/bullesGesat/pictoCharte.png" style=" border: 0;" />
                //<img info_bulle="Lauréat des trophées HandiResponsables 2013" border="0" alt="/images/bullesGesat/LAURIERS-OR-2013.png" src="/images/bullesGesat/LAURIERS-OR-2013.png" style=" border: 0;" />
                header.Infos = xeHeader.XPathValues(".//img/@info_bulle").Select(_trimFunc).ToArray();
                //_header.SetInfo(xeHeader.XPathValues(".//img/@info_bulle"));

                headers.Add(header);
            }
            data.Data = headers.ToArray();
            return(data);
        }
Пример #13
0
        //protected override GoldenDdl_PostDetail GetDataFromWeb(LoadDataFromWeb loadDataFromWeb)
        protected override IPost GetDataFromWeb(LoadDataFromWeb_v3 loadDataFromWeb)
        {
            XXElement            xeSource = new XXElement(loadDataFromWeb.GetXmlDocument().Root);
            GoldenDdl_PostDetail data     = new GoldenDdl_PostDetail();

            data.sourceUrl       = loadDataFromWeb.request.Url;
            data.loadFromWebDate = loadDataFromWeb.loadFromWebDate;
            data.id = GetPostDetailKey(data.sourceUrl);

            XXElement xePost = xeSource.XPathElement("//div[@id='dle-content']");

            //data.category = xePost.DescendantTextList(".//div[@class='hdiin']//a").Select(DownloadPrint.TrimFunc1).zToStringValues("/");
            data.category = xePost.XPathElements(".//div[@class='hdiin']//a").DescendantTexts().Select(DownloadPrint.Trim).zToStringValues("/");
            string category = data.category.ToLowerInvariant();

            data.printType = GetPrintType(category);
            //pb.Trace.WriteLine("category \"{0}\" printType {1}", category, data.printType);

            //data.title = xePost.XPathValue(".//div[@class='bheading']//text()", DownloadPrint.Trim);
            data.title = xePost.XPathValue(".//div[@class='bheading']//text()").Trim(DownloadPrint.TrimChars);
            PrintTitleInfos titleInfos = DownloadPrint.PrintTextValuesManager.ExtractTitleInfos(data.title);

            if (titleInfos.foundInfo)
            {
                data.originalTitle = data.title;
                data.title         = titleInfos.title;
                data.infos.SetValues(titleInfos.infos);
            }

            string date = xePost.XPathValue(".//div[@class='datenews']//text()");

            data.creationDate = zdate.ParseDateTimeLikeToday(date, loadDataFromWeb.loadFromWebDate, "d-M-yyyy, HH:mm", "d M yyyy", "d MMMM yyyy");
            if (data.creationDate == null)
            {
                pb.Trace.WriteLine("unknow date time \"{0}\"", date);
            }
            if (__trace)
            {
                pb.Trace.WriteLine("creationDate {0} - \"{1}\"", data.creationDate, date);
            }

            data.postAuthor = xePost.XPathValue(".//div[@class='argr']//a//text()");

            XXElement xe = xePost.XPathElement(".//div[@class='maincont']");

            //data.images = xe.XPathImages(xeImg => new UrlImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).ToArray();
            data.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new WebImage(zurl.GetUrl(loadDataFromWeb.request.Url, xeImg.zAttribValue("src")))).ToArray();


            // force load image to get image width and height
            if (loadDataFromWeb.request.LoadImage)
            {
                data.images = DownloadPrint.LoadImages(data.images).ToArray();
            }

            // get infos, description, language, size, nbPages
            //PrintTextValues_old textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_old(xe.DescendantTextList(nodeFilter: node => !(node is XElement) || ((XElement)node).Name != "a"), data.title);
            PrintTextValues_v1 textValues = DownloadPrint.PrintTextValuesManager.GetTextValues_v1(xe.DescendantTexts(node => !(node is XElement) || ((XElement)node).Name != "a" ? XNodeFilter.SelectNode : XNodeFilter.SkipNode), data.title);

            data.description = textValues.description;
            data.language    = textValues.language;
            data.size        = textValues.size;
            data.nbPages     = textValues.nbPages;
            data.infos.SetValues(textValues.infos);

            List <string> downloadLinks = new List <string>();

            foreach (XXElement xe2 in xe.XPathElements("div/div"))
            {
                // http://prezup.eu http://pixhst.com/avaxhome/27/36/002e3627.jpeg http://www.zupmage.eu/i/R1UgqdXn4F.jpg
                // http://i.imgur.com/Gu7hagN.jpg http://img11.hostingpics.net/pics/591623liens.png http://www.hapshack.com/images/jUfTZ.gif
                // http://pixhst.com/pictures/3029467
                downloadLinks.AddRange(xe2.XPathValues(".//a/@href").Where(url => !url.StartsWith("http://prezup.eu") && !url.StartsWith("http://pixhst.com") &&
                                                                           !url.EndsWith(".jpg") && !url.EndsWith("jpeg") && !url.EndsWith("png") && !url.EndsWith("gif")));
            }
            data.downloadLinks = downloadLinks.ToArray();

            if (__trace)
            {
                pb.Trace.WriteLine(data.zToJson());
            }

            return(data);
        }
Пример #14
0
        //public static TelechargementPlus_HeaderPage Load_old(string url, HttpRequestParameters requestParameters = null, bool reload = false, bool loadImage = false)
        //{
        //    RequestFromWeb request = new RequestFromWeb(url, requestParameters, reload, loadImage);
        //    return _loadHeaderPage_old.Load(request);
        //}

        public static TelechargementPlus_HeaderPage LoadHeaderPageFromWeb(pb.Web.v1.RequestFromWeb_v2 request)
        {
            // loadDataFromWeb
            XXElement xeSource = new XXElement(request.GetXmlDocument().Root);
            string    url      = request.Url;
            TelechargementPlus_HeaderPage data = new TelechargementPlus_HeaderPage();

            // post list :
            //   <div class="base shortstory">
            //   _hxr.ReadSelect("//div[@class='base shortstory']:.:EmptyRow", ".//text()");
            // next page :
            //   <div class="navigation">
            //     <div align="center">
            //       <span>Prev.</span>
            //       <span>1</span>
            //       <a href="http://www.telechargement-plus.com/e-book-magazines/page/2/">2</a>
            //       ...
            //       <a href="http://www.telechargement-plus.com/e-book-magazines/page/2/">Next</a>
            //     </div>
            //   </div>
            //   _hxr.ReadSelect("//div[@class='navigation']//a[text()='Next']:.:EmptyRow", "text()", "@href");
            data.urlNextPage = zurl.GetUrl(url, xeSource.XPathValue("//div[@class='navigation']//a[text()='Next']/@href"));
            IEnumerable <XXElement> xeHeaders            = xeSource.XPathElements("//div[@class='base shortstory']");
            List <TelechargementPlus_PostHeader> headers = new List <TelechargementPlus_PostHeader>();

            foreach (XXElement xeHeader in xeHeaders)
            {
                TelechargementPlus_PostHeader header = new TelechargementPlus_PostHeader();
                //_postHeader.sourceUrl = _sourceUrl;
                header.sourceUrl       = url;
                header.loadFromWebDate = DateTime.Now;

                //<h1 class="shd">
                //    <a href="http://www.telechargement-plus.com/e-book-magazines/magazines/86236-multi-ici-paris-n3562-9-au-15-octobre-2013.html">
                //        [Multi] Ici Paris N°3562 - 9 au 15 Octobre 2013
                //    </a>
                //</h1>
                XXElement xe = xeHeader.XPathElement(".//*[@class='shd']//a");
                header.urlDetail = zurl.GetUrl(url, xe.XPathValue("@href"));
                //header.title = TelechargementPlus.TrimString(TelechargementPlus.ExtractTextValues(header.infos, xe.XPathValue(".//text()")));
                // xe.XPathValue(".//text()", TelechargementPlus.TrimFunc1)
                header.title = TelechargementPlus.ExtractTextValues(header.infos, TelechargementPlus.TrimFunc1(xe.XPathValue(".//text()")));

                //<div class="shdinf">
                //    <div class="shdinf">
                //      <span class="rcol">Auteur:
                //          <a onclick="ShowProfile('bakafa', 'http://www.telechargement-plus.com/user/bakafa/', '0'); return false;" href="http://www.telechargement-plus.com/user/bakafa/">
                //              bakafa
                //          </a>
                //      </span>
                //      <span class="date">
                //          <b><a href="http://www.telechargement-plus.com/2013/10/09/">Aujourd'hui, 17:13</a></b>
                //      </span>
                //      <span class="lcol">Cat&eacute;gorie:
                //          <a href="http://www.telechargement-plus.com/e-book-magazines/">
                //              E-Book / Magazines
                //          </a> &raquo;
                //          <a href="http://www.telechargement-plus.com/e-book-magazines/magazines/">
                //              Magazines
                //          </a>
                //      </span>
                //    </div>
                //</div>
                xe = xeHeader.XPathElement(".//div[@class='shdinf']/div[@class='shdinf']");
                header.postAuthor = xe.XPathValue(".//span[@class='rcol']//a//text()");
                //string postDate = xe.XPathValue(".//span[@class='date']//text()");
                // Aujourd'hui, 17:13
                //if (postDate != null)
                //    _postHeader.infos.SetValue("postDate", new ZString(postDate));
                header.creationDate = TelechargementPlus.ParseDateTime(xe.XPathValue(".//span[@class='date']//text()"));
                //header.category = xe.DescendantTextList(".//span[@class='lcol']").Select(TelechargementPlus.TrimFunc1).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/");
                header.category = xe.XPathElements(".//span[@class='lcol']").DescendantTexts().Select(TelechargementPlus.TrimFunc1).Where(s => s != "E-Book / Magazines" && s != "Catégorie:" && s != "").zToStringValues("/");
                //Trace.CurrentTrace.WriteLine("post header category \"{0}\"", _postHeader.category);
                //.zForEach(s => s.Trim())

                //<span id="post-img">
                //    <div id="news-id-86236" style="display: inline;">
                //        <div style="text-align: center;">
                //            <!--dle_image_begin:http://zupimages.net/up/3/1515486591.jpeg|-->
                //            <img src="http://zupimages.net/up/3/1515486591.jpeg" alt="[Multi] Ici Paris N°3562 - 9 au 15 Octobre 2013"
                //                title="[Multi] Ici Paris N°3562 - 9 au 15 Octobre 2013" /><!--dle_image_end-->
                //            <br />
                //            <b>
                //                <br />
                //                Ici Paris N°3562 - 9 au 15 Octobre 2013<br />
                //                French | 52 pages | HQ PDF | 101 MB
                //            </b>
                //            <br />
                //            <br />
                //            Ici Paris vous fait partager la vie publique et privée de celles et ceux qui font
                //            l'actualité : exclusivités, interviews, enquêtes (la face cachée du showbiz, les
                //            coulisses de la télé) indiscrétions, potins.<br />
                //        </div>
                //    </div>
                //</span>
                xe = xeHeader.XPathElement(".//span[@id='post-img']//div[starts-with(@id, 'news-id')]");
                //_postHeader.images = xe.XPathImages(".//img", _url, TelechargementPlus.ImagesToSkip);
                //header.images = xe.XPathImages(url, TelechargementPlus.ImagesToSkip);
                //header.images = xe.XPathImages(url, imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source));
                //header.images = xe.XPathImages(xeImg => new ImageHtml(xeImg, url), imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source)).ToList();
                header.images = xe.DescendantNodes(node => XmlDescendant.ImageFilter(node)).Select(xeImg => new pb.old.ImageHtml((XElement)xeImg, url)).Where(imageHtml => !TelechargementPlus.ImagesToSkip.ContainsKey(imageHtml.Source)).ToList();

                if (request.LoadImage)
                {
                    pb.old.Http_v2.LoadImageFromWeb(header.images);
                }

                //header.SetTextValues(xe.DescendantTextList());
                header.SetTextValues(xe.DescendantTexts());

                headers.Add(header);
            }
            data.postHeaders = headers.ToArray();
            return(data);
        }