예제 #1
0
        public async Task <GraphObject> ReadAsync(Uri uri)
        {
            Type   graphType   = uri.ToGraphType();
            object graphObject = Activator.CreateInstance(graphType);

            PropertyInfo[] properties = graphType.GetProperties()
                                        .Where(x => Attribute.IsDefined(x, typeof(GraphElementAttribute))).ToArray();
            HtmlDocument htmlDocument = await HtmlClient.LoadDocumentAsync(uri);

            foreach (PropertyInfo prop in properties)
            {
                if (prop.PropertyType.IsClass && (prop.PropertyType.Module.Name.Contains("Graphlax")))
                {
                    prop.SetValue(graphObject, GetSubPropValue(prop.PropertyType, htmlDocument));
                }
                else
                {
                    prop.SetValue(graphObject, GetPropValue(prop, htmlDocument));
                }
            }
            ((GraphObject)graphObject).Url       = uri.ToString();
            ((GraphObject)graphObject).Site.Name = uri.Host;
            ((GraphObject)graphObject).Site.IP   = "127.0.0.1";
            return((GraphObject)graphObject);
        }
예제 #2
0
        public async Task TestGetTextReaderAsync_ForEncoding(string url, string expectedTitle, ClientOptions options = null)
        {
            ClientOptions optionsToUse = options == null ? HtmlClient.Options : options;
            XmlDocument   doc1         = new XmlDocument();

            System.Text.Encoding initialEncoding   = null;
            EncodingConfidence   initialConfidence = EncodingConfidence.Tentative;

            System.Text.Encoding finalEncoding   = null;
            EncodingConfidence   finalConfidence = EncodingConfidence.Tentative;

            // Get the Html asynchronously and Parse it into an Xml Document
            using (HtmlTextReader textReader = await HtmlClient.GetHtmlTextReaderAsync(url, optionsToUse)) {
                initialEncoding   = textReader.CurrentEncoding;
                initialConfidence = textReader.CurrentEncodingConfidence;

                HtmlParser.DefaultParser.Parse(doc1, textReader, new ParserOptions {
                    BaseUrl = url
                });

                finalEncoding   = textReader.CurrentEncoding;
                finalConfidence = textReader.CurrentEncodingConfidence;
            }

            string title1 = doc1.SelectSingleNode("//title/text()").InnerText;

            Console.WriteLine("Crawled: " + url + ", title: " + title1 + ", default: " + optionsToUse.DefaultEncoding.WebName + " (detect=" + optionsToUse.DetectEncoding + "), inital: " + initialEncoding.WebName + " (" + initialConfidence + "), final: " + finalEncoding.WebName + " (" + finalConfidence + ")");

            // Compare the titles of the pages to see if the encoding is picking up consistently between
            Assert.AreEqual(expectedTitle, title1);
        }
예제 #3
0
        protected virtual async Task <int> GetPagesCount()
        {
            var url     = _urlGenerator.GeneratePageUrl(CurrentCity, 1);
            var htmlDoc = await HtmlClient.LoadFromWebAsync(url);

            HtmlNode pageCount = htmlDoc.DocumentNode.SelectSingleNode(TargetWebsite.PagesCountXPath);

            return(Convert.ToInt32(pageCount.InnerHtml));
        }
예제 #4
0
        public async Task <XmlDocument> LoadXHtmlDocAsync(string url)
        {
            XmlDocument xhtmlDoc = new XmlDocument();

            // Get the Html asynchronously and Parse it into an Xml Document
            using (HtmlTextReader htmlReader = await HtmlClient.GetHtmlTextReaderAsync(url))
                this.Parser.Parse(xhtmlDoc, htmlReader, new ParserOptions {
                    BaseUrl = url
                });

            return(xhtmlDoc);
        }
예제 #5
0
        internal static async Task LoadWebPageAsync(XmlDocument doc, string url, LoaderOptions options)
        {
            LoaderOptions optionsToUse = options == null ? new LoaderOptions() : options;

            optionsToUse.ParserOptions.BaseUrl = string.IsNullOrEmpty(optionsToUse.ParserOptions.BaseUrl) ? url : optionsToUse.ParserOptions.BaseUrl;

            XmlDomBuilder dom = new XmlDomBuilder(doc);
            HtmlStreamParser <XmlNode> parser = new HtmlStreamParser <XmlNode>();

            // Get the Html asynchronously and Parse it into an Xml Document
            using (HtmlTextReader htmlReader = await HtmlClient.GetHtmlTextReaderAsync(url, optionsToUse))
                parser.Parse(dom, htmlReader, optionsToUse.ParserOptions);
        }
예제 #6
0
        public async Task <int> DownloadPageUsingGetAsTextReaderAsync(string url)
        {
            using (HtmlTextReader reader = await HtmlClient.GetHtmlTextReaderAsync(url))
            {
                int c         = 0;
                int charsRead = 0;
                while (true)
                {
                    c = reader.BaseReader.Read();
                    if (c < 0)
                    {
                        break;
                    }
                    charsRead++;
                }

                return(charsRead);
            }
        }
예제 #7
0
        public virtual async Task CollectData(City city)
        {
            CurrentCity = city;
            await _repository.ClearOffers(CurrentCity, TargetWebsite.Name);

            for (int i = 0; i < await GetPagesCount(); i++)
            {
                var url     = _urlGenerator.GeneratePageUrl(city, i);
                var htmlDoc = await HtmlClient.LoadFromWebAsync(url);

                var nodes = htmlDoc.DocumentNode
                            .SelectNodes(TargetWebsite.DocumentNodeXPath).ToList();

                var offers = GetDetails(nodes);
                await _repository.AddMany(offers);

                //todo find better anti-ban solution
                Thread.Sleep(4000);
            }
        }
예제 #8
0
        private Offer GetFullOffer(Offer teaser)
        {
            var client = new HtmlClient();

            var html = client.GetHtml(teaser.Url);

            var dom = new CQ(html);

            var header = dom[".wspolny_naglowek_tytul"][0].InnerHTML;
            if (header.Contains("PRYWATNA"))
            {
                teaser.PrivateOffer = true;
            }

            var fullDescription = dom[".pokaz_ogloszenie_tresc"];
            for (var i = 0; i < fullDescription.Length; i++)
            {
                teaser.Description =  TextHelper.CleanText(fullDescription.RenderSelection());
            }

            var kontakt = dom["ul.pokaz_ogloszenie"][0].OuterHTML;

            Regex rgx = new Regex("<script.+script>", RegexOptions.Singleline);
            Match match = rgx.Match(kontakt);
            if (match.Success)
                kontakt = rgx.Replace(kontakt, "");

            teaser.Description += kontakt;

            var pictureEls = dom["img.pokaz_ogloszenie_obrazek"];

            var pictures = new List<string>();

            foreach (var picture in pictureEls)
            {
                var pictureUrl = "http://ogloszenia.przemysl.pl/" + picture.ParentNode.Attributes["href"];
                pictures.Add(pictureUrl);
            }

            if (pictures.Count > 0)
            {
                teaser.Pictures = pictures;
            }

            teaser.Teaser = false;

            return teaser;
        }
예제 #9
0
        private TeaserCrawrlResult GetTeasers(string url)
        {
            var result = new TeaserCrawrlResult();

            var client = new HtmlClient();

            var html = client.GetHtml(url);

            var dom = new CQ(html);

            result.TeaserUrls = GetUrls(dom).ToList();
            result.Teasers = GetTeasers(dom);

            return result;
        }