Пример #1
1
        /// <summary>
        /// Returns an XML document from a given URL.
        /// </summary>
        /// <param name="web">The web.</param>
        /// <param name="url">The URL.</param>
        /// <param name="format">The format.</param>
        /// <param name="absolutizeLinks">if set to <c>true</c> [absolutize links].</param>
        /// <returns></returns>
        public static XmlDocument LoadHtmlAsXml(HtmlWeb web, string url, string format,
            bool absolutizeLinks)
        {
            // Declare necessary stream and writer objects
            MemoryStream m = new MemoryStream();
            XmlTextWriter xtw = new XmlTextWriter(m, null);

            // Load the content into the writer
            if (format == "html")
            {
                web.LoadHtmlAsXml(url, xtw);
                // Rewind the memory stream
                m.Position = 0;
                // Create, fill, and return the xml document
                XmlDocument xdoc = new XmlDocument();
                string content = (new StreamReader(m)).ReadToEnd();

                HtmlDocument doc = new HtmlDocument();
                doc.OptionOutputAsXml = true;
                doc.LoadHtml(content);

                if (absolutizeLinks == true)
                {
                    AttributeReferenceAbsolutizer.ExecuteDefaultAbsolutization
                        (doc.DocumentNode, url);
                }

                xdoc.LoadXml(doc.DocumentNode.OuterHtml);

                return xdoc;
            }
            else
            {
                HtmlDocument doc = web.Load(url);
                doc.OptionOutputAsXml = true;
                XmlDocument xdoc = new XmlDocument();

                if (absolutizeLinks == true)
                {
                    AttributeReferenceAbsolutizer.ExecuteDefaultAbsolutization
                        (doc.DocumentNode, url);
                }

                xdoc.LoadXml(doc.DocumentNode.OuterHtml);

                return xdoc;
            }
        }
Пример #2
0
        static void Main(string[] args)
        {
            HtmlWeb hw = new HtmlWeb();

            // we are going to use cache, for demonstration purpose only.
            string cachePath = Path.GetFullPath(@".\cache");
            if (!Directory.Exists(cachePath))
            {
                Directory.CreateDirectory(cachePath);
            }
            hw.CachePath = cachePath;
            hw.UsingCache = true;

            // set the following to true, if you don't want to use the Internet at all and if you are sure something is available in the cache (for testing purposes for example).
            hw.CacheOnly = true;

            // this is the url we want to scrap
            // note: you want to check Terms Of Services, Copyrights and other legal issues if you plan to use this for non personnal work.
            string url = @"http://www.asp.net/Modules/MoreArticles.aspx?tabindex=0&mid=64";

            // there are two methods to do the work
            // 1st method: use XSLT
            ElegantWay(hw, url);

            // 2nd method: use C# code
            ManualWay(hw, url);
        }
Пример #3
0
        static void ElegantWay(HtmlWeb hw, string url)
        {
            string xslt = "www.asp.net.ToRss.xsl";

            // copy the file so it exists aside the .exe
            File.Copy(@"..\..\" + xslt, xslt, true);

            // create an XML file
            XmlTextWriter writer = new XmlTextWriter("rss.xml", System.Text.Encoding.UTF8);

            // get an Internet resource and write it as an XML file, after an XSLT transormation
            // if www.asp.net ever change its HTML format, just changes the XSL file. No need for recompilation.
            hw.LoadHtmlAsXml(url, xslt, null, writer);

            // cleanup
            writer.Flush();
            writer.Close();
        }
Пример #4
0
		static void Main(string[] args)
		{
			HtmlWeb hw = new HtmlWeb();
			string url = @"http://www.microsoft.com";
			HtmlDocument doc = hw.Load(url);
			doc.Save("mshome.htm");

			DocumentWithLinks nwl = new DocumentWithLinks(doc);
			Console.WriteLine("Linked urls:");
			for(int i=0;i<nwl.Links.Count;i++)
			{
				Console.WriteLine(nwl.Links[i]);
			}

			Console.WriteLine("Referenced urls:");
			for(int i=0;i<nwl.References.Count;i++)
			{
				Console.WriteLine(nwl.References[i]);
			}
		}
Пример #5
0
        static void Main(string[] args)
        {
            /*
            string webAdd = "http://moto.money.pl/ceny-paliw/wroclaw,pb95.html";
            string exprON = ".+gaz_box.+>([0-9].[0-9]+)</span>.+[0-9]";
            GetDataFromWeb ON = new GetDataFromWeb(webAdd, exprON);

            List<string> resultsON = ON.getResults();

            foreach(string x in resultsON)
                Console.WriteLine(x);
            */

            HtmlWeb webGet = new HtmlWeb();
            HtmlDocument document = webGet.Load(@"C:\Users\Bocian\Documents\Visual Studio 2013\Projects\Xpath first proj\bookshop.html");
            HtmlNodeCollection metaTags = document.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/ul/li/span");

            Console.WriteLine("{0}    {1}","ON",metaTags[0].InnerHtml);
            Console.WriteLine("{0}  {1}", "PB95", metaTags[1].InnerHtml);
            Console.WriteLine("{0}  {1}", "PB98", metaTags[2].InnerHtml);
            Console.WriteLine("{0}   {1}", "LPG", metaTags[3].InnerHtml);

            Console.ReadLine();
        }
Пример #6
0
        public static List <string> GetSinonimsFromDictooByWord(string word)
        {
            HtmlWeb       web        = new HtmlWeb();
            List <string> wordsFinal = new List <string>();
            HtmlDocument  document   = web.Load("http://dex.dictoo.eu/index.php?cheie=" + word + "&m=0");

            HtmlNode[] mainNode = document.DocumentNode.SelectNodes("//div[@class='cell colspan6']") != null?document.DocumentNode.SelectNodes("//div[@class='cell colspan6']").ToArray() : null;

            var mainNodeHTML = mainNode[0].InnerHtml;

            if (!mainNodeHTML.Contains("Nu există rezultate pentru termenul sau termenii căutați"))
            {
                var indexOne = -1;
                var indexTwo = -1;

                do
                {
                    indexOne = mainNodeHTML.IndexOf("<script");
                    indexTwo = mainNodeHTML.IndexOf("</script>");
                    if (indexOne != -1 && indexTwo != -1)
                    {
                        mainNodeHTML = mainNodeHTML.Replace(mainNodeHTML.Substring(indexOne, indexTwo - indexOne + 9), string.Empty);
                    }
                } while (indexOne != -1 && indexTwo != -1);

                var mainDiv1        = mainNodeHTML.ToString().Split(new string[] { "<strong>" }, StringSplitOptions.None);
                var mainDiv2        = mainDiv1[1].Split(new string[] { "<span style=\"font-weight: bold;\">Sinonime</span>" }, StringSplitOptions.None);
                var mainDivRawWords = string.Empty;

                if (mainDiv2.Count() > 1)
                {
                    if (mainDiv2[0].Contains("style=\"text-decoration:none;\">" + word + "</a>") ||
                        (mainDiv2[0].IndexOf("style=\"text-decoration:none;\">" + word + "</a>", StringComparison.OrdinalIgnoreCase) >= 0) ||
                        (mainDiv2[0].IndexOf("style=\"text-decoration:none;\">" + word + " (", StringComparison.OrdinalIgnoreCase) >= 0) ||
                        mainDiv2[0].Contains("style=\"text-decoration:none;\">" + word + " ("))
                    {
                        mainDivRawWords = mainDiv2[1];

                        do
                        {
                            indexOne = mainDivRawWords.IndexOf("(");
                            indexTwo = mainDivRawWords.IndexOf(")");
                            if (indexOne != -1 && indexTwo != -1)
                            {
                                mainDivRawWords = mainDivRawWords.Replace(mainDivRawWords.Substring(indexOne, indexTwo - indexOne + 1), string.Empty);
                            }
                        } while (indexOne != -1 && indexTwo != -1);


                        HtmlDocument htmlDocIn = new HtmlDocument();
                        htmlDocIn.LoadHtml(mainDivRawWords);
                        var resultIn        = htmlDocIn.DocumentNode.SelectNodes("//a").ToArray();
                        var combinedResults = string.Empty;
                        foreach (var item in resultIn)
                        {
                            //if (item.InnerText.Contains(",") || item.InnerText.Contains(","))
                            combinedResults = combinedResults + " " + item.InnerText;
                        }
                        //var wordsWithSpaces = combinedResults.KeepOnlyAlphabetical();
                        combinedResults = combinedResults.Replace("\n", string.Empty).Replace("\r", string.Empty);
                        var test = combinedResults.Split(new char[] { ',', ';', '.' }, StringSplitOptions.RemoveEmptyEntries).ToList();
                        foreach (var item in test)
                        {
                            wordsFinal.Add(item.KeepOnlyAlphabetical().Trim());
                        }
                        // wordsFinal = wordsWithSpaces.Split(new char[] { ' ' }, StringSplitOptions.None).ToList();
                    }

                    //get
                }
                else
                {
                    //not found?
                    throw new Exception();
                }
            }

            return(wordsFinal);
        }
Пример #7
0
        public void ParsePage()
        {
            HtmlWeb      web = new HtmlWeb();
            HtmlDocument doc = web.Load(pageLink);

            if (doc == null)
            {
                return;
            }

            if (doc.DocumentNode.SelectSingleNode("//ul[@class='search-results-list']") == null)
            {
                //Console.WriteLine("No results!");
                return;
            }

            HtmlNodeCollection houseCollection = doc.DocumentNode.SelectSingleNode("//ul[@class='search-results-list']")
                                                 .SelectNodes(".//li[@class='property-list-item-container ']");

            if (houseCollection == null)
            {
                return;
            }

            List <HtmlNode> houses = houseCollection.ToList();

            if (houses.Count < 1)
            {
                //Console.WriteLine("No houses found");
                return;
            }

            if (houseData == null)
            {
                houseData = new List <HouseData>();
            }
            else
            {
                houseData.Clear();
            }

            HtmlNode pagination = doc.DocumentNode.SelectSingleNode("//ul[@class='pagination']");

            if (pagination != null)
            {
                HtmlNode lastPage  = pagination.SelectSingleNode("./li[@class='last']").SelectSingleNode(".//a");
                string[] splitLink = lastPage.Attributes["href"].Value.Split('-');
                int      pageCount = Int32.Parse(splitLink[splitLink.Length - 1]);
                //Console.WriteLine(pageCount + " pages");
                for (int p = 0; p < pageCount; p++)
                {
                    if (p > 0)
                    {
                        HtmlDocument doc2 = web.Load(pageLink + "/page-" + (p + 1));
                        if (doc2 == null)
                        {
                            continue;
                        }
                        //if (doc2.DocumentNode.SelectSingleNode("//ul[@class='search-results-list']") != null)
                        //{
                        HtmlNodeCollection houses2Collection = doc2.DocumentNode.SelectSingleNode("//ul[@class='search-results-list']")
                                                               .SelectNodes(".//li[@class='property-list-item-container ']");
                        if (houses2Collection == null)
                        {
                            continue;
                        }
                        //List<HtmlNode> houses2 = doc2.DocumentNode.SelectSingleNode("//ul[@class='search-results-list']")
                        //.SelectNodes(".//li[@class='property-list-item-container ']").ToList();
                        List <HtmlNode> houses2 = houses2Collection.ToList();
                        for (int h = 0; h < houses2.Count; h++)
                        {
                            houses.Add(houses2[h]);
                        }
                        Console.WriteLine("houses now contains " + houses.Count);
                        //}
                    }
                }
            }

            //ParseHouse(siteName, houses[0]);
            foreach (HtmlNode house in houses)
            {
                ParseHouse(house);
                //Console.WriteLine("houseData list now contains " + houseData.Count + " houses.");
            }

            //for(int i=0; i<5; i++)
            //{
            //    ParseHouse(siteName, houses[i]);
            //    Console.WriteLine("houseData list now contains " + houseData.Count + " houses.");
            //}
        }
Пример #8
0
        private static HtmlDocument OpenLink(string link)
        {
            var web = new HtmlWeb();

            return(web.Load(link));
        }
Пример #9
0
 /// <summary>
 /// Initializes a new instance of the <see cref="LimangoProcess"/> class.
 /// </summary>
 public LimangoProcess(HtmlWeb htmlWeb, ILogger <LimangoProcess> logger, ILimangoWebsiteProcessor websiteProcessor)
 {
     Logger = logger;
     this.websiteProcessor = websiteProcessor;
     web = htmlWeb;
 }
Пример #10
0
        /// <summary>
        /// Scrapes data from the NUFORC web portal, deserializes it into Report objects
        /// Multithreaded, takes a lot of CPU and memory to run, and is more or less a DDOS if done wrong
        /// Ensure parameters are properly set before starting
        /// </summary>
        /// <returns></returns>
        public IEnumerable <Report> GetReports(ISet <int> currentIds)
        {
            IsRunning = true;
            Halt      = false;
            var newReports = new HashSet <Report>();

            var web     = new HtmlWeb();
            var homeDoc = web.Load(BaseUrl + "webreports.html");

            // from front page, go to "by state"
            var stateIndexLink = BaseUrl + homeDoc.QuerySelector(Selectors.indexSelector).Attributes["href"].Value;
            var stateIndexDoc  = web.Load(stateIndexLink);

            // number of pending requests to the NUFORC site, decrements once the web page is received
            int requestsInFlight = 0;

            // get list of links to state pages
            var stateLinkNodes = stateIndexDoc.QuerySelectorAll(Selectors.tableLinkSelector);


            // Main loop, forks threads to independently open and close pages as they load
            do
            {
                foreach (var s in stateLinkNodes)
                {
                    web = new HtmlWeb();

                    var stateLink = BaseUrl + "webreports/" + s.Attributes["href"].Value;

                    // throttle requests when too many are in flight
                    while (requestsInFlight > MaxInFlight)
                    {
                        Thread.Sleep(ThrottleTimeout);
                    }
                    requestsInFlight++;

                    // spin off threads for each state
                    web.LoadFromWebAsync(stateLink).ContinueWith((state_task) =>
                    {
                        try { state_task.Wait(); }
                        catch { return; }

                        requestsInFlight--;

                        if (!state_task.IsCompletedSuccessfully)
                        {
                            return;
                        }

                        var stateDoc = state_task.Result;

                        // get list of links to individual reports (same selector)
                        var reportLinkNodes = stateDoc.QuerySelectorAll(Selectors.tableLinkSelector);

                        foreach (var r in reportLinkNodes)
                        {
                            // add another short lived web client!
                            var webTemp    = new HtmlWeb();
                            var reportLink = BaseUrl + "webreports/" + r.Attributes["href"].Value;

                            // extract ID from url, use that as the ID
                            bool parseResult = int.TryParse(reportLink.Substring(reportLink.LastIndexOf('/') + 2).Replace(".html", ""), out int reportId);

                            // check against duplicates and failed lookups
                            if (!parseResult || currentIds.Contains(reportId))
                            {
                                continue;
                            }

                            // throttle requests when to many are in flight
                            while (requestsInFlight >= MaxInFlight)
                            {
                                Thread.Sleep(ThrottleTimeout);
                            }
                            requestsInFlight++;

                            // spin off *another* thread to load the report pages
                            webTemp.LoadFromWebAsync(reportLink).ContinueWith((report_task) =>
                            {
                                try
                                { report_task.Wait(); }
                                catch { return; }
                                finally { requestsInFlight--; }

                                // bail out if the request failed
                                if (!report_task.IsCompletedSuccessfully)
                                {
                                    return;
                                }

                                var reportDoc = report_task.Result;

                                // get the details and description of the report
                                var reportTable = reportDoc.QuerySelectorAll("tr td");

                                // report wasn't found, got a different page
                                if (reportTable.Count < 2)
                                {
                                    return;
                                }

                                var details     = reportTable[0].InnerHtml;
                                var description = reportTable[1];

                                // bail out if we've hit the max batch in any thread
                                if (Halt.Equals(true) || (BatchSize != -1 && newReports.Count > BatchSize))
                                {
                                    Halt = true;
                                    return;
                                }

                                // for this row: the left most portion is the date, the rightmost is the time, the middle is usually useless
                                var reportDateTimes = Regex.Match(details, RegularExpressions.dateReported).Value.Split(' ');

                                // there is
                                newReports.Add(new Report()
                                {
                                    ReportId      = reportId,
                                    DateOccurred  = DateTime.Parse(Regex.Match(details, RegularExpressions.dateOccured).Value),
                                    DateSubmitted = DateTime.Parse(reportDateTimes[0] + reportDateTimes[reportDateTimes.Length - 1]),
                                    DatePosted    = DateTime.Parse(Regex.Match(details, RegularExpressions.datePosted).Value),
                                    Location      = Regex.Match(details, RegularExpressions.location).Value,
                                    Shape         = ShapeUtility.ShapeAliases(Regex.Match(details, RegularExpressions.shape).Value),
                                    Duration      = Regex.Match(details, RegularExpressions.duration).Value,
                                    Description   = description.InnerText
                                });

                                Console.WriteLine(newReports.Count);
                            });

                            if (Halt.Equals(true))
                            {
                                break;
                            }
                        }
                    });
                    if (Halt.Equals(true))
                    {
                        break;
                    }
                }
            } while (requestsInFlight > 0 || Halt.Equals(false));

            // reset initial conditions
            Halt      = false;
            IsRunning = false;

            return(newReports);
        }
Пример #11
0
        private void Button_Click(object sender, RoutedEventArgs e)
        {
            Przycisk.Content = "Pobieram...Proszę czekać";
            Console.WriteLine("Pobieram...");
            Przycisk.IsEnabled = false;
            try
            {
                HtmlWeb htmlWeb = new HtmlWeb();
                htmlWeb.OverrideEncoding = Encoding.GetEncoding("ISO-8859-2");//polskie znaki
                //var progress = new Progress<ProgressReport>();
                //progress.ProgressChanged += (o, report) =>
                //{
                //    LabelWynik.Content = string.Format("Proces trwa...{0}", report.PercenntComplete);
                //    progressBar.Value = report.PercenntComplete;
                //    progressBar.UpdateLayout();
                //};

                List <string> strony = new List <string>();
                for (int i = 1; i < 200; i++)
                {
                    string tytulstrony = "http://www.oglaszamy24.pl/ogloszenia/?std=1&keyword=szybka+po%BFyczka+w+domu+klienta&results=" + i.ToString();
                    strony.Add(tytulstrony);
                }
                int zmienna = 1;
                //
                foreach (string strona in strony)
                {
                    //await ProcessData(strony, progress);
                    HtmlDocument htmlDocument = htmlWeb.Load(strona);
                    ////////////////////////////linki/////////////////////////////////////////////
                    var links = from node in htmlDocument.DocumentNode.Descendants("a")
                                where node.Attributes.Contains("href") &&
                                node.Attributes.Contains("class") &&
                                node.Attributes.Contains("name") &&
                                node.Attributes["name"].Value == "alnk" &&
                                node.Attributes["class"].Value == "o_title" &&
                                node.ParentNode.Name == "div"
                                select new
                    {
                        a = node.Attributes["href"].Value,
                    };
                    Console.WriteLine("links ma {0} elementow", links.Count());
                    //LabelWynik.Content = links.Count().ToString();//
                    if (links.Count() == 0)
                    {
                        break;
                    }
                    foreach (var el in links)
                    {
                        //Console.WriteLine("{0}", el.a);
                        string       nowylink      = el.a.ToString();
                        HtmlDocument htmlDocument2 = htmlWeb.Load(nowylink);
                        ////////////////////////////tresc/////////////////////////////////////////////
                        var source = from node in htmlDocument2.DocumentNode.Descendants("div")
                                     where node.Attributes.Contains("id") &&
                                     node.Attributes.Contains("style") &&
                                     node.Attributes["id"].Value == "adv_desc" &&
                                     node.ParentNode.Name == "div"
                                     select new
                        {
                            a = node.InnerHtml,
                        };
                        //Console.WriteLine("source ma {0} elementow", source.Count());
                        foreach (var element in source)
                        {
                            string nazwa = zmienna.ToString() + ".html";
                            Zapisz(element.a, nazwa);
                            zmienna++;
                        }
                    }
                    LabelWynik.Content = "Pobrano i zapisano pliki \nw  folderze PlikiTekstowe";
                }
            }
            catch
            {
                LabelWynik.Content = "Brak połączenia z internetem.";
            }
            Przycisk.Content   = "Pobierz nowe dane";
            Przycisk.IsEnabled = true;
        }
Пример #12
0
        public List <SearchResultObject> ParseSearch(string content)
        {
            List <SearchResultObject> SearchResults = new List <SearchResultObject>();
            Regex        IdMatch = new Regex(@"r\d+");
            HtmlDocument SearchResultDocument = new HtmlDocument();

            SearchResultDocument.LoadHtml(content);
            HtmlWeb            HtmlWeb           = new HtmlWeb();
            HtmlNodeCollection HtmlSearchResults = SearchResultDocument.DocumentNode.SelectNodes("//table[contains(@class,'ipb_table chapters_list')]/tbody/tr[not(contains(@class,'header'))]");

            if (!Equals(HtmlSearchResults, null))
            {
                foreach (HtmlNode SearchResultNode in HtmlSearchResults)
                {
                    HtmlNode NameLink = SearchResultNode.SelectSingleNode(".//td[1]/strong/a");
                    if (NameLink != null)
                    {
                        Int32  Id            = -1;
                        String Name          = HtmlEntity.DeEntitize(NameLink.InnerText).Trim(),
                               Link          = NameLink.Attributes["href"].Value,
                               Description   = null;
                        LocationObject Cover = null;
                        if (Int32.TryParse(IdMatch.Match(Link).Value.Substring(1), out Id))
                        {
                            HtmlDocument PopDocument     = HtmlWeb.Load(String.Format("{0}/comic_pop?id={1}", ExtensionDescriptionAttribute.RootUrl, Id));
                            HtmlNode     CoverNode       = PopDocument.DocumentNode.SelectSingleNode("//img"),
                                         DescriptionNode = PopDocument.DocumentNode.SelectSingleNode("//table/tbody/tr[6]/td[2]");
                            if (!HtmlNode.Equals(CoverNode, null))
                            {
                                Cover = new LocationObject()
                                {
                                    Url               = CoverNode.Attributes["src"].Value,
                                    ExtensionName     = ExtensionDescriptionAttribute.Name,
                                    ExtensionLanguage = ExtensionDescriptionAttribute.Language
                                }
                            }
                            ;
                            if (!HtmlNode.Equals(DescriptionNode, null))
                            {
                                Description = DescriptionNode.InnerText.Trim();
                            }
                        }
                        String[] Author_Artists = { SearchResultNode.SelectSingleNode(".//td[2]").InnerText.Trim() };
                        SearchResults.Add(new SearchResultObject()
                        {
                            Cover             = Cover,
                            Description       = Description,
                            ExtensionName     = ExtensionDescriptionAttribute.Name,
                            ExtensionLanguage = ExtensionDescriptionAttribute.Language,
                            Name    = Name,
                            Url     = Link,
                            Id      = Id.ToString(),
                            Rating  = Double.Parse(SearchResultNode.SelectSingleNode(".//td[3]/div").Attributes["title"].Value.Substring(0, 4)),
                            Artists = Author_Artists.ToList(),
                            Authors = Author_Artists.ToList()
                        });
                    }
                }
            }
            return(SearchResults);
        }
    }
Пример #13
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();

            foreach (string url in this.docUrls)
            {
                HtmlDocument       listDoc     = web.Load(url);
                HtmlNodeCollection docNodeList = listDoc.DocumentNode.SelectNodes("//div[@id='system']//table//tr[@valign='top']");

                if (docNodeList != null)
                {
                    Console.WriteLine("{0} dates...", docNodeList.Count);

                    foreach (HtmlNode docNode in docNodeList)
                    {
                        DateTime meetingDate = DateTime.MinValue;
                        try
                        {
                            string dateText = docNode.SelectSingleNode("./td").InnerText.Trim((char)32, (char)160);
                            if (!string.IsNullOrEmpty(dateText))
                            {
                                meetingDate = DateTime.Parse(dateText);
                            }
                        }
                        catch
                        {
                        }

                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Earlier than {0}...", this.dtStartFrom);
                            continue;
                        }

                        HtmlNodeCollection pdfNodes = docNode.SelectNodes(".//a[@href]");

                        if (pdfNodes != null)
                        {
                            Console.WriteLine("{0} files at {1}...", pdfNodes.Count, meetingDate);

                            foreach (HtmlNode pdfNode in pdfNodes)
                            {
                                string pdfUrl = pdfNode.Attributes["href"].Value;
                                pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.ci.wayne.mi.us" + pdfUrl;

                                if (pdfUrl.Contains("youtu"))
                                {
                                    continue;
                                }

                                Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl);

                                if (localdoc == null)
                                {
                                    localdoc           = new Documents();
                                    localdoc.DocId     = Guid.NewGuid().ToString();
                                    localdoc.DocType   = "Council";
                                    localdoc.CityId    = this.cityEntity.CityId;
                                    localdoc.DocSource = pdfUrl;

                                    string localPath = string.Format("{0}\\{1}", this.localDirectory, pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault());
                                    localdoc.DocLocalPath = localPath;

                                    if (!File.Exists(localPath))
                                    {
                                        try
                                        {
                                            c.DownloadFile(pdfUrl, localPath);
                                        }
                                        catch
                                        {
                                        }
                                    }

                                    docs.Add(localdoc);
                                }
                                else
                                {
                                    Console.ForegroundColor = ConsoleColor.Yellow;
                                    Console.WriteLine("This document already downloaded...");
                                    Console.ResetColor();
                                }

                                this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.DocId       = localdoc.DocId;
                                    qr.CityId      = localdoc.CityId;
                                    qr.MeetingDate = meetingDate;
                                    qr.SearchTime  = DateTime.Now;
                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localdoc, ref qr);
                                Console.WriteLine("{0} documents saved...", docs.Count);
                                Console.WriteLine("{0} query results saved...", queries.Count);
                            }
                        }
                    }
                }
            }

            this.SaveMeetingResultsToSQL(docs, queries);
        }
Пример #14
0
 public HtmlAgilityPack.HtmlDocument GetDocument()
 {
     HtmlAgilityPack.HtmlWeb      hweb = new HtmlWeb();
     HtmlAgilityPack.HtmlDocument hdoc = hweb.Load(this._uri);
     return(hdoc);
 }
Пример #15
0
        public static void ScrapeOwnedPlayers(PlayerType playerType)
        {
            string URLFormat = playerType == PlayerType.Skater ? URL_OWNED_SKATERS : URL_OWNED_GOALIES;
            string baseURL   = string.Format(URLFormat, LeagueIDCurrent, LeagueYearPrevious);
            string completeURL;

            for (int pages = 0; pages < NUMBER_OF_PAGES_TO_SCRAPE; pages++)
            {
                // Set the page URL to scrape
                if (pages == 0) // First page has no additional parameters
                {
                    completeURL = baseURL;
                }
                else // add parameter to specificy which page of the table to scrape
                {
                    completeURL = baseURL + "&count=" + (pages * numberOfPlayersPerPage);
                }

                // Load page
                HtmlWeb      getHtmlWeb = new HtmlWeb();
                HtmlDocument document   = getHtmlWeb.Load(completeURL);

                // Page is loaded

                // Scrape the table headers
                if (pages == 0)
                {
                    var headerNodes = document.DocumentNode.SelectNodes(XPATH_PLAYERS_TABLE_HEADERS);

                    int headerIndex = 0;

                    Dictionary <int, string> headers = playerType == PlayerType.Skater ? skaterHeaders : goalieHeaders;
                    foreach (HtmlNode headerNode in headerNodes)
                    {
                        string header = headerNode.InnerText;

                        // Erase garbage headers
                        header = header.Replace("&nbsp;", "");
                        header = header.Replace("&#xe002;", "");
                        //header = header.Replace("*", "");
                        if (header.Contains("Opp:"))
                        {
                            header = "";
                        }

                        headers.Add(headerIndex++, header);
                    }
                }

                // Go through the stats for each player on this page
                var playerNodes = document.DocumentNode.SelectNodes(XPATH_PLAYERS_TABLE_PLAYER_ROWS);
                if (playerNodes != null)
                {
                    foreach (HtmlNode playerNode in playerNodes)
                    {
                        players.Add(ScrapePlayerNode(playerNode, playerType));
                    }
                }
                else
                {
                    break;
                }
            }
        }
        public async Task <bool> StartNgrok()
        {
            string ngrokPath      = _config.NgrokPath;
            string ngrokArguments = _config.NgrokArguments;

            _logHelper.Log("K43233234KK4333",
                           $"Ngrok path: {ngrokPath} Ngrok arguments: {ngrokArguments}", LogLevel.Information);

            var pInfo = new ProcessStartInfo()
            {
                FileName               = ngrokPath,
                Arguments              = ngrokArguments,
                UseShellExecute        = false,
                CreateNoWindow         = true,
                RedirectStandardOutput = true
            };
            var process = new Process()
            {
                StartInfo = pInfo
            };

            try
            {
                process.Start();
            }
            catch (Exception ex)
            {
                _logHelper.Log("LKJ45LK65BB765",
                               $"Exception occured during Ngrok start.\r\n{ex.Message}", LogLevel.Error);

                return(false);
            }
            //process.BeginOutputReadLine();

            _logHelper.Log("HJLK6543NMGFFD", "Seems like Ngrok is started", LogLevel.Information);
            var url = "http://127.0.0.1:4040/status";
            var web = new HtmlWeb();

            bool isSuccessfullyLoaded = false;
            int  attempts             = 0;
            int  attemptsLimit        = _config.AttempstLimit;

            while (!isSuccessfullyLoaded && attempts < attemptsLimit)
            {
                isSuccessfullyLoaded = true;
                _logHelper.Log("LK6LK4CDD5G645", $"About to check status", LogLevel.Information);
                try
                {
                    var doc = await web.LoadFromWebAsync(url);//.Load(url);
                }
                catch (Exception)
                {
                    isSuccessfullyLoaded = false;
                }
                ++attempts;
            }
            _logHelper.Log("LK6L5555D5G645", $"Status is {isSuccessfullyLoaded.ToString()}", LogLevel.Warning);


            return(isSuccessfullyLoaded);
        }
Пример #17
0
        public async Task <List <MetaTagInfo> > GetAllMetaTagsInfo(string searchText, bool isPageFilterStopWords)
        {
            var listOfWords = new List <string>();

            var webGet   = new HtmlWeb();
            var document = await webGet.LoadFromWebAsync(searchText);

            var metaTags          = document.DocumentNode.SelectNodes("//meta");
            var listofMetaTagInfo = new List <MetaTagInfo>();

            foreach (var tag in metaTags.ToList())
            {
                var metaTagInfo = new MetaTagInfo();

                List <string> listofURL   = new List <string>();
                List <string> listofWords = new List <string>();

                string content   = tag.Attributes["content"] != null ? tag.Attributes["content"].Value : "";
                string property  = tag.Attributes["property"] != null ? tag.Attributes["property"].Value : "";
                string name      = tag.Attributes["name"] != null ? tag.Attributes["name"].Value : "";
                string itemProp  = tag.Attributes["itemprop"] != null ? tag.Attributes["itemprop"].Value : "";
                string httpEquiv = tag.Attributes["http-equiv"] != null ? tag.Attributes["http-equiv"].Value : "";

                metaTagInfo.Content   = content;
                metaTagInfo.Property  = property;
                metaTagInfo.Name      = name;
                metaTagInfo.ItemProp  = itemProp;
                metaTagInfo.HttpEquiv = httpEquiv;

                var hrefList = Regex.Replace(metaTagInfo.Content, FilterFormat.GetAllLinks, "$1");

                if (hrefList.ToString().ToUpper().Contains("HTTP") || hrefList.ToString().ToUpper().Contains("://"))
                {
                    //isURL
                    listofURL.Add(hrefList);
                }
                else
                {
                    //isWords
                    var words = await Task.Run(() => { return(Util.SplitSentenceIntoWords(hrefList.ToLower(), 1)); });

                    listofWords.AddRange(words);
                }

                if (isPageFilterStopWords)
                {
                    listOfWords = await Util.FilterStopWords(listOfWords, Path.Combine(_hostingEnvironment.WebRootPath, Constant.StopWordsPath));
                }

                metaTagInfo.TotalWordCount = listofWords.Count();
                metaTagInfo.URLInfoList    = await Util.GroupListOfString(listofURL);

                metaTagInfo.WordsInfoList = await Util.GroupListOfString(listofWords);

                if (!string.IsNullOrWhiteSpace(metaTagInfo.Content))
                {
                    listofMetaTagInfo.Add(metaTagInfo);
                }
            }

            return(listofMetaTagInfo);
        }
 public Translator(string langPair)         // LangPair = "SL|TL" ( Source Lang | Target Lang - Ex.: "en|pt"
 {
     this.url     = "http://www.google.com/translate_t?hl=en&ie=UTF8&text={0}&langpair=" + langPair;
     this.web     = new HtmlWeb();
     this.htmlDoc = new HtmlDocument();
 }
Пример #19
0
        public ActionResult Determine(string url)
        {
            List <UrlResponseTime> responses = new List <UrlResponseTime>();

            using (var db = new UrlResponseContext())
            {
                //checking if already searched
                var responsesFromDB = db.UrlResponseTime
                                      .Where(record => record.Url.Contains(url))
                                      .OrderByDescending(record => record.MaxResponseTime);
                if (responsesFromDB.Count() > 0)
                {
                    responses = responsesFromDB.ToList();
                }
                else
                {
                    url = url.Contains("http") ? url : "http://" + url;
                    //Adding root url and its children urls
                    HashSet <string> links = new HashSet <string>();
                    links.Add(url);
                    HashSet <string> linksAlreadySearched = new HashSet <string>();

                    while (links.Count != 0)
                    {
                        //getting first link from list
                        var link = links.First();
                        linksAlreadySearched.Add(link);
                        var doc = new HtmlWeb().Load(link);
                        links.Remove(link);

                        if (doc.DocumentNode.SelectNodes("//a[@href]") != null)
                        {
                            foreach (HtmlNode anchor in doc.DocumentNode.SelectNodes("//a[@href]"))
                            {
                                var l = anchor.Attributes["href"].Value.Contains("http") ? anchor.Attributes["href"].Value : url + anchor.Attributes["href"].Value;
                                //adding every unique anchor's ref to links list
                                if (!linksAlreadySearched.Contains(l) && !links.Contains(l) && l.Contains(url) && UrlIsValid(l))
                                {
                                    links.Add(l);
                                }
                            }
                        }
                    }

                    //measuring responses
                    foreach (var link in linksAlreadySearched)
                    {
                        var responsesTimesList = new List <double>();
                        for (int i = 0; i < 3; i++)
                        {
                            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(link);

                            Stopwatch timer = new Stopwatch();
                            timer.Start();
                            try
                            {
                                HttpWebResponse res = (HttpWebResponse)request.GetResponse();
                            }
                            catch (WebException ex)
                            {
                            }
                            finally
                            {
                                timer.Stop();
                                responsesTimesList.Add(timer.Elapsed.TotalSeconds);
                            }
                        }
                        responses.Add(new UrlResponseTime()
                        {
                            Url             = link,
                            MaxResponseTime = responsesTimesList.Max(),
                            MinResponseTime = responsesTimesList.Min()
                        });
                    }

                    responses = responses.OrderByDescending(link => link.MaxResponseTime).ToList();
                    db.UrlResponseTime.AddRange(responses);
                    db.SaveChanges();
                }

                return(View(responses));
            }
        }
        /// <summary>
        /// A lot of things going on inside: gets current gpu driver, fetches latest gpu driver from NVIDIA server and fetches download link for latest drivers.
        /// </summary>
        private static void GpuInfo()
        {
            Console.Write("Retrieving GPU information . . . ");
            int    error      = 0;
            string processURL = null;
            string confirmURL = null;
            string gpuURL     = null;
            string gpuName    = null;
            bool   foundGpu   = false;

            // query local driver version
            try {
                foreach (ManagementObject obj in new ManagementObjectSearcher("SELECT * FROM Win32_VideoController").Get())
                {
                    if (obj["Description"].ToString().ToLower().Contains("nvidia"))
                    {
                        gpuName           = obj["Description"].ToString().Trim();
                        OfflineGPUVersion = obj["DriverVersion"].ToString().Replace(".", string.Empty).Substring(5);
                        OfflineGPUVersion = OfflineGPUVersion.Substring(0, 3) + "." + OfflineGPUVersion.Substring(3); // add dot
                        foundGpu          = true;
                        break;
                    }
                    else if (obj["PNPDeviceID"].ToString().ToLower().Contains("ven_10de"))
                    {
                        foreach (ManagementObject obj1 in new ManagementClass("Win32_SystemEnclosure").GetInstances())
                        {
                            foreach (int chassisType in (UInt16[])(obj1["ChassisTypes"]))
                            {
                                gpuName = (chassisType == 3) ? "GTX" : "GTX M";
                            }
                        }

                        foundGpu = true;
                        break;
                    }
                    else     // gpu not found
                    {
                        LogManager.Log(obj["Description"].ToString().Trim() + " is not NVIDIA!", LogManager.Level.INFO);
                    }
                }

                if (!foundGpu)
                {
                    if (ignoreMissingGpu)
                    {
                        gpuName = "GTX";
                    }
                    else
                    {
                        throw new InvalidDataException();
                    }
                }
            } catch (InvalidDataException) {
                Console.Write("ERROR!");
                Console.WriteLine();
                Console.WriteLine("No supported nvidia graphics cards were found, and the application will not continue!");
                if (showUI)
                {
                    Console.ReadKey();
                }
                Environment.Exit(1);
            } catch (Exception ex) {
                error++;
                OfflineGPUVersion = "000.00";
                Console.Write("ERROR!");
                LogManager.Log(ex.ToString(), LogManager.Level.ERROR);
                Console.WriteLine();
                Console.WriteLine(ex.ToString());
            }

            /// In order to proceed, we must input what GPU we have.
            /// Looking at the supported products on NVIDIA website for desktop and mobile GeForce series,
            /// we can see that they're sharing drivers with other GPU families, the only thing we have to do is tell the website
            /// if we're running a mobile or desktop GPU.

            int psID = 0, pfID = 0;

            /// Get correct gpu drivers:
            /// you do not have to choose the exact GPU,
            /// looking at supported products, we see that the same driver package includes
            /// drivers for the majority GPU family.
            if (gpuName.Contains("M")) // mobile | notebook
            {
                psID = 99;             // GeForce 900M-series (M for Mobile)
                pfID = 758;            // GTX 970M
            }
            else                       // desktop
            {
                psID = 98;             // GeForce 900-series
                pfID = 756;            // GTX 970
            }

            // Check if system requires DCH drivers, thanks to https://github.com/Osspial
            try {
                using (var regKey = Registry.LocalMachine.OpenSubKey(@"SYSTEM\CurrentControlSet\Services\nvlddmkm", false)) {
                    if (regKey != null)
                    {
                        if (regKey.GetValue("DCHUVen") != null)
                        {
                            dchID = 1;
                        }
                    }
                }
            } catch { }

            // finish request
            try {
                gpuURL = $"https://www.nvidia.com/Download/processDriver.aspx?psid={psID}&pfid={pfID}&rpf=1&osid={osID}&lid={langID}&dtcid={dchID}&ctk=0";

                WebClient    client = new WebClient();
                Stream       stream = client.OpenRead(gpuURL);
                StreamReader reader = new StreamReader(stream);
                processURL = reader.ReadToEnd();
                reader.Close();
                stream.Close();
            } catch (Exception ex) {
                if (error == 0)
                {
                    Console.Write("ERROR!");
                    Console.WriteLine();
                    error++;
                }
                Console.WriteLine(ex.ToString());
            }

            try
            {
                // HTMLAgilityPack
                // thanks to http://www.codeproject.com/Articles/691119/Html-Agility-Pack-Massive-information-extraction-f for a great article

                var htmlWeb = new HtmlWeb();
                HtmlAgilityPack.HtmlDocument htmlDocument = htmlWeb.Load(processURL);

                // get version
                var tdVer = htmlDocument.DocumentNode.Descendants().SingleOrDefault(x => x.Id == "tdVersion");
                OnlineGPUVersion = tdVer.InnerHtml.Trim().Substring(0, 6);

                // get release date
                var tdReleaseDate = htmlDocument.DocumentNode.Descendants().SingleOrDefault(x => x.Id == "tdReleaseDate");
                var dates         = tdReleaseDate.InnerHtml.Trim();

                // get driver release date
                int status = 0, year = 0, month = 0, day = 0;

                foreach (var substring in dates.Split('.'))
                {
                    status++; // goes up starting from 1, being the year, followed by month then day.
                    switch (status)
                    {
                    // year
                    case 1:
                        year = Convert.ToInt32(substring);
                        break;

                    // month
                    case 2:
                        month = Convert.ToInt32(substring);
                        break;

                    // day
                    case 3:
                        day = Convert.ToInt32(substring);
                        break;

                    default:
                        LogManager.Log($"The status: '{status}' is not a recognized status!", LogManager.Level.ERROR);
                        break;
                    }
                }

                releaseDate = new DateTime(year, month, day);
                IEnumerable <HtmlNode> node = htmlDocument.DocumentNode.Descendants("a").Where(x => x.Attributes.Contains("href"));

                // get driver URL
                foreach (var child in node)
                {
                    if (child.Attributes["href"].Value.Contains("/content/DriverDownload-March2009/"))
                    {
                        confirmURL = "https://www.nvidia.com" + child.Attributes["href"].Value.Trim();
                        break;
                    }
                }

                // get release notes URL
                foreach (var child in node)
                {
                    if (child.Attributes["href"].Value.Contains("release-notes.pdf"))
                    {
                        pdfURL = child.Attributes["href"].Value.Trim();
                        break;
                    }
                }

                if (pdfURL == null)
                {
                    if (psID == 98)   // if desktop
                    {
                        pdfURL = $"https://us.download.nvidia.com/Windows/{OnlineGPUVersion}/{OnlineGPUVersion}-win10-win8-win7-desktop-release-notes.pdf";
                    }
                    else
                    {
                        pdfURL = $"https://us.download.nvidia.com/Windows/{OnlineGPUVersion}/{OnlineGPUVersion}-win10-win8-win7-notebook-release-notes.pdf";
                    }
                    LogManager.Log("No release notes found, but a link to the notes has been crafted by following the template Nvidia uses.", LogManager.Level.INFO);
                }

                // get driver description and show it in HTML
                releaseDesc = htmlDocument.DocumentNode.SelectSingleNode("//div[@id='tab1_content']").InnerHtml.Trim();

                // get download link
                htmlDocument = htmlWeb.Load(confirmURL);
                node         = htmlDocument.DocumentNode.Descendants("a").Where(x => x.Attributes.Contains("href"));

                foreach (var child in node)
                {
                    if (child.Attributes["href"].Value.Contains("download.nvidia"))
                    {
                        downloadURL = child.Attributes["href"].Value.Trim();
                        break;
                    }
                }

                var locationPrefix          = SettingManager.ReadSetting("Download location");
                downloadURL = downloadURL.Substring(4);
                downloadURL = $"https://{locationPrefix}{downloadURL}";

                // get file size
                using (var responce = WebRequest.Create(downloadURL).GetResponse()) {
                    downloadFileSize = responce.ContentLength;
                }
            } catch (Exception ex) {
                OnlineGPUVersion = "000.00";
                LogManager.Log(ex.ToString(), LogManager.Level.ERROR);
                if (error == 0)
                {
                    Console.Write("ERROR!");
                    Console.WriteLine();
                    error++;
                }
                Console.WriteLine(ex.ToString());
            }

            if (error == 0)
            {
                Console.Write("OK!");
                Console.WriteLine();
            }

            if (debug)
            {
                Console.WriteLine($"downloadURL: {downloadURL}");
                Console.WriteLine($"pdfURL:      {pdfURL}");
                Console.WriteLine($"releaseDate: {releaseDate.ToShortDateString()}");
                Console.WriteLine($"downloadFileSize:  {Math.Round((downloadFileSize / 1024f) / 1024f)} MB ({downloadFileSize:N} bytes)");
                Console.WriteLine($"OfflineGPUVersion: {OfflineGPUVersion}");
                Console.WriteLine($"OnlineGPUVersion:  {OnlineGPUVersion}");
            }
        }
Пример #21
0
        private BaseNode Visit(HtmlWeb page, List <DnpPage> links, string root, string link, BaseNode parentNode, BaseNode rootNode)
        {
            HtmlNode htmlnode = null;
            var      dnp1     = links.Where(m => m.Link == link).FirstOrDefault();

            if (dnp1 == null)
            {
                var document = page.Load(root + link);
                htmlnode = document.DocumentNode;

                dnp1       = new DnpPage();
                dnp1.Link  = link;
                dnp1.Html  = htmlnode.OuterHtml;
                dnp1.PLink = link;
                links.Add(dnp1);

                BaseNode myNode = new PageNode(htmlnode);
                myNode.LoadedData = true;
                myNode.Url        = link;
                if (!string.IsNullOrEmpty(link))
                {
                    myNode.Title = link;
                }
                else
                {
                    myNode.Title = "Please choose the example:";
                }
                parentNode.Children.Add(myNode);
                myNode.ParentNode = parentNode;
                myNode.RootNode   = rootNode;

                dnp1.Node = myNode;

                GenerateSourceCodeNodes(htmlnode, myNode, rootNode);
            }
            else
            {
                if (string.IsNullOrEmpty(dnp1.Html))
                {
                    var document = page.Load(root + link);
                    htmlnode           = document.DocumentNode;
                    dnp1.Html          = htmlnode.OuterHtml;
                    dnp1.Node.SPObject = htmlnode;
                }
                else
                {
                    var document = new HtmlDocument();
                    document.LoadHtml(dnp1.Html);
                    htmlnode           = document.DocumentNode;
                    dnp1.Node.SPObject = htmlnode;
                }
            }

            //var htmla = htmlnode.SelectNodes("//a");
            var htmla = htmlnode.SelectNodes("//a").OrderBy(m => m.InnerText).ToList();

            foreach (var a in htmla)
            {
                if (a.Attributes.Contains("href"))
                {
                    var  href   = a.Attributes["href"].Value;
                    bool exists = false;
                    foreach (string s in notcsharp)
                    {
                        if (href.EndsWith(s))
                        {
                            exists = true;
                            break;
                        }
                    }

                    foreach (string s in skipcsharppages)
                    {
                        if (href.Equals(s))
                        {
                            exists = true;
                            break;
                        }
                    }


                    //skip external links, skip "s" link
                    if (href.Contains("https://") || href.Contains("http://") || href == "s")
                    {
                        exists = true;
                        //break;
                    }

                    if (!exists)
                    {
                        DnpPage dnp = null;
                        if (links.Where(m => m.Link == href).Count() == 0)
                        {
                            dnp       = new DnpPage();
                            dnp.Link  = href;
                            dnp.PLink = link;
                            links.Add(dnp);
                        }
                        else
                        {
                            dnp = links.Where(m => m.Link == href).First();
                        }

                        //BaseNode myNode = new PageNode(htmlnode);
                        if (dnp.Node == null)
                        {
                            BaseNode myNode = new PageNode();
                            dnp.Node     = myNode;
                            myNode.Title = href;
                            myNode.Url   = href;
                            dnp1.Node.Children.Add(myNode);
                            myNode.ParentNode = dnp1.Node;
                            myNode.RootNode   = rootNode;
                        }
                    }
                }
            }
            return(dnp1.Node);
        }
Пример #22
0
        public List <ListTemp> scrap(Shelter_ID.id shelter_id)
        {
            HtmlWeb         web_page     = new HtmlWeb();
            List <ListTemp> listTemp     = new List <ListTemp>();
            DateTime        today        = DateTime.Today;
            string          _name        = "";
            string          _description = "";
            string          _breed       = "";
            string          _gender      = "";
            string          _age         = "";
            string          _weight      = "";
            DateTime        _dateStart   = DateTime.Today;



            string url = @"http://www.napaluchu.waw.pl/czekam_na_ciebie/wszystkie_zwierzeta_do_adopcji:1";

            var  doc     = web_page.Load(url);
            int  number  = 1;
            bool allList = false;

            while (!allList)
            {
                var nextPage = doc.DocumentNode.SelectNodes("//div[@class = 'pagination']/a[@class = 'next']");
                number++;
                url = "http://www.napaluchu.waw.pl/czekam_na_ciebie/wszystkie_zwierzeta_do_adopcji:" + number;

                var animal_link = doc.DocumentNode.SelectNodes("//a[@class = 'animals_btn_list_more']/@href").Select(q => q.GetAttributeValue("href", null)).ToList();

                for (int i = 0; i < animal_link.Count(); i++)
                {
                    List <byte[]> _photo     = new List <byte[]>();
                    var           animal_doc = web_page.Load(@"http://www.napaluchu.waw.pl" + animal_link[i]);

                    //--INFO
                    var nodeInfo = animal_doc.DocumentNode.SelectNodes("//div[@class = 'info']")[0].InnerText.Replace("\r", "").Replace("\n", "").Trim();
                    nodeInfo = HtmlEntity.DeEntitize(nodeInfo).Trim();
                    var tempInfo = nodeInfo.Split(':');

                    for (int t = 0; t < tempInfo.Count(); t++)
                    {
                        if (tempInfo[t].Contains("Gatunek"))
                        {
                            _name = tempInfo[t].Replace("Gatunek", "").Trim();
                        }
                        if (tempInfo[t].Contains("Płeć"))
                        {
                            _breed = tempInfo[t].Replace("Płeć", "").Trim();
                        }
                        if (tempInfo[t].Contains("Wiek"))
                        {
                            _gender = tempInfo[t].Replace("Wiek", "").Trim();
                        }
                        if (tempInfo[t].Contains("Waga"))
                        {
                            _age = tempInfo[t].Replace("Waga", "").Replace("lat", "").Replace("rok", "").Trim();
                        }
                        if (tempInfo[t].Contains("Data przyjęcia"))
                        {
                            _weight = tempInfo[t].Replace("Data przyjęcia", "").Trim();
                        }
                        if (tempInfo[t].Contains("ewidencyjny"))
                        {
                            var year  = int.Parse(tempInfo[t].Replace("Nr ewidencyjny", "").Trim().Split('.')[2]);
                            var month = int.Parse(tempInfo[t].Replace("Nr ewidencyjny", "").Trim().Split('.')[1]);
                            var day   = int.Parse(tempInfo[t].Replace("Nr ewidencyjny", "").Trim().Split('.')[0]);
                            _dateStart = new DateTime(year, month, day);
                        }
                    }//--INFO

                    //--Description
                    _description = "";

                    var nodeDescription = animal_doc.DocumentNode.SelectNodes("//div[@class = 'description']").Select(q => q.InnerText).ToList();

                    for (int d = 0; d < nodeDescription.Count(); d++)
                    {
                        _description += nodeDescription[d];
                    }
                    _description = HtmlEntity.DeEntitize(_description).Replace("\r", " ").Replace("\n", "").Trim();
                    //--Description

                    //--Photo
                    var node_Photo = animal_doc.DocumentNode.SelectNodes("//div[@class = 'ani_images']/div[@class = 'ani_image_bottom']/a");
                    if (node_Photo != null)
                    {
                        var nodePhoto = node_Photo.Select(q => q.GetAttributeValue("href", null)).ToList();
                        var photoLink = @"http://www.napaluchu.waw.pl";

                        for (int p = 0; p < nodePhoto.Count(); p++)
                        {
                            using (var client = new WebClient())
                            {
                                _photo.Add(client.DownloadData(photoLink + nodePhoto[p]));
                            }
                            if (p == 4)
                            {
                                break;
                            }
                        }
                    }

                    //--Photo

                    listTemp.Add(new ListTemp()
                    {
                        name        = _name,
                        breed       = _breed,
                        gender      = _gender,
                        age         = _age,
                        weight      = _weight,
                        description = _description,
                        dateStart   = _dateStart,
                        shelter_ID  = shelter_id.ID,
                        photo       = _photo,
                    });
                }

                doc = web_page.Load(url);
                if (nextPage == null)
                {
                    allList = true;
                }
            }


            return(listTemp);
        }
Пример #23
0
        private static List <Cre> ParseState(string state)
        //private static DataTable ParseState(string state)
        {
            string url = String.Format("http://www.creonline.com/{0}.html", state);
            //string page = String.Empty;
            //using(WebClient client = new WebClient())
            //{
            //    page = client.DownloadString(url);
            //}
            //HtmlDocument doc = new HtmlDocument();
            //doc.LoadHtml(page);
            //var parentNode = doc.DocumentNode.SelectNodes("")
            List <HtmlNode> lstNodes = new List <HtmlNode>();
            List <Cre>      lstCre   = new List <Cre>();

            var web      = new HtmlWeb();
            var document = web.Load(url);
            var page     = document.DocumentNode;

            if (page.InnerHtml.Contains("Sorry but we can't find that page!"))
            {
                return(null);
            }
            var tdNode = page.QuerySelector("td[valign='top']");

            if (tdNode != null)
            {
                foreach (var pitem in tdNode.QuerySelectorAll("p"))
                {
                    lstNodes.AddRange(pitem.Descendants().Where(e => e.Name.ToLower() != "br").Where(e => !(e.Name.ToLower() == "#text" && String.IsNullOrWhiteSpace(e.InnerHtml))));
                }
            }
            var firstImg = lstNodes.First(e => e.Name.ToLower() == "img");

            for (int i = lstNodes.IndexOf(firstImg); i < lstNodes.Count; i++)
            {
                var node = lstNodes[i];
                if (node.Name.ToLower() == "img")
                {
                    lstCre.Add(new Cre());
                }
                else
                {
                    var currentCre = lstCre.Last();
                    if (currentCre != null)
                    {
                        string text = node.InnerHtml;

                        if (node.Name.ToLower() == "b" || node.Name.ToLower() == "strong")
                        {
                            if (node.FirstChild != null)
                            {
                                if (node.FirstChild.Name.ToLower() == "#text")
                                {
                                    currentCre.Name = node.InnerHtml;
                                }
                                if (node.FirstChild.Name.ToLower() == "a")
                                {
                                    currentCre.Name = node.FirstChild.InnerHtml;
                                }
                            }
                        }
                        if (node.Name.ToLower() == "a")
                        {
                            //check if url
                            var href = node.GetAttributeValue("href", "");
                            Uri result;
                            if (Uri.TryCreate(href, UriKind.Absolute, out result))
                            {
                                if (result.AbsoluteUri.ToLower().StartsWith("mailto"))
                                {
                                    currentCre.EmailAddress  = result.AbsoluteUri;//.Replace("mailto:", "");
                                    currentCre.ContactPerson = node.InnerHtml;
                                }
                                else
                                {
                                    currentCre.Website = node.InnerHtml;
                                }
                            }
                        }
                        if (node.Name.ToLower() == "#text")
                        {
                            if (node.InnerHtml.Trim().StartsWith("Telephone:"))
                            {
                                currentCre.Phone = node.InnerHtml.Replace("Telephone:", "").Trim();
                            }
                            if (node.InnerHtml.Trim().StartsWith("Where:"))
                            {
                                currentCre.Address = HttpUtility.HtmlDecode(node.InnerHtml.Replace("Where:", "").Trim());
                            }
                            if (node.InnerHtml.Trim().StartsWith("Contact:"))
                            {
                                var contact = node.InnerHtml.Replace("Contact:", "");
                                if (!String.IsNullOrWhiteSpace(contact) && string.IsNullOrWhiteSpace(currentCre.ContactPerson))
                                {
                                    currentCre.ContactPerson = contact.Trim();
                                }
                            }
                        }
                    }
                }
            }
            var invalidCre = lstCre.FirstOrDefault(e => (!String.IsNullOrWhiteSpace(e.Name) && (e.Name.Trim().ToLower() == "click here")));

            if (invalidCre != null)
            {
                lstCre.Remove(invalidCre);
            }

            return(lstCre);
        }
Пример #24
0
        /// <summary>
        /// Runs the spidering processing.
        /// </summary>
        public void Run()
        {
            // Loop through URLs and download.
            int activeDownloads = 0;

            while (uris.Count > 0)
            {
                // TODO: multithread
                if (uris.Count <= 0)
                {
                    break;
                }
                Node node = uris.Dequeue();
                if (node.Target == SrcType.Media)
                {
                    string filename = Path.GetFileName(node.Uri.LocalPath);
                    if (!string.IsNullOrWhiteSpace(filename))
                    {
                        string extension = GetFileExtension(filename);
                        if (extensions.Contains(extension))
                        {
                            FileInfo info = new FileInfo(filename);
                            if (info.Exists && info.Length > 0)
                            {
                                continue;
                            }

                            WebClient client = new WebClient();
                            // TODO: Setup client with headers from root page load plus browser string etc.
                            try
                            {
                                ++activeDownloads;
                                client.DownloadFileCompleted += (s, e) => { activeDownloads--; };
                                client.DownloadFileAsync(node.Uri, Path.Combine(output, filename));
                            }
                            catch (Exception e) // e isn't used but it was useful during debugging to see what exceptions were happening.
                            {
                                string uriText = node.Uri.ToString();
                                if (uriText.StartsWith("file:"))
                                {
                                    uriText = uriText.Replace("file", "https");
                                    try
                                    {
                                        client.DownloadFileAsync(new Uri(uriText), Path.Combine(output, filename));
                                    }
                                    catch (Exception)
                                    {
                                        // Nothing.
                                    }
                                }
                            }
                        }
                    }
                }
                else
                {
                    if (node.Depth <= maxDepth)
                    {
                        int     nextDepth = node.Depth + 1;
                        HtmlWeb web       = new HtmlWeb();
                        try
                        {
                            HtmlDocument doc   = web.Load/*FromWebAsync*/ (node.Uri);
                            List <Node>  links = GetLinks(doc, node.Uri, nextDepth);
                            foreach (Node link in links)
                            {
                                Console.WriteLine($"{nextDepth} - {link.Target.ToString()} - {link.Uri}");
                                uris.Enqueue(link);
                            }
                        }
                        catch (Exception)
                        {
                            // Nothing.
                        }
                    }
                }
            }

            while (activeDownloads > 0)
            {
                Thread.Sleep(1000);
            }
        }
Пример #25
0
        public static BookModels GetBookDetails(string url)
        {
            var book = new BookModels();

            book.AmazonUrl = url;

            var webGet  = new HtmlWeb();
            var htmlDoc = webGet.Load(url);

            htmlDoc.OptionFixNestedTags = true;

            // ParseErrors is an ArrayList containing any errors from the Load statement
            if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0)
            {
                // Handle any parse errors as required
            }

            if (htmlDoc.DocumentNode != null)
            {
                HtmlAgilityPack.HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//body");

                if (bodyNode != null)
                {
                    var image = bodyNode.SelectSingleNode("//img[@id='main-image']");

                    if (image == null)
                    {
                        image = bodyNode.SelectSingleNode("//img[@id='imgBlkFront']");
                    }

                    if (image != null)
                    {
                        book.CoverUrl = image.Attributes["src"].Value;
                        book.CoverUrl = book.CoverUrl.Substring(0, book.CoverUrl.IndexOf("._")) + ".jpg";
                    }

                    var title = bodyNode.Descendants("span")
                                .Where(x => x.Id == "btAsinTitle")
                                .Select(s => s.InnerText);
                    book.Title = title.FirstOrDefault();

                    if (string.IsNullOrEmpty(book.Title))
                    {
                        title = bodyNode.Descendants("h1")
                                .Where(x => x.Id == "title")
                                .Select(s => s.InnerText);

                        book.Title = title.FirstOrDefault();
                    }

                    var price = bodyNode.SelectSingleNode("//b[@class='priceLarge']");

                    if (price != null)
                    {
                        book.Price = Convert.ToDecimal(price.InnerText.Trim().Replace("$", string.Empty).Replace("\n", string.Empty));
                    }

                    var description = bodyNode.SelectSingleNode("//div[@id='postBodyPS']")
                                      .InnerText;

                    book.Description = description;
                }
            }

            return(book);
        }
Пример #26
0
 /// <summary>
 /// Crawls the passed in URL for all html links and adds them into the urlqueue. Adds the passed in
 /// URL to the urltable once it is done crawling.
 /// </summary>
 /// <param name="htmlurl">The URL that you wish to crawl</param>
 public void crawlUrl(string htmlurl)
 {
     try {
         HtmlWeb      web = new HtmlWeb();
         HtmlDocument doc = web.Load(htmlurl);
         //Check the webpage for any errors (such as 404)
         if (web.StatusCode == HttpStatusCode.OK)
         {
             if (htmlurl.StartsWith("http://bleacherreport.com") || htmlurl.StartsWith("http://www.cnn.com"))
             {
                 string pageTitle   = "No page title";
                 string pagePubDate = "No pub date found";
                 if (doc != null && doc.DocumentNode != null)
                 {
                     pageTitle = (doc.DocumentNode.SelectSingleNode("//head/title").InnerText);
                     var dateTest = (doc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']"));
                     if (dateTest != null)
                     {
                         pagePubDate = (doc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']").GetAttributeValue("content", string.Empty));
                     }
                     var hrefTest = doc.DocumentNode.SelectNodes("//a[@href]");
                     if (hrefTest != null)
                     {
                         foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))
                         {
                             string hrefValue = link.GetAttributeValue("href", string.Empty);
                             string newUrl    = "None";
                             if (hrefValue.StartsWith("http://bleacherreport.com") && !disallowedLink(hrefValue))
                             {
                                 newUrl = hrefValue;
                             }
                             else
                             {
                                 newUrl = getCrawledUrl(htmlurl, hrefValue);
                             }
                             if (newUrl != null & !visitedUrls.Contains(newUrl) && (newUrl.StartsWith("http://bleacherreport.com/articles") || newUrl.StartsWith("http://www.cnn.com")))
                             {
                                 visitedUrls.Add(newUrl);
                                 CloudQueueMessage newhtmlurl = new CloudQueueMessage(newUrl);
                                 htmlqueue.AddMessage(newhtmlurl);
                             }
                         }
                     }
                     //Add the current crawled htmlurl to the queue of 10 recently added urls
                     CloudQueueMessage newestUrl = new CloudQueueMessage(htmlurl);
                     newestqueue.FetchAttributes();
                     int queueLength = (int)newestqueue.ApproximateMessageCount;
                     if (queueLength < 10)
                     {
                         newestqueue.AddMessage(newestUrl);
                     }
                     else
                     {
                         CloudQueueMessage lastMessage = newestqueue.GetMessage();
                         if (lastMessage != null)
                         {
                             newestqueue.DeleteMessage(lastMessage);
                         }
                         newestqueue.AddMessage(newestUrl);
                     }
                     //Parse the title and add each filtered word to the table
                     string[] titleWords = pageTitle.Split(' ');
                     foreach (string word in titleWords)
                     {
                         string cleanWord = storage.RemoveSpecialCharacters(word);
                         if (cleanWord != "")
                         {
                             string         urlHash         = storage.calculateHash(htmlurl.ToLower());
                             urlNode        tableUrl        = new urlNode(cleanWord.ToLower(), urlHash, htmlurl, pageTitle, pagePubDate);
                             TableOperation insertOperation = TableOperation.Insert(tableUrl);
                             urltable.Execute(insertOperation);
                         }
                     }
                 }
                 //Update table row count entitiy
                 TableOperation getRowCount       = TableOperation.Retrieve <rowCount>("rowCount", "totalRows");
                 TableResult    retrievedRowCount = stattable.Execute(getRowCount);
                 rowCount       newRowCount       = (rowCount)retrievedRowCount.Result;
                 if (newRowCount != null)
                 {
                     int newCount = (((rowCount)retrievedRowCount.Result).count) + 1;
                     newRowCount.count = newCount;
                     TableOperation updateRowCount = TableOperation.Replace(newRowCount);
                     stattable.Execute(updateRowCount);
                 }
             }
         }
     } catch (Exception e) {
         visitedUrls.Add(htmlurl);
         string            errorMessage = "URL: " + htmlurl + " Error: " + e;
         CloudQueueMessage badurl       = new CloudQueueMessage(errorMessage);
         brokenqueue.AddMessage(badurl);
     }
 }
Пример #27
0
        public void ParseHousePage(string houseLink, HouseData house = null)
        {
            HtmlWeb      web = new HtmlWeb();
            HtmlDocument doc = web.Load(houseLink);

            if (doc == null)
            {
                return;
            }

            HtmlNode content = doc.DocumentNode.SelectSingleNode("//div[@class='content-container']");

            if (content == null)
            {
                return;
            }

            HtmlNode           dataList         = content.SelectSingleNode(".//dl");
            HtmlNodeCollection labelCollection  = dataList.SelectNodes("dt");
            HtmlNodeCollection valuesCollection = dataList.SelectNodes("dd");

            if (labelCollection == null || valuesCollection == null)
            {
                return;
            }
            List <HtmlNode> labels = dataList.SelectNodes("dt").ToList();
            List <HtmlNode> values = dataList.SelectNodes("dd").ToList();

            HtmlNodeCollection featuresCollection = content.SelectSingleNode(".//ul[@class='features']").SelectNodes("li");

            if (featuresCollection == null)
            {
                return;
            }
            List <HtmlNode> features = featuresCollection.ToList();

            string neighbourhoodText = "";
            string postalcodeText    = "";
            string streetText        = "";
            string areaText          = "";
            string priceText         = "";
            string availableText     = "";
            string bedroomsText      = "";
            string placedText        = "";
            string extraText         = "";

            if (house == null)
            {
                house = new HouseData();
            }
            house.city = city;

            for (int l = 0; l < labels.Count; l++)
            {
                string labelText = labels[l].InnerText.Trim();

                switch (labelText)
                {
                case "Buurt":
                    neighbourhoodText   = values[l].InnerText;
                    house.neighbourhood = neighbourhoodText;
                    break;

                case "Postcode":
                    postalcodeText   = values[l].InnerText;
                    house.postalCode = postalcodeText;
                    break;

                case "Straat":
                    streetText   = values[l].InnerText;
                    house.street = streetText;
                    break;

                case "Oppervlakte (m²)":
                    areaText   = values[l].InnerText.Replace("&sup2;", "2");  //²
                    house.area = areaText;
                    break;

                case "Huurprijs per maand":
                    priceText   = values[l].InnerText.Replace("€", "").Replace(",-", " euro").Replace(".", "").Trim();
                    house.price = priceText;
                    break;

                case "Beschikbaar per":
                    availableText   = values[l].InnerText;
                    house.available = availableText;
                    break;

                case "Aantal slaapkamers":
                    bedroomsText   = values[l].InnerText;
                    house.bedrooms = bedroomsText;
                    break;

                case "Aangeboden sinds":
                    placedText   = values[l].InnerText.Replace("&gt;", "meer dan");   //>
                    house.placed = placedText;
                    break;
                }
            }

            for (int f = 0; f < features.Count; f++)
            {
                extraText += "[" + features[f].InnerText.Trim() + "]";
            }

            house.extra = extraText;
            houseData.Add(house);

            Console.WriteLine(streetText + ", " + postalcodeText + ", " + neighbourhoodText + "\n" +
                              priceText + " voor " + areaText + " met " + bedroomsText + " slaapkamer(s)\n" +
                              "Beschikbaar vanaf " + availableText + ", aangeboden sinds " + placedText + "\n" +
                              "Extra info: " + extraText);
        }
Пример #28
0
        public void DownloadCouncilPdfFiles()
        {
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            Regex dateReg = new Regex("[A-Za-z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]{4}");

            foreach (string url in this.docUrls)
            {
                string             category    = url.Split('*')[0];
                string             categoryUrl = url.Split('*')[1];
                HtmlDocument       listDoc     = web.Load(categoryUrl);
                HtmlNodeCollection recordNodes = listDoc.DocumentNode.SelectNodes("//table/tbody/tr[@class='catAgendaRow']");

                if (recordNodes != null && recordNodes.Count > 0)
                {
                    foreach (HtmlNode recordNode in recordNodes)
                    {
                        try
                        {
                            HtmlNode dateNode    = recordNode.SelectSingleNode(".//strong");
                            string   dateText    = dateReg.Match(dateNode.InnerText).ToString();
                            DateTime meetingDate = DateTime.Parse(dateText);
                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Too early, skip...");
                                continue;
                            }
                            HtmlNode agendaNode = dateNode == null?
                                                  recordNode.SelectNodes(".//a[contains(@href,'ViewFile')]")
                                                  .Where(t => !t.Attributes["href"].Value.Contains("html"))
                                                  .FirstOrDefault(t => t.Attributes["href"].Value
                                                                  .ToLower().Contains("/agenda/")) :
                                                      dateNode.ParentNode;

                            string agendaUrl = agendaNode.Attributes["href"].Value;
                            agendaUrl = agendaUrl.StartsWith("http") ? agendaUrl : this.cityEntity.CityUrl + agendaUrl;
                            HtmlNode minuteNode = recordNode.SelectNodes(".//a[contains(@href,'ViewFile')]")
                                                  .FirstOrDefault(t => t.Attributes["href"].Value.ToLower().Contains("minutes"));
                            string        minuteUrl = minuteNode == null ? string.Empty : minuteNode.Attributes["href"].Value;
                            List <string> fileUrls  = new List <string>();
                            fileUrls.Add(agendaUrl);
                            if (!string.IsNullOrEmpty(minuteUrl))
                            {
                                minuteUrl = minuteUrl.StartsWith("http") ? minuteUrl : this.cityEntity.CityUrl + minuteUrl;
                                fileUrls.Add(minuteUrl);
                            }

                            foreach (string fileUrl in fileUrls)
                            {
                                Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl);
                                string    tag      = fileUrl.ToLower().Contains("minute") ? "minute" : "agenda";

                                if (localdoc == null)
                                {
                                    localdoc           = new Documents();
                                    localdoc.CityId    = this.cityEntity.CityId;
                                    localdoc.Checked   = false;
                                    localdoc.DocId     = Guid.NewGuid().ToString();
                                    localdoc.DocSource = fileUrl;
                                    localdoc.DocType   = category;
                                    string localFileName = string.Format("{0}\\{1}_{2}_{3}.pdf",
                                                                         this.localDirectory,
                                                                         category,
                                                                         meetingDate.ToString("yyyy-MM-dd"),
                                                                         tag);
                                    try
                                    {
                                        c.DownloadFile(fileUrl, localFileName);
                                    }
                                    catch
                                    {
                                    }

                                    localdoc.DocLocalPath = localFileName;
                                    docs.Add(localdoc);
                                }
                                else
                                {
                                    Console.ForegroundColor = ConsoleColor.Yellow;
                                    Console.WriteLine("File already downloaded....");
                                    Console.ResetColor();
                                }

                                this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.CityId      = this.cityEntity.CityId;
                                    qr.DocId       = localdoc.DocId;
                                    qr.MeetingDate = meetingDate;
                                    qr.SearchTime  = DateTime.Now;

                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localdoc, ref qr);
                                Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count);
                            }
                        }
                        catch (Exception ex)
                        {
                            Console.WriteLine("DEBUG EXCEPTION:{0}", ex.ToString());
                            Console.WriteLine("DATA: {0}", recordNode.InnerHtml);
                        }
                    }

                    this.SaveMeetingResultsToSQL(docs, queries);
                }
            }
        }
Пример #29
0
        static void Main(string[] args)
        {
            //var art = "t1064071605100.html";
            string[] arts = { "t1064071605100", "t1166171603700" };

            List <List <string> > Nodes = new List <List <string> >();

            foreach (string art in arts)
            {
                var html = @"https://www.tissotwatches.com/ru-ru/shop/" + art + ".html";

                HtmlWeb web = new HtmlWeb();

                var htmlDoc = web.Load(html);

                var htmlNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class='specs-table-1024']");
                var Articul   = new List <string>();//создали лист


                foreach (var node in htmlNodes)
                {
                    var res = node.InnerHtml.ToString().Replace("<table class=\"specs-table\">", "").Replace("\t", "").Replace("\n", "").Replace("\r", "")
                              .Replace("<tr>", "").Replace("</tr>", "#").Split('#');

                    foreach (var i in res)
                    {
                        var t = i.Replace("<td>", "").Replace("</td>", "#").Split('#');

                        try
                        {
                            var entity = t[0] + " : " + t[1];
                            Articul.Add(entity);
                        }
                        catch (Exception)
                        {
                            var s = "Wrong Entity : " + t;
                            Articul.Add(s);
                        }
                    }
                }



                Nodes.Add(Articul);



                //foreach (var node in Nodes)
                //{
                //    Console.WriteLine(node);
                //}

                //Console.ReadKey();



                //Console.WriteLine("файл excel сохранен");
            }

            using (ExcelPackage excelPackage = new ExcelPackage())
            {
                //create a WorkSheet
                ExcelWorksheet worksheet = excelPackage.Workbook.Worksheets.Add("Sheet 1");

                var iter = 1;
                foreach (var i in Nodes)
                {
                    var val = i.ToList();
                    //add all the content from the List collection, starting at cell A1
                    worksheet.Cells[1, iter].LoadFromCollection(val);
                    iter++;
                }

                FileInfo fi = new FileInfo(@"D:\file\result.xlsx");
                excelPackage.SaveAs(fi);
            }

            //WebClient wc = new WebClient();
            //string path = "https://www.tissotwatches.com/media/shop/catalog/product/T/0/T099.207.22.118.01.png";//создаем переменую с урл файла
            //wc.DownloadFileAsync(new Uri(path), @"D:\Downloads\tissot\" + System.IO.Path.GetFileName(path));//скачиваем файл по указанному пути в указанное место на диске С


            WebClient wc = new WebClient();

            string[] articlesBase = { "T106.407.16.051.00", "T106.407.16.031.00" };
            foreach (string articleBase in articlesBase)
            {
                string path = @"https://www.tissotwatches.com/media/shop/catalog/product/T/1/" + articleBase + ".png";//создаем переменую с урл файла
                try
                {
                    wc.DownloadFileTaskAsync(new Uri(path), @"D:\Downloads\tissot\" + System.IO.Path.GetFileName(path)).GetAwaiter().GetResult();//скачиваем файл по указанному пути в указанное место на диске С
                }
                catch (Exception ex)
                {
                    var t = ex;
                }
            }

            Console.WriteLine("Картинка сохранена");
            Console.Read();
        }
Пример #30
0
        static void Main(string[] args)
        {
            var conID_Sim = 0;

            var constituencies      = new List <Constituency>();
            var constituencyResults = new List <ConstituencyResult>();
            var candidateResults    = new List <CandidateResult>();
            var constituencyHeads   = new List <ConstituencyHead>();

            // set up sql server connection

            DataAccess.connectionString = "server=" + "DESKTOP-7UJF7DE" +
                                          ";Trusted_Connection=yes; database=" + "GER";

            // write csv files to this directory

            const string csvDir = "d:\\temp\\";

            // BBC election data root node

            var web = new HtmlWeb();
            var doc = web.Load("https://www.bbc.co.uk/news/politics/constituencies");

            // get list of what gets added to root node to get each constituency result web page

            var conNodes = doc.DocumentNode
                           .SelectNodes("//tr[@class='az-table__row']/th/a").ToList();

            // for each constituency page

            var i = 0;

            foreach (var item in conNodes)
            {
                i++;

                // get the web page for the constituency

                var webInner1 = new HtmlWeb();

                var docInner1 = web.Load("https://www.bbc.co.uk/" + item.Attributes["href"].Value);

                // first extract and save json data of winner, previous winner and headline for each constituency
                // this is on every page so only save it once

                if (i == 1)
                {
                    var json = GetHeadlineData(constituencyHeads, docInner1);

#if JSON
                    // save json file - we also save as csv later
                    File.WriteAllText(csvDir + "election_results2019.json", json);
#endif
                }

                Constituency c = GetConstituency(ref conID_Sim, constituencies, item, docInner1);

                // get constituency level vote data

                var conR = GetConstituencyResult(constituencyResults, docInner1, c);

                // get a collection of the nodes for each candidate

                const string xpathCandidates =
                    "//ol[@class=\'ge2019-constituency-result__list\']" +
                    "//li[starts-with(@class, 'ge2019-constituency-result__item ge2019__party--border ge2019__party--border')]";

                HtmlNodeCollection candidateListItems =
                    docInner1.DocumentNode.SelectNodes(xpathCandidates);

                // loop through candidate list items

                foreach (var candidateListItem in candidateListItems)
                {
                    GetCandidateResult(candidateResults, c, conR, candidateListItem);
                }
            }

#if CSV
            // create csv files. If they already exist they will be overwritten.
            // using UTF8 to cope with accented characters etc

            const string csvPathC = csvDir + "constituencies.csv";
            using (var writer = new StreamWriter(
                       new FileStream(csvPathC, FileMode.Create, FileAccess.Write),
                       Encoding.UTF8))
                using (var csv = new CsvWriter(writer))
                {
                    csv.WriteRecords(constituencies);
                }

            const string csvPathConH = csvDir + "constituency_headline.csv";
            using (var writer = new StreamWriter(
                       new FileStream(csvPathConH, FileMode.Create, FileAccess.Write),
                       Encoding.UTF8))
                using (var csv = new CsvWriter(writer))
                {
                    csv.WriteRecords(constituencyHeads);
                }

            const string csvPathConR = csvDir + "constituency_results.csv";
            using (var writer = new StreamWriter(
                       new FileStream(csvPathConR, FileMode.Create, FileAccess.Write),
                       Encoding.UTF8))
                using (var csv = new CsvWriter(writer))
                {
                    csv.WriteRecords(constituencyResults);
                }

            const string csvPathCanR = csvDir + "candidate_results.csv";
            using (var writer = new StreamWriter(
                       new FileStream(csvPathCanR, FileMode.Create, FileAccess.Write),
                       Encoding.UTF8))
                using (var csv = new CsvWriter(writer))
                {
                    csv.WriteRecords(candidateResults);
                }
#endif
        }
Пример #31
0
        static void Main(string[] args)
        {
            List <DataModel> records = new List <DataModel>();

            try
            {
                var client  = new RestClient("https://socms.polkcountyiowa.gov/");
                var request = new RestRequest("sheriffsaleviewer/Home/PropertyListJson", Method.POST);
                request.AddParameter("draw", 1);
                request.AddParameter("start", 0);
                request.AddParameter("length", 100);
                request.AddParameter("isOpenStatus", true);

                var responce = client.Execute <RootObject>(request);
                var data     = responce.Data.data;

                Console.WriteLine("Total Records: " + data.Count);
                if (data.Count > 0)
                {
                    HtmlWeb web = new HtmlWeb();
                    int     i   = 1;
                    foreach (var item in data)
                    {
                        Console.WriteLine("===============================PROCESSING RECORD===============================");
                        //Get Details of each property
                        string url = "https://apps.polkcountyiowa.gov/sheriffsaleviewer/Home/Detail/" + item.propertyId;
                        var    doc = web.Load(url);

                        var       table = doc.DocumentNode.SelectSingleNode("/html/body/div/div[3]/div/table/tbody");
                        DataModel model = new DataModel()
                        {
                            Id = i
                        };
                        Console.WriteLine("Getting details...");
                        if (table != null)
                        {
                            foreach (var entry in table.ChildNodes.Where(x => x.Name != "#text"))
                            {
                                try
                                {
                                    HtmlDocument row = new HtmlDocument();
                                    row.LoadHtml(entry.InnerHtml);

                                    var title = row.DocumentNode.SelectSingleNode("/th[1]").InnerText.Replace("\n", "").Replace("\r", "");
                                    title = title.Trim();
                                    string value = "";
                                    switch (title)
                                    {
                                    case "Sheriff Number":
                                        value = row.DocumentNode.SelectSingleNode("/td[1]").InnerText.Replace("\n", "").Replace("\r", "");
                                        model.SheriffNumber = value.Trim();
                                        break;

                                    case "Approximate Judgment":
                                        value = row.DocumentNode.SelectSingleNode("/td[1]").InnerText.Replace("\n", "").Replace("\r", "");
                                        model.ApproxJudgment = Convert.ToDouble(value.Replace("$", "").Replace(",", "").Trim());
                                        break;

                                    case "Sales Date":
                                        value           = row.DocumentNode.SelectSingleNode("/td[1]").InnerText.Replace("\n", "").Replace("\r", "");
                                        model.SalesDate = value.Trim();
                                        break;

                                    case "Plaintiff":
                                        value           = row.DocumentNode.SelectSingleNode("/td[1]").InnerText.Replace("\n", "").Replace("\r", "");
                                        model.PlainTiff = value.Trim();
                                        break;

                                    case "Defendant":
                                        value           = row.DocumentNode.SelectSingleNode("/td[1]").InnerText.Replace("\n", "").Replace("\r", "");
                                        model.Defendant = value.Trim();
                                        break;

                                    case "Address":
                                        value         = row.DocumentNode.SelectSingleNode("/td[1]").InnerText.Replace("\n", "").Replace("\r", "");
                                        model.Address = value.Trim();
                                        break;

                                    case "Redemption Period":
                                        value = row.DocumentNode.SelectSingleNode("/td[1]").InnerText.Replace("\n", "").Replace("\r", "");
                                        model.RedemptionPeriod = value.Trim();
                                        break;

                                    default:
                                        break;
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Console.WriteLine("Unable to compile record.\nReason:" + ex.Message);
                                }
                            }
                        }
                        else
                        {
                            Console.WriteLine("Record details page not found");
                            model.SheriffNumber = item.referenceNumber.ToString();
                            model.SalesDate     = item.salesDate.ToString("MM/dd/yyyy");
                            model.PlainTiff     = item.plaintiff;
                            model.Defendant     = item.defendant;
                            model.Address       = item.propertyAddress;
                        }

                        if (!string.IsNullOrEmpty(model.Address))
                        {
                            Console.WriteLine("Getting Data from zillow...");
                            ReturnedResult dataModel = GetZestimateAPI(model.Address);
                            model.Zestimate    = (string.IsNullOrEmpty(dataModel.Zestimate)) ? 0 : Math.Round(Convert.ToDouble(dataModel.Zestimate));
                            model.TaxAssesment = string.IsNullOrEmpty(dataModel.TaxAssestment) ? 0 : Convert.ToDecimal(dataModel.TaxAssestment);
                            model.IndexValue   = dataModel.ZillowHomeValueIndex;

                            if (model.Zestimate == 0)
                            {
                                if (model.TaxAssesment != 0)
                                {
                                    model.Zestimate_Approx_Judgement_Diff = Math.Round((double)model.TaxAssesment - model.ApproxJudgment, 2);
                                    if (model.TaxAssesment > 0)
                                    {
                                        model.Zestimate_Approx_Judgement_Division = model.ApproxJudgment / (double)model.TaxAssesment;
                                    }
                                }
                            }
                            else
                            {
                                model.Zestimate_Approx_Judgement_Diff = Math.Round(model.Zestimate - model.ApproxJudgment, 2);
                                if (model.Zestimate > 0)
                                {
                                    model.Zestimate_Approx_Judgement_Division = model.ApproxJudgment / model.Zestimate;
                                }
                            }
                        }
                        else
                        {
                            Console.WriteLine("Address not found. So we cant get zestimate");
                        }

                        records.Add(model);
                        i += 1;
                    }

                    //Console.WriteLine("Fetching data from realtor.com. This may take few minutes...");
                    //records = GetRealtorEstimate(records);
                    //foreach (var model in records)
                    //{
                    //    model.Zestimate_Approx_Judgement_Diff = Math.Round(model.Zestimate - model.ApproxJudgment, 2);
                    //    if (model.Zestimate > 0)
                    //    {
                    //        model.Zestimate_Approx_Judgement_Division = model.ApproxJudgment / model.Zestimate;
                    //    }
                    //}

                    List <string> entries = new List <string>();
                    entries.Add(string.Format("\"{0}\",\"{1}\",\"{2}\",\"{3}\",\"{4}\",\"{5}\",\"{6}\",\"{7}\",\"{8}\",\"{9}\",\"{10}\",\"{11}\"",
                                              "SalesDate", "PlainTiff", "Defendant", "SheriffNumber", "ApproxJudgment", "RedemptionPeriod", "Address"
                                              , "Zestimate", "Zestimate_Approx_Judgement_Diff", "Zestimate_Approx_Judgement_Division", "Tax Assestment", "Home Index Value"));
                    foreach (var item in records)
                    {
                        entries.Add(string.Format("\"{0}\",\"{1}\",\"{2}\",\"{3}\",\"{4}\",\"{5}\",\"{6}\",\"{7}\",\"{8}\",\"{9}\",\"{10}\",\"{11}\"",
                                                  item.SalesDate, item.PlainTiff, item.Defendant, item.SheriffNumber, item.ApproxJudgment, item.RedemptionPeriod, item.Address
                                                  , item.Zestimate, item.Zestimate_Approx_Judgement_Diff, item.Zestimate_Approx_Judgement_Division, item.TaxAssesment, item.IndexValue));
                    }

                    DateTime today = DateTime.Now;
                    var      time  = string.Format("{0}{1}{2}{3}{4}{5}", today.Year, today.Month, today.Day, today.Hour, today.Minute, today.Second);

                    File.WriteAllLines("results-" + time + ".csv", entries);
                }
                else
                {
                    Console.WriteLine("No record found. Please try again another time.");
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine("Unable to fetch data.\nReason: " + ex.Message);
            }
            Console.WriteLine("Operation Completed. Press any key to exit.");
            Console.ReadKey();
        }
Пример #32
0
        static void Main(string[] args)
        {
            Console.WriteLine("Getting url");
            var factory = new ConnectionFactory()
            {
                HostName = "localhost"
            };

            using (var connection = factory.CreateConnection())
                using (var channel = connection.CreateModel())
                {
                    channel.QueueDeclare(queue: "hello",
                                         durable: false,
                                         exclusive: false,
                                         autoDelete: false,
                                         arguments: null);

                    var consumer = new EventingBasicConsumer(channel);
                    consumer.Received += (model, ea) =>
                    {
                        var body    = ea.Body.ToArray();
                        var message = Encoding.UTF8.GetString(body);

                        ArticleDbContext db = new ArticleDbContext();
                        //var existArt = db.Articles.Where(a => a.Link.Equals(message));
                        string existArt = null;
                        if (existArt == null)
                        {
                            Console.WriteLine("Creating article");
                            try
                            {
                                var doc = new HtmlWeb().Load(message);

                                var listTitle = doc.DocumentNode.SelectNodes("//h1[contains(@class, 'title-detail')]");
                                var title     = "";
                                if (listTitle == null)
                                {
                                    return;
                                }
                                Console.WriteLine(listTitle.First().InnerText);

                                title = listTitle.First().InnerText;

                                var images = doc.DocumentNode.Descendants("img")
                                             .Select(a => a.GetAttributeValue("data-src", null))
                                             .Where(u => !String.IsNullOrEmpty(u));
                                var img = "";
                                if (images == null)
                                {
                                    return;
                                }
                                img = images.First();


                                var listContent = doc.DocumentNode.SelectNodes("//p[contains(@class, 'Normal')]");
                                if (listContent == null)
                                {
                                    return;
                                }
                                Console.WriteLine(listContent.First().InnerText);
                                StringBuilder content = new StringBuilder();
                                foreach (var item in listContent)
                                {
                                    content.AppendLine(item.InnerText);
                                }
                                Console.WriteLine(content);



                                // Create demo: Create a article instance and save it to the database
                                Article article = new Article
                                {
                                    Link    = message,
                                    Title   = title,
                                    Images  = img,
                                    Content = content.ToString()
                                };
                                db.Articles.Add(article);
                                db.SaveChanges();
                                Console.WriteLine("\nCreated article: ");
                            }
                            catch (Exception e)
                            {
                                Console.WriteLine(e.Message);
                            }
                        }
                        else
                        {
                            Console.WriteLine(message + "\nExisted");
                        }
                    };
                    channel.BasicConsume(queue: "hello",
                                         autoAck: true,
                                         consumer: consumer);

                    Console.WriteLine(" Press [enter] to exit.");
                    Console.ReadLine();
                }
        }
 public HtmlDocument Convert(IWebPage webPage)
 {
     var getHtmlWeb = new HtmlWeb();
     return getHtmlWeb.Load(webPage.Uri.OriginalString) as HtmlDocument;
 }
Пример #34
0
        static void ManualWay(HtmlWeb hw, string url)
        {
            // get the document from the Internet resource
            HtmlDocument doc = hw.Load(url);

            // we remarked all articles have discriminant target="_new" attribute.
            HtmlNodeCollection hrefs = doc.DocumentNode.SelectNodes("//a[@href and @target='_new']");
            if (hrefs == null)
            {
                return;
            }

            // create fake rss feed
            XmlDocument rssDoc = new XmlDocument();
            rssDoc.LoadXml("<?xml version=\"1.0\" encoding=\"" + doc.Encoding.BodyName + "\"?><rss version=\"0.91\"/>");

            // add channel element and other information
            XmlElement channel = rssDoc.CreateElement("channel");
            rssDoc.FirstChild.NextSibling.AppendChild(channel);

            XmlElement temp = rssDoc.CreateElement("title");
            temp.InnerText = "ASP.Net articles scrap RSS feed";
            channel.AppendChild(temp);

            temp = rssDoc.CreateElement("link");
            temp.InnerText = url;
            channel.AppendChild(temp);

            XmlElement item;
            // browse each article
            foreach(HtmlNode href in hrefs)
            {
                // get what's interesting for RSS
                string link = href.Attributes["href"].Value;
                string title = href.InnerText;
                string description = null;
                HtmlNode descNode = href.SelectSingleNode("../div/text()");
                if (descNode != null)
                    description = descNode.InnerText;

                // create XML elements
                item = rssDoc.CreateElement("item");
                channel.AppendChild(item);

                temp = rssDoc.CreateElement("title");
                temp.InnerText = title;
                item.AppendChild(temp);

                temp = rssDoc.CreateElement("link");
                temp.InnerText = link;
                item.AppendChild(temp);

                // description is not always here
                if ((description != null) && (description.Length >0))
                {
                    temp = rssDoc.CreateElement("description");
                    temp.InnerText = description;
                    item.AppendChild(temp);
                }
            }
            rssDoc.Save("rss.xml");
        }
Пример #35
0
        public HtmlDocument GetDocument()
        {
            HtmlWeb web = new HtmlWeb();

            return(web.Load(SiteUrl));
        }
Пример #36
0
 public Parser(HtmlWeb htmlWeb)
 {
     _htmlWeb = htmlWeb;
 }
Пример #37
0
 public async void TestHtmlWebBasicCall()
 {
     var html = new HtmlWeb();
     var doc = await html.LoadFromWebAsync("http://www.google.com");
     Assert.IsNotNull(doc);
 }
Пример #38
0
        public HtmlDocument GetDocumentByUrl(string url)
        {
            HtmlWeb web = new HtmlWeb();

            return(web.Load(url));
        }