public JobPost(HtmlNode node) { //Get links Id = Guid.NewGuid(); var link = HTMLHandler.GetValidLinks(node.SelectNodes(".//a[@href]"), IsValid).FirstOrDefault(); if (link != null) { JobName = link.InnerText; JobUrl = new Uri(System.Configuration.ConfigurationManager.AppSettings["Domain"] + link.GetAttributeValue("href", "")); if (node.SelectSingleNode(".//span[contains(@class,'company')]//span") != null) { Company = node.SelectSingleNode(".//span[contains(@class,'company')]//span").InnerText.RemoveAllNextLineCharacters(); } if (node.SelectSingleNode(".//span[contains(@class,'location')]//span") != null) { JobLocation = node.SelectSingleNode(".//span[contains(@class,'location')]//span").InnerText.RemoveAllNextLineCharacters(); } if (node.SelectSingleNode(".//td[contains(@class, 'snip')]//nobr") != null) { Salary = node.SelectSingleNode(".//td[contains(@class, 'snip')]//nobr").InnerText.RemoveAllNextLineCharacters(); } if (node.SelectSingleNode(".//span[contains(@class, 'summary')]") != null) { this.JobDescription = node.SelectSingleNode(".//span[contains(@class, 'summary')]").InnerText.RemoveAllNextLineCharacters(); } JobPostDate = DateTime.Now; IsIndeed = IndeedHandler.IsIndeedInsideLink(JobUrl); } }
/// <summary> /// Function to take a load of text and write it to a file /// </summary> /// <param name="title">Title of the article, will be the name of the file</param> /// <param name="text">Text to be in the file</param> /// <returns></returns> public async static Task WriteTextToFile(string title, string text) { title = HTMLHandler.ReplaceColons(title); //Get the path of the file and get all the text string fileName = Path.Combine(dirPath, (title + ".wik")); File.WriteAllText(fileName, text); }
/// <summary> /// Function to get the Text from a file /// </summary> /// <param name="title">Title of the article file to get text from</param> /// <returns>String that contains all the text from the file</returns> public async static Task <string> GetHTMLTextFromFile(string title) { title = HTMLHandler.ReplaceColons(title); //Get the path of the file and get all the text string fileName = Path.Combine(dirPath, (title + ".wik")); return(File.ReadAllText(fileName)); }
public void DoTheCrawl() { var doc = HTMLHandler.GetHtml(this.UrlToCrawl.AbsoluteUri); //Get all the downloadable link HtmlNodeCollection collection = doc.DocumentNode.SelectNodes("//div[contains(@class,'result') and contains(@class, 'row')]"); if (collection != null && collection.Count > 0) { foreach (HtmlNode obj in collection) { JobPost jp = new JobPost(obj) { Category = this.JobCategory, Title = this.JobTitle }; if (!string.IsNullOrEmpty(jp.JobName)) { if (jp.IsIndeed) { //TODO: get indeed data var jobDoc = HTMLHandler.GetHtml(jp.JobUrl.AbsoluteUri); HtmlNode node = jobDoc.DocumentNode.SelectSingleNode("//span[contains(@id,'job_summary')]"); if (node != null && node.InnerHtml.Length > 0) { var jobDtl = new JobDetail(jp.Id, node.InnerHtml); //save the post jp.Save(); jobDtl.Save(); } } else { //TODO: get outside link and update jobPost jp.JobUrl = new Uri(jp.JobUrl.AbsoluteUri.GetFinalRedirect()); //Save the post only jp.Save(); } Console.WriteLine(string.Format("{0} - {1}", jp.JobName, jp.JobLocation)); } Thread.Sleep(sleepingTime); } } //JobLinks = GetValidLinks(doc.DocumentNode.SelectNodes("//a[@href]")); foreach (LinkObject link in JobLinks) { //Process link data ProcessLink(link); Thread.Sleep(sleepingTime); } Thread.Sleep(sleepingTime); }
public AsyncHTTPServer(ushort port) { m_listener = new HttpListener(); m_listener.Prefixes.Add("http://+:" + port + "/"); /* Add HTTPS listener only on domain public address */ if (Config <string> .GetInstance()["PUBLIC_ADDRESS"].Contains("://quickstream.me")) { m_listener.Prefixes.Add("https://+:443/"); } m_handlers = new Dictionary <string, IServable>(); m_404Handler = new Error404Handler(); m_500Handler = new Error500Handler(); m_HTMLHandler = new HTMLHandler(); }
/// <summary> /// Function to save the HTML of a article to a local file. /// </summary> /// <param name="title">The title of the article to get the html from and the name of file to be saved to</param> /// <returns>Nothing</returns> public async static Task SaveHTMLFileToStorage(string title) { //Debug.WriteLine("Title: " + title); string HTMLText = ""; //Call the API service to get the HTML text from wikipedia HTMLText = await APIServices.GetAllHTMLFromWikipediaArticle(title); //Get the path to the file where it will be stored title = HTMLHandler.ReplaceColons(title); string fileName = Path.Combine(dirPath, (title + ".wik")); //Write to file File.WriteAllText(fileName, HTMLText); //Debug.WriteLine("Wrote To file: " + Path.Combine(dirPath, (title + ".wik"))); }
/////////////// // INITIATOR // /////////////// #region /// <summary> /// Begin the Download process /// </summary> public static void beginDownload()//rename this after cleanup { if (checkifDownloadCancelled()) { return; } if (!verifyDownloadDirectory()) { return; } Program.mainForm.lbOutput.BeginInvoke(new Action(() => Logger.logDownload("DOWNLOAD PROCESS COMMENCED") )); numMaxDownload = 0; // setup setExistingFiles(); resetDownloadLinks(); trimArtistList(); Program.mainForm.lblDownloadingFile.BeginInvoke(new Action(() => Program.mainForm.lblDownloadingFile.ForeColor = System.Drawing.Color.Orange )); Program.mainForm.lbDownloadProgress.BeginInvoke(new Action(() => Program.mainForm.lbDownloadProgress.ForeColor = System.Drawing.Color.Orange )); // download HTMLHandler.dig(); if (!HTMLHandler.faIsInBeta) // BETA UNSUPPORTED { // BETA UNSUPPORTED if (!downloadAfterEachParse) { artistSetup(); } } // BETA UNSUPPORTED downloadCompleteCleanup(); Program.mainForm.lbOutput.BeginInvoke(new Action(() => Logger.logDownload("YOU'RE DONE SON") )); }