public override HtmlDocument Load(string url, uint level, string path, object userData, int tryCount) { var client = new MyWebClient(); var data = LoadData(client, url, level, tryCount); if (data == null) return null; Encoding charset = null; var ct = client.ResponseHeaders["Content-Type"]; var ex = new Regex("charset=(.+)"); var match = ex.Match(ct); if (match.Groups.Count == 2) { charset = Encoding.GetEncoding(match.Groups[1].Value); } var web = new HtmlDocument(); Encoding encoding = null; using (MemoryStream ms = new MemoryStream(data)) { encoding = web.DetectEncoding(ms); if (encoding == null) encoding = charset; if (encoding == null) encoding = Encoding.UTF8; } if (charset != null && charset != encoding) { data = Encoding.Convert(charset, encoding, data); } using (MemoryStream ms = new MemoryStream(data)) { web.Load(ms, encoding); } return web; }
override protected void DoPullProperties () { enc = null; try { foreach (Property prop in Indexable.Properties) { if (prop.Key != StringFu.UnindexedNamespace + "encoding") continue; enc = Encoding.GetEncoding ((string) prop.Value); break; } if (enc == null) { // we need to tell the parser to detect encoding, HtmlDocument temp_doc = new HtmlDocument (); enc = temp_doc.DetectEncoding (Stream); temp_doc = null; Stream.Seek (0, SeekOrigin.Begin); } } catch (NotSupportedException) { // Encoding passed in isn't supported } // Default if (enc == null) enc = Encoding.ASCII; doc = new HtmlDocument (); doc.ReportNode += HandleNodeEventHead; doc.StreamMode = true; // we already determined encoding doc.OptionReadEncoding = false; try { if (enc == null) doc.Load (Stream); else doc.Load (Stream, enc); } catch (NotSupportedException) { enc = Encoding.ASCII; doc.Load (Stream, enc); } catch (Exception e) { Log.Debug (e, "Exception while filtering HTML file " +FileInfo.FullName); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); string stepUri = Uri.UnescapeDataString(propertyBag.Step.Uri.AbsoluteUri); if (stepUri.Length > 396) { stepUri = stepUri.Substring(0, 396); } var crawlHistory = AspectF.Define. Return<CrawlHistory, NCrawlerEntitiesDbServices>( e => e.CrawlHistory.Where(m => m.Key == stepUri).FirstOrDefault()); if (crawlHistory == null) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", stepUri); }); return; } try { if (propertyBag.StatusCode != HttpStatusCode.OK) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key); //if (!result.IsNull()) //{ // e.DeleteObject(result); // e.SaveChanges(); //} }); return; } if (!IsHtmlContent(propertyBag.ContentType)) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key); //if (!result.IsNull()) //{ // e.DeleteObject(result); // e.SaveChanges(); //} }); return; } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (Stream reader = propertyBag.GetResponse()) { Encoding documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } //string content = reader.ReadToEnd(); //resultHtmlContent = content; } //string steplUri = propertyBag.ResponseUri.OriginalString; string orginalHtmlContent = htmlDoc.DocumentNode.OuterHtml; string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); DocumentWithLinks links = htmlDoc.GetLinks(); //string urlRegex = @"^http://www.bbc.co.uk/food/recipes/[^#/]+$"; List<string> recipeRegex = null; var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string; if (jsonStr == null) { using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8)) { jsonStr = stream.ReadToEnd(); var policy = new CacheItemPolicy(); policy.Priority = CacheItemPriority.NotRemovable; policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1); cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy); Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite")); } } var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr); if (json.RecipeRegex != null && json.RecipeRegex.Count > 0) { recipeRegex = json.RecipeRegex; } bool needToStore = false; if (recipeRegex != null) { foreach (var regex in recipeRegex) { if (Regex.IsMatch(propertyBag.Step.Uri.AbsoluteUri, regex, RegexOptions.IgnoreCase)) { needToStore = true; break; } } } else { needToStore = true; } if (needToStore) { //string folderPath = "D:/CrawlerManager/CrawlerData"; //string instanceFolderPath = folderPath + "/" + crawlHistory.GroupId; //string path = folderPath + "/" + crawlHistory.GroupId + "/" + string.Format("{0}.txt", crawlHistory.Id); //if (!Directory.Exists(folderPath)) //{ // Directory.CreateDirectory(folderPath); //} //if (!Directory.Exists(instanceFolderPath)) //{ // Directory.CreateDirectory(instanceFolderPath); //} //if (!File.Exists(path)) //{ // try // { // using (StreamWriter sw = File.CreateText(path)) // { // sw.WriteLine(orginalHtmlContent); // } // } // catch (Exception ex) // { // log4net.Config.XmlConfigurator.Configure(); // log4net.ILog log = log4net.LogManager.GetLogger("logger-name"); // log.Error(ex); // } //} var folderHelper = new FolderHelper(); var path = folderHelper.GetFolderPathToStore(crawlHistory.GroupId) + "/" + string.Format("{0}.txt", crawlHistory.Id); Console.Write(path); if (!File.Exists(path)) { try { using (StreamWriter sw = File.CreateText(path)) { sw.WriteLine(orginalHtmlContent); } } catch (Exception ex) { log4net.Config.XmlConfigurator.Configure(); log4net.ILog log = log4net.LogManager.GetLogger("logger-name"); log.Error(ex); } } //} } AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); }); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty() || link.Length > 396) { continue; } string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = ""; try { normalizedLink = NormalizeLink(baseUrl, decodedLink); } catch (Exception ex) { continue; } if (normalizedLink.IsNullOrEmpty()) { continue; } if (link.Contains("page=")) { var a = 1; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary<string, object> { {Resources.PropertyBagKeyOriginalUrl, link}, {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri} }); } } catch (Exception ex) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); }); log4net.Config.XmlConfigurator.Configure(); log4net.ILog log = log4net.LogManager.GetLogger("logger-name"); log.Error(ex); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsHtmlContent(propertyBag.ContentType)) { return; } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (Stream reader = propertyBag.GetResponse()) { Encoding documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } propertyBag["HtmlDoc"].Value = htmlDoc; HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select name.Value + ": " + content.Value).ToArray(); } propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary<string, object> { {Resources.PropertyBagKeyOriginalUrl, link}, {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri} }); } }
private NewsItem ParseArticle(string url, string source) { var uri = new Uri(url); var manager = GetManager(source); var doc = new HtmlDocument(); CustomWebClient client = new CustomWebClient(); using (var stream = new StreamReader(client.OpenRead(url))) { var encoding = doc.DetectEncoding(stream); using (var stream2 = client.OpenRead(url)) { if (encoding == null) encoding = Encoding.UTF8; doc.Load(stream2, encoding); } } var item = manager.ParseItem(doc, uri.Host); item.Href = client.ResponseUri.ToString(); return item; }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define .NotNull(crawler, nameof(crawler)) .NotNull(propertyBag, nameof(propertyBag)); if (propertyBag.StatusCode != HttpStatusCode.OK) { return Task.FromResult(true); } if (!IsHtmlContent(propertyBag.ContentType)) { return Task.FromResult(true); } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (MemoryStream ms = new MemoryStream(propertyBag.Response)) { Encoding documentEncoding = htmlDoc.DetectEncoding(ms); ms.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(ms, documentEncoding, true); } else { htmlDoc.Load(ms, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } propertyBag["HtmlDoc"].Value = htmlDoc; HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select $"{name.Value}: {content.Value}").ToArray(); } // Extract text propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); // Extract Head Base nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]"); if (!nodes.IsNull()) { baseUrl = nodes .Select(entry => new {entry, href = entry.Attributes["href"]}) .Where(arg => !arg.href.IsNull() && !arg.href.Value.IsNullOrEmpty() && Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute)) .Select(t => { if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative)) { return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value; } return t.href.Value; }) .AddToEnd(baseUrl) .FirstOrDefault(); } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.Crawl(new Uri(normalizedLink), propertyBag); } return Task.FromResult(true); }
public void ParseKartaitogovWebPage() { var viewModel = new KartaitogovViewModel(new Logger()); //string resourceName = "Loader.Tests.diff.htm"; //Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName); var filePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Kartaitogov", "diff.htm"); //byte[] webpageContent = Encoding.UTF8.GetBytes(File.ReadAllText(filePath)); /* viewModel.Downloader = new MockDownloader(webpageContent); var task = viewModel.downloadImages(); Assert.IsTrue(task.Wait(TimeSpan.FromSeconds(10))); Assert.IsNull(viewModel.LastError, "Error occured: " + viewModel.LastError); */ HtmlDocument htmlDoc = new HtmlDocument(); Encoding encoding = htmlDoc.DetectEncoding(filePath) ?? Encoding.UTF8; htmlDoc.Load(filePath, encoding); var reUikNumber = new System.Text.RegularExpressions.Regex(@"\d+"); using (var con = new SqlConnection("Data Source=.;Initial Catalog = elect;Integrated Security=True")) { con.Open(); var cmdRegion = con.CreateCommand(); cmdRegion.CommandText = "select ObjectID from Region where name = @pName"; var sqlParamRegName = new SqlParameter("pName", SqlDbType.VarChar); cmdRegion.Parameters.Add(sqlParamRegName); var cmdComission = con.CreateCommand(); cmdComission.CommandText = "select ObjectID from Comission where Region = @pRegion and [Number] = @pNumber"; var sqlParamComNum = new SqlParameter("pNumber", SqlDbType.Int); cmdComission.Parameters.Add(sqlParamComNum); var sqlParamRegId = new SqlParameter("pRegion", SqlDbType.UniqueIdentifier); cmdComission.Parameters.Add(sqlParamRegId); string regionName = null; Guid regionId = Guid.Empty; foreach (HtmlNode headUik in htmlDoc.DocumentNode.SelectNodes("//h3[@class='uik']")) { var regionNode = headUik.SelectSingleNode("preceding-sibling::h2[@class='oblast']"); var uikText = headUik.InnerText; if (regionNode != null) { var match = reUikNumber.Match(uikText); if (!match.Success) { Console.WriteLine("ERROR: Can't parse UIK number: " + uikText); } else { if (regionName != regionNode.InnerText) { regionName = regionNode.InnerText; sqlParamRegName.Value = regionName; var regionIdRaw = cmdRegion.ExecuteScalar(); if (regionIdRaw != null) regionId = (Guid)regionIdRaw; else { regionId = Guid.Empty; Console.WriteLine("WARN: Can't find in DB a region with name: " + regionName); } } sqlParamRegId.Value = regionId; int comissionNum = Int32.Parse(match.Value); sqlParamComNum.Value = comissionNum; var comissionIdRaw = cmdComission.ExecuteScalar(); Guid comissionId; if (comissionIdRaw != null) comissionId = (Guid)comissionIdRaw; else comissionId = Guid.Empty; //Console.WriteLine(regionNode.InnerText + " : " + uikText.Substring(uikText.IndexOf('\n', 0, 2))); Console.WriteLine(regionName + "(" + regionId + ")" + " / " + comissionNum + "(" + comissionId + ")"); } } else { Console.WriteLine("ERROR: Can't find region node!"); } } } }