static void Main(string[] args) { string htmlCode = "<!DOCTYPE html>\r\n" + "<html lang=\"en\">\r\n" + "<body>\r\n " + "<div width=\"100\" height=\"200\">sssssssssss</div>\r\n" + "</body>\r\n</html>\r\n"; //string htmlCode = "<html>"; var tokenList = new HtmlParser.HtmlTokenizer(htmlCode).Tokenizer(); foreach (var item in tokenList) { Console.WriteLine(item.value); } HtmlNode root = new HtmlParser.HtmlParser().Parse(tokenList, 0); Console.ReadKey(); }
public void TestNoLinksToPages() { Template riaTemplate = new Template(templateFilePath); var content = File.ReadAllText(this.noLinksToDetailsAndPagesPath); HtmlParser.HtmlParser parser = new HtmlParser.HtmlParser(); int counter = 0; foreach (var link in parser.GetLinks(content, "http://dom.ria.ua", riaTemplate.pages[0])) { Console.WriteLine(link); if (!string.IsNullOrEmpty(link)) counter++; } Assert.AreEqual(0, counter); }
public ParseFilter() { htmlParser = new HtmlParser(); worker = new ParseURLWorkerHandler(this.ParseURLWorker); callback = new AsyncCallback(this.OnParseURLCompleted); }
private void SearchSection(string base_section_site, string index_suffix, ref List<string> IndividualEntries, ref List<string> LastFiveEntriesSearched) { HtmlParser.HtmlParser parser = new HtmlParser.HtmlParser(); if (!parser.ParseURL(base_section_site + index_suffix, true, new string[] { "<br>" })) return; List<HtmlParser.HtmlTag> nodes = parser._nodes; HtmlParser.HtmlTag new_parent = null; if (parser.FilterNodes(filter_, out new_parent)) { foreach (HtmlTag pa in new_parent.Children) { if (pa.Name == "p") { foreach (HtmlTag a in pa.Children) { string site = null; if (a.Attributes.TryGetValue("href", out site)) { if (LastFiveEntriesSearched.Count != 0 && LastFiveEntriesSearched.Contains(site)) { FillLastFive(ref LastFiveEntriesSearched, new_parent); return; } else SearchEntry(site, pa); } } } else { string suffix = null; if (pa.Children.ElementAt(0).Attributes.TryGetValue("href", out suffix)) SearchSection(base_section_site, "/" + suffix, ref IndividualEntries, ref LastFiveEntriesSearched); } } if (index_suffix == "") FillLastFive(ref LastFiveEntriesSearched, new_parent); } }
//title = <html><head><title> //body = <html><body><div id="userbody"> private void SearchEntry(string entry_site, HtmlTag parent) { if (entry_site == String.Empty) return; // HtmlParser.HtmlParser parser = new HtmlParser.HtmlParser(); if (!parser.ParseURL(entry_site, true, new string[] {"<br>"})) return; List<HtmlParser.HtmlTag> nodes = parser._nodes; HtmlTag new_parent = new HtmlTag(); new_parent.Name = "Artificial Parent"; HtmlParser.ParseFilter title_filter = HtmlParser.ParseFilter.Create("html(head(title[parent]))"); HtmlParser.ParseFilter body_filter = HtmlParser.ParseFilter.Create("html(head(div[parent]))"); HtmlParser.HtmlTag title_tag = null; parser.FilterNodes(title_filter, out title_tag); HtmlParser.HtmlTag body_tag = null; parser.FilterNodes(body_filter, out body_tag); for (int i = 0; i < nodes.Count; i++) { if (nodes[i].Name == "html") { try { HtmlTag header_tag = null; nodes[i].FilterForChildrenByName("title", out header_tag); string title = header_tag.Children[0].Value; Dictionary<string, KeyValuePair<string, string>> tag_list = new Dictionary<string, KeyValuePair<string, string>>(); tag_list.Add("div", new KeyValuePair<string, string>("id", "userbody")); nodes[i].FilterForChildrenByNameAndAttribute(tag_list, ref new_parent); //Deleted by author, expired, etc. if(new_parent.Children.Count == 0) continue; string body = new_parent.Children[0].Value; if (body != null && body != String.Empty) { body = body.ToLower(); foreach (string keyword in Details_.Keywords_) { if (!body.Contains(keyword) && !title.Contains(keyword)) { int start = entry_site.LastIndexOf('/'); string output_file = entry_site.Substring(start, entry_site.Length - start); output_file += ".xml"; System.IO.StreamWriter test_xml = new System.IO.StreamWriter(output_file, false); test_xml.WriteLine("Couldn't find: '" + keyword + "' in body: '" + body + "'"); test_xml.WriteLine(parser.ToString()); test_xml.Close(); return; } } matchingEntriesFound++; Parent_.UpdateEntries(parent.ToString()); } } catch (Exception error) { Logger.Instance.Log(error.ToString(), Details_.City_, LogType.ltError); } } entries_searched_++; Parent_.UpdateTotalSearched(); } }
public PreciseParseFilter() { parser = new HtmlParser(); }
public BackgroundPoller(PollHandler parent, ref CityDetails details, HtmlParser.ParseFilter filter) : base(false, EventResetMode.ManualReset) { Parent_ = parent; Details_ = details; matchingEntriesFound = 0; Worker_ = new BackgroundWorker(); filter_ = filter; entries_searched_ = 0; stop_watch_ = new System.Diagnostics.Stopwatch(); Worker_.DoWork += this.PollCity; Worker_.RunWorkerCompleted += this.PollDone; Worker_.RunWorkerAsync(); }
public Entity Extract(string data, string host, Template template) { Stopwatch watch = null; //Template template = new Template("template1.txt"); //var data = File.ReadAllText(path); Entity ent = new Entity(); HtmlParser parser = new HtmlParser(); string tmpStr = string.Empty; int tmpInt = -1; if (string.IsNullOrEmpty(data)) return ent; //System.Diagnostics.Debug.WriteLine("starting title"); if (this.debugMode) watch = Stopwatch.StartNew(); #region title for (int i = 0; i < template.title.Count(); i++) if (string.IsNullOrEmpty(ent.title)) ent.title = parser.GetSingleContent(data, template.title[i]); else ent.title = parser.GetSingleContent(ent.title, template.title[i]); ent.title = parser.RemoveTags(ent.title).Replace("&quot;", "\""); var charArray = ent.title.ToCharArray(); bool convertToUppercase = false; if (charArray != null && charArray.Length > 0) { charArray[0] = char.ToUpper(charArray[0]); for (int i = 0; i < charArray.Length; i++) { if (convertToUppercase && char.IsLetter(charArray[i])) { charArray[i] = char.ToUpper(charArray[i]); convertToUppercase = false; } if (charArray[i] == '.') convertToUppercase = true; } ent.title = new string(charArray); } if (string.IsNullOrEmpty(ent.title)) ent.title = this.ExtractString(data, template.title); #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title extected in " + elapsedMs.ToString()); } tmpStr = string.Empty; //System.Diagnostics.Debug.WriteLine("starting description"); if (this.debugMode) watch = Stopwatch.StartNew(); #region description for (int i = 0; i < template.description.Count(); i++) if (string.IsNullOrEmpty(ent.description)) ent.description = parser.GetSingleContent(data, template.description[i]); else ent.description = parser.GetSingleContent(ent.description, template.description[i]); ent.description = parser.RemoveTags(ent.description).Replace("&quot;", "\""); ; //делаем буквы заглавными после точек charArray = ent.description.ToCharArray(); if (charArray != null && charArray.Count() > 0) { charArray[0] = char.ToUpper(charArray[0]); convertToUppercase = false; for (int i = 0; i < charArray.Length; i++) { if (convertToUppercase && char.IsLetter(charArray[i])) { charArray[i] = char.ToUpper(charArray[i]); convertToUppercase = false; } if (charArray[i] == '.') convertToUppercase = true; } ent.description = new string(charArray); } #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title description in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region avatar for (int i = 0; i < template.avatar.Count(); i++) if (string.IsNullOrEmpty(ent.userAvatarLink)) { foreach (var item in parser.GetImgUrl(data, host, template.avatar[i])) if (!string.IsNullOrEmpty(item)) ent.userAvatarLink = item; } else { foreach (var item in parser.GetImgUrl(ent.userAvatarLink, host, template.avatar[i])) if (!string.IsNullOrEmpty(item)) ent.userAvatarLink = item; } if (!string.IsNullOrEmpty(ent.userAvatarLink)) ent.userAvatarLink = parser.RemoveTags(ent.userAvatarLink); #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title avatar in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region roomsCount for (int i = 0; i < template.roomsCount.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.roomsCount[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.roomsCount[i]); tmpStr = parser.RemoveTags(tmpStr); if (!int.TryParse(tmpStr, out tmpInt)) ent.roomsCount = -1; else ent.roomsCount = tmpInt; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title roomsCount in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region sleepingPlaces for (int i = 0; i < template.sleepingPlaces.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.sleepingPlaces[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.sleepingPlaces[i]); tmpStr = parser.RemoveTags(tmpStr); if (!int.TryParse(tmpStr, out tmpInt)) ent.SleepingPlaces = -1; else ent.SleepingPlaces = tmpInt; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title sleepingPlaces in " + elapsedMs.ToString()); } if (this.debugMode) watch = Stopwatch.StartNew(); tmpStr = string.Empty; double tmpDbl = -1.0; #region allSize for (int i = 0; i < template.allSize.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.allSize[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.allSize[i]); tmpStr = parser.RemoveTags(tmpStr); if (!double.TryParse(tmpStr, out tmpDbl)) ent.allSize = -1; else ent.allSize = tmpDbl; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title allSize in " + elapsedMs.ToString()); } tmpDbl = -1.0; tmpStr = string.Empty; //System.Diagnostics.Debug.WriteLine("starting houseSize"); if (this.debugMode) watch = Stopwatch.StartNew(); #region houseSize for (int i = 0; i < template.houseSize.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.houseSize[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.houseSize[i]); tmpStr = parser.RemoveTags(tmpStr); if (tmpStr.ToLower().Contains(Dicts.KV_M.ToLower())) tmpStr = tmpStr.ToLower().Replace(Dicts.KV_M.ToLower(), string.Empty).Trim(); if (!double.TryParse(tmpStr, out tmpDbl)) ent.houseSize = -1; else ent.houseSize = tmpDbl; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title houseSize in " + elapsedMs.ToString()); } tmpDbl = -1.0; tmpStr = string.Empty; //System.Diagnostics.Debug.WriteLine("starting houseGardenSize"); if (this.debugMode) watch = Stopwatch.StartNew(); #region houseGardenSize for (int i = 0; i < template.houseGardenSize.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.houseGardenSize[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.houseGardenSize[i]); tmpStr = parser.RemoveTags(tmpStr); if (tmpStr.ToLower().Contains(Dicts.SOTOK.ToLower())) tmpStr = tmpStr.ToLower().Replace(Dicts.SOTOK.ToLower(), string.Empty).Trim(); if (!double.TryParse(tmpStr, out tmpDbl)) ent.houseGardenSize = -1; else ent.houseGardenSize = tmpDbl; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title houseGardenSize in " + elapsedMs.ToString()); } tmpDbl = -1.0; tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region distanceToCity for (int i = 0; i < template.distanceToCity.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.distanceToCity[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.distanceToCity[i]); if (tmpStr.ToLower().Contains(Dicts.KM.ToLower())) tmpStr = tmpStr.ToLower().Replace(Dicts.KM.ToLower(), string.Empty).Trim(); tmpStr = parser.RemoveTags(tmpStr); if (!double.TryParse(tmpStr, out tmpDbl)) ent.distanceToCity = -1; else ent.distanceToCity = tmpDbl; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title distanceToCity in " + elapsedMs.ToString()); } tmpDbl = -1.0; tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region houseCountFloor for (int i = 0; i < template.houseCountFloor.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.houseCountFloor[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.houseCountFloor[i]); tmpStr = parser.RemoveTags(tmpStr); if (!double.TryParse(tmpStr, out tmpDbl)) ent.houseCountFloor = -1; else ent.houseCountFloor = tmpDbl; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title houseCountFloor in " + elapsedMs.ToString()); } tmpDbl = -1.0; tmpStr = string.Empty; //System.Diagnostics.Debug.WriteLine("starting commercialObjectSize"); if (this.debugMode) watch = Stopwatch.StartNew(); #region commercialObjectSize for (int i = 0; i < template.commercialObjectSize.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.commercialObjectSize[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.commercialObjectSize[i]); tmpStr = parser.RemoveTags(tmpStr); if (tmpStr.ToLower().Contains(Dicts.KV_M.ToLower())) { tmpStr = tmpStr.ToLower().Replace(Dicts.KV_M.ToLower(), string.Empty).Trim(); } if (!double.TryParse(tmpStr, out tmpDbl)) ent.commercialObjectSize = -1; else ent.commercialObjectSize = tmpDbl; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title commercialObjectSize in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region livingSize for (int i = 0; i < template.livingSize.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.livingSize[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.livingSize[i]); tmpStr = parser.RemoveTags(tmpStr); if (!double.TryParse(tmpStr, out tmpDbl)) ent.livingSize = -1; else ent.livingSize = tmpDbl; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title livingSize in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region kitchenSize for (int i = 0; i < template.kitchenSize.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.kitchenSize[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.kitchenSize[i]); tmpStr = parser.RemoveTags(tmpStr); if (!double.TryParse(tmpStr, out tmpDbl)) ent.kitchenSize = -1; else ent.kitchenSize = tmpDbl; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title kitchenSize in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region floor/count floors for (int i = 0; i < template.floor.Count(); i++) { if (template.floor[i].Contains(Commands.SPLIT) && !string.IsNullOrEmpty(tmpStr)) { var DataSplitter = template.floor[i].Split('='); if (DataSplitter[1].ToLower().Contains(Commands.VERTICALSLASH.ToLower())) DataSplitter[1] = DataSplitter[1].Replace(Commands.VERTICALSLASH, "|"); var splittedData = tmpStr.Split(DataSplitter[1][0]); if (splittedData.Length > 1) { splittedData[0] = splittedData[0].Trim(); splittedData[1] = splittedData[1].Trim(); if (int.TryParse(splittedData[0], out tmpInt)) ent.floor = tmpInt; if (int.TryParse(splittedData[1], out tmpInt)) ent.countFloors = tmpInt; } } else { if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.floor[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.floor[i]); } } tmpStr = tmpStr.Trim(); tmpStr = parser.RemoveTags(tmpStr); if (int.TryParse(tmpStr, out tmpInt)) ent.floor = tmpInt; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title floor/count in " + elapsedMs.ToString()); } tmpStr = string.Empty; //System.Diagnostics.Debug.WriteLine("starting houseType"); if (this.debugMode) watch = Stopwatch.StartNew(); #region houseType for (int i = 0; i < template.houseType.Count(); i++) { if (template.houseType[i].Contains(Commands.SPLIT) && !string.IsNullOrEmpty(ent.houseType)) { var DataSplitter = template.houseType[i].Split('='); if (DataSplitter[1].ToLower().Contains(Commands.VERTICALSLASH.ToLower())) DataSplitter[1] = DataSplitter[1].Replace(Commands.VERTICALSLASH, "|"); var splittedData = ent.houseType.Split(DataSplitter[1][0]); if (splittedData.Length > 1) { ent.houseType = splittedData[0].Trim(); if (splittedData[1].Trim().Contains(Dicts.NEW_BUILDING.ToLower())) ent.isNewBuilding = true; else ent.isNewBuilding = false; } } else { if (string.IsNullOrEmpty(ent.houseType)) ent.houseType = parser.GetSingleContent(data, template.houseType[i]); else ent.houseType = parser.GetSingleContent(ent.houseType, template.houseType[i]); } } if (ent.houseType.Contains(Dicts.NEW_BUILDING.ToLower()))//возможно не указан тип здания, но указано что здание новострой { ent.isNewBuilding = true; ent.houseType = string.Empty; } ent.houseType = parser.RemoveTags(ent.houseType); #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title houseType in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region wctype for (int i = 0; i < template.wctype.Count(); i++) if (string.IsNullOrEmpty(ent.wctype)) ent.wctype = parser.GetSingleContent(data, template.wctype[i]); else ent.wctype = parser.GetSingleContent(ent.wctype, template.wctype[i]); ent.wctype = parser.RemoveTags(ent.wctype); #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title wctype in " + elapsedMs.ToString()); } tmpStr = string.Empty; //System.Diagnostics.Debug.WriteLine("starting balcony 1"); if (this.debugMode) watch = Stopwatch.StartNew(); #region balcony 1 for (int i = 0; i < template.balcony1.Count(); i++) { if (template.balcony1[i].Contains(Commands.SPLIT) && !string.IsNullOrEmpty(tmpStr)) { var DataSplitter = template.balcony1[i].Split('='); if (DataSplitter[1].ToLower().Contains(Commands.VERTICALSLASH.ToLower())) DataSplitter[1] = DataSplitter[1].Replace(Commands.VERTICALSLASH, "|"); var splittedData = tmpStr.Split(DataSplitter[1][0]); if (splittedData.Length > 1) { if (splittedData[0].Trim().Contains(Dicts.KV_M.ToLower())) { splittedData[0] = splittedData[0].Replace(Dicts.KV_M.ToLower(), string.Empty).Trim(); if (int.TryParse(splittedData[0], out tmpInt)) ent.balconySize = tmpInt; } else if (splittedData[0].Contains(Dicts.IS_GLASSED.ToLower())) ent.isBalconyGlassed = true; if (!ent.isBalconyGlassed && splittedData[1].Contains(Dicts.IS_GLASSED.ToLower())) ent.isBalconyGlassed = true; tmpStr = string.Empty;//clear temp var } } else { if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.balcony1[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.balcony1[i]); } } if (!string.IsNullOrEmpty(tmpStr)) { tmpStr = parser.RemoveTags(tmpStr); if (tmpStr.Trim().Contains(Dicts.KV_M.ToLower())) { tmpStr = tmpStr.Replace(Dicts.KV_M.ToLower(), string.Empty).Trim(); if (int.TryParse(tmpStr, out tmpInt)) ent.balconySize = tmpInt; } else if (tmpStr.Contains(Dicts.IS_GLASSED.ToLower())) ent.isBalconyGlassed = true; } #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title balcony1 in " + elapsedMs.ToString()); } tmpStr = string.Empty; //System.Diagnostics.Debug.WriteLine("starting balcony 2"); if (this.debugMode) watch = Stopwatch.StartNew(); #region balcony 2 if (!ent.isBalconyGlassed && ent.balconySize <= 0) for (int i = 0; i < template.balcony2.Count(); i++) { //System.Diagnostics.Debug.WriteLine("starting balcony 2 "+i.ToString()); if (template.balcony2[i].Contains(Commands.SPLIT) && !string.IsNullOrEmpty(tmpStr)) { //System.Diagnostics.Debug.WriteLine("starting balcony 2 first case"); var DataSplitter = template.balcony2[i].Split('='); if (DataSplitter[1].ToLower().Contains(Commands.VERTICALSLASH.ToLower())) DataSplitter[1] = DataSplitter[1].Replace(Commands.VERTICALSLASH, "|"); var splittedData = tmpStr.Split(DataSplitter[1][0]); if (splittedData.Length > 1) { if (splittedData[0].Trim().Contains(Dicts.KV_M.ToLower())) { splittedData[0] = splittedData[0].Replace(Dicts.KV_M.ToLower(), string.Empty).Trim(); if (int.TryParse(splittedData[0], out tmpInt)) ent.balconySize = tmpInt; } else if (splittedData[0].Contains(Dicts.IS_GLASSED.ToLower())) ent.isBalconyGlassed = true; if (!ent.isBalconyGlassed && splittedData[1].Contains(Dicts.IS_GLASSED.ToLower())) ent.isBalconyGlassed = true; tmpStr = string.Empty;//clear temp var } } else { //System.Diagnostics.Debug.WriteLine("starting balcony 2 second case"); if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.balcony2[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.balcony2[i]); } } if (!string.IsNullOrEmpty(tmpStr)) { tmpStr = parser.RemoveTags(tmpStr); if (tmpStr.Trim().Contains(Dicts.KV_M.ToLower())) { tmpStr = tmpStr.Replace(Dicts.KV_M.ToLower(), string.Empty).Trim(); if (int.TryParse(tmpStr, out tmpInt)) ent.balconySize = tmpInt; } else if (tmpStr.Contains(Dicts.IS_GLASSED.ToLower())) ent.isBalconyGlassed = true; } #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title balcony2 in " + elapsedMs.ToString()); } tmpStr = string.Empty; //System.Diagnostics.Debug.WriteLine("starting price & currency"); if (this.debugMode) watch = Stopwatch.StartNew(); #region price & currency for (int i = 0; i < template.price.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.price[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.price[i]); if (tmpStr.Contains(Dicts.USD)) { ent.currency = Dicts.USD; tmpStr = tmpStr.Replace(Dicts.USD, string.Empty).Replace(" ", string.Empty); } else if (tmpStr.Contains(Dicts.UAH)) { ent.currency = Dicts.UAH; tmpStr = tmpStr.Replace(Dicts.UAH, string.Empty).Replace(" ", string.Empty); } else if (tmpStr.Contains(Dicts.EURO)) { ent.currency = Dicts.EURO; tmpStr = tmpStr.Replace(Dicts.EURO, string.Empty).Replace(" ", string.Empty); } if (tmpStr.ToLower().Contains(Dicts.ZA_KV_M.ToLower().Replace(" ", string.Empty))) { ent.priceFor = Dicts.ZA_KV_M; tmpStr = tmpStr.ToLower().Replace(Dicts.ZA_KV_M.ToLower().Replace(" ", string.Empty), string.Empty); } else if (tmpStr.ToLower().Contains(Dicts.ZA_MONTH.ToLower().Replace(" ", string.Empty))) { ent.priceFor = Dicts.ZA_MONTH; tmpStr = tmpStr.ToLower().Replace(Dicts.ZA_MONTH.ToLower().Replace(" ", string.Empty), string.Empty); } else if (tmpStr.ToLower().Contains(Dicts.ZA_OBJECT.ToLower().Replace(" ", string.Empty))) { ent.priceFor = Dicts.ZA_OBJECT; tmpStr = tmpStr.ToLower().Replace(Dicts.ZA_OBJECT.ToLower().Replace(" ", string.Empty), string.Empty); } else if (tmpStr.ToLower().Contains(Dicts.ZA_SUTKI.ToLower().Replace(" ", string.Empty))) { ent.priceFor = Dicts.ZA_SUTKI; tmpStr = tmpStr.ToLower().Replace(Dicts.ZA_SUTKI.ToLower().Replace(" ", string.Empty), string.Empty); } tmpStr = parser.RemoveTags(tmpStr); tmpStr = tmpStr.Replace(" ", string.Empty); if (int.TryParse(tmpStr, out tmpInt)) ent.price = tmpInt; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title price¤cy in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region commision for (int i = 0; i < template.commision.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.commision[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.commision[i]); if (tmpStr.Contains(Dicts.NO_COMISSION)) ent.noComission = true; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title commision in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region contact for (int i = 0; i < template.contact.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.contact[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.contact[i]); tmpStr = parser.RemoveTags(tmpStr); ent.contactName = CultureInfo.CurrentCulture.TextInfo.ToTitleCase(tmpStr); #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title contact in " + elapsedMs.ToString()); } tmpStr = string.Empty; if (this.debugMode) watch = Stopwatch.StartNew(); #region createdDate for (int i = 0; i < template.createddate.Count(); i++) if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.createddate[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.createddate[i]); tmpStr = parser.RemoveTags(tmpStr); ent.CreatedDate = tmpStr; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title createdDate in " + elapsedMs.ToString()); } tmpStr = string.Empty; //System.Diagnostics.Debug.WriteLine("starting linkToPhotos"); if (this.debugMode) watch = Stopwatch.StartNew(); #region LinkToPhotos var links = new List<string>(); for (int i = 0; i < template.photos.Count(); i++) { if (template.photos[i].Contains(Commands.LOOP) && !string.IsNullOrEmpty(tmpStr)) { template.photos[i] = parser.ClearContent(template.photos[i].Replace(Commands.LOOP, string.Empty)); foreach (var item in parser.GetLinks(tmpStr, host, template.photos[i])) links.Add(item); } else if (template.photos[i].Contains(Commands.LOOP)) { template.photos[i] = parser.ClearContent(template.photos[i].Replace(Commands.LOOP, string.Empty)); if (template.photos[i].Contains(Commands.ATTRIBUTE) && template.photos[i].Contains(Commands.SPLIT)) { //ожидаем что то похожее на //[attribute]=onclick[split]<div class=\"preview-gallery\">*</div> template.photos[i] = template.photos[i].Replace(Commands.SPLIT, "|"); var rowData = template.photos[i].Split('|'); if (rowData != null && rowData.Length == 2) { //получаем чистый аттрибут rowData[0] = rowData[0].Replace(Commands.ATTRIBUTE, string.Empty).Replace("=", string.Empty); //rowData[1] должен содержать шаблон поиска foreach (var item in parser.GetAttributeValue(data, rowData[1], rowData[0])) links.Add(item); } } } else if (links != null && links.Count() > 0) { if (template.photos[i].Contains(Commands.REPLACE) && template.photos[i].Contains(Commands.SPLIT)) { template.photos[i] = template.photos[i].Replace(Commands.SPLIT, "|"); var rowData = template.photos[i].Split('|'); if (rowData != null && rowData.Length == 2) { rowData[0] = rowData[0].Replace(Commands.REPLACE, string.Empty).Replace("=", string.Empty); for (int linkCounter = 0; linkCounter < links.Count(); linkCounter++) { if (links[linkCounter].Contains(rowData[0])) links[linkCounter] = links[linkCounter].Replace(rowData[0], rowData[1]); else links[linkCounter] = string.Empty;//намеренно обнуляем результат- т.к. там скорее всего мусор } } } else { for (int linkCounter = 0; linkCounter < links.Count(); linkCounter++) { string tmpLinkResult = parser.GetSingleContent(links[linkCounter], template.photos[i]); if (!string.IsNullOrEmpty(tmpLinkResult)) links[linkCounter] = tmpLinkResult; } } } else { if (string.IsNullOrEmpty(tmpStr)) tmpStr = parser.GetSingleContent(data, template.photos[i]); else tmpStr = parser.GetSingleContent(tmpStr, template.photos[i]); } } ent.LinkToPhotos = links; #endregion if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Title linkToPhotos in " + elapsedMs.ToString()); } ent.wall = this.ExtractString(data, template.wall); //ent.house_state = this.ExtractString(data, template.house_state); //ent.type_of_perekritiya = this.ExtractString(data, template.type_of_perekritiya); ent.flat_planirovka = this.ExtractString(data, template.flat_planirovka); ent.visota_potolka = this.ExtractString(data, template.visota_potolka); ent.state_of_flat = this.ExtractString(data, template.state_of_flat); ent.uteplenie = this.ExtractString(data, template.uteplenie); ent.state_of_building = this.ExtractString(data, template.state_of_building); ent.building_type_of_perekritiya = this.ExtractString(data, template.building_type_of_perekritiya); ent.building_type_of_roof = this.ExtractString(data, template.building_type_of_roof); ent.characteristic_of_space_planirovka = this.ExtractString(data, template.characteristic_of_space_planirovka); ent.characteristic_of_space_some_feature = this.ExtractString(data, template.characteristic_of_space_some_feature); ent.characteristic_of_space_height = this.ExtractString(data, template.characteristic_of_space_height); ent.characteristic_of_space_state = this.ExtractString(data, template.characteristic_of_space_state); ent.characteristic_of_space_gips = this.ExtractString(data, template.characteristic_of_space_gips); ent.communication_gas = this.ExtractString(data, template.communication_gas); ent.communication_water = this.ExtractString(data, template.communication_water); ent.communication_heating = this.ExtractString(data, template.communication_heating); ent.communication_water_heating = this.ExtractString(data, template.communication_water_heating); ent.communication_tv = this.ExtractString(data, template.communication_tv); ent.communication_internet = this.ExtractString(data, template.communication_internet); ent.communication_phone = this.ExtractString(data, template.communication_phone); ent.communication_conditioner = this.ExtractString(data, template.communication_conditioner); ent.owner_object_type = this.ExtractString(data, template.owner_object_type); ent.nearest_metro = this.ExtractString(data, template.nearest_metro); ent.nearest_metro_distance = this.ExtractString(data, template.nearest_metro_distance); ent.nearest_metro_howto_get = this.ExtractString(data, template.nearest_metro_howto_get); ent.center_city_distance = this.ExtractString(data, template.center_city_distance); ent.center_city_howto_get = this.ExtractString(data, template.center_city_howto_get); ent.doors_and_windows_indoor = this.ExtractString(data, template.doors_and_windows_indoor); ent.doors_and_windows_window_type = this.ExtractString(data, template.doors_and_windows_window_type); ent.doors_and_windows_count_glass = this.ExtractString(data, template.doors_and_windows_count_glass); ent.school_distance = this.ExtractString(data, template.school_distance); ent.school_howto_get = this.ExtractString(data, template.school_howto_get); ent.childrengarden_distance = this.ExtractString(data, template.childrengarden_distance); ent.childrengarden_howto_get = this.ExtractString(data, template.childrengarden_howto_get); ent.policlinic_distance = this.ExtractString(data, template.policlinic_distance); ent.policlinic_howto_get = this.ExtractString(data, template.policlinic_howto_get); ent.market_distance = this.ExtractString(data, template.market_distance); ent.market_howto_get = this.ExtractString(data, template.market_howto_get); ent.relax_zone_type = this.ExtractString(data, template.relax_zone_type); ent.relax_zone_distance = this.ExtractString(data, template.relax_zone_distance); ent.relax_zone_howto_get = this.ExtractString(data, template.relax_zone_howto_get); ent.other_pravo_spbst_na_nedvig = this.ExtractString(data, template.other_pravo_spbst_na_nedvig); ent.other_pravo_spbst_na_zemlyu = this.ExtractString(data, template.other_pravo_spbst_na_zemlyu); ent.other_docs_na_pravo_sobstv = this.ExtractString(data, template.other_docs_na_pravo_sobstv); ent.other_obstoyatelstva = this.ExtractString(data, template.other_obstoyatelstva); ent.building_haracter_class_object = this.ExtractString(data, template.building_haracter_class_object); ent.building_haracter_building_year = this.ExtractString(data, template.building_haracter_building_year); ent.building_haracter_state = this.ExtractString(data, template.building_haracter_state); ent.haracter_bussiness_pravovaya_forma = this.ExtractString(data, template.haracter_bussiness_pravovaya_forma); ent.haracter_bussiness_srok_okupaemosti = this.ExtractString(data, template.haracter_bussiness_srok_okupaemosti); ent.haracter_bussiness_average_income = this.ExtractString(data, template.haracter_bussiness_average_income); ent.haracter_bussiness_debit_dolg = this.ExtractString(data, template.haracter_bussiness_debit_dolg); ent.haracter_bussiness_credit_dolg = this.ExtractString(data, template.haracter_bussiness_credit_dolg); ent.haracter_bussiness_count_empl = this.ExtractString(data, template.haracter_bussiness_count_empl); ent.haracter_bussiness_selling_part = this.ExtractString(data, template.haracter_bussiness_selling_part); ent.space_poshad_hoz_pomesh = this.ExtractString(data, template.space_poshad_hoz_pomesh); ent.space_poshad_torg_zala = this.ExtractString(data, template.space_poshad_torg_zala); ent.space_poshad_sklad = this.ExtractString(data, template.space_poshad_sklad); ent.space_poshad_rampa = this.ExtractString(data, template.space_poshad_rampa); ent.parking_count_place = this.ExtractString(data, template.parking_count_place); ent.parking_type = this.ExtractString(data, template.parking_type); ent.parking_howto_get = this.ExtractString(data, template.parking_howto_get); ent.parking_distance = this.ExtractString(data, template.parking_distance); ent.offer_type = this.ExtractString(data, template.offer_type); if (ent.currency == null || ent.currency.Trim() == "") ent.currency = this.ExtractString(data, template.currency); if (string.IsNullOrEmpty(ent.phone1)) ent.phone1 = this.ExtractString(data, template.phone); //System.Diagnostics.Debug.WriteLine("starting returning data"); return ent; }
public string ExtractString(string data, List<string> template) { Stopwatch watch = null; if (this.debugMode) watch = Stopwatch.StartNew(); HtmlParser parser = new HtmlParser(); string result = string.Empty; for (int i = 0; i < template.Count(); i++) if (string.IsNullOrEmpty(result)) result = parser.GetSingleContent(data, template[i]); else result = parser.GetSingleContent(result, template[i]); result=parser.RemoveTags(result); if (this.debugMode) { watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("ExtractString in " + elapsedMs.ToString()); } return result; }
public void ExtractPhone(string content, Entity ent, string[] template) { HtmlParser parser = new HtmlParser(); #region phone for (int i = 0; i < template.Count(); i++) if (string.IsNullOrEmpty(ent.phone1)) ent.phone1 = parser.GetSingleContent(content, template[i]); else ent.phone1 = parser.GetSingleContent(ent.phone1, template[i]); ent.phone1 = parser.RemoveTags(ent.phone1); #endregion }
public List<string> ExtractLinksToObjects(string data, string host) { Template template = new Template("template1.txt"); HtmlParser parser = new HtmlParser(); var result = new List<string>(); template.links[0] = parser.ClearContent(template.links[0]); string tmpStr = string.Empty; foreach (var item in parser.GetLinks(data, host, template.links[0])) { tmpStr = parser.ClearContent(item).Trim(); if (!string.IsNullOrEmpty(tmpStr)) result.Add(parser.ClearContent(item)); } return result; }
public void DownloadLocations() { string site = "http://www.craigslist.org/about/sites"; HtmlParser.HtmlParser parser = new HtmlParser.HtmlParser(); if (!parser.ParseURL(site, false, new string[] {})) return; List<HtmlParser.HtmlTag> nodes = parser._nodes; HtmlParser.HtmlTag new_parent = null; for (int i = 0; i < nodes.Count; i++) { if (nodes[i].Name == "html") { nodes[i].FilterForChildrenByName("div", out new_parent); List<string> class_names = new List<string>(); class_names.Add("jump_to_continents"); class_names.Add("colmask"); Dictionary<string, List<string>> filter_for = new Dictionary<string, List<string>>(); filter_for.Add("class", class_names); new_parent.FilterOutChildrenByAttribute(filter_for); Dictionary<string, string> abbr_to_area = new Dictionary<string, string>(); for (int z = 0; z < new_parent.Children.Count; z++) { HtmlTag Child = new_parent.Children[z]; if(Child.Attributes.Contains(new KeyValuePair<string,string>("class", "jump_to_continents"))) { for(int a = 0; a < Child.Children.Count; a++) { string abbreviation = Child.Children[a].Attributes.ElementAt(0).Value; abbreviation = abbreviation.Substring(1, abbreviation.Length - 1); abbr_to_area.Add(abbreviation, Child.Children[a].Value); } } else if(Child.Attributes.Contains(new KeyValuePair<string,string>("class", "colmask"))) { HtmlParser.HtmlTag NewerChild = new HtmlParser.HtmlTag(); Dictionary<string, KeyValuePair<string, string>> tag_list = new Dictionary<string, KeyValuePair<string, string>>(); tag_list.Add("h1", new KeyValuePair<string, string>("class", "continent_header")); tag_list.Add("div", new KeyValuePair<string, string>("class", "state_delimiter")); tag_list.Add("a", new KeyValuePair<string, string>("href", "*")); Child.FilterForChildrenByNameAndAttribute(tag_list, ref NewerChild); string current_area = ""; string current_state = ""; for (int count_newer_children = 0; count_newer_children < NewerChild.Children.Count; count_newer_children++ ) { HtmlParser.HtmlTag temp_child = NewerChild.Children[count_newer_children]; bool val_found = true; switch (temp_child.Name) { /*area*/ case "h1": { string temp_area = temp_child.Children[0].Attributes.ElementAt(0).Value; if (!abbr_to_area.TryGetValue(temp_area, out current_area)) { val_found = false; break; } if (!LocationDictionary.ContainsKey(current_area)) LocationDictionary.Add(current_area, new Dictionary<string, Dictionary<string, string>>()); } break; /*state*/ case "div": { current_state = temp_child.Value; if (current_state == null) current_state = "Unspecified"; try { if (!LocationDictionary[current_area].ContainsKey(current_state)) LocationDictionary[current_area].Add(current_state, new Dictionary<string, string>()); } catch (System.Exception ex) { string error = ex.ToString(); } } break; /*city*/ case "a": { string city = temp_child.Value; string website = temp_child.Attributes.ElementAt(0).Value; LocationDictionary[current_area][current_state].Add(city, website); } break; } if( !val_found ) break; } new_parent.Children[z] = NewerChild; } } } } }