private void UploadAdvs() { if (!_db.Connect()) { MessageBox.Show(General.ERROR_DB_CONNECT); _db.Close(); Invoke(new Action(() => { statusLabel.Text = STATUS_BAR_DEFAULT; uploadButton.Enabled = true; exploreButton.Enabled = true; parseButton.Enabled = true; exportButton.Enabled = true; textBox2.Enabled = true; stopButton.Enabled = false; fromBox.Enabled = true; toBox.Enabled = true; })); return; } var targets = textBox2.Text.Split(';'); var i = 0; foreach (var target in targets) { var paths = Directory.GetDirectories(target); Invoke(new Action(() => { statusLabel.Text = "Инициализация Selenium..."; })); IWebDriver browser; var option = new ChromeOptions(); option.AddArgument("--headless"); option.AddArgument("--user-agent=" + General.DEFAULT_USER_AGENT); //option.AddArgument("--proxy-server=socks5://109.234.35.41:8888"); var service = ChromeDriverService.CreateDefaultService(); service.HideCommandPromptWindow = true; browser = new ChromeDriver(service, option); // Нужно сделать проверку является-ли фарпостовской сессией var defaultPageAddress = "https://www.farpost.ru/vladivostok/realty/sell_flats/?page=1"; browser.Navigate().GoToUrl(defaultPageAddress); var def = (int)(toBox.Value - fromBox.Value) + 1; foreach (var path in paths) { if (_stopFlag) { break; } var pathSplit = path.Split('\\'); var advName = pathSplit[pathSplit.Length - 1]; var nlink = target + "\\" + advName; if (File.Exists(nlink + ".html") || File.Exists(nlink + "_arch.html")) { i++; continue; } var idStr = pathSplit[pathSplit.Length - 1].Split('_')[0]; if (!int.TryParse(idStr, out int id)) { i++; continue; } if (id < fromBox.Value || id > toBox.Value) { i++; continue; } Invoke(new Action(() => { statusLabel.Text = "Загрузка объявления #" + idStr + " [" + i + "/" + def + "]"; })); var link = _db.GetLinkById(id); while (true) { if (_stopFlag) { break; } try { browser.Navigate().GoToUrl(link); if (browser.PageSource.Contains("Из вашей подсети наблюдается подозрительная активность")) { _db.Close(); Invoke(new Action(() => { statusLabel.Text = STATUS_BAR_DEFAULT; uploadButton.Enabled = true; exploreButton.Enabled = true; parseButton.Enabled = true; exportButton.Enabled = true; textBox2.Enabled = true; stopButton.Enabled = false; fromBox.Enabled = true; toBox.Enabled = true; })); service.Dispose(); browser.Dispose(); MessageBox.Show("Капча! Завершаем работу."); return; } break; } catch (Exception ex) { General.WriteLog(ex.Message); Thread.Sleep(10000); } } if (!browser.PageSource.Contains("Объявление находится в архиве и может быть неактуальным.")) { for (var k = 0; k < 5; k++) { if (_stopFlag) { break; } try { var element = browser.FindElement(By.PartialLinkText("Показать контакты")); element.Click(); var isf = false; for (var ti = 0; ti < 20 && !_stopFlag; ti++) { try { browser.FindElement(By.ClassName("dummy-listener_new-contacts")); isf = true; break; } catch { Thread.Sleep(500); } } if (isf) { break; } } catch (Exception ex) { General.WriteLog(ex.Message); Thread.Sleep(1000); } } } else { nlink += "_arch"; } try { var element = browser.FindElement(By.ClassName("expand--button")); element.Click(); for (var ti = 0; ti < 20 && !_stopFlag; ti++) { try { browser.FindElement(By.ClassName("mod__active")); break; } catch { } Thread.Sleep(1000); } } catch (Exception) { } var content = browser.PageSource; var images = new List <string>(); var doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(content); var nodes = doc.DocumentNode.SelectNodes("//img"); if (nodes != null) { var nodesArray = nodes.ToArray(); foreach (var node in nodesArray) { images.Add(node.Attributes["src"].Value); } } int img_name = 0; using (var wc = new WebClient()) { foreach (var imageSrc in images) { if (_stopFlag) { break; } try { var ext = imageSrc; var idx = ext.LastIndexOf("/"); if (idx == -1) { continue; } ext = ext.Remove(0, idx + 1); idx = ext.LastIndexOf("."); ext = idx != -1 ? ext.Remove(0, idx) : ".jpg"; var pictureName = img_name.ToString() + ext; content = content.Replace(imageSrc, Path.Combine(advName, pictureName)); wc.DownloadFile(imageSrc, Path.Combine(path, pictureName)); img_name++; } catch (Exception ex) { General.WriteLog(ex.Message); } } } nodes = doc.DocumentNode.SelectNodes("//*[href]"); if (nodes != null) { var nodesArray = nodes.ToArray(); foreach (var node in nodesArray) { var value = node.Attributes["href"].Value; if (!value.StartsWith("http://") && !value.StartsWith("https://")) { var newValue = value; if (newValue.StartsWith("/")) { newValue.Remove(0, 1); } content = content.Replace(value, "https://www.farpost.ru/" + newValue); } } } var htmled = nlink + ".html"; File.WriteAllText(htmled, content, Encoding.UTF8); FileInfo fileInfo = new FileInfo(htmled); string newname; if (fileInfo.Length <= 25600) { newname = nlink + "_udalen.html"; File.Move(htmled, newname); } _db.ChangeAdvNLink(id, htmled); General.Delay(); i++; } _db.Close(); Invoke(new Action(() => { statusLabel.Text = STATUS_BAR_DEFAULT; uploadButton.Enabled = true; exploreButton.Enabled = true; parseButton.Enabled = true; exportButton.Enabled = true; textBox2.Enabled = true; stopButton.Enabled = false; fromBox.Enabled = true; toBox.Enabled = true; })); service.Dispose(); browser.Dispose(); } }
private static void UnhandledExceptionOccured(object sender, UnhandledExceptionEventArgs e) { var ex = (Exception)e.ExceptionObject; General.WriteLog(ex.ToString()); }
/// <summary> /// Возвращает список с инфой об объявах /// </summary> /// <param name="link">Ссылка на ленту</param> /// <returns>Список с инфой об объявах</returns> public static List <Advertisement> GetDataFromFeed(string url, string saveTo) { var result = new List <Advertisement>(); HtmlDocument doc; var web = new HtmlWeb { AutoDetectEncoding = false, OverrideEncoding = Encoding.Default, UserAgent = General.DEFAULT_USER_AGENT }; doc = web.Load(url); var cityDom = doc.DocumentNode.SelectNodes("//a[contains(@class, 'cityPop')]"); var useAnnotations = url.Contains("primorskii-krai"); var city = string.Empty; if (cityDom != null) { city = cityDom[0].InnerText.Trim(); } else { city = string.Empty; } var advs = doc.DocumentNode.SelectNodes("//tr[contains(@class,'bull-item') and not(@data-accuracy)]"); if (saveTo != null) { var content = doc.DocumentNode.InnerHtml; var linx = doc.DocumentNode.SelectNodes("//a[@href]"); if (linx != null) { foreach (var link in linx) { var val = link.Attributes["href"].Value.Trim(); if (!val.StartsWith("http://") && !val.StartsWith("https://")) { content = content.Replace('"' + val + '"', "\"https://www.farpost.ru" + val + '"'); } } } File.WriteAllText(saveTo, content, Encoding.UTF8); } if (advs != null) { foreach (var adv in advs) { try { var geo = string.Empty; var title = WebUtility.HtmlDecode(adv.SelectSingleNode(".//td[@class='descriptionCell']/a[@name and @data-stat]").InnerText).Trim(); var geoDom = adv.SelectSingleNode(".//a[@data-geo]"); if (geoDom != null) { geo = geoDom.Attributes["data-geo"].Value.Trim(); } var viewsDom = adv.SelectSingleNode(".//*[contains(@class,'views')]"); var views = 0; if (viewsDom != null) { int.TryParse(viewsDom.InnerHtml, out views); } var number = adv.SelectSingleNode(".//a[@name!='']").Attributes["name"].Value.Trim(); var link = adv.SelectSingleNode(".//a[@href!='#']").Attributes["href"].Value.Trim(); // Цены иногда может и не быть... var price = "n/a"; var priceDom = adv.SelectSingleNode(".//span[@data-role='price']"); if (priceDom != null) { price = WebUtility.HtmlDecode(priceDom.InnerText).Trim(); if (price.Contains("₽")) { price = price.Replace("₽", ""); } } // Площадь. Также иногда может отсутствовать string[] annotations = null; var square = "n/a"; var squareDom = adv.SelectSingleNode(".//div[contains(@class, 'annotation')]"); if (squareDom != null) { annotations = squareDom.InnerText.Split(new string[] { ", " }, StringSplitOptions.RemoveEmptyEntries); foreach (var sep in annotations) { if (sep.Contains("кв.")) { square = WebUtility.HtmlDecode(sep).Trim(); break; } } } if (useAnnotations) { var altCity = string.Empty; var cityDivDom = adv.SelectSingleNode(".//div[contains(@class, 'city')]"); if (cityDivDom != null) { altCity = cityDivDom.InnerText; } else { altCity = annotations[annotations.Length - 1]; var idx = altCity.IndexOf("м.."); if (idx != -1) { altCity = altCity.Remove(0, idx + 3); } } city = altCity.Trim(); General.WriteLog(city); } var adv_m = new Advertisement() { Number = number, Link = "https://www.farpost.ru" + link, Title = title, Price = price, Square = square, Geo = geo, Type = GetTypeByLink(link), Views = views, Annotations = annotations, City = city, CurLink = saveTo }; result.Add(adv_m); } catch (Exception ex) { General.WriteLog(ex.ToString()); } } } /* * */ advs = doc.DocumentNode.SelectNodes("//td[contains(@class,'bull-item') and not(@data-accuracy)]"); if (advs != null) { foreach (var adv in advs) { try { var title = WebUtility.HtmlDecode(adv.SelectSingleNode(".//div[@class='title']/a").InnerText).Trim(); var geo = string.Empty; var geoDom = adv.SelectSingleNode(".//a[@data-geo]"); if (geoDom != null) { geo = geoDom.Attributes["data-geo"].Value.Trim(); } var viewsDom = adv.SelectSingleNode(".//*[contains(@class,'views')]"); var views = 0; if (viewsDom != null) { int.TryParse(viewsDom.InnerHtml, out views); } var number = adv.SelectSingleNode(".//a[@name!='']").Attributes["name"].Value.Trim(); var link = adv.SelectSingleNode(".//a[@href!='#']").Attributes["href"].Value.Trim(); // Цены иногда может и не быть... var price = "n/a"; var priceDom = adv.SelectSingleNode(".//span[@data-role='price']"); if (priceDom != null) { price = WebUtility.HtmlDecode(priceDom.InnerText).Trim(); } // Площадь. Также иногда может отсутствовать string[] annotations = null; var square = "n/a"; var squareDom = adv.SelectSingleNode(".//div[contains(@class, 'annotation')]"); if (squareDom != null) { annotations = squareDom.InnerText.Split(new string[] { ", " }, StringSplitOptions.RemoveEmptyEntries); foreach (var sep in annotations) { if (sep.Contains("кв.")) { square = WebUtility.HtmlDecode(sep).Trim(); break; } } } var altCity = string.Empty; if (useAnnotations) { var cityDivDom = adv.SelectSingleNode(".//div[contains(@class, 'city')]"); if (cityDivDom != null) { altCity = cityDivDom.InnerText; } else { altCity = annotations[annotations.Length - 1]; var idx = altCity.IndexOf("м.."); if (idx != -1) { altCity = altCity.Remove(0, idx + 3); } } city = altCity.Trim(); } var adv_m = new Advertisement() { Number = number, Link = "https://www.farpost.ru" + link, Title = title, Price = price, Square = square, Geo = geo, Type = GetTypeByLink(link), Views = views, Annotations = annotations, City = city }; result.Add(adv_m); } catch (Exception ex) { General.WriteLog(ex.ToString()); } } } return(result); }