private void ParseAdvs() { if (!_db.Connect()) { MessageBox.Show(General.ERROR_DB_CONNECT); Invoke(new Action(() => { statusLabel.Text = STATUS_BAR_DEFAULT; uploadButton.Enabled = true; exploreButton.Enabled = true; parseButton.Enabled = true; exportButton.Enabled = true; textBox2.Enabled = true; fromBox.Enabled = true; toBox.Enabled = true; })); _db.Close(); return; } var targets = textBox2.Text.Split(';'); var def = (int)(toBox.Value - fromBox.Value) + 1; uint i = 0; foreach (var target in targets) { var source = (target.Contains("_arc")) ? 2 : 1; var dirs = Directory.GetDirectories(target); var pathElements = target.Split('\\'); var comma = pathElements[pathElements.Length - 1].Split('_'); var type = comma[2] + '_' + comma[3]; foreach (var dir in dirs) { if (_stopFlag) { break; } var dsp = dir.Split('\\'); var desp = dsp[dsp.Length - 2]; var idStr = dsp[dsp.Length - 1].Split('_')[0]; if (!int.TryParse(idStr, out int id)) { i++; continue; } if (id < fromBox.Value || id > toBox.Value) { i++; continue; } char status = '-'; var singleName = dir.Split('\\')[dir.Split('\\').Length - 1]; var fileName = target + "\\" + singleName + ".html"; if (!File.Exists(fileName)) { fileName = target + "\\" + singleName + "_arch.html"; if (!File.Exists(fileName)) { fileName = target + "\\" + singleName + "_udalen.html"; if (!File.Exists(fileName)) { i++; continue; } } } if (fileName.Contains("udalen")) { status = 'u'; _db.UpdateStatus(id, status, source); continue; } FileInfo fileInfo = new FileInfo(fileName); string newname = fileName; if (fileInfo.Length <= 25600) { //if(!fileName.Contains("udalen")) newname = target + "\\" + singleName + "_udalen.html"; File.Move(fileName, newname); status = 'u'; _db.UpdateStatus(id, status, source); continue; } if (fileName.Contains("arch")) { status = 'a'; } _db.UpdateStatus(id, status, source); var doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(File.ReadAllText(fileName)); var restrictions = General.ExtractValueFromDoc(SiteType.Farpost, "restrictions", doc); var agency = General.ExtractValueFromDoc(SiteType.Farpost, "agency", doc); var rooms = General.ExtractValueFromDoc(SiteType.Farpost, "rooms", doc); var description = General.ExtractValueFromDoc(SiteType.Farpost, "description", doc); _db.UpdateAdvData(id, restrictions, string.Empty, agency, rooms, description); var added = General.ExtractValueFromDoc(SiteType.Farpost, "added", doc); if (!String.IsNullOrEmpty(added)) { var underlines = singleName.Split('_'); /*var savedDateTime = DateTime.MinValue; * DateTime.TryParseExact(underlines[2], "yyyyddMM", new CultureInfo("ru-RU"), DateTimeStyles.None, out savedDateTime);*/ var savedDateTime = File.GetCreationTime(fileName); var hour = 0; var minute = 0; var dateTime = DateTime.MinValue; var addedArr = added.Split(','); if (addedArr.Length >= 2) { var time = addedArr[0].Trim(); var timeArr = time.Split(':'); if (timeArr.Length >= 2) { int.TryParse(timeArr[0], out hour); int.TryParse(timeArr[1], out minute); } var date = addedArr[1].Trim().ToLower(); if (date == "сегодня") { dateTime = new DateTime(savedDateTime.Year, savedDateTime.Month, savedDateTime.Day, hour, minute, 0); date = "Сегодня"; } else if (date == "вчера") { dateTime = new DateTime(savedDateTime.Year, savedDateTime.Month, savedDateTime.Day, hour, minute, 0); dateTime.AddDays(-1); date = "Вчера"; } else { var dateArr = date.Split(' '); var format = dateArr.Length == 2 ? "d MMMM" : "d MMMM yyyy"; DateTime.TryParseExact(date, format, new CultureInfo("ru-RU"), DateTimeStyles.None, out dateTime); } _db.UpdateDate(id, dateTime, date); } } var username = General.ExtractValueFromDoc(SiteType.Farpost, "username", doc); var phone = General.ExtractValueFromDoc(SiteType.Farpost, "phone", doc); var email = General.ExtractValueFromDoc(SiteType.Farpost, "email", doc); var advsStr = General.GetNumbers(General.ExtractValueFromDoc(SiteType.Farpost, "advs", doc)); int.TryParse(advsStr, out int advs); _db.UpdateUserInfo(id, username, phone, email, advs); var address = General.ExtractValueFromDoc(SiteType.Farpost, "address", doc); var hood = General.ExtractValueFromDoc(SiteType.Farpost, "district", doc); var cadastre = General.ExtractValueFromDoc(SiteType.Farpost, "cadastreNumber", doc); var city = General.ExtractValueFromDoc(SiteType.Farpost, "city", doc); _db.UpdateAddr(id, address, string.Empty, string.Empty, string.Empty, string.Empty, cadastre, city); var advName = fileName.Split('\\')[fileName.Split('\\').Length - 1]; var advNumber = advName.Split('_')[1]; var ID = _db.GetIdByNumber(advNumber); if (ID != -1) // Если объява с таким внутренним идом уже существует { var adv = _db.GetAdvPdata(ID); var advOld = _db.GetAdvPdata(id); if ((General.GetNumbers(adv.Price) != General.GetNumbers(advOld.Price) || General.GetNumbers(adv.Square) != General.GetNumbers(advOld.Square))) { _db.UpdateIsNew(id, 0); } } else { _db.UpdateIsNew(id, 1); } if (type == "КН_ар") { var url = _db.GetUrlById(id); var comType = "n/a"; if (url.Contains("/market/")) { comType = "Торговая точка"; } else if (url.Contains("/terminal/")) { comType = "Складское помещение"; } else if (url.Contains("/workroom/")) { comType = "Производственное помещение"; } else if (url.Contains("/office/")) { comType = "Офисное помещенине"; } _db.UpdateComTypeSec(advNumber, comType); try { stopButton.Enabled = false; } catch { } } else if (type == "КН_пр" && !string.IsNullOrEmpty(rooms)) { _db.UpdateComTypeSec(advNumber, char.ToUpper(rooms[0]) + rooms.Substring(1)); } Invoke(new Action(() => { statusLabel.Text = "Обработка объявления #" + id + " [" + i + "/" + def + "]"; })); i++; } } Invoke(new Action(() => { statusLabel.Text = STATUS_BAR_DEFAULT; uploadButton.Enabled = true; exploreButton.Enabled = true; parseButton.Enabled = true; exportButton.Enabled = true; textBox2.Enabled = true; fromBox.Enabled = true; toBox.Enabled = true; stopButton.Enabled = false; })); _db.Close(); }
private void ParsePage(string url) { if (!_db.Connect()) { MessageBox.Show(General.ERROR_DB_CONNECT); _db.Close(); Invoke(new Action(() => { startButton.Text = "Начать"; startButton.Enabled = true; statusLabel.Text = General.STATUS_DEFAULT; })); return; } /* * Трём '#' и всё после него, если таковой есть */ var shard_idx = url.IndexOf("#"); if (shard_idx > -1) { url = url.Remove(shard_idx); } // Invoke(new Action(() => { textBox1.Text = url; })); // var lastId = _db.GetLastID() + 1; var siteType = General.GetSiteType(url); uint cnt = 0; if (siteType == SiteType.Farpost) { if (url.Contains("status=archive")) { _archiveFlag = true; } else { _archiveFlag = false; } var singleType = Farpost.GetTypeByLink(url); var dateTime = DateTime.Now.ToString("yyyyMMdd_HHmmss"); var folderName = dateTime + "_" + singleType + (_archiveFlag ? "_arc" : string.Empty); var workingDir = General.FARPOST_FOLDER + folderName + "\\"; var Comment = (CommentBox.Text != String.Empty ? CommentBox.Text.Replace(" ", "_") : string.Empty); Directory.CreateDirectory(workingDir); List <Advertisement> advsData; var saveTo = workingDir + "L_" + dateTime + "_" + (_archiveFlag ? "_arc" : string.Empty) + singleType + ".html"; var typeInt = _db.GetAdvTypeByString(singleType); if (url.Contains("?page=") || url.Contains("&page=")) { Invoke(new Action(() => { statusLabel.Text = "Собираем информацию с ленты..."; })); advsData = Farpost.GetDataFromFeed(url, saveTo); _db.InsertPageInfo(saveTo, typeInt, lastId, lastId + advsData.Count - 1, DateTime.Now.ToString("yyyy.dd.MM hh:mm:ss")); } else { var pages = Farpost.GetItemsCount(url); pages = (int)Math.Ceiling((float)(pages / 50)) + 1; if (pages != 1) { Invoke(new Action(() => { statusLabel.Text = "Собираем информацию со всех страниц ленты (0/" + pages + ")..."; })); advsData = new List <Advertisement>(); for (var i = 1; i <= pages; i++) { if (_stopFlag) { break; } saveTo = workingDir + "L_" + dateTime + "_" + (_archiveFlag ? "_arc" : string.Empty) + singleType + "_" + i + ".html"; var urlp = url + (url.Contains("?") ? "&" : "?") + "page=" + i; var temp = Farpost.GetDataFromFeed(urlp, saveTo); advsData.AddRange(temp); _db.InsertPageInfo(saveTo, typeInt, lastId, lastId + advsData.Count - 1, DateTime.Now.ToString("yyyy.dd.MM hh:mm:ss")); Invoke(new Action(() => { statusLabel.Text = "Собираем информацию со всех страниц ленты (" + i + "/" + pages + ")..."; })); if (pages != i) { General.SmallDelay(); } } } else { Invoke(new Action(() => { statusLabel.Text = "Собираем информацию с ленты..."; })); advsData = Farpost.GetDataFromFeed(url, saveTo); _db.InsertPageInfo(saveTo, typeInt, lastId, lastId + advsData.Count - 1, DateTime.Now.ToString("yyyy.dd.MM hh:mm:ss")); } } if (_stopFlag) { _db.Close(); Invoke(new Action(() => { startButton.Text = "Начать"; startButton.Enabled = true; statusLabel.Text = General.STATUS_DEFAULT; })); return; } if (advsData == null || advsData.Count == 0) { MessageBox.Show("Не удалось собрать объявления с указаной страницы."); _db.Close(); Invoke(new Action(() => { startButton.Text = "Начать"; startButton.Enabled = true; statusLabel.Text = General.STATUS_DEFAULT; })); return; } cnt = 0; Invoke(new Action(() => { listBox1.Items.Clear(); })); Invoke(new Action(() => { statusLabel.Text = "Обработка данных..."; })); var todel = new List <Advertisement>(); var updated = new List <int>(); var trigger = checkBox1.Checked; int newId = lastId; for (var i = 0; i < advsData.Count; i++) { if (_stopFlag) { break; } var id = _db.GetIdByNumber(advsData[i].Number); if (id != -1) // Если объява с таким внутренним идом уже существует, то { // Получаем информацию о ней. И если цена и площадь остались прежними - удаляем из списка на инсерты данную великолепную запись. var adv = _db.GetAdvPdata(id); if ((!trigger && General.GetNumbers(adv.Price) == General.GetNumbers(advsData[i].Price) && General.GetNumbers(adv.Square) == General.GetNumbers(advsData[i].Square) || (trigger && General.GetNumbers(adv.Price) == General.GetNumbers(advsData[i].Price) && General.GetNumbers(adv.Square) == General.GetNumbers(advsData[i].Square) && _db.GetTownByMainId(id) == -1 && _db.GetTownByMainId(id) == 0))) { todel.Add(advsData[i]); continue; } updated.Add(newId); } _db.InsertRecord(); newId++; } foreach (var item in todel) { advsData.Remove(item); } string distr = String.Empty; if (advsData.Count > 0) { foreach (var advData in advsData) { if (_stopFlag) { break; } Invoke(new Action(() => { listBox1.Items.Add(advData.Link); countLabel.Text = "Кол-во объявлений: " + ++cnt; })); if (advData.Annotations != null) { foreach (string item in advData.Annotations) { if (item.Contains("р-н")) { distr = item; break; } } } var dir = workingDir + lastId + "_" + advData.Number + "_" + DateTime.Now.ToString("yyyyddMM_hhmmss") + (_archiveFlag ? "_arc" : string.Empty); var newdir = workingDir.Trim('\\') + "_" + advsData.Count + Comment; int is_new = 1; if (updated.Contains(lastId)) { is_new = 0; } int source = 1; if (_archiveFlag) { source = 2; } _db.UpdateAdvData(lastId, is_new, source, advData.Number, advData.Link, "www.farpost.ru", newdir, advData.CurLink, advData.Title, advData.Price, advData.Square, advData.Geo, advData.City, distr, advData.Views, typeInt); _db.InsertSecData(lastId); Directory.CreateDirectory(dir); lastId++; } if (singleType.Contains("КН_") && !(url.Contains("&flatType") || url.Contains("?flatType"))) { var commercialTypes = new string[][] { new [] { "outlet", "Торговая точка" }, new [] { "manufacture", "Производство" }, new [] { "office", "Офис" }, new [] { "storage", "Склад" }, new [] { "etc", "Другое" }, }; var advsDataAd = new List <Advertisement>(); foreach (var commercialType in commercialTypes) { var urlp = url + (url.Contains("?") ? "&" : "?") + "flatType%5B%5D=" + commercialType[0]; var pages = Farpost.GetItemsCount(urlp); pages = (int)Math.Ceiling((float)(pages / 50)) + 1; if (pages != 1) { for (var i = 1; i <= pages; i++) { if (_stopFlag) { break; } var paged = urlp + (urlp.Contains("?") ? "&" : "?") + "page=" + i; var temp = Farpost.GetDataFromFeed(paged, null); advsDataAd.AddRange(temp); if (pages != i) { General.SmallDelay(); } } } else { advsDataAd = Farpost.GetDataFromFeed(urlp, null); } foreach (var advDataAd in advsDataAd) { _db.UpdateComType(advDataAd.Number, commercialType[1]); } } } } var newPath = workingDir.Trim('\\') + "_" + advsData.Count + "_" + Comment; Directory.Move(workingDir, newPath); _lastSaveToPath = newPath; } else if (siteType == SiteType.Cian) { var links = Cian.GetLinksFromFeed(url); if (links == null) { MessageBox.Show("Не удалось собрать ссылки на объявления с указаной страницы."); _db.Close(); Invoke(new Action(() => { startButton.Text = "Начать"; startButton.Enabled = true; statusLabel.Text = General.STATUS_DEFAULT; })); return; } cnt = 0; Invoke(new Action(() => { listBox1.Items.Clear(); })); foreach (var link in links) { if (string.IsNullOrEmpty(link)) { continue; } Invoke(new Action(() => { listBox1.Items.Add(link); countLabel.Text = "Кол-во объявлений: " + ++cnt; })); General.Delay(); } } _db.Close(); Invoke(new Action(() => { startButton.Text = "Начать"; startButton.Enabled = true; statusLabel.Text = General.STATUS_DEFAULT; })); }
private void UploadAdvs() { if (!_db.Connect()) { MessageBox.Show(General.ERROR_DB_CONNECT); _db.Close(); Invoke(new Action(() => { statusLabel.Text = STATUS_BAR_DEFAULT; uploadButton.Enabled = true; exploreButton.Enabled = true; parseButton.Enabled = true; exportButton.Enabled = true; textBox2.Enabled = true; stopButton.Enabled = false; fromBox.Enabled = true; toBox.Enabled = true; })); return; } var targets = textBox2.Text.Split(';'); var i = 0; foreach (var target in targets) { var paths = Directory.GetDirectories(target); Invoke(new Action(() => { statusLabel.Text = "Инициализация Selenium..."; })); IWebDriver browser; var option = new ChromeOptions(); option.AddArgument("--headless"); option.AddArgument("--user-agent=" + General.DEFAULT_USER_AGENT); //option.AddArgument("--proxy-server=socks5://109.234.35.41:8888"); var service = ChromeDriverService.CreateDefaultService(); service.HideCommandPromptWindow = true; browser = new ChromeDriver(service, option); // Нужно сделать проверку является-ли фарпостовской сессией var defaultPageAddress = "https://www.farpost.ru/vladivostok/realty/sell_flats/?page=1"; browser.Navigate().GoToUrl(defaultPageAddress); var def = (int)(toBox.Value - fromBox.Value) + 1; foreach (var path in paths) { if (_stopFlag) { break; } var pathSplit = path.Split('\\'); var advName = pathSplit[pathSplit.Length - 1]; var nlink = target + "\\" + advName; if (File.Exists(nlink + ".html") || File.Exists(nlink + "_arch.html")) { i++; continue; } var idStr = pathSplit[pathSplit.Length - 1].Split('_')[0]; if (!int.TryParse(idStr, out int id)) { i++; continue; } if (id < fromBox.Value || id > toBox.Value) { i++; continue; } Invoke(new Action(() => { statusLabel.Text = "Загрузка объявления #" + idStr + " [" + i + "/" + def + "]"; })); var link = _db.GetLinkById(id); while (true) { if (_stopFlag) { break; } try { browser.Navigate().GoToUrl(link); if (browser.PageSource.Contains("Из вашей подсети наблюдается подозрительная активность")) { _db.Close(); Invoke(new Action(() => { statusLabel.Text = STATUS_BAR_DEFAULT; uploadButton.Enabled = true; exploreButton.Enabled = true; parseButton.Enabled = true; exportButton.Enabled = true; textBox2.Enabled = true; stopButton.Enabled = false; fromBox.Enabled = true; toBox.Enabled = true; })); service.Dispose(); browser.Dispose(); MessageBox.Show("Капча! Завершаем работу."); return; } break; } catch (Exception ex) { General.WriteLog(ex.Message); Thread.Sleep(10000); } } if (!browser.PageSource.Contains("Объявление находится в архиве и может быть неактуальным.")) { for (var k = 0; k < 5; k++) { if (_stopFlag) { break; } try { var element = browser.FindElement(By.PartialLinkText("Показать контакты")); element.Click(); var isf = false; for (var ti = 0; ti < 20 && !_stopFlag; ti++) { try { browser.FindElement(By.ClassName("dummy-listener_new-contacts")); isf = true; break; } catch { Thread.Sleep(500); } } if (isf) { break; } } catch (Exception ex) { General.WriteLog(ex.Message); Thread.Sleep(1000); } } } else { nlink += "_arch"; } try { var element = browser.FindElement(By.ClassName("expand--button")); element.Click(); for (var ti = 0; ti < 20 && !_stopFlag; ti++) { try { browser.FindElement(By.ClassName("mod__active")); break; } catch { } Thread.Sleep(1000); } } catch (Exception) { } var content = browser.PageSource; var images = new List <string>(); var doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(content); var nodes = doc.DocumentNode.SelectNodes("//img"); if (nodes != null) { var nodesArray = nodes.ToArray(); foreach (var node in nodesArray) { images.Add(node.Attributes["src"].Value); } } int img_name = 0; using (var wc = new WebClient()) { foreach (var imageSrc in images) { if (_stopFlag) { break; } try { var ext = imageSrc; var idx = ext.LastIndexOf("/"); if (idx == -1) { continue; } ext = ext.Remove(0, idx + 1); idx = ext.LastIndexOf("."); ext = idx != -1 ? ext.Remove(0, idx) : ".jpg"; var pictureName = img_name.ToString() + ext; content = content.Replace(imageSrc, Path.Combine(advName, pictureName)); wc.DownloadFile(imageSrc, Path.Combine(path, pictureName)); img_name++; } catch (Exception ex) { General.WriteLog(ex.Message); } } } nodes = doc.DocumentNode.SelectNodes("//*[href]"); if (nodes != null) { var nodesArray = nodes.ToArray(); foreach (var node in nodesArray) { var value = node.Attributes["href"].Value; if (!value.StartsWith("http://") && !value.StartsWith("https://")) { var newValue = value; if (newValue.StartsWith("/")) { newValue.Remove(0, 1); } content = content.Replace(value, "https://www.farpost.ru/" + newValue); } } } var htmled = nlink + ".html"; File.WriteAllText(htmled, content, Encoding.UTF8); FileInfo fileInfo = new FileInfo(htmled); string newname; if (fileInfo.Length <= 25600) { newname = nlink + "_udalen.html"; File.Move(htmled, newname); } _db.ChangeAdvNLink(id, htmled); General.Delay(); i++; } _db.Close(); Invoke(new Action(() => { statusLabel.Text = STATUS_BAR_DEFAULT; uploadButton.Enabled = true; exploreButton.Enabled = true; parseButton.Enabled = true; exportButton.Enabled = true; textBox2.Enabled = true; stopButton.Enabled = false; fromBox.Enabled = true; toBox.Enabled = true; })); service.Dispose(); browser.Dispose(); } }