static void Main(string[] args) { Console.WriteLine("Start Crawler..."); System.Threading.Thread.Sleep(2000); Console.WriteLine("Running"); TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient(); string html = ""; for (int i = 1; i <= 4; i++) { html += client.Get("https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316?p=" + i, "https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316", ""); } MatchCollection BookData = Regex.Matches(html, "data-price=\"(.*?)\" data-title=\"(.*?)\" data-brand=\"(.*?)\" data-category=\"(.*?)\">", RegexOptions.Singleline); MatchCollection Review = Regex.Matches(html, "class=\"review\">((.*?))</p>", RegexOptions.Singleline); MatchCollection Description = Regex.Matches(html, "description\">(.*?)<a", RegexOptions.Singleline); StreamWriter File = new StreamWriter("DataSave.txt"); File.Flush(); int j = 0; int a = 0; foreach (Match Book in BookData) { File.Write("Giá: "); File.WriteLine(Book.Groups[1].Value.Trim()); File.Write("Tên sách: "); File.WriteLine(Book.Groups[2].Value.Trim()); File.Write("Tác Giả: "); File.WriteLine(Book.Groups[3].Value.Trim()); File.Write("Đánh giá: "); File.WriteLine(Review[j].Groups[1].Value.Trim()); File.Write("Miêu tả: "); File.WriteLine(Description[a].Groups[1].Value.Trim()); File.Write("Thể loại: "); File.WriteLine(Book.Groups[4].Value.Trim()); File.WriteLine("___________________________"); j++; a++; } File.Close(); System.Threading.Thread.Sleep(5000); Console.WriteLine("Completed. Let's check file DataSave.txt"); Console.ReadLine(); }
static void Main(string[] args) { Console.WriteLine("Start crawler..."); //GET : https://tiki.vn/ TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient(); //get book for (int i = 1; i <= 4; i++) { string html = client.Get("https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316?p=" + i + "", "http://tiki.vn/", ""); //get book MatchCollection book = Regex.Matches(html, "title=\"(.*?)\" data-brand", RegexOptions.Multiline); foreach (Match title in book) { Console.WriteLine(title.Groups[1].Value.Replace("?", "").Trim()); } //get author MatchCollection authors = Regex.Matches(html, "author\">(.*?)</p>", RegexOptions.Singleline); foreach (Match author in authors) { Console.WriteLine(author.Groups[1].Value.Replace("?", "").Trim()); } //get price MatchCollection priceList = Regex.Matches(html, "price-sale\">(.*?)<span", RegexOptions.Singleline); foreach (Match price in priceList) { Console.WriteLine(price.Groups[1].Value.Replace("?", "").Trim()); } //get comment MatchCollection comments = Regex.Matches(html, "review\">(.*?)</p>", RegexOptions.Multiline); foreach (Match review in comments) { Console.WriteLine(review.Groups[1].Value.Replace("?", "").Trim()); } //get description MatchCollection descriptions = Regex.Matches(html, "description\">(.*?)<a", RegexOptions.Singleline); foreach (Match description in descriptions) { Console.WriteLine(description.Groups[1].Value.Replace("?", "").Trim()); } } System.Threading.Thread.Sleep(5000); Console.WriteLine("OK"); Console.ReadLine(); }
static void Main(string[] args) { TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient(); int start = 0, dem; String tieude, noidung, hinhanh; String reg = "<a href=" + '"' + "(.*?)" + '"' + " itemprop=" + '"' + "url" + '"'; String imgs = "<img src=" + '"' + "(.*?)" + '"' + " alt=" + '"' + '"' + " itemprop=" + '"' + "thumbnailUrl" + '"' + "/>"; client.Post("http://dichvuketoantainha.net/login", "trankhanhtoan=b303baf471a2a712b2456ed41c822ee8&user_name=admin&user_pass=BUKT&login=Log+in", "http://dichvuketoantainha.net", ""); loop: String html = client.Get("http://ketoanbanthoigian.com/kinh-nghiem-ke-toan.html?start=" + start, "http://ketoanbanthoigian.com", ""); start = start + 10; List <String> imgurls = new List <String>(); if (start > 180) { goto finish; } MatchCollection rg = Regex.Matches(html, reg); MatchCollection rgimg = Regex.Matches(html, imgs); dem = -1; foreach (Match rimg in rgimg) { dem++; String img = "http://ketoanbanthoigian.com" + rimg.Groups[1].Value; imgurls.Add(img); } dem = -1; foreach (Match r in rg) { dem++; String url = "http://ketoanbanthoigian.com" + r.Groups[1].Value; String htmli = client.Get(url, "http://ketoanbanthoigian.com", ""); int i = htmli.IndexOf("<h2 itemprop=\"name\">"); int j = htmli.IndexOf("</h2>", i); tieude = htmli.Substring(i + 20, j - i - 20).Trim(); int i1 = htmli.IndexOf("<div itemprop=\"articleBody\">"); int j1 = htmli.IndexOf("<div class=\"extranews_separator\"></div>", i1); noidung = htmli.Substring(i1 + 28, j1 - i1 - 28).Trim(); Console.WriteLine(dem + 1); hinhanh = imgurls.ElementAt(dem); /* * thuc hien upload vao website dichvuketoantainha.net * */ //client.Post("http://dichvuketoantainha.net/admin/new_blog", "blog_name="+tieude+ "&blog_image="+hinhanh+ "&blog_cat_ids[]=4&blog_seo_title="+tieude+ "&blog_seo_keyword=&blog_seo_description=&new_blog=Create&blog_content="+noidung, "http://dichvuketoantainha.net/admin/", ""); System.Threading.Thread.Sleep(1000); } goto loop; finish: Console.WriteLine("finish!"); Console.ReadLine(); }
static void Main(string[] args) { Console.WriteLine("Scraping...."); TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient(); StreamWriter file = new StreamWriter(@"D:\study\ECDESI\products.txt"); for (int t = 1; t <= 4; t++) // get multiple page { string html_source = client.Get($"https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316?p={t}", "https://tiki.vn/", ""); int i = 0; /// Get Product div MatchCollection products = Regex.Matches(html_source, "class=\"infomation\">(.*?)fa-caret-right", RegexOptions.Singleline); //end Get foreach (Match product in products) { string string_product = product.Groups[1].Value.Trim(); /// Remove htlm tag MatchCollection products_detail = Regex.Matches(string_product, "[^>]+?(?=<)", RegexOptions.Singleline); // /// write data to products.txt foreach (Match product_detail in products_detail) { if (product_detail.Value.Trim() != "") { i++; /// Set category if (i == 1) { string p = "Name: " + product_detail.Value.Trim(); file.WriteLine(p); } if (i == 2) { string p = "Author: " + product_detail.Value.Trim(); file.WriteLine(p); } if (i == 5) { string p = "Price: " + product_detail.Value.Trim(); file.WriteLine(p); } if (i == 8) { string p = "Decription: " + product_detail.Value.Trim(); file.WriteLine(p); } if (i == 9) { string p = "end!\n\n------------------------------------"; file.WriteLine(p); } file.Flush(); ///end Set catgory /// } } i = 0; } } //end write System.Threading.Thread.Sleep(5000); Console.WriteLine("Ok"); Console.ReadLine(); }
static void Main(string[] args) { MatchCollection matches; //File BestSellingBook.txt lưu trữ thông tin cần lấy từ website Tiki.vn StreamWriter text = new StreamWriter("BestSellingBooks.txt"); Console.WriteLine("Start Crawler..."); //Số trang trong danh mục Best Selling Books trong Tiki.vn int page = 1; //Thứ tự sách int number = 1; TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient(); while (page <= 4) { List <string> TITLE = new List <string>(); List <string> BRAND = new List <string>(); List <string> PRICE = new List <string>(); List <string> REVIEW = new List <string>(); List <string> DESCRIPTION = new List <string>(); string html = client.Get("https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316?p= " + page, "https://tiki.vn/", ""); //Get Title var title = "data-title=\"(.*?)\""; Regex re_title = new Regex(title); matches = re_title.Matches(html); foreach (Match correct in matches) { TITLE.Add(correct.Groups[1].Value); } //Console.WriteLine(matches.Count); //Get Brand var brand = "data-brand=\"(.*?)\""; Regex re_brand = new Regex(brand); matches = re_brand.Matches(html); foreach (Match correct in matches) { BRAND.Add(correct.Groups[1].Value); } //Console.WriteLine(matches.Count); //Get Price var price = "data-price=\"(.*?)\""; Regex re_price = new Regex(price); matches = re_price.Matches(html); foreach (Match correct in matches) { PRICE.Add(correct.Groups[1].Value); } //Console.WriteLine(matches.Count); //Get Review var review = "<p class=\"review\">(.*?)</p>"; Regex re_review = new Regex(review); matches = re_review.Matches(html); foreach (Match correct in matches) { REVIEW.Add(correct.Groups[1].Value); } //Console.WriteLine(matches.Count); // Get Description var description = "<div class=\"description\">[\r\n]([^><]+)<"; Regex re_description = new Regex(description); matches = re_description.Matches(html); foreach (Match correct in matches) { DESCRIPTION.Add(correct.Groups[1].Value.Trim()); } //Console.WriteLine(matches.Count); for (int i = 0; i < TITLE.Count; i++) { text.WriteLine(number + ") " + "Tên sách: " + TITLE[i] + " | Tác giả: " + BRAND[i] + " | Số nhận xét: " + REVIEW[i] + " | Giá: " + PRICE[i] + "đ | Giới thiệu sách: " + DESCRIPTION[i]); text.WriteLine(); number++; } page++; } text.Close(); }