Пример #1
0
        static void Main(string[] args)
        {
            Console.WriteLine("Start Crawler...");
            System.Threading.Thread.Sleep(2000);
            Console.WriteLine("Running");
            TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient();
            string html = "";

            for (int i = 1; i <= 4; i++)
            {
                html += client.Get("https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316?p=" + i, "https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316", "");
            }

            MatchCollection BookData    = Regex.Matches(html, "data-price=\"(.*?)\" data-title=\"(.*?)\" data-brand=\"(.*?)\" data-category=\"(.*?)\">", RegexOptions.Singleline);
            MatchCollection Review      = Regex.Matches(html, "class=\"review\">((.*?))</p>", RegexOptions.Singleline);
            MatchCollection Description = Regex.Matches(html, "description\">(.*?)<a", RegexOptions.Singleline);

            StreamWriter File = new StreamWriter("DataSave.txt");

            File.Flush();

            int j = 0;
            int a = 0;

            foreach (Match Book in BookData)
            {
                File.Write("Giá: ");
                File.WriteLine(Book.Groups[1].Value.Trim());
                File.Write("Tên sách: ");
                File.WriteLine(Book.Groups[2].Value.Trim());
                File.Write("Tác Giả: ");
                File.WriteLine(Book.Groups[3].Value.Trim());
                File.Write("Đánh giá: ");
                File.WriteLine(Review[j].Groups[1].Value.Trim());
                File.Write("Miêu tả: ");
                File.WriteLine(Description[a].Groups[1].Value.Trim());
                File.Write("Thể loại: ");
                File.WriteLine(Book.Groups[4].Value.Trim());
                File.WriteLine("___________________________");

                j++;
                a++;
            }

            File.Close();

            System.Threading.Thread.Sleep(5000);
            Console.WriteLine("Completed. Let's check file DataSave.txt");
            Console.ReadLine();
        }
Пример #2
0
        static void Main(string[] args)
        {
            Console.WriteLine("Start crawler...");

            //GET : https://tiki.vn/
            TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient();
            //get book
            for (int i = 1; i <= 4; i++)
            {
                string html = client.Get("https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316?p=" + i + "", "http://tiki.vn/", "");
                //get book
                MatchCollection book = Regex.Matches(html, "title=\"(.*?)\" data-brand", RegexOptions.Multiline);
                foreach (Match title in book)
                {
                    Console.WriteLine(title.Groups[1].Value.Replace("?", "").Trim());
                }
                //get author
                MatchCollection authors = Regex.Matches(html, "author\">(.*?)</p>", RegexOptions.Singleline);
                foreach (Match author in authors)
                {
                    Console.WriteLine(author.Groups[1].Value.Replace("?", "").Trim());
                }
                //get price
                MatchCollection priceList = Regex.Matches(html, "price-sale\">(.*?)<span", RegexOptions.Singleline);
                foreach (Match price in priceList)
                {
                    Console.WriteLine(price.Groups[1].Value.Replace("?", "").Trim());
                }
                //get comment
                MatchCollection comments = Regex.Matches(html, "review\">(.*?)</p>", RegexOptions.Multiline);
                foreach (Match review in comments)
                {
                    Console.WriteLine(review.Groups[1].Value.Replace("?", "").Trim());
                }
                //get description
                MatchCollection descriptions = Regex.Matches(html, "description\">(.*?)<a", RegexOptions.Singleline);
                foreach (Match description in descriptions)
                {
                    Console.WriteLine(description.Groups[1].Value.Replace("?", "").Trim());
                }
            }
            System.Threading.Thread.Sleep(5000);
            Console.WriteLine("OK");
            Console.ReadLine();
        }
        static void Main(string[] args)
        {
            TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient();
            int    start = 0, dem;
            String tieude, noidung, hinhanh;
            String reg  = "<a href=" + '"' + "(.*?)" + '"' + " itemprop=" + '"' + "url" + '"';
            String imgs = "<img src=" + '"' + "(.*?)" + '"' + " alt=" + '"' + '"' + " itemprop=" + '"' + "thumbnailUrl" + '"' + "/>";

            client.Post("http://dichvuketoantainha.net/login", "trankhanhtoan=b303baf471a2a712b2456ed41c822ee8&user_name=admin&user_pass=BUKT&login=Log+in", "http://dichvuketoantainha.net", "");

loop:
            String html = client.Get("http://ketoanbanthoigian.com/kinh-nghiem-ke-toan.html?start=" + start, "http://ketoanbanthoigian.com", "");

            start = start + 10;

            List <String> imgurls = new List <String>();

            if (start > 180)
            {
                goto finish;
            }
            MatchCollection rg    = Regex.Matches(html, reg);
            MatchCollection rgimg = Regex.Matches(html, imgs);

            dem = -1;
            foreach (Match rimg in rgimg)
            {
                dem++;
                String img = "http://ketoanbanthoigian.com" + rimg.Groups[1].Value;
                imgurls.Add(img);
            }
            dem = -1;
            foreach (Match r in rg)
            {
                dem++;
                String url   = "http://ketoanbanthoigian.com" + r.Groups[1].Value;
                String htmli = client.Get(url, "http://ketoanbanthoigian.com", "");
                int    i     = htmli.IndexOf("<h2 itemprop=\"name\">");
                int    j     = htmli.IndexOf("</h2>", i);
                tieude = htmli.Substring(i + 20, j - i - 20).Trim();

                int i1 = htmli.IndexOf("<div itemprop=\"articleBody\">");
                int j1 = htmli.IndexOf("<div class=\"extranews_separator\"></div>", i1);
                noidung = htmli.Substring(i1 + 28, j1 - i1 - 28).Trim();

                Console.WriteLine(dem + 1);
                hinhanh = imgurls.ElementAt(dem);

                /*
                 * thuc hien upload vao website dichvuketoantainha.net
                 * */
                //client.Post("http://dichvuketoantainha.net/admin/new_blog", "blog_name="+tieude+ "&blog_image="+hinhanh+ "&blog_cat_ids[]=4&blog_seo_title="+tieude+ "&blog_seo_keyword=&blog_seo_description=&new_blog=Create&blog_content="+noidung, "http://dichvuketoantainha.net/admin/", "");
                System.Threading.Thread.Sleep(1000);
            }
            goto loop;

finish:

            Console.WriteLine("finish!");
            Console.ReadLine();
        }
Пример #4
0
        static void Main(string[] args)
        {
            Console.WriteLine("Scraping....");

            TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient();

            StreamWriter file = new StreamWriter(@"D:\study\ECDESI\products.txt");

            for (int t = 1; t <= 4; t++) // get multiple page
            {
                string html_source = client.Get($"https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316?p={t}", "https://tiki.vn/", "");
                int    i           = 0;

                /// Get Product div
                MatchCollection products = Regex.Matches(html_source, "class=\"infomation\">(.*?)fa-caret-right", RegexOptions.Singleline);
                //end Get
                foreach (Match product in products)
                {
                    string string_product = product.Groups[1].Value.Trim();
                    /// Remove htlm tag
                    MatchCollection products_detail = Regex.Matches(string_product, "[^>]+?(?=<)", RegexOptions.Singleline); //
                    /// write data to products.txt
                    foreach (Match product_detail in products_detail)
                    {
                        if (product_detail.Value.Trim() != "")
                        {
                            i++;
                            /// Set category
                            if (i == 1)
                            {
                                string p = "Name: " + product_detail.Value.Trim();
                                file.WriteLine(p);
                            }
                            if (i == 2)
                            {
                                string p = "Author: " + product_detail.Value.Trim();
                                file.WriteLine(p);
                            }
                            if (i == 5)
                            {
                                string p = "Price: " + product_detail.Value.Trim();
                                file.WriteLine(p);
                            }
                            if (i == 8)
                            {
                                string p = "Decription: " + product_detail.Value.Trim();
                                file.WriteLine(p);
                            }
                            if (i == 9)
                            {
                                string p = "end!\n\n------------------------------------";
                                file.WriteLine(p);
                            }
                            file.Flush();
                            ///end Set catgory
                            ///
                        }
                    }
                    i = 0;
                }
            }
            //end write
            System.Threading.Thread.Sleep(5000);
            Console.WriteLine("Ok");
            Console.ReadLine();
        }
        static void Main(string[] args)
        {
            MatchCollection matches;


            //File BestSellingBook.txt lưu trữ thông tin cần lấy từ website Tiki.vn
            StreamWriter text = new StreamWriter("BestSellingBooks.txt");

            Console.WriteLine("Start Crawler...");

            //Số trang trong danh mục Best Selling Books trong Tiki.vn
            int page = 1;

            //Thứ tự sách
            int number = 1;

            TinTrinhLibrary.WebClient client = new TinTrinhLibrary.WebClient();

            while (page <= 4)
            {
                List <string> TITLE       = new List <string>();
                List <string> BRAND       = new List <string>();
                List <string> PRICE       = new List <string>();
                List <string> REVIEW      = new List <string>();
                List <string> DESCRIPTION = new List <string>();

                string html = client.Get("https://tiki.vn/bestsellers/sach-truyen-tieng-viet/c316?p= " + page, "https://tiki.vn/", "");


                //Get Title
                var   title    = "data-title=\"(.*?)\"";
                Regex re_title = new Regex(title);
                matches = re_title.Matches(html);


                foreach (Match correct in matches)
                {
                    TITLE.Add(correct.Groups[1].Value);
                }

                //Console.WriteLine(matches.Count);


                //Get Brand
                var   brand    = "data-brand=\"(.*?)\"";
                Regex re_brand = new Regex(brand);
                matches = re_brand.Matches(html);

                foreach (Match correct in matches)
                {
                    BRAND.Add(correct.Groups[1].Value);
                }

                //Console.WriteLine(matches.Count);


                //Get Price
                var   price    = "data-price=\"(.*?)\"";
                Regex re_price = new Regex(price);
                matches = re_price.Matches(html);

                foreach (Match correct in matches)
                {
                    PRICE.Add(correct.Groups[1].Value);
                }

                //Console.WriteLine(matches.Count);


                //Get Review
                var   review    = "<p class=\"review\">(.*?)</p>";
                Regex re_review = new Regex(review);
                matches = re_review.Matches(html);

                foreach (Match correct in matches)
                {
                    REVIEW.Add(correct.Groups[1].Value);
                }

                //Console.WriteLine(matches.Count);


                // Get Description
                var   description    = "<div class=\"description\">[\r\n]([^><]+)<";
                Regex re_description = new Regex(description);
                matches = re_description.Matches(html);


                foreach (Match correct in matches)
                {
                    DESCRIPTION.Add(correct.Groups[1].Value.Trim());
                }

                //Console.WriteLine(matches.Count);

                for (int i = 0; i < TITLE.Count; i++)
                {
                    text.WriteLine(number + ") " + "Tên sách: " + TITLE[i] + " | Tác giả: " + BRAND[i] + " | Số nhận xét: " + REVIEW[i] + " | Giá: " + PRICE[i] + "đ | Giới thiệu sách: " + DESCRIPTION[i]);
                    text.WriteLine();
                    number++;
                }

                page++;
            }


            text.Close();
        }