// Display results to a text public int download_given_links(string baslangic_url, int count) { default_url = baslangic_url; default_url = baslangic_url.ToString().Substring(0, baslangic_url.ToString().LastIndexOf("/") + 1); kayit _k = new kayit(); string html_content = download(baslangic_url); _k.URL = baslangic_url; _k.HTML_Content = html_content; if (html_content != "") { _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url); } kayitlar.Add(_k); int i = 1; int cnt = all_links.Count;//all records, changeable if (count > 0) { cnt = count; } while (i < cnt) { _k = new kayit(); html_content = download(all_links[i - 1].ToString()); if (html_content != "") { _k.URL = all_links[i - 1].ToString(); _k.HTML_Content = html_content; default_url = all_links[i - 1].ToString().Substring(0, all_links[i - 1].ToString().LastIndexOf("/") + 1); _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url); kayitlar.Add(_k); } i++; } return(0); }
// Display results to a text public int download_given_links_secim(string baslangic_url, string DirectoryName, int count, int secim, bool one_page, bool allpages) { string dirinfo = DateTime.Now.Day + "." + DateTime.Now.Month + "." + DateTime.Now.Year + "." + DateTime.Now.Hour + "." + DateTime.Now.Minute + "." + DateTime.Now.Second; default_url = baslangic_url; default_url = baslangic_url.ToString().Substring(0, baslangic_url.ToString().LastIndexOf("/") + 1); kayit _k = new kayit(); string html_content = download(baslangic_url); _k.URL = baslangic_url; _k.HTML_Content = html_content; //dom ML_Rules _mr = new ML_Rules(); //rules if (secim == 1 || secim == 2) { _mr._ht_rules = Rules_Process.Load_Rules(DirectoryName + "\\rule.xml"); } DOM _dom = new DOM(); if ((secim == 1 || secim == 3) && one_page == false) { CreateDOM(ref _dom, html_content); _mr = new ML_Rules(_dom._list, _mr._ht_rules); if (html_content != "") { _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url); } } if ((secim == 1 || secim == 3) && one_page == true) { CreateDOM(ref _dom, html_content); _mr = new ML_Rules(_dom._list, _mr._ht_rules); if (html_content != "") { _k.URL_Count = Add_links_to_Hashtable(extract_links_by_ML(_dom._list), baslangic_url); count = _k.URL_Count; } } if ((secim == 2) && one_page == true) { if (html_content != "") { _k.URL_Count = Add_links_to_Hashtable(extract_links_by_Rules(html_content, _mr._ht_rules), baslangic_url); count = _k.URL_Count; } } if ((secim == 2) && one_page == false) { if (html_content != "") { _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url); } } if (one_page == false & allpages == true) { count = _k.URL_Count; } kayitlar.Add(_k); int i = 1; int cnt = all_links.Count;//all records, changeable if (count > 0) { cnt = count; } if (!Directory.Exists(DirectoryName)) { Directory.CreateDirectory(DirectoryName); } if (!Directory.Exists(DirectoryName + "\\" + dirinfo)) { Directory.CreateDirectory(DirectoryName + "\\" + dirinfo); } if (!Directory.Exists(DirectoryName + "\\" + dirinfo + "\\XML")) { Directory.CreateDirectory(DirectoryName + "\\" + dirinfo + "\\XML"); } if (!Directory.Exists(DirectoryName + "\\" + dirinfo + "\\HTML")) { Directory.CreateDirectory(DirectoryName + "\\" + dirinfo + "\\HTML"); } //save files SaveHTMLFile(DirectoryName + "\\" + dirinfo, _k.URL, "0.html"); SaveLogFile(DirectoryName + "\\" + dirinfo, _k.URL, DirectoryName + "\\" + dirinfo + "\\HTML\\" + "0.txt"); if (all_links.Count > 0) { int docno = 1; while (i < cnt) { if (i >= all_links.Count) { break; } _k = new kayit(); html_content = download(all_links[i - 1].ToString()); if (html_content != "") { _k.URL = all_links[i - 1].ToString(); _k.HTML_Content = html_content; default_url = all_links[i - 1].ToString().Substring(0, all_links[i - 1].ToString().LastIndexOf("/") + 1); //save files SaveHTMLFile(DirectoryName + "\\" + dirinfo, _k.HTML_Content, i.ToString() + ".html"); SaveLogFile(DirectoryName + "\\" + dirinfo, _k.URL, DirectoryName + "\\" + dirinfo + "\\HTML\\" + i.ToString() + ".txt"); ArrayList _main_list = efficientextraction.extractRules(_mr._ht_rules, _k.HTML_Content, "main"); if (_main_list.Count > 0) { ArrayList _headline_list = efficientextraction.extractRules(_mr._ht_rules, _k.HTML_Content, "headline"); ArrayList _summary_list = efficientextraction.extractRules(_mr._ht_rules, _k.HTML_Content, "summary"); ArrayList _additional_list = efficientextraction.extractRules(_mr._ht_rules, _k.HTML_Content, "additional"); //Remove Repetative Parts Rules_Process.RemoveRepetativeParts(ref _main_list, ref _additional_list, ref _summary_list, ref _additional_list, _mr._ht_rules); SaveXMLFile(DirectoryName, _main_list, _headline_list, _additional_list, _summary_list, DirectoryName + "\\" + dirinfo + "\\XML\\" + i.ToString() + ".xml"); } else { if (secim == 1 || secim == 3) { _dom = new DOM(); CreateDOM(ref _dom, html_content); ArrayList _main_list_ML = PrepareContent("main", _dom._list); if (_main_list_ML.Count > 0) { ArrayList _headline_list_ML = PrepareContent("headline", _dom._list); ArrayList _summary_list_ML = PrepareContent("summary", _dom._list); ArrayList _additional_list_ML = PrepareContent("additional", _dom._list); //Remove Repetative Parts Rules_Process.RemoveRepetativeParts(ref _main_list_ML, ref _additional_list_ML, ref _summary_list_ML, ref _additional_list_ML, _mr._ht_rules); SaveXMLFile(DirectoryName, _main_list_ML, _headline_list_ML, _additional_list_ML, _summary_list_ML, DirectoryName + "\\" + dirinfo + "\\XML\\" + i.ToString() + ".xml"); } else { if (one_page == false) { _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url);//all a href add because of not contain main if (allpages == true) { count = count + _k.URL_Count; } } } //rule'lar kullanılacak önceki ile kontrol et _mr = new ML_Rules(_dom._list, _mr._ht_rules); } } kayitlar.Add(_k); docno++; } else { cnt++; } int durum = (int)((double)i / count * 100); _bg.ReportProgress(durum); if (_bg.CancellationPending) { _e.Cancel = true; break; } i++; }//while i if (secim == 1 || secim == 3) { SaveRuleFile(DirectoryName, _mr._ht_rules); } } _bg.ReportProgress(100); return(0); }