コード例 #1
0
        // Display results to a text
        public int download_given_links(string baslangic_url, int count)
        {
            default_url = baslangic_url;
            default_url = baslangic_url.ToString().Substring(0, baslangic_url.ToString().LastIndexOf("/") + 1);

            kayit _k = new kayit();

            string html_content = download(baslangic_url);

            _k.URL          = baslangic_url;
            _k.HTML_Content = html_content;
            if (html_content != "")
            {
                _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url);
            }

            kayitlar.Add(_k);

            int i = 1;

            int cnt = all_links.Count;//all records, changeable

            if (count > 0)
            {
                cnt = count;
            }

            while (i < cnt)
            {
                _k           = new kayit();
                html_content = download(all_links[i - 1].ToString());

                if (html_content != "")
                {
                    _k.URL          = all_links[i - 1].ToString();
                    _k.HTML_Content = html_content;
                    default_url     = all_links[i - 1].ToString().Substring(0, all_links[i - 1].ToString().LastIndexOf("/") + 1);

                    _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url);

                    kayitlar.Add(_k);
                }
                i++;
            }

            return(0);
        }
コード例 #2
0
ファイル: crawl.cs プロジェクト: erdincuzun/iCrawler
        // Display results to a text
        public int download_given_links_secim(string baslangic_url, string DirectoryName, int count, int secim, bool one_page, bool allpages)
        {
            string dirinfo = DateTime.Now.Day + "." + DateTime.Now.Month + "." + DateTime.Now.Year + "." + DateTime.Now.Hour + "." + DateTime.Now.Minute + "." + DateTime.Now.Second;

            default_url = baslangic_url;
            default_url = baslangic_url.ToString().Substring(0, baslangic_url.ToString().LastIndexOf("/") + 1);

            kayit _k = new kayit();

            string html_content = download(baslangic_url);

            _k.URL          = baslangic_url;
            _k.HTML_Content = html_content;

            //dom
            ML_Rules _mr = new ML_Rules(); //rules

            if (secim == 1 || secim == 2)
            {
                _mr._ht_rules = Rules_Process.Load_Rules(DirectoryName + "\\rule.xml");
            }

            DOM _dom = new DOM();

            if ((secim == 1 || secim == 3) && one_page == false)
            {
                CreateDOM(ref _dom, html_content);
                _mr = new ML_Rules(_dom._list, _mr._ht_rules);
                if (html_content != "")
                {
                    _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url);
                }
            }

            if ((secim == 1 || secim == 3) && one_page == true)
            {
                CreateDOM(ref _dom, html_content);
                _mr = new ML_Rules(_dom._list, _mr._ht_rules);
                if (html_content != "")
                {
                    _k.URL_Count = Add_links_to_Hashtable(extract_links_by_ML(_dom._list), baslangic_url);
                    count        = _k.URL_Count;
                }
            }

            if ((secim == 2) && one_page == true)
            {
                if (html_content != "")
                {
                    _k.URL_Count = Add_links_to_Hashtable(extract_links_by_Rules(html_content, _mr._ht_rules), baslangic_url);
                    count        = _k.URL_Count;
                }
            }

            if ((secim == 2) && one_page == false)
            {
                if (html_content != "")
                {
                    _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url);
                }
            }

            if (one_page == false & allpages == true)
            {
                count = _k.URL_Count;
            }


            kayitlar.Add(_k);

            int i = 1;

            int cnt = all_links.Count;//all records, changeable

            if (count > 0)
            {
                cnt = count;
            }

            if (!Directory.Exists(DirectoryName))
            {
                Directory.CreateDirectory(DirectoryName);
            }
            if (!Directory.Exists(DirectoryName + "\\" + dirinfo))
            {
                Directory.CreateDirectory(DirectoryName + "\\" + dirinfo);
            }
            if (!Directory.Exists(DirectoryName + "\\" + dirinfo + "\\XML"))
            {
                Directory.CreateDirectory(DirectoryName + "\\" + dirinfo + "\\XML");
            }
            if (!Directory.Exists(DirectoryName + "\\" + dirinfo + "\\HTML"))
            {
                Directory.CreateDirectory(DirectoryName + "\\" + dirinfo + "\\HTML");
            }
            //save files
            SaveHTMLFile(DirectoryName + "\\" + dirinfo, _k.URL, "0.html");
            SaveLogFile(DirectoryName + "\\" + dirinfo, _k.URL, DirectoryName + "\\" + dirinfo + "\\HTML\\" + "0.txt");

            if (all_links.Count > 0)
            {
                int docno = 1;
                while (i < cnt)
                {
                    if (i >= all_links.Count)
                    {
                        break;
                    }

                    _k           = new kayit();
                    html_content = download(all_links[i - 1].ToString());

                    if (html_content != "")
                    {
                        _k.URL          = all_links[i - 1].ToString();
                        _k.HTML_Content = html_content;
                        default_url     = all_links[i - 1].ToString().Substring(0, all_links[i - 1].ToString().LastIndexOf("/") + 1);

                        //save files
                        SaveHTMLFile(DirectoryName + "\\" + dirinfo, _k.HTML_Content, i.ToString() + ".html");
                        SaveLogFile(DirectoryName + "\\" + dirinfo, _k.URL, DirectoryName + "\\" + dirinfo + "\\HTML\\" + i.ToString() + ".txt");

                        ArrayList _main_list = efficientextraction.extractRules(_mr._ht_rules, _k.HTML_Content, "main");
                        if (_main_list.Count > 0)
                        {
                            ArrayList _headline_list   = efficientextraction.extractRules(_mr._ht_rules, _k.HTML_Content, "headline");
                            ArrayList _summary_list    = efficientextraction.extractRules(_mr._ht_rules, _k.HTML_Content, "summary");
                            ArrayList _additional_list = efficientextraction.extractRules(_mr._ht_rules, _k.HTML_Content, "additional");
                            //Remove Repetative Parts
                            Rules_Process.RemoveRepetativeParts(ref _main_list, ref _additional_list, ref _summary_list, ref _additional_list, _mr._ht_rules);

                            SaveXMLFile(DirectoryName, _main_list, _headline_list, _additional_list, _summary_list, DirectoryName + "\\" + dirinfo + "\\XML\\" + i.ToString() + ".xml");
                        }
                        else
                        {
                            if (secim == 1 || secim == 3)
                            {
                                _dom = new DOM();
                                CreateDOM(ref _dom, html_content);
                                ArrayList _main_list_ML = PrepareContent("main", _dom._list);
                                if (_main_list_ML.Count > 0)
                                {
                                    ArrayList _headline_list_ML   = PrepareContent("headline", _dom._list);
                                    ArrayList _summary_list_ML    = PrepareContent("summary", _dom._list);
                                    ArrayList _additional_list_ML = PrepareContent("additional", _dom._list);
                                    //Remove Repetative Parts
                                    Rules_Process.RemoveRepetativeParts(ref _main_list_ML, ref _additional_list_ML, ref _summary_list_ML, ref _additional_list_ML, _mr._ht_rules);

                                    SaveXMLFile(DirectoryName, _main_list_ML, _headline_list_ML, _additional_list_ML, _summary_list_ML, DirectoryName + "\\" + dirinfo + "\\XML\\" + i.ToString() + ".xml");
                                }
                                else
                                {
                                    if (one_page == false)
                                    {
                                        _k.URL_Count = Add_links_to_Hashtable(html_content, baslangic_url);//all a href add because of not contain main
                                        if (allpages == true)
                                        {
                                            count = count + _k.URL_Count;
                                        }
                                    }
                                }
                                //rule'lar kullanılacak önceki ile kontrol et
                                _mr = new ML_Rules(_dom._list, _mr._ht_rules);
                            }
                        }

                        kayitlar.Add(_k);
                        docno++;
                    }
                    else
                    {
                        cnt++;
                    }

                    int durum = (int)((double)i / count * 100);
                    _bg.ReportProgress(durum);

                    if (_bg.CancellationPending)
                    {
                        _e.Cancel = true;
                        break;
                    }

                    i++;
                }//while i

                if (secim == 1 || secim == 3)
                {
                    SaveRuleFile(DirectoryName, _mr._ht_rules);
                }
            }

            _bg.ReportProgress(100);

            return(0);
        }