Exemple #1
0
        private void LaunchAllEventsAndClearMemory(int cid, string roUrl, string enUrl, string deUrl, WikitravelDataExtraction wde)
        {
            DataCollectedEventArgs e = new DataCollectedEventArgs();

            e.CrawlerID = cid;
            e.Text      = wde.TextContents;
            e.URL       = roUrl;
            e.Language  = LanguageType.RO;
            if (this.DataCollected != null)
            {
                DataCollected(e);
            }

            wde         = new WikitravelDataExtraction(enUrl);
            e.CrawlerID = cid;
            e.Text      = wde.TextContents;
            e.URL       = enUrl;
            e.Language  = LanguageType.EN;
            if (this.DataCollected != null)
            {
                DataCollected(e);
            }

            wde         = new WikitravelDataExtraction(deUrl);
            e.CrawlerID = cid;
            e.Text      = wde.TextContents;
            e.URL       = deUrl;
            e.Language  = LanguageType.DE;
            if (this.DataCollected != null)
            {
                DataCollected(e);
            }
        }
Exemple #2
0
        private void WorkingThread()
        {
            //progresul curent
            int currentFileProgress = 0;
            //stiva de procesare pe site-uri
            List <string> crawlingStack = new List <string>();

            //primul url este cel de baza
            crawlingStack.Add(baseURL + serverLocation);
            //cata vreme mai putem procesa si nu am atins limita
            while ((currentFileProgress < crawlingStack.Count) && (currentFileProgress < needed))
            {
                string roUrl = crawlingStack[currentFileProgress];
                string enUrl = "";
                string deUrl = "";
                WikitravelDataExtraction wde = new WikitravelDataExtraction(roUrl);
                //cautam echivalentele in celelalte limbi si linkuri pe care ar trebui sa le urmam
                foreach (string tmp in wde.FollowLinks)
                {
                    if (tmp.Contains("wikitravel.org/de/"))
                    {
                        deUrl = tmp;
                    }
                    else if (tmp.Contains("wikitravel.org/en/"))
                    {
                        enUrl = tmp;
                    }
                    else if (tmp.Contains("href=\"/ro"))
                    {
                        //trebuie sa procesam url-ul pentru al adauga in lista
                        string temp = tmp.Substring(6);
                        temp = temp.Substring(0, temp.Length - 1);
                        crawlingStack.Add(this.baseURL + temp);
                    }
                }
                try{
                    enUrl = enUrl.Substring(6);
                    enUrl = enUrl.Substring(0, enUrl.Length - 1);
                }catch {
                }
                try {
                    deUrl = deUrl.Substring(6);
                    deUrl = deUrl.Substring(0, deUrl.Length - 1);
                } catch {
                }
                //lansam evenimentul pentru site-ul ro
                LaunchAllEventsAndClearMemory(currentFileProgress, roUrl, enUrl, deUrl, wde);
                wde = null;
                GC.Collect();
                currentFileProgress++;
            }
            if (NoMoreData != null)
            {
                NoMoreData();
            }
        }