コード例 #1
0
        public bool Seed(string seed)
        {
            if (temp_list == null)
            {
                temp_list = new LinkedList <ConcurrentQueue <string> >();
            }

            if (!Database.LinkExists(seed) && RobotstxtParser.Approved(seed))
            {
                ConcurrentQueue <string> queue = new ConcurrentQueue <string>();
                queue.Enqueue(seed);
                temp_list.AddLast(queue);
                reporter.Invoke(reporter.ReportQueued, seed + Environment.NewLine);
                return(true);
            }
            return(false);
        }
コード例 #2
0
        public void ThreadProc(object obj)
        {
            if (Controller.OperationCancelled)
            {
                return;
            }

            HttpDownloader downloader = HttpDownloader.GetInstance();

            string link = (obj as ThreadParameter).link;
            ConcurrentQueue <string> Scheduled_links = (obj as ThreadParameter).queue;

            bool Revisted = false;

            if (downloader.IsVisited(link))
            {
                Database.LinkHit(downloader.GetRedirectOf(link));
                return;
            }

            /*
             * else if (Database.LinkExists(link))
             * {
             *  Database.LinkHit(link);
             *
             *  if ((DateTime.Now - Database.GetLinkDate(link).GetValueOrDefault()).TotalDays >= 7)
             *  {
             *      Revisted = true;
             *  }
             *  else
             *  {
             *      return;
             *  }
             * }
             */
            if (!RobotstxtParser.Approved(link))
            {
                return;
            }


            if (Controller.OperationCancelled)
            {
                return;
            }

            reporter.Invoke(reporter.ReportStartProcessing, link);

            var response = downloader.GetHtml(link);

            if (Controller.OperationCancelled)
            {
                return;
            }

            if (response == null)
            {
                return;
            }

            HtmlParser doc = new HtmlParser(response, link);

            link = downloader.GetRedirectOf(link);  //Get reponse redirect link

            if (Database.LinkExists(link))
            {
                Database.LinkHit(link);

                if ((DateTime.Now - Database.GetLinkDate(link).GetValueOrDefault()).TotalDays > 7)
                {
                    Revisted = true;
                }
                else
                {
                    return;
                }
            }

            int linkscount = 0;


            if (!Revisted)
            {
                IEnumerable <string> links_list = doc.GetOutGoingLinks();
                HashSet <string>     Distinct   = new HashSet <string>(links_list);
                StringBuilder        queued     = new StringBuilder();


                foreach (string Link in Distinct)
                {
                    linkscount++;

                    Scheduled_links.Enqueue(Link);

                    if (Controller.OperationCancelled)
                    {
                        return;
                    }

                    queued.Append(Link);
                    queued.Append(Environment.NewLine);
                }

                reporter.Invoke(reporter.ReportQueued, queued.ToString());
            }

            string title = doc.GetTitle();

            string PlainText = doc.PlainText();

            Dictionary <string, string> images = doc.ImagesVectors();
            //Dictionary<string, string> ordered_lists = doc.GetOrderedLists();

            Dictionary <string, double> imagesDictionary = doc.KeywordsVectorsFromImages(images);

            Dictionary <string, double> textDictionary = doc.KeywordsVectorsFromText();

            Dictionary <string, double> dictionary = doc.MergeDictionaries(textDictionary, imagesDictionary);

            if (dictionary == null)
            {
                return;
            }


            if (Controller.OperationCancelled)
            {
                return;
            }


            //redundant re check but needed because race condition may occuar leading to deplicate entry
            if (Database.LinkExists(link) && !Revisted)
            {
                Database.LinkHit(link);
                return;
            }
            else if (Revisted)
            {
                Database.UpdateLinkDate(link);
                Database.UpdateLinkTitle(link, title);
                Database.UpdatePageVector(link, dictionary);
                Database.UpdatePageContent(link, PlainText);
                //if (ordered_lists.Count > 0)
                //   Database.UpdatePageStepsList(link, ordered_lists);
                if (images.Count > 0)
                {
                    Database.AddPageImages(link, images);
                }
            }
            else
            {
                Database.AddLink(link, title, linkscount);
                Database.AddPageVector(link, dictionary);
                Database.AddPageContent(link, PlainText);

                //if (ordered_lists.Count > 0)
                //    Database.AddPageStepsList(link, ordered_lists);
                if (images.Count > 0)
                {
                    Database.AddPageImages(link, images);
                }
            }


            if (Controller.OperationCancelled)
            {
                return;
            }

            reporter.Invoke(reporter.ReportStatistics, dictionary);

            reporter.Invoke(reporter.ReportProcessed, link);
        }