Ejemplo n.º 1
0
 public ThreadManager(GUI RP)
 {
     Database = DBController.GetInstance();
     HttpDownloader.GetInstance();
     ClosingLock  = new object();
     reporter     = RP;
     ThreadCount  = 0;
     busy         = false;
     CurrentQueue = 0;
 }
Ejemplo n.º 2
0
        public static bool Approved(string link)
        {
            string domain;

            try
            {
                domain = new Uri(link).Host;
            }
            catch (Exception)
            {
                return(false);
            }
            bool            allowed = true;
            MatchCollection Disallows;
            MatchCollection Allows;

            if (DisallowCache.ContainsKey(domain))
            {
                Allows    = AllowCache[domain];
                Disallows = DisallowCache[domain];

                if (Allows == null || Disallows == null)
                {
                    return(true);
                }
            }
            else
            {
                string robotstxt = HttpDownloader.GetInstance().GetRobotsTxt(domain);

                if (robotstxt == null)
                {
                    DisallowCache.TryAdd(domain, null);
                    AllowCache.TryAdd(domain, null);
                    return(true);
                }

                robotstxt = Regex.Replace(robotstxt, @"#.*?$", "", RegexOptions.Multiline);

                string robotsmatch = Regex.Match(robotstxt, @"(?<=User\-agent\: \*).+?(?=(User-agent\:)|$)", RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled).Value;

                Disallows = Regex.Matches(robotsmatch, @"(?<=Disallow: ).+?(?=(\u000D|( +#)|$))", RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
                Allows    = Regex.Matches(robotsmatch, @"(?<=[^(Dis)]allow: ).+?(?=(\u000D|( +#)|$))", RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled);

                DisallowCache.TryAdd(domain, Disallows);
                AllowCache.TryAdd(domain, Allows);
            }

            StringBuilder PatternBuilder;
            string        pattern;

            foreach (Match disallow in Disallows)
            {
                PatternBuilder = new StringBuilder(disallow.Value).Replace(@"\", @"\\").Replace(@"/", @"\/").Replace(".", @"\.").Replace("?", @"\?").Replace("+", @"\+").Replace("*", @".*?").Replace("(", @"\(").Replace(")", @"\)").Replace("^", @"\^").Replace("$", @"\$").Replace("[", @"\[").Replace("]", @"]").Replace("{", @"\}").Replace("}", @"\}").Replace("|", @"\|");
                pattern        = PatternBuilder.ToString();
                if (Regex.IsMatch(link, pattern))
                {
                    allowed = false;
                    break;
                }
            }

            foreach (Match allow in Allows)
            {
                PatternBuilder = new StringBuilder(allow.Value).Replace(@"\", @"\\").Replace(@"/", @"\/").Replace(".", @"\.").Replace("?", @"\?").Replace("+", @"\+").Replace("*", @".*?").Replace("(", @"\(").Replace(")", @"\)").Replace("^", @"\^").Replace("$", @"\$").Replace("[", @"\[").Replace("]", @"]").Replace("{", @"\}").Replace("}", @"\}").Replace("|", @"\|");
                pattern        = PatternBuilder.ToString();
                if (Regex.IsMatch(link, pattern))
                {
                    allowed = true;
                    break;
                }
            }

            return(allowed);
        }
Ejemplo n.º 3
0
        public void ThreadProc(object obj)
        {
            if (Controller.OperationCancelled)
            {
                return;
            }

            HttpDownloader downloader = HttpDownloader.GetInstance();

            string link = (obj as ThreadParameter).link;
            ConcurrentQueue <string> Scheduled_links = (obj as ThreadParameter).queue;

            bool Revisted = false;

            if (downloader.IsVisited(link))
            {
                Database.LinkHit(downloader.GetRedirectOf(link));
                return;
            }

            /*
             * else if (Database.LinkExists(link))
             * {
             *  Database.LinkHit(link);
             *
             *  if ((DateTime.Now - Database.GetLinkDate(link).GetValueOrDefault()).TotalDays >= 7)
             *  {
             *      Revisted = true;
             *  }
             *  else
             *  {
             *      return;
             *  }
             * }
             */
            if (!RobotstxtParser.Approved(link))
            {
                return;
            }


            if (Controller.OperationCancelled)
            {
                return;
            }

            reporter.Invoke(reporter.ReportStartProcessing, link);

            var response = downloader.GetHtml(link);

            if (Controller.OperationCancelled)
            {
                return;
            }

            if (response == null)
            {
                return;
            }

            HtmlParser doc = new HtmlParser(response, link);

            link = downloader.GetRedirectOf(link);  //Get reponse redirect link

            if (Database.LinkExists(link))
            {
                Database.LinkHit(link);

                if ((DateTime.Now - Database.GetLinkDate(link).GetValueOrDefault()).TotalDays > 7)
                {
                    Revisted = true;
                }
                else
                {
                    return;
                }
            }

            int linkscount = 0;


            if (!Revisted)
            {
                IEnumerable <string> links_list = doc.GetOutGoingLinks();
                HashSet <string>     Distinct   = new HashSet <string>(links_list);
                StringBuilder        queued     = new StringBuilder();


                foreach (string Link in Distinct)
                {
                    linkscount++;

                    Scheduled_links.Enqueue(Link);

                    if (Controller.OperationCancelled)
                    {
                        return;
                    }

                    queued.Append(Link);
                    queued.Append(Environment.NewLine);
                }

                reporter.Invoke(reporter.ReportQueued, queued.ToString());
            }

            string title = doc.GetTitle();

            string PlainText = doc.PlainText();

            Dictionary <string, string> images = doc.ImagesVectors();
            //Dictionary<string, string> ordered_lists = doc.GetOrderedLists();

            Dictionary <string, double> imagesDictionary = doc.KeywordsVectorsFromImages(images);

            Dictionary <string, double> textDictionary = doc.KeywordsVectorsFromText();

            Dictionary <string, double> dictionary = doc.MergeDictionaries(textDictionary, imagesDictionary);

            if (dictionary == null)
            {
                return;
            }


            if (Controller.OperationCancelled)
            {
                return;
            }


            //redundant re check but needed because race condition may occuar leading to deplicate entry
            if (Database.LinkExists(link) && !Revisted)
            {
                Database.LinkHit(link);
                return;
            }
            else if (Revisted)
            {
                Database.UpdateLinkDate(link);
                Database.UpdateLinkTitle(link, title);
                Database.UpdatePageVector(link, dictionary);
                Database.UpdatePageContent(link, PlainText);
                //if (ordered_lists.Count > 0)
                //   Database.UpdatePageStepsList(link, ordered_lists);
                if (images.Count > 0)
                {
                    Database.AddPageImages(link, images);
                }
            }
            else
            {
                Database.AddLink(link, title, linkscount);
                Database.AddPageVector(link, dictionary);
                Database.AddPageContent(link, PlainText);

                //if (ordered_lists.Count > 0)
                //    Database.AddPageStepsList(link, ordered_lists);
                if (images.Count > 0)
                {
                    Database.AddPageImages(link, images);
                }
            }


            if (Controller.OperationCancelled)
            {
                return;
            }

            reporter.Invoke(reporter.ReportStatistics, dictionary);

            reporter.Invoke(reporter.ReportProcessed, link);
        }