Пример #1
0
        public ViewResult Index(String searchingText)
        {
            try
            {
                Task[] tasks = new Task[preferences.parsers.Count];
                int    i     = 0;
                preferences.parsers.ForEach(p => { tasks[i] = p.LoadPage(searchingText); i++; });
                int index = Task.WaitAny(tasks);

                List <String> result = preferences.parsers[index].Parse();

                foreach (var r in result)
                {
                    FoundUrl url = new FoundUrl();
                    url.Url       = r;
                    url.Engine    = preferences.parsers[index].searchEngineUrl;
                    url.DateFound = DateTime.Now;
                    context.Add(url);
                }
                context.SaveChanges();

                return(View(result));
            }
            catch (AggregateException ex)
            {
                List <String> result = new List <String>();
                foreach (var e in ex.InnerExceptions)
                {
                    result.Add(e.Message);
                }
                return(View(result));
            }
        }
Пример #2
0
        /// <summary>
        ///     Crawls a given URL, queueing all found URLs and storing information about
        ///     the given URL for later querying.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="url">
        ///     The given URL to crawl.
        /// </param>
        public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url)
        {
            if (data.ChkIfUriAllowed(url))
            {
                ///*  Unsure if necessary.  */
                //if (!url.Contains(".htm"))
                //{
                //    if (!url.Contains(".jpg") && !url.Contains(".png"))
                //    {
                //        if (url.EndsWith(@"/"))
                //        {
                //            url += "index.html";
                //        }
                //        else
                //        {
                //            url += @"/index.html";
                //        }
                //    }
                //}
                try
                {
                    var web      = new HtmlWeb();
                    var currDoc  = web.Load(url);
                    var urlNodes = currDoc.DocumentNode.Descendants("a")
                                   .ToList();
                    var urlPageTitle = currDoc.DocumentNode.Descendants("title")
                                       .First()
                                       .InnerText;
                    var urlLastModNode = currDoc.DocumentNode.Descendants("meta")
                                         .Select(y => y)
                                         .Where(y => y.Attributes.Contains("name"))
                                         .Where(y => y.Attributes["name"].Value == "pubdate")
                                         .ToList();

                    DateTime?urlLastMod = null;
                    if (urlLastModNode.Count > 0)
                    {
                        urlLastMod = DateTime.Parse(
                            urlLastModNode.First().Attributes["content"].Value);
                    }

                    List <string> urlsToQueue = new List <string>();

                    foreach (var urlNode in urlNodes)
                    {
                        if (urlNode.Attributes.Contains("href"))
                        {
                            urlsToQueue.Add(urlNode.Attributes["href"].Value);
                        }
                    }

                    foreach (string newUrl in urlsToQueue)
                    {
                        ChkAndAddUrl(newUrl, url, urlLastMod, ref data, ref storage);
                    }

                    if (!data.AddedUrls.Contains(url))
                    {
                        data.AddedUrls.Add(url);
                        data.NumUrlsIndexed++;
                    }
                    data.NumUrlsCrawled++;
                    FoundUrl       finishedUrl = new FoundUrl(urlPageTitle, (urlLastMod != null ? urlLastMod.ToString() : "NULL"), url);
                    UrlTableCount  newCount    = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed);
                    TableOperation insertUrl   = TableOperation.InsertOrReplace(finishedUrl);
                    TableOperation insertCount = TableOperation.InsertOrReplace(newCount);
                    storage.UrlTable.Execute(insertUrl);
                    storage.UrlTable.Execute(insertCount);
                    if (data.LastTenUrls.Count == 10)
                    {
                        data.LastTenUrls.Dequeue();
                    }
                    data.LastTenUrls.Enqueue(url);
                }
                catch (Exception ex)
                {
                    ErrorUrl       errorUrl       = new ErrorUrl(url, ex.ToString());
                    TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                    storage.ErrorTable.Execute(insertErrorUrl);
                }
            }
        }