public ActionResult Add() { Trace.TraceInformation("HomeController.Add : " + Request.RawUrl); ViewBag.TitlePrefix = "Add a hidden service to "; if (Request.Form.Count > 0) { string url = Request.Form["url"]; if (!string.IsNullOrWhiteSpace(url)) { DisableCache(); url = url.Trim(); if (!url.StartsWith("http")) { url = "http://" + url; } if (url.Length < SqlManager.MaxSqlIndex && Uri.TryCreate(url, UriKind.Absolute, out Uri uriResult) && UriManager.IsTorUri(uriResult)) { using (SqlManager sql = new SqlManager()) { UriManager.NormalizeUrlInit(sql); url = UriManager.NormalizeUrl(uriResult); sql.CrawleRequestEnqueue(url); } return(Redirect("/?msg=1")); } else { ViewBag.AlertDanger = "ERROR : the URL " + url + " doesn't seems to be a valide hidden service, please check and retry."; } } else { ViewBag.AlertDanger = "ERROR : the URL " + url + " is empty, please check and retry."; } } return(View()); }
internal static async Task <bool> CrawleOneAsync(ProxyManager proxy, string url, SqlManager sql, CancellationToken cancellationToken) { try { Uri uriOrig; if (!Uri.TryCreate(url, UriKind.Absolute, out uriOrig)) // TOFIX : how this case can happen ??? { Trace.TraceWarning("CrawleManager.CrawleOneAsync Bad Url Requested : " + url); return(false); } PageEntity page = new PageEntity(uriOrig); //<!> uriOrig may have changed during the normaliz, use only for dnshost if (uriOrig.ToString() != page.Url) { await sql.FixUri(uriOrig.ToString(), page.Url, cancellationToken); } string rawHtml; try { rawHtml = await proxy.DownloadStringTaskAsync(page.HiddenService, page.Url); } catch (WebException ex) { if (!cancellationToken.IsCancellationRequested) { bool isRetry; if (ex.Status != WebExceptionStatus.RequestCanceled) { isRetry = url == page.HiddenService; // default if (ex.Response is HttpWebResponse err) { Trace.TraceInformation("CrawleManager.CrawleOneAsync DownloadStringTaskAsync " + url + " : WebException.HttpWebResponse " + ex.GetBaseException().Message); if (err.StatusCode == HttpStatusCode.Moved || err.StatusCode == HttpStatusCode.MovedPermanently || err.StatusCode == HttpStatusCode.Found) { string redirectUrl = err.Headers["Location"]; if (!string.IsNullOrEmpty(redirectUrl) && Uri.TryCreate(redirectUrl, UriKind.Absolute, out Uri uri) && UriManager.IsTorUri(uri)) { redirectUrl = UriManager.NormalizeUrl(uri); await sql.CrawleRequestEnqueueAsync(redirectUrl, (short)(UriManager.IsHiddenService(redirectUrl) ? 3 : 4), cancellationToken); } } else if (err.StatusCode == HttpStatusCode.InternalServerError || err.StatusCode == HttpStatusCode.ServiceUnavailable) // servers error are retryed { isRetry = true; } } else if (ex.Response is SocksHttpWebResponse webResponse) { Trace.TraceInformation("CrawleManager.CrawleOneAsync DownloadStringTaskAsync " + url + " : WebException.SocksHttpWebResponse " + ex.GetBaseException().Message); if (webResponse.StatusCode == HttpStatusCode.Moved || webResponse.StatusCode == HttpStatusCode.MovedPermanently || webResponse.StatusCode == HttpStatusCode.Found) { string redirectUrl = webResponse.Headers["Location"]; if (!string.IsNullOrEmpty(redirectUrl) && Uri.TryCreate(redirectUrl, UriKind.Absolute, out Uri uri) && UriManager.IsTorUri(uri)) { redirectUrl = UriManager.NormalizeUrl(uri); await sql.CrawleRequestEnqueueAsync(redirectUrl, (short)(UriManager.IsHiddenService(redirectUrl) ? 3 : 4), cancellationToken); } } else if (webResponse.StatusCode == HttpStatusCode.InternalServerError || webResponse.StatusCode == HttpStatusCode.ServiceUnavailable) // servers error are retryed { isRetry = true; } } else { Trace.TraceInformation("CrawleManager.CrawleOneAsync DownloadStringTaskAsync " + url + " : WebException.?Response " + ex.GetBaseException().Message); } } else //if (ex.Status == WebExceptionStatus.RequestCanceled) // raise by ProxyManager_DownloadProgressChanged { Trace.TraceInformation("CrawleManager.CrawleOneAsync DownloadStringTaskAsync " + url + " : Cancelled"); isRetry = false; } if (isRetry) { await sql.PageUpdateKo(page, cancellationToken); return(false); } else { // default KO management with retry await sql.UrlPurge(url, cancellationToken); return(true); // looks like an OK for the manager : he won't retry } } else { return(false); } } catch (OperationCanceledException) { return(false); } // retry catch (Exception ex) { Trace.TraceInformation("CrawleManager.CrawleOneAsync DownloadStringTaskAsync " + url + " : Exception " + ex.GetBaseException().ToString()); if (!cancellationToken.IsCancellationRequested) { await sql.PageUpdateKo(page, cancellationToken); #if DEBUG if (Debugger.IsAttached) { Debugger.Break(); } #endif } return(false); } HtmlDocument htmlDoc = new HtmlDocument(); try { htmlDoc.LoadHtml(rawHtml); } catch // htmlDoc.LoadHtml May cause a stack overflow on recursive call of HtmlAgilityPack.HtmlNode.CloseNode(HtmlAgilityPack.HtmlNode, Int32), not catcheable by catch Exception { Trace.TraceWarning("CrawleManager.CrawleOneAsync LoadHtml " + url + " : Formating Exception"); if (!cancellationToken.IsCancellationRequested) { await sql.PageUpdateKo(page, cancellationToken); #if DEBUG if (Debugger.IsAttached) { Debugger.Break(); } #endif } return(false); } rawHtml = null; IEnumerable <HtmlNode> lst = htmlDoc.DocumentNode.Descendants("script"); if (lst != null) { lst.ToList().ForEach(x => x.Remove()); // else appear in InnerText ! } lst = htmlDoc.DocumentNode.Descendants("style"); if (lst != null) { lst.ToList().ForEach(x => x.Remove()); // else appear in InnerText ! } lst = htmlDoc.DocumentNode.Descendants().Where(n => n.NodeType == HtmlNodeType.Comment); if (lst != null) { lst.ToList().ForEach(x => x.Remove()); // else appear in InnerText ! } lst = null; // Title HtmlNode htmlNode2 = htmlDoc.DocumentNode.SelectSingleNode("//head/title"); if (htmlNode2 != null && !string.IsNullOrEmpty(htmlNode2.InnerText)) { page.Title = PageEntity.NormalizeText(HttpUtility.HtmlDecode(htmlNode2.InnerText)); } else // some very badly formated site without the <head> for exemple { htmlNode2 = htmlDoc.DocumentNode.SelectSingleNode("//*/title"); // look everywhere if (htmlNode2 != null && !string.IsNullOrEmpty(htmlNode2.InnerText)) { page.Title = PageEntity.NormalizeText(HttpUtility.HtmlDecode(htmlNode2.InnerText)); } else { page.Title = uriOrig.Host; } } // InnerText htmlNode2 = htmlDoc.DocumentNode.SelectSingleNode("//body"); if (htmlNode2 != null && !string.IsNullOrEmpty(htmlNode2.InnerText)) { page.InnerText = PageEntity.NormalizeText(HttpUtility.HtmlDecode(htmlNode2.InnerText)); } else // some very badly formated site without the <body> for exemple { htmlNode2 = htmlDoc.DocumentNode; // will take all the web content if (htmlNode2 != null && !string.IsNullOrEmpty(htmlNode2.InnerText)) { page.InnerText = PageEntity.NormalizeText(HttpUtility.HtmlDecode(htmlNode2.InnerText)); } else { page.InnerText = string.Empty; // a null will raise an exception on the sql proc call } } htmlNode2 = null; // <Heading digest StringBuilder heading = new StringBuilder(); foreach (HtmlNode htmlNode in htmlDoc.DocumentNode.Descendants("h1")) { if (heading.Length < SqlManager.MaxSqlIndex) { heading.AppendLine(htmlNode.InnerText); } else { break; } } if (heading.Length < SqlManager.MaxSqlIndex) { foreach (HtmlNode htmlNode in htmlDoc.DocumentNode.Descendants("h2")) { if (heading.Length < SqlManager.MaxSqlIndex) { heading.AppendLine(htmlNode.InnerText); } else { break; } } } if (heading.Length < SqlManager.MaxSqlIndex) { foreach (HtmlNode htmlNode in htmlDoc.DocumentNode.Descendants("h3")) { if (heading.Length < SqlManager.MaxSqlIndex) { heading.AppendLine(htmlNode.InnerText); } else { break; } } } if (heading.Length > 0) { page.Heading = PageEntity.NormalizeText(HttpUtility.HtmlDecode(heading.ToString())); } else { page.Heading = string.Empty; // null will raise an exception on the sql proc call } heading = null; // <A href digest page.OuterLinks = new HashSet <string>(); page.OuterHdLinks = new HashSet <string>(); page.InnerLinks = new HashSet <string>(); foreach (HtmlNode htmlNode in htmlDoc.DocumentNode.Descendants("a")) { if (htmlNode.Attributes.Contains("href") && !cancellationToken.IsCancellationRequested) { string href = htmlNode.Attributes["href"].Value; Uri uriResult = null; if (href.StartsWith("http")) { if (Uri.TryCreate(href, UriKind.Absolute, out uriResult) && UriManager.IsTorUri(uriResult)) // >=29 because some funny link with just: href='http://' { string str = UriManager.NormalizeUrl(uriResult); if (str.Length < SqlManager.MaxSqlIndex) { if (uriResult.DnsSafeHost != uriOrig.DnsSafeHost) { if (!page.OuterLinks.Contains(str) && !page.OuterHdLinks.Contains(str)) { string hd = UriManager.GetHiddenService(str); // Digest outter HD if (hd.Length <= 37 && hd.IndexOfAny(forbiddenInHd) == -1) // some strange url may cause injection { if (hd == str) { page.OuterHdLinks.Add(str); } else { page.OuterLinks.Add(str); } } else { Trace.TraceWarning("CrawleManager.CrawleOneAsync Strange HD outer link from " + url + " : " + hd); #if DEBUG if (Debugger.IsAttached) { Debugger.Break(); } #endif } } } else if (!page.InnerLinks.Contains(str)) { page.InnerLinks.Add(str); } } } } else if (!href.StartsWith("#")) { if (Uri.TryCreate(uriOrig, href, out uriResult) && UriManager.IsTorUri(uriResult)) { string str = UriManager.NormalizeUrl(uriResult); if (str.Length < SqlManager.MaxSqlIndex && !page.InnerLinks.Contains(str)) { page.InnerLinks.Add(str); } } } } } htmlDoc = null; if (page.InnerLinks.Contains(page.Url)) { page.InnerLinks.Remove(page.Url); } if (page.InnerLinks.Contains(page.HiddenService)) { page.InnerLinks.Remove(page.HiddenService); } await sql.PageInsertOrUpdateOk(page, cancellationToken); page = null; return(true); } catch (OperationCanceledException) { return(false); } catch (SqlException ex) { Exception exb = ex.GetBaseException(); if (!exb.Message.Contains("Operation cancelled by user.")) { Trace.TraceWarning("CrawleManager.CrawleOneAsync SqlException : " + ex.GetBaseException().ToString()); #if DEBUG if (Debugger.IsAttached) { Debugger.Break(); } #endif } // <=> catch (OperationCanceledException) return(false); // will retry and not save the page in failed } catch (Exception ex) { Trace.TraceWarning("CrawleManager.CrawleOneAsync Exception for " + url + " : " + ex.GetBaseException().ToString()); #if DEBUG if (Debugger.IsAttached) { Debugger.Break(); } #endif return(false); // will retry and not save the page in failed } }
public CorsPolicyService(UriManager uriManager) { m_uriManager = uriManager; }
protected override async Task RunAsync(CancellationToken cancellationToken) { Trace.TraceInformation("WorkerRole.RunAsync : Start"); try { PerfCounter.Init(); RotManager.TryKillTorIfRequired(); // pools init #if DEBUG taskPool = new List <Task>(Settings.Default.NbCrawlersPerInstance / 2); #else taskPool = new List <Task>(Settings.Default.NbCrawlersPerInstance); #endif for (int i = 0; i < taskPool.Capacity; i++) { taskPool.Add(null); } // NormalizeUrl Init using (SqlManager sql = new SqlManager()) { UriManager.NormalizeUrlInit(sql); } int gCFullCollectRemainingMin = Settings.Default.GCFullCollectMin; HtmlDocument.MaxDepthLevel = Int16.MaxValue; // default value Int32.MaxValue is far too hight : an call stack exeption will be raised far before (around 43k)... // main loop while (!cancellationToken.IsCancellationRequested) { for (int i = 0; !cancellationToken.IsCancellationRequested && i < taskPool.Count; i++) { Task task = taskPool[i]; if (task != null && (task.IsCanceled || task.IsCompleted || task.IsFaulted)) { task.Dispose(); taskPool[i] = null; } if (taskPool[i] == null && !cancellationToken.IsCancellationRequested) { int iVarRequiredForLambda = i; // <!> else the i may be changed by next for iteration in this multi task app !!! taskPool[i] = Task.Run(() => { RunTask(iVarRequiredForLambda, cancellationToken).Wait(); // cancel supported by task, so not used for the wait() }, cancellationToken); await Task.Delay(1000, cancellationToken); // avoid violent startup by x tor started in same instant } } await Task.Delay(60000, cancellationToken); // 1 min // We use a lot of large object for short time, si need to change the GC default mode gCFullCollectRemainingMin--; if (!cancellationToken.IsCancellationRequested && gCFullCollectRemainingMin == 0) { gCFullCollectRemainingMin = Settings.Default.GCFullCollectMin; GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce; // https://msdn.microsoft.com/en-us/library/system.runtime.gcsettings.largeobjectheapcompactionmode(v=vs.110).aspx GC.Collect(); // take some tome for freeing RAM and reduce LargeObjectHeap fragment } } } catch (OperationCanceledException) { } catch (AggregateException) { } catch (Exception ex) { Trace.TraceError("WorkerRole.RunAsync Exception : " + ex.GetBaseException().ToString()); #if DEBUG if (Debugger.IsAttached) { Debugger.Break(); } #endif } Trace.TraceInformation("WorkerRole.RunAsync : End"); }
public ActionResult Index() { Trace.TraceInformation("HomeController.Index : " + Request.RawUrl); //ViewBag.AlertDanger = "DATABASE MAINTENANCE, please come back tomorrow."; if (Request.QueryString.Count == 0) { ViewBag.FastPageDisplay = true; } else { string q = Request.QueryString["q"]; if (!string.IsNullOrWhiteSpace(q)) { short p = 1; if (!string.IsNullOrWhiteSpace(Request.QueryString["p"])) { if (short.TryParse(Request.QueryString["p"], out p) && p <= 0) { p = 1; } } ShorterCache(); using (SqlManager sql = new SqlManager()) { HashSet <string> sw = GetStopWords(sql); bool hasSW = false; List <string> keywords = q.Split(splitChar, StringSplitOptions.RemoveEmptyEntries).ToList(); for (int i = keywords.Count - 1; i >= 0; i--) { if (sw.Contains(keywords[i].ToLowerInvariant())) { hasSW = true; keywords.RemoveAt(i); } } if (keywords.Count > 0) { if (hasSW) { ViewBag.AlertWarning = "Some of your keyword(s) have been removed. Please read the About section for more information on usage and policy."; q = string.Join(" ", keywords); } else { q = q.Trim(); } if (q.Length > 64) { ViewBag.AlertWarning = "Your search keyword(s) have been reduced"; q = q.Substring(0, 64); } ViewBag.IsFull = Request.QueryString["f"] == "1"; // result grouped by domain or not ViewBag.Research = q; ViewBag.ResearchUrlEncoded = Url.Encode(q); ViewBag.TitlePrefix = q + " - "; SearchResultEntity ret; // Advanced search? if (!keywords[0].Contains(':')) { ret = sql.GetSearchResult(q, p, ViewBag.IsFull); } else { Uri url; if (keywords[0].StartsWith("site:", StringComparison.InvariantCultureIgnoreCase) && keywords[0].Length > 5) { ViewBag.IsFull = null; // removing filtering IHM if (Uri.TryCreate((!keywords[0].StartsWith("site:http:") ? "http://" : "") + keywords[0].Substring(5), UriKind.Absolute, out url) && UriManager.IsTorUri(url)) { ret = sql.GetSearchSiteResult(UriManager.GetHiddenService(url.ToString()), p); } else { ViewBag.AlertDanger = "Error processing the Url. Please check or contact us."; return(View()); } } else if (keywords[0].StartsWith("cache:", StringComparison.InvariantCultureIgnoreCase) && keywords[0].Length > 6) { ViewBag.IsFull = null; // removing filtering IHM if (Uri.TryCreate((!keywords[0].StartsWith("cache:http:") ? "http://" : "") + keywords[0].Substring(6), UriKind.Absolute, out url) && UriManager.IsTorUri(url)) { ret = sql.GetSearchCachedResult(url.ToString()); } else { ViewBag.AlertDanger = "Error processing the Url. Please check or contact us."; return(View()); } } else if (q.StartsWith("intitle:", StringComparison.InvariantCultureIgnoreCase) && q.Length > 8) { ret = sql.GetSearchTitleResult(q.Substring(8), p, ViewBag.IsFull); } else if (q.StartsWith("allintitle:", StringComparison.InvariantCultureIgnoreCase) && q.Length > 11) { ret = sql.GetSearchTitleResult(q.Substring(11), p, ViewBag.IsFull); } else if (q.StartsWith("intext:", StringComparison.InvariantCultureIgnoreCase) && q.Length > 7) { q = q.Substring(7); if (q.Length > 1) { ret = sql.GetSearchInnerTextResult(q, p, ViewBag.IsFull); } else { ViewBag.AlertDanger = "Please be more specific than 1 char"; return(View()); } } else if (q.StartsWith("allintext:", StringComparison.InvariantCultureIgnoreCase) && q.Length > 10) { q = q.Substring(10); if (q.Length > 1) { ret = sql.GetSearchInnerTextResult(q, p, ViewBag.IsFull); } else { ViewBag.AlertDanger = "Please be more specific than 1 char"; return(View()); } } else if (q.StartsWith("inurl:", StringComparison.InvariantCultureIgnoreCase) && q.Length > 6) { q = q.Substring(6); if (q.Length > 3) { ret = sql.GetSearchUrlResult(q, p, ViewBag.IsFull); } else { ViewBag.AlertDanger = "Please be more specific than " + q.Length + " char(s)"; return(View()); } } else if (q.StartsWith("allinurl:", StringComparison.InvariantCultureIgnoreCase) && q.Length > 9) { q = q.Substring(9); if (q.Length > 3) { ret = sql.GetSearchUrlResult(q, p, ViewBag.IsFull); } else { ViewBag.AlertDanger = "Please be more specific than " + q.Length + " char(s)"; return(View()); } } else // normal search with a ":" { ret = sql.GetSearchResult(q, p, ViewBag.IsFull); } } ViewBag.Results = ret.Results; ViewBag.ResultsTotalNb = ret.ResultsTotalNb; ViewBag.Page = p; if (ret.ResultsTotalNb > 10) { ViewBag.Previous = true; ViewBag.Next = true; if (p < 4) { ViewBag.Previous = (p != 1); ViewBag.PageA = 1; ViewBag.PageB = 2; ViewBag.PageC = 3; ViewBag.PageD = 4; ViewBag.PageE = 5; } else if ((p + 1) * 10 < ret.ResultsTotalNb) { ViewBag.PageA = p - 2; ViewBag.PageB = p - 1; ViewBag.PageC = p; ViewBag.PageD = p + 1; ViewBag.PageE = p + 2; } else if (p * 10 < ret.ResultsTotalNb) { ViewBag.PageA = p - 3; ViewBag.PageB = p - 2; ViewBag.PageC = p - 1; ViewBag.PageD = p; ViewBag.PageE = p + 1; } else { ViewBag.PageA = p - 4; ViewBag.PageB = p - 3; ViewBag.PageC = p - 2; ViewBag.PageD = p - 1; ViewBag.PageE = p; ViewBag.Next = false; } } return(View("Result")); } else { ViewBag.AlertDanger = "Your keyword(s) have been removed. Please read the About section for more information on usage and policy. You may use the Contact section if you think there is a mistake."; } } } else { q = Request.QueryString["url"]; if (!string.IsNullOrWhiteSpace(q)) { ShorterCache(); return(Redirect(q)); } else { // normal cache q = Request.QueryString["msg"]; if (!string.IsNullOrWhiteSpace(q)) { switch (q) { case "1": ViewBag.AlertSuccess = "Thanks, the URL will be checked soon."; break; case "2": ViewBag.AlertSuccess = "Thanks for contacting us."; break; } } } } } return(View()); }
public ClientController(ClientManager clientManager, UriManager uriManager, SecretManager secretManager) { m_clientManager = clientManager; m_uriManager = uriManager; m_secretManager = secretManager; }