/// <summary> /// Attempts to fetch a FilterListEntry instance for the supplied category name, or create a /// new one if one does not exist. Whether one is created, or an existing instance is /// discovered, a valid, unique FilterListEntry for the supplied category shall be returned. /// </summary> /// <param name="categoryName"> /// The category name for which to fetch or generate a new FilterListEntry instance. /// </param> /// <returns> /// The unique FilterListEntry for the supplied category name, whether an existing instance /// was found or a new one was created. /// </returns> /// <remarks> /// This will always fail if more than 255 categories are created! /// </remarks> private bool TryFetchOrCreateCategoryMap <T>(string categoryName, PlainTextFilteringListType listType, out T model) where T : MappedFilterListCategoryModel { m_logger.Info("CATEGORY {0}", categoryName); MappedFilterListCategoryModel existingCategory = null; if (!m_generatedCategoriesMap.TryGetValue(categoryName, out existingCategory)) { // We can't generate anymore categories. Sorry, but the rest get ignored. if (m_generatedCategoriesMap.Count >= short.MaxValue) { m_logger.Error("The maximum number of filtering categories has been exceeded."); model = null; return(false); } if (typeof(T) == typeof(MappedBypassListCategoryModel)) { MappedFilterListCategoryModel secondCategory = null; if (TryFetchOrCreateCategoryMap(categoryName + "_as_whitelist", PlainTextFilteringListType.Whitelist, out secondCategory)) { var newModel = (T)(MappedFilterListCategoryModel) new MappedBypassListCategoryModel((byte)((m_generatedCategoriesMap.Count) + 1), secondCategory.CategoryId, categoryName, secondCategory.CategoryName); m_generatedCategoriesMap.GetOrAdd(categoryName, newModel); model = newModel; return(true); } else { model = null; return(false); } } else { var newModel = (T) new MappedFilterListCategoryModel((byte)((m_generatedCategoriesMap.Count) + 1), categoryName, listType); m_generatedCategoriesMap.GetOrAdd(categoryName, newModel); model = newModel; return(true); } } model = existingCategory as T; return(true); }
public byte[] ResolveBlockedSiteTemplate(Uri requestUri, int matchingCategory, List <MappedFilterListCategoryModel> appliedCategories, BlockType blockType = BlockType.None, string triggerCategory = "") { Dictionary <string, object> blockPageContext = new Dictionary <string, object>(); // Produces something that looks like "www.badsite.com/example?arg=0" instead of "http://www.badsite.com/example?arg=0" // In my opninion this looks slightly more friendly to a user than the entire URI. string friendlyUrlText = (requestUri.Host + requestUri.PathAndQuery + requestUri.Fragment).TrimEnd('/'); string urlText = requestUri.ToString(); bool showUnblockRequestButton = true; string unblockRequest = getUnblockRequestUrl(urlText); string message = "was blocked because it was in the following category:"; // Collect category information: Blocked category, other categories, and whether the blocked category is in the relaxed policy. MappedFilterListCategoryModel matchingCategoryModel = m_policyConfiguration.GeneratedCategoriesMap.Values.FirstOrDefault(m => m.CategoryId == matchingCategory); string matching_category = matchingCategoryModel?.ShortCategoryName; List <string> otherCategories = appliedCategories? .Where(c => c.CategoryId != matchingCategory) .Select(c => c.ShortCategoryName) .Distinct() .ToList(); bool isRelaxedPolicy = (matchingCategoryModel is MappedBypassListCategoryModel); // Get category or block type. string url_text = urlText == null ? "" : urlText; if (matchingCategory > 0 && blockType == BlockType.None) { // matching_category name already set. } else { otherCategories = null; switch (blockType) { case BlockType.None: matching_category = "unknown reason"; break; case BlockType.ImageClassification: matching_category = "naughty image"; break; case BlockType.Url: matching_category = "bad webpage"; break; case BlockType.TextClassification: case BlockType.TextTrigger: matching_category = string.Format("offensive text: {0}", triggerCategory); break; case BlockType.TimeRestriction: message = "is blocked because your time restrictions do not allow internet access at this time."; //matching_category = "no internet allowed after hours"; showUnblockRequestButton = false; break; case BlockType.OtherContentClassification: default: matching_category = "other content classification"; break; } } blockPageContext.Add("url_text", url_text); blockPageContext.Add("friendly_url_text", friendlyUrlText); blockPageContext.Add("message", message); blockPageContext.Add("matching_category", matching_category); blockPageContext.Add("other_categories", otherCategories); blockPageContext.Add("showUnblockRequestButton", showUnblockRequestButton); blockPageContext.Add("passcodeSetupUrl", CompileSecrets.ServiceProviderUserRelaxedPolicyPath); blockPageContext.Add("unblockRequest", unblockRequest); blockPageContext.Add("isRelaxedPolicy", isRelaxedPolicy); blockPageContext.Add("isRelaxedPolicyPasscodeRequired", m_policyConfiguration?.Configuration?.EnableRelaxedPolicyPasscode); return(Encoding.UTF8.GetBytes(m_blockedHtmlPage(blockPageContext))); }
/// <summary> /// Called by the engine when the engine fails to classify a request or response by its /// metadata. The engine provides a full byte array of the content of the request or /// response, along with the declared content type of the data. This is currently used for /// NLP classification, but can be adapted with minimal changes to the Engine. /// </summary> /// <param name="data"> /// The data to be classified. /// </param> /// <param name="contentType"> /// The declared content type of the data. /// </param> /// <returns> /// A numeric category ID that the content was deemed to belong to. Zero is returned here if /// the content is not deemed to be part of any known category, which is a general indication /// to the engine that the content should not be blocked. /// </returns> private short OnClassifyContent(Memory <byte> data, string contentType, out BlockType blockedBecause, out string textTrigger, out string triggerCategory) { Stopwatch stopwatch = null; try { m_policyConfiguration.PolicyLock.EnterReadLock(); stopwatch = Stopwatch.StartNew(); if (m_policyConfiguration.TextTriggers != null && m_policyConfiguration.TextTriggers.HasTriggers) { var isHtml = contentType.IndexOf("html") != -1; var isJson = contentType.IndexOf("json") != -1; if (isHtml || isJson) { var dataToAnalyzeStr = Encoding.UTF8.GetString(data.ToArray()); if (isHtml) { // This doesn't work anymore because google has started sending bad stuff directly // embedded inside HTML responses, instead of sending JSON a separate response. // So, we need to let the triggers engine just chew through the entire raw HTML. // var ext = new FastHtmlTextExtractor(); // dataToAnalyzeStr = ext.Extract(dataToAnalyzeStr.ToCharArray(), true); } short matchedCategory = -1; string trigger = null; var cfg = m_policyConfiguration.Configuration; if (m_policyConfiguration.TextTriggers.ContainsTrigger(dataToAnalyzeStr, out matchedCategory, out trigger, m_policyConfiguration.CategoryIndex.GetIsCategoryEnabled, cfg != null && cfg.MaxTextTriggerScanningSize > 1, cfg != null ? cfg.MaxTextTriggerScanningSize : -1)) { m_logger.Info("Triggers successfully run. matchedCategory = {0}, trigger = '{1}'", matchedCategory, trigger); var mappedCategory = m_policyConfiguration.GeneratedCategoriesMap.Values.Where(xx => xx.CategoryId == matchedCategory).FirstOrDefault(); if (mappedCategory != null) { m_logger.Info("Response blocked by text trigger \"{0}\" in category {1}.", trigger, mappedCategory.CategoryName); blockedBecause = BlockType.TextTrigger; triggerCategory = mappedCategory.CategoryName; textTrigger = trigger; return(mappedCategory.CategoryId); } } } } stopwatch.Stop(); //m_logger.Info("Text triggers took {0} on {1}", stopwatch.ElapsedMilliseconds, url); } catch (Exception e) { LoggerUtil.RecursivelyLogException(m_logger, e); } finally { m_policyConfiguration.PolicyLock.ExitReadLock(); } #if WITH_NLP try { m_doccatSlimLock.EnterReadLock(); contentType = contentType.ToLower(); // Only attempt text classification if we have a text classifier, silly. if (m_documentClassifiers != null && m_documentClassifiers.Count > 0) { var textToClassifyBuilder = new StringBuilder(); if (contentType.IndexOf("html") != -1) { // This might be plain text, might be HTML. We need to find out. var rawText = Encoding.UTF8.GetString(data).ToCharArray(); var extractor = new FastHtmlTextExtractor(); var extractedText = extractor.Extract(rawText); m_logger.Info("From HTML: Classify this string: {0}", extractedText); textToClassifyBuilder.Append(extractedText); } else if (contentType.IndexOf("json") != -1) { // This should be JSON. var jsonText = Encoding.UTF8.GetString(data); var len = jsonText.Length; for (int i = 0; i < len; ++i) { char c = jsonText[i]; if (char.IsLetterOrDigit(c) || char.IsWhiteSpace(c)) { textToClassifyBuilder.Append(c); } else { textToClassifyBuilder.Append(' '); } } m_logger.Info("From Json: Classify this string: {0}", m_whitespaceRegex.Replace(textToClassifyBuilder.ToString(), " ")); } var textToClassify = textToClassifyBuilder.ToString(); if (textToClassify.Length > 0) { foreach (var classifier in m_documentClassifiers) { m_logger.Info("Got text to classify of length {0}.", textToClassify.Length); // Remove all multi-whitespace, newlines etc. textToClassify = m_whitespaceRegex.Replace(textToClassify, " "); var classificationResult = classifier.ClassifyText(textToClassify); MappedFilterListCategoryModel categoryNumber = null; if (m_generatedCategoriesMap.TryGetValue(classificationResult.BestCategoryName, out categoryNumber)) { if (categoryNumber.CategoryId > 0 && m_categoryIndex.GetIsCategoryEnabled(categoryNumber.CategoryId)) { var cfg = m_policyConfiguration.Configuration; var threshold = cfg != null ? cfg.NlpThreshold : 0.9f; if (classificationResult.BestCategoryScore < threshold) { m_logger.Info("Rejected {0} classification because score was less than threshold of {1}. Returned score was {2}.", classificationResult.BestCategoryName, threshold, classificationResult.BestCategoryScore); blockedBecause = BlockType.OtherContentClassification; return(0); } m_logger.Info("Classified text content as {0}.", classificationResult.BestCategoryName); blockedBecause = BlockType.TextClassification; return(categoryNumber.CategoryId); } } else { m_logger.Info("Did not find category registered: {0}.", classificationResult.BestCategoryName); } } } } } catch (Exception e) { LoggerUtil.RecursivelyLogException(m_logger, e); } finally { m_doccatSlimLock.ExitReadLock(); } #endif // Default to zero. Means don't block this content. blockedBecause = BlockType.OtherContentClassification; textTrigger = ""; triggerCategory = ""; return(0); }
public bool LoadLists() { try { m_policyLock.EnterWriteLock(); var listFolderPath = getListFolder(); if (Directory.Exists(listFolderPath)) { // Recreate our filter collection and reset all categories to be disabled. AdBlockMatcherApi.Initialize(); // Recreate our triggers container. if (m_textTriggers != null) { m_textTriggers.Dispose(); } m_categoryIndex.SetAll(false); // XXX TODO - Maybe make it a compiler flag to toggle if this is going to // be an in-memory DB or not. m_textTriggers = new BagOfTextTriggers(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "t.dat"), true, true, m_logger); // Now clear all generated categories. These will be re-generated as needed. m_generatedCategoriesMap.Clear(); uint totalFilterRulesLoaded = 0; uint totalFilterRulesFailed = 0; uint totalTriggersLoaded = 0; // Load all configured list files. string tempFolder = getTempFolder(); decryptLists(getListFolder(), tempFolder); var rulePath = s_paths.GetPath("rules.dat"); if (File.Exists(rulePath)) { // AdBlockMatcherApi.Load(rulePath); } foreach (var listModel in Configuration.ConfiguredLists) { var rulesetPath = getListFilePath(listModel.RelativeListPath, tempFolder); if (File.Exists(rulesetPath)) { var thisListCategoryName = listModel.RelativeListPath.Substring(0, listModel.RelativeListPath.LastIndexOfAny(new[] { '/', '\\' }) + 1) + Path.GetFileNameWithoutExtension(listModel.RelativeListPath); MappedFilterListCategoryModel categoryModel = null; switch (listModel.ListType) { case PlainTextFilteringListType.Blacklist: { if (TryFetchOrCreateCategoryMap(thisListCategoryName, listModel.ListType, out categoryModel)) { AdBlockMatcherApi.ParseRuleFile(rulesetPath, categoryModel.CategoryId, ListType.Blacklist); m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true); } } break; case PlainTextFilteringListType.BypassList: { MappedBypassListCategoryModel bypassCategoryModel = null; // Must be loaded twice. Once as a blacklist, once as a whitelist. if (TryFetchOrCreateCategoryMap(thisListCategoryName, listModel.ListType, out bypassCategoryModel)) { AdBlockMatcherApi.ParseRuleFile(rulesetPath, bypassCategoryModel.CategoryId, ListType.BypassList); m_categoryIndex.SetIsCategoryEnabled(bypassCategoryModel.CategoryId, true); GC.Collect(); } } break; case PlainTextFilteringListType.TextTrigger: { // Always load triggers as blacklists. if (TryFetchOrCreateCategoryMap(thisListCategoryName, listModel.ListType, out categoryModel)) { using (var listStream = File.OpenRead(rulesetPath)) { try { var triggersLoaded = m_textTriggers.LoadStoreFromStream(listStream, categoryModel.CategoryId).Result; totalTriggersLoaded += (uint)triggersLoaded; if (triggersLoaded > 0) { m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true); } } catch (Exception ex) { m_logger.Info($"Error on LoadStoresFromStream {ex}"); } } } GC.Collect(); } break; case PlainTextFilteringListType.Whitelist: { if (TryFetchOrCreateCategoryMap(thisListCategoryName, listModel.ListType, out categoryModel)) { AdBlockMatcherApi.ParseRuleFile(rulesetPath, categoryModel.CategoryId, ListType.Whitelist); m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true); } GC.Collect(); } break; } } } if (Configuration != null && Configuration.CustomTriggerBlacklist != null && Configuration.CustomTriggerBlacklist.Count > 0) { MappedFilterListCategoryModel categoryModel = null; // Always load triggers as blacklists. if (TryFetchOrCreateCategoryMap("/user/trigger_blacklist", PlainTextFilteringListType.TextTrigger, out categoryModel)) { var triggersLoaded = m_textTriggers.LoadStoreFromList(Configuration.CustomTriggerBlacklist, categoryModel.CategoryId).Result; totalTriggersLoaded += (uint)triggersLoaded; if (triggersLoaded > 0) { m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true); } m_logger.Info("Number of triggers loaded for CustomTriggerBlacklist {0}", triggersLoaded); } } if (Configuration != null && Configuration.CustomWhitelist != null && Configuration.CustomWhitelist.Count > 0) { List <string> sanitizedCustomWhitelist = new List <string>(); // As we are importing directly into an Adblock Plus-style rule engine, we need to make sure // that the user can't whitelist sites by adding something with a "@@" in front of it. // The easiest way to do this is to limit the characters to 'safe' characters. Regex isCleanRule = new Regex(@"^[a-zA-Z0-9\-_\:\.\/]+$", RegexOptions.Compiled); string rulesetPath = Path.Combine(tempFolder, ".user.custom_whitelist.rules.txt"); using (var rulesetStream = File.OpenWrite(rulesetPath)) using (var writer = new StreamWriter(rulesetStream)) { foreach (string site in Configuration.CustomWhitelist) { if (site == null) { continue; } if (isCleanRule.IsMatch(site)) { writer.WriteLine($"||{site}"); } } } MappedFilterListCategoryModel categoryModel = null; if (TryFetchOrCreateCategoryMap("/user/custom_whitelist", PlainTextFilteringListType.Whitelist, out categoryModel)) { AdBlockMatcherApi.ParseRuleFile(rulesetPath, categoryModel.CategoryId, ListType.Whitelist); m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true); } } if (Configuration != null && Configuration.SelfModeration != null && Configuration.SelfModeration.Count > 0) { List <string> sanitizedSelfModerationSites = new List <string>(); // As we are importing directly into an Adblock Plus-style rule engine, we need to make sure // that the user can't whitelist sites by adding something with a "@@" in front of it. // The easiest way to do this is to limit the characters to 'safe' characters. Regex isCleanRule = new Regex(@"^[a-zA-Z0-9\-_\:\.\/]+$", RegexOptions.Compiled); string rulesetPath = Path.Combine(tempFolder, ".user.self_moderation.rules.txt"); using (var rulesetStream = File.OpenWrite(rulesetPath)) using (var writer = new StreamWriter(rulesetStream)) { foreach (string site in Configuration.SelfModeration) { if (site == null) { continue; } if (isCleanRule.IsMatch(site)) { writer.WriteLine($"||{site}"); } } } MappedFilterListCategoryModel categoryModel = null; if (TryFetchOrCreateCategoryMap("/user/self_moderation", PlainTextFilteringListType.Blacklist, out categoryModel)) { AdBlockMatcherApi.ParseRuleFile(rulesetPath, categoryModel.CategoryId, ListType.Blacklist); m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true); } } //m_filterCollection.FinalizeForRead(); //m_filterCollection.InitializeBloomFilters(); m_textTriggers.FinalizeForRead(); m_textTriggers.InitializeBloomFilters(); // AdBlockMatcherApi.Save(s_paths.GetPath("rules.dat")); ListsReloaded?.Invoke(this, new EventArgs()); m_logger.Info("Loaded {0} rules, {1} rules failed most likely due to being malformed, and {2} text triggers loaded.", totalFilterRulesLoaded, totalFilterRulesFailed, totalTriggersLoaded); } return(true); } catch (Exception ex) { LoggerUtil.RecursivelyLogException(m_logger, ex); return(false); } finally { m_policyLock.ExitWriteLock(); deleteTemporaryLists(); } }