/// <summary>
        /// Attempts to fetch a FilterListEntry instance for the supplied category name, or create a
        /// new one if one does not exist. Whether one is created, or an existing instance is
        /// discovered, a valid, unique FilterListEntry for the supplied category shall be returned.
        /// </summary>
        /// <param name="categoryName">
        /// The category name for which to fetch or generate a new FilterListEntry instance.
        /// </param>
        /// <returns>
        /// The unique FilterListEntry for the supplied category name, whether an existing instance
        /// was found or a new one was created.
        /// </returns>
        /// <remarks>
        /// This will always fail if more than 255 categories are created!
        /// </remarks>
        private bool TryFetchOrCreateCategoryMap <T>(string categoryName, PlainTextFilteringListType listType, out T model) where T : MappedFilterListCategoryModel
        {
            m_logger.Info("CATEGORY {0}", categoryName);

            MappedFilterListCategoryModel existingCategory = null;

            if (!m_generatedCategoriesMap.TryGetValue(categoryName, out existingCategory))
            {
                // We can't generate anymore categories. Sorry, but the rest get ignored.
                if (m_generatedCategoriesMap.Count >= short.MaxValue)
                {
                    m_logger.Error("The maximum number of filtering categories has been exceeded.");
                    model = null;
                    return(false);
                }

                if (typeof(T) == typeof(MappedBypassListCategoryModel))
                {
                    MappedFilterListCategoryModel secondCategory = null;

                    if (TryFetchOrCreateCategoryMap(categoryName + "_as_whitelist", PlainTextFilteringListType.Whitelist, out secondCategory))
                    {
                        var newModel = (T)(MappedFilterListCategoryModel) new MappedBypassListCategoryModel((byte)((m_generatedCategoriesMap.Count) + 1), secondCategory.CategoryId, categoryName, secondCategory.CategoryName);
                        m_generatedCategoriesMap.GetOrAdd(categoryName, newModel);
                        model = newModel;
                        return(true);
                    }
                    else
                    {
                        model = null;
                        return(false);
                    }
                }
                else
                {
                    var newModel = (T) new MappedFilterListCategoryModel((byte)((m_generatedCategoriesMap.Count) + 1), categoryName, listType);
                    m_generatedCategoriesMap.GetOrAdd(categoryName, newModel);
                    model = newModel;
                    return(true);
                }
            }

            model = existingCategory as T;
            return(true);
        }
Example #2
0
        public byte[] ResolveBlockedSiteTemplate(Uri requestUri, int matchingCategory, List <MappedFilterListCategoryModel> appliedCategories, BlockType blockType = BlockType.None, string triggerCategory = "")
        {
            Dictionary <string, object> blockPageContext = new Dictionary <string, object>();

            // Produces something that looks like "www.badsite.com/example?arg=0" instead of "http://www.badsite.com/example?arg=0"
            // In my opninion this looks slightly more friendly to a user than the entire URI.
            string friendlyUrlText = (requestUri.Host + requestUri.PathAndQuery + requestUri.Fragment).TrimEnd('/');
            string urlText         = requestUri.ToString();

            bool   showUnblockRequestButton = true;
            string unblockRequest           = getUnblockRequestUrl(urlText);

            string message = "was blocked because it was in the following category:";

            // Collect category information: Blocked category, other categories, and whether the blocked category is in the relaxed policy.
            MappedFilterListCategoryModel matchingCategoryModel = m_policyConfiguration.GeneratedCategoriesMap.Values.FirstOrDefault(m => m.CategoryId == matchingCategory);
            string matching_category = matchingCategoryModel?.ShortCategoryName;

            List <string> otherCategories = appliedCategories?
                                            .Where(c => c.CategoryId != matchingCategory)
                                            .Select(c => c.ShortCategoryName)
                                            .Distinct()
                                            .ToList();

            bool isRelaxedPolicy = (matchingCategoryModel is MappedBypassListCategoryModel);

            // Get category or block type.
            string url_text = urlText == null ? "" : urlText;

            if (matchingCategory > 0 && blockType == BlockType.None)
            {
                // matching_category name already set.
            }
            else
            {
                otherCategories = null;
                switch (blockType)
                {
                case BlockType.None:
                    matching_category = "unknown reason";
                    break;

                case BlockType.ImageClassification:
                    matching_category = "naughty image";
                    break;

                case BlockType.Url:
                    matching_category = "bad webpage";
                    break;

                case BlockType.TextClassification:
                case BlockType.TextTrigger:
                    matching_category = string.Format("offensive text: {0}", triggerCategory);
                    break;

                case BlockType.TimeRestriction:
                    message = "is blocked because your time restrictions do not allow internet access at this time.";
                    //matching_category = "no internet allowed after hours";
                    showUnblockRequestButton = false;
                    break;

                case BlockType.OtherContentClassification:
                default:
                    matching_category = "other content classification";
                    break;
                }
            }

            blockPageContext.Add("url_text", url_text);
            blockPageContext.Add("friendly_url_text", friendlyUrlText);
            blockPageContext.Add("message", message);
            blockPageContext.Add("matching_category", matching_category);
            blockPageContext.Add("other_categories", otherCategories);
            blockPageContext.Add("showUnblockRequestButton", showUnblockRequestButton);
            blockPageContext.Add("passcodeSetupUrl", CompileSecrets.ServiceProviderUserRelaxedPolicyPath);
            blockPageContext.Add("unblockRequest", unblockRequest);
            blockPageContext.Add("isRelaxedPolicy", isRelaxedPolicy);
            blockPageContext.Add("isRelaxedPolicyPasscodeRequired", m_policyConfiguration?.Configuration?.EnableRelaxedPolicyPasscode);

            return(Encoding.UTF8.GetBytes(m_blockedHtmlPage(blockPageContext)));
        }
Example #3
0
        /// <summary>
        /// Called by the engine when the engine fails to classify a request or response by its
        /// metadata. The engine provides a full byte array of the content of the request or
        /// response, along with the declared content type of the data. This is currently used for
        /// NLP classification, but can be adapted with minimal changes to the Engine.
        /// </summary>
        /// <param name="data">
        /// The data to be classified.
        /// </param>
        /// <param name="contentType">
        /// The declared content type of the data.
        /// </param>
        /// <returns>
        /// A numeric category ID that the content was deemed to belong to. Zero is returned here if
        /// the content is not deemed to be part of any known category, which is a general indication
        /// to the engine that the content should not be blocked.
        /// </returns>
        private short OnClassifyContent(Memory <byte> data, string contentType, out BlockType blockedBecause, out string textTrigger, out string triggerCategory)
        {
            Stopwatch stopwatch = null;

            try
            {
                m_policyConfiguration.PolicyLock.EnterReadLock();

                stopwatch = Stopwatch.StartNew();
                if (m_policyConfiguration.TextTriggers != null && m_policyConfiguration.TextTriggers.HasTriggers)
                {
                    var isHtml = contentType.IndexOf("html") != -1;
                    var isJson = contentType.IndexOf("json") != -1;
                    if (isHtml || isJson)
                    {
                        var dataToAnalyzeStr = Encoding.UTF8.GetString(data.ToArray());

                        if (isHtml)
                        {
                            // This doesn't work anymore because google has started sending bad stuff directly
                            // embedded inside HTML responses, instead of sending JSON a separate response.
                            // So, we need to let the triggers engine just chew through the entire raw HTML.
                            // var ext = new FastHtmlTextExtractor();
                            // dataToAnalyzeStr = ext.Extract(dataToAnalyzeStr.ToCharArray(), true);
                        }

                        short  matchedCategory = -1;
                        string trigger         = null;
                        var    cfg             = m_policyConfiguration.Configuration;

                        if (m_policyConfiguration.TextTriggers.ContainsTrigger(dataToAnalyzeStr, out matchedCategory, out trigger, m_policyConfiguration.CategoryIndex.GetIsCategoryEnabled, cfg != null && cfg.MaxTextTriggerScanningSize > 1, cfg != null ? cfg.MaxTextTriggerScanningSize : -1))
                        {
                            m_logger.Info("Triggers successfully run. matchedCategory = {0}, trigger = '{1}'", matchedCategory, trigger);

                            var mappedCategory = m_policyConfiguration.GeneratedCategoriesMap.Values.Where(xx => xx.CategoryId == matchedCategory).FirstOrDefault();

                            if (mappedCategory != null)
                            {
                                m_logger.Info("Response blocked by text trigger \"{0}\" in category {1}.", trigger, mappedCategory.CategoryName);
                                blockedBecause  = BlockType.TextTrigger;
                                triggerCategory = mappedCategory.CategoryName;
                                textTrigger     = trigger;
                                return(mappedCategory.CategoryId);
                            }
                        }
                    }
                }
                stopwatch.Stop();

                //m_logger.Info("Text triggers took {0} on {1}", stopwatch.ElapsedMilliseconds, url);
            }
            catch (Exception e)
            {
                LoggerUtil.RecursivelyLogException(m_logger, e);
            }
            finally
            {
                m_policyConfiguration.PolicyLock.ExitReadLock();
            }

#if WITH_NLP
            try
            {
                m_doccatSlimLock.EnterReadLock();

                contentType = contentType.ToLower();

                // Only attempt text classification if we have a text classifier, silly.
                if (m_documentClassifiers != null && m_documentClassifiers.Count > 0)
                {
                    var textToClassifyBuilder = new StringBuilder();

                    if (contentType.IndexOf("html") != -1)
                    {
                        // This might be plain text, might be HTML. We need to find out.
                        var rawText = Encoding.UTF8.GetString(data).ToCharArray();

                        var extractor = new FastHtmlTextExtractor();

                        var extractedText = extractor.Extract(rawText);
                        m_logger.Info("From HTML: Classify this string: {0}", extractedText);
                        textToClassifyBuilder.Append(extractedText);
                    }
                    else if (contentType.IndexOf("json") != -1)
                    {
                        // This should be JSON.
                        var jsonText = Encoding.UTF8.GetString(data);

                        var len = jsonText.Length;
                        for (int i = 0; i < len; ++i)
                        {
                            char c = jsonText[i];
                            if (char.IsLetterOrDigit(c) || char.IsWhiteSpace(c))
                            {
                                textToClassifyBuilder.Append(c);
                            }
                            else
                            {
                                textToClassifyBuilder.Append(' ');
                            }
                        }

                        m_logger.Info("From Json: Classify this string: {0}", m_whitespaceRegex.Replace(textToClassifyBuilder.ToString(), " "));
                    }

                    var textToClassify = textToClassifyBuilder.ToString();

                    if (textToClassify.Length > 0)
                    {
                        foreach (var classifier in m_documentClassifiers)
                        {
                            m_logger.Info("Got text to classify of length {0}.", textToClassify.Length);

                            // Remove all multi-whitespace, newlines etc.
                            textToClassify = m_whitespaceRegex.Replace(textToClassify, " ");

                            var classificationResult = classifier.ClassifyText(textToClassify);

                            MappedFilterListCategoryModel categoryNumber = null;

                            if (m_generatedCategoriesMap.TryGetValue(classificationResult.BestCategoryName, out categoryNumber))
                            {
                                if (categoryNumber.CategoryId > 0 && m_categoryIndex.GetIsCategoryEnabled(categoryNumber.CategoryId))
                                {
                                    var cfg       = m_policyConfiguration.Configuration;
                                    var threshold = cfg != null ? cfg.NlpThreshold : 0.9f;

                                    if (classificationResult.BestCategoryScore < threshold)
                                    {
                                        m_logger.Info("Rejected {0} classification because score was less than threshold of {1}. Returned score was {2}.", classificationResult.BestCategoryName, threshold, classificationResult.BestCategoryScore);
                                        blockedBecause = BlockType.OtherContentClassification;
                                        return(0);
                                    }

                                    m_logger.Info("Classified text content as {0}.", classificationResult.BestCategoryName);
                                    blockedBecause = BlockType.TextClassification;
                                    return(categoryNumber.CategoryId);
                                }
                            }
                            else
                            {
                                m_logger.Info("Did not find category registered: {0}.", classificationResult.BestCategoryName);
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                LoggerUtil.RecursivelyLogException(m_logger, e);
            }
            finally
            {
                m_doccatSlimLock.ExitReadLock();
            }
#endif
            // Default to zero. Means don't block this content.
            blockedBecause  = BlockType.OtherContentClassification;
            textTrigger     = "";
            triggerCategory = "";
            return(0);
        }
        public bool LoadLists()
        {
            try
            {
                m_policyLock.EnterWriteLock();

                var listFolderPath = getListFolder();

                if (Directory.Exists(listFolderPath))
                {
                    // Recreate our filter collection and reset all categories to be disabled.
                    AdBlockMatcherApi.Initialize();

                    // Recreate our triggers container.
                    if (m_textTriggers != null)
                    {
                        m_textTriggers.Dispose();
                    }

                    m_categoryIndex.SetAll(false);

                    // XXX TODO - Maybe make it a compiler flag to toggle if this is going to
                    // be an in-memory DB or not.
                    m_textTriggers = new BagOfTextTriggers(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "t.dat"), true, true, m_logger);

                    // Now clear all generated categories. These will be re-generated as needed.
                    m_generatedCategoriesMap.Clear();

                    uint totalFilterRulesLoaded = 0;
                    uint totalFilterRulesFailed = 0;
                    uint totalTriggersLoaded    = 0;

                    // Load all configured list files.
                    string tempFolder = getTempFolder();

                    decryptLists(getListFolder(), tempFolder);

                    var rulePath = s_paths.GetPath("rules.dat");

                    if (File.Exists(rulePath))
                    {
                        //  AdBlockMatcherApi.Load(rulePath);
                    }
                    foreach (var listModel in Configuration.ConfiguredLists)
                    {
                        var rulesetPath = getListFilePath(listModel.RelativeListPath, tempFolder);

                        if (File.Exists(rulesetPath))
                        {
                            var thisListCategoryName = listModel.RelativeListPath.Substring(0, listModel.RelativeListPath.LastIndexOfAny(new[] { '/', '\\' }) + 1) + Path.GetFileNameWithoutExtension(listModel.RelativeListPath);

                            MappedFilterListCategoryModel categoryModel = null;

                            switch (listModel.ListType)
                            {
                            case PlainTextFilteringListType.Blacklist:
                            {
                                if (TryFetchOrCreateCategoryMap(thisListCategoryName, listModel.ListType, out categoryModel))
                                {
                                    AdBlockMatcherApi.ParseRuleFile(rulesetPath, categoryModel.CategoryId, ListType.Blacklist);
                                    m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true);
                                }
                            }
                            break;

                            case PlainTextFilteringListType.BypassList:
                            {
                                MappedBypassListCategoryModel bypassCategoryModel = null;

                                // Must be loaded twice. Once as a blacklist, once as a whitelist.
                                if (TryFetchOrCreateCategoryMap(thisListCategoryName, listModel.ListType, out bypassCategoryModel))
                                {
                                    AdBlockMatcherApi.ParseRuleFile(rulesetPath, bypassCategoryModel.CategoryId, ListType.BypassList);
                                    m_categoryIndex.SetIsCategoryEnabled(bypassCategoryModel.CategoryId, true);
                                    GC.Collect();
                                }
                            }
                            break;

                            case PlainTextFilteringListType.TextTrigger:
                            {
                                // Always load triggers as blacklists.
                                if (TryFetchOrCreateCategoryMap(thisListCategoryName, listModel.ListType, out categoryModel))
                                {
                                    using (var listStream = File.OpenRead(rulesetPath))
                                    {
                                        try
                                        {
                                            var triggersLoaded = m_textTriggers.LoadStoreFromStream(listStream, categoryModel.CategoryId).Result;

                                            totalTriggersLoaded += (uint)triggersLoaded;

                                            if (triggersLoaded > 0)
                                            {
                                                m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true);
                                            }
                                        }
                                        catch (Exception ex)
                                        {
                                            m_logger.Info($"Error on LoadStoresFromStream {ex}");
                                        }
                                    }
                                }

                                GC.Collect();
                            }
                            break;

                            case PlainTextFilteringListType.Whitelist:
                            {
                                if (TryFetchOrCreateCategoryMap(thisListCategoryName, listModel.ListType, out categoryModel))
                                {
                                    AdBlockMatcherApi.ParseRuleFile(rulesetPath, categoryModel.CategoryId, ListType.Whitelist);
                                    m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true);
                                }

                                GC.Collect();
                            }
                            break;
                            }
                        }
                    }

                    if (Configuration != null && Configuration.CustomTriggerBlacklist != null && Configuration.CustomTriggerBlacklist.Count > 0)
                    {
                        MappedFilterListCategoryModel categoryModel = null;

                        // Always load triggers as blacklists.
                        if (TryFetchOrCreateCategoryMap("/user/trigger_blacklist", PlainTextFilteringListType.TextTrigger, out categoryModel))
                        {
                            var triggersLoaded = m_textTriggers.LoadStoreFromList(Configuration.CustomTriggerBlacklist, categoryModel.CategoryId).Result;

                            totalTriggersLoaded += (uint)triggersLoaded;

                            if (triggersLoaded > 0)
                            {
                                m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true);
                            }

                            m_logger.Info("Number of triggers loaded for CustomTriggerBlacklist {0}", triggersLoaded);
                        }
                    }

                    if (Configuration != null && Configuration.CustomWhitelist != null && Configuration.CustomWhitelist.Count > 0)
                    {
                        List <string> sanitizedCustomWhitelist = new List <string>();

                        // As we are importing directly into an Adblock Plus-style rule engine, we need to make sure
                        // that the user can't whitelist sites by adding something with a "@@" in front of it.

                        // The easiest way to do this is to limit the characters to 'safe' characters.
                        Regex  isCleanRule = new Regex(@"^[a-zA-Z0-9\-_\:\.\/]+$", RegexOptions.Compiled);
                        string rulesetPath = Path.Combine(tempFolder, ".user.custom_whitelist.rules.txt");

                        using (var rulesetStream = File.OpenWrite(rulesetPath))
                            using (var writer = new StreamWriter(rulesetStream))
                            {
                                foreach (string site in Configuration.CustomWhitelist)
                                {
                                    if (site == null)
                                    {
                                        continue;
                                    }

                                    if (isCleanRule.IsMatch(site))
                                    {
                                        writer.WriteLine($"||{site}");
                                    }
                                }
                            }

                        MappedFilterListCategoryModel categoryModel = null;
                        if (TryFetchOrCreateCategoryMap("/user/custom_whitelist", PlainTextFilteringListType.Whitelist, out categoryModel))
                        {
                            AdBlockMatcherApi.ParseRuleFile(rulesetPath, categoryModel.CategoryId, ListType.Whitelist);
                            m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true);
                        }
                    }

                    if (Configuration != null && Configuration.SelfModeration != null && Configuration.SelfModeration.Count > 0)
                    {
                        List <string> sanitizedSelfModerationSites = new List <string>();

                        // As we are importing directly into an Adblock Plus-style rule engine, we need to make sure
                        // that the user can't whitelist sites by adding something with a "@@" in front of it.

                        // The easiest way to do this is to limit the characters to 'safe' characters.
                        Regex  isCleanRule = new Regex(@"^[a-zA-Z0-9\-_\:\.\/]+$", RegexOptions.Compiled);
                        string rulesetPath = Path.Combine(tempFolder, ".user.self_moderation.rules.txt");

                        using (var rulesetStream = File.OpenWrite(rulesetPath))
                            using (var writer = new StreamWriter(rulesetStream))
                            {
                                foreach (string site in Configuration.SelfModeration)
                                {
                                    if (site == null)
                                    {
                                        continue;
                                    }

                                    if (isCleanRule.IsMatch(site))
                                    {
                                        writer.WriteLine($"||{site}");
                                    }
                                }
                            }

                        MappedFilterListCategoryModel categoryModel = null;
                        if (TryFetchOrCreateCategoryMap("/user/self_moderation", PlainTextFilteringListType.Blacklist, out categoryModel))
                        {
                            AdBlockMatcherApi.ParseRuleFile(rulesetPath, categoryModel.CategoryId, ListType.Blacklist);
                            m_categoryIndex.SetIsCategoryEnabled(categoryModel.CategoryId, true);
                        }
                    }

                    //m_filterCollection.FinalizeForRead();
                    //m_filterCollection.InitializeBloomFilters();

                    m_textTriggers.FinalizeForRead();
                    m_textTriggers.InitializeBloomFilters();

                    //     AdBlockMatcherApi.Save(s_paths.GetPath("rules.dat"));

                    ListsReloaded?.Invoke(this, new EventArgs());

                    m_logger.Info("Loaded {0} rules, {1} rules failed most likely due to being malformed, and {2} text triggers loaded.", totalFilterRulesLoaded, totalFilterRulesFailed, totalTriggersLoaded);
                }

                return(true);
            }
            catch (Exception ex)
            {
                LoggerUtil.RecursivelyLogException(m_logger, ex);
                return(false);
            }
            finally
            {
                m_policyLock.ExitWriteLock();

                deleteTemporaryLists();
            }
        }