// Methods
        public List <INGram> Do
            (string text, ITokenizationStrategy strategy, INGramTokenizerRuleSet ruleSet)
        {
            Validator.ValidateStringNullOrWhiteSpace(text, nameof(text));
            Validator.ValidateObject(strategy, nameof(strategy));
            Validator.ValidateObject(ruleSet, nameof(ruleSet));

            return(TokenizeText(text, strategy, ruleSet));
        }
        // Constructor(s)
        protected ANGram(ushort n, ITokenizationStrategy strategy, string value)
        {
            Validator.ValidateN(n);
            Validator.ValidateObject(strategy, nameof(strategy));
            Validator.ValidateStringNullOrWhiteSpace(value, nameof(value));

            N        = n;
            Strategy = strategy;
            Value    = value;
        }
示例#3
0
        // Methods (public)
        public LabeledExample Create
            (ulong id, string label, string text, ITokenizationStrategy strategy, INGramTokenizerRuleSet ruleSet)
        {
            Validator.ValidateStringNullOrWhiteSpace(label, nameof(label));
            Validator.ValidateStringNullOrWhiteSpace(text, nameof(text));
            Validator.ValidateObject(strategy, nameof(strategy));
            Validator.ValidateObject(ruleSet, nameof(ruleSet));

            List <INGram>  nGrams         = _tokenizer.Do(text, strategy, ruleSet);
            LabeledExample labeledExample = new LabeledExample(id, label, text, nGrams);

            return(labeledExample);
        }
        private List <T> DoFor <T>(string text, ITokenizationStrategy strategy) where T : INGram
        {
            // "This is a sample text." => "This", "is", ..., "text"
            MatchCollection matches = Regex.Matches(text, strategy.Pattern);

            if (matches.Count == 0)
            {
                throw new Exception(MessageCollection.NGramsTokenizer_ProvidedTokenizationStrategyPatternReturnsZeroMatches.Invoke(strategy));
            }

            ushort N = GetN <T>();

            Validator.ThrowIfFirstIsGreater(N, nameof(N), matches.Count, "matches.Count");

            return(GetTokens <T>(N, matches, strategy));
        }
示例#5
0
        // Methods
        public TextClassifierResult PredictLabel
            (string text, ITokenizationStrategy strategy, INGramTokenizerRuleSet ruleSet, List <LabeledExample> labeledExamples)
        {
            Validator.ValidateStringNullOrWhiteSpace(text, nameof(text));
            Validator.ValidateObject(strategy, nameof(strategy));
            Validator.ValidateObject(ruleSet, nameof(ruleSet));
            Validator.ValidateList(labeledExamples, nameof(labeledExamples));

            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_AttemptingToPredictLabel);
            string truncatedText = _components.TextTruncatingFunction.Invoke(text, _settings.TruncateTextInLogMessagesAfter);

            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_FollowingTextHasBeenProvided.Invoke(truncatedText));
            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_FollowingTokenizationStrategyWillBeUsed.Invoke(strategy));
            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_FollowingNGramsTokenizerRuleSetWillBeUsed.Invoke(ruleSet));
            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_XLabeledExamplesHaveBeenProvided.Invoke(labeledExamples));

            List <INGram> nGrams = _components.NGramsTokenizer.Do(text, strategy, ruleSet);

            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_ProvidedTextHasBeenTokenizedIntoXNGrams.Invoke(nGrams));

            List <SimilarityIndex> indexes = GetSimilarityIndexes(nGrams, labeledExamples);

            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_TokenizedTextHasBeenComparedAgainstTheProvidedLabeledExamples);
            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_XSimilarityIndexObjectsHaveBeenComputed.Invoke(indexes));

            List <SimilarityIndexAverage> indexAverages = GetSimilarityIndexAverages(indexes);

            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_XSimilarityIndexAverageObjectsHaveBeenComputed(indexAverages));

            string label = PredictLabel(indexAverages);

            _components.LoggingAction.Invoke(MessageCollection.TextClassifier_PredictedLabelIs.Invoke(label));

            if (label == null)
            {
                _components.LoggingAction.Invoke(MessageCollection.TextClassifier_PredictionHasFailedTryIncreasingTheAmountOfProvidedLabeledExamples);
            }
            else
            {
                _components.LoggingAction.Invoke(MessageCollection.TextClassifier_PredictionHasBeenSuccessful);
            }

            TextClassifierResult result = new TextClassifierResult(label, indexes, indexAverages);

            return(result);
        }
        // Methods (private)
        private List <INGram> TokenizeText
            (string text, ITokenizationStrategy strategy, INGramTokenizerRuleSet ruleSet)
        {
            List <INGram> nGrams   = new List <INGram>();
            string        ruleName = null;

            try
            {
                if (ruleSet.DoForMonograms)
                {
                    ruleName = nameof(ruleSet.DoForMonograms);
                    nGrams.AddRange(DoFor <Monogram>(text, strategy));
                }

                if (ruleSet.DoForBigrams)
                {
                    ruleName = nameof(ruleSet.DoForBigrams);
                    nGrams.AddRange(DoFor <Bigram>(text, strategy));
                }

                if (ruleSet.DoForTrigrams)
                {
                    ruleName = nameof(ruleSet.DoForTrigrams);
                    nGrams.AddRange(DoFor <Trigram>(text, strategy));
                }

                if (ruleSet.DoForFourgrams)
                {
                    ruleName = nameof(ruleSet.DoForFourgrams);
                    nGrams.AddRange(DoFor <Fourgram>(text, strategy));
                }

                if (ruleSet.DoForFivegrams)
                {
                    ruleName = nameof(ruleSet.DoForFivegrams);
                    nGrams.AddRange(DoFor <Fivegram>(text, strategy));
                }

                return(nGrams);
            }
            catch
            {
                throw new Exception(MessageCollection.NGramTokenizer_TheRuleCantBeAppliedTo.Invoke(ruleName, text));
            }
        }
        private List <T> GetTokens <T>(ushort N, MatchCollection matches, ITokenizationStrategy strategy)
        {
            string[] allWords = ConvertToArray(matches);

            List <T> tokens = new List <T>();

            for (uint i = 0; i < allWords.Length; i++)
            {
                // The last x NGrams are shorter in length...
                ushort currentN = N;
                if ((allWords.Length - i) < N)
                {
                    currentN = (ushort)(allWords.Length - i);
                }

                // For N = 3: ["This", "is", "a"], ["is", "a", "sample"], ...
                string[] currentSubset = _ArrayManager.GetSubset(allWords, i, currentN);

                // [ "This", "is", "a" ] => [ "This", " ", "is", " ", "a" ]
                currentSubset = _ArrayManager.AddDelimiter(currentSubset, strategy.Delimiter);

                // [ "This", " ", "is", " ", "a" ] => "This is a"
                StringBuilder currentToken = new StringBuilder();
                foreach (string word in currentSubset)
                {
                    currentToken.Append(word);
                }

                // "This is a" => "this is a"
                if (strategy.ToLowercase)
                {
                    currentToken = new StringBuilder(currentToken.ToString().ToLower());
                }

                // new Monogram(strategy, "this is a")
                T nGram = CreateInstance <T>(strategy, currentToken.ToString());

                tokens.Add(nGram);
            }

            return(tokens);
        }
 // Fields
 // Properties
 // Constructors
 public Fourgram(ITokenizationStrategy strategy, string value)
     : base(4, strategy, value)
 {
 }
 public List <INGram> Do(string text, ITokenizationStrategy strategy)
 => Do(text, strategy, new NGramTokenizerRuleSet());
 // Fields
 // Properties
 // Constructors
 public Monogram(ITokenizationStrategy strategy, string value)
     : base(1, strategy, value)
 {
 }
 // Fields
 // Properties
 // Constructors
 public Bigram(ITokenizationStrategy strategy, string value)
     : base(2, strategy, value)
 {
 }
 // Fields
 // Properties
 // Constructors
 public FakeGram(ushort n, ITokenizationStrategy strategy, string value)
     : base(n, strategy, value)
 {
 }
 // Fields
 // Properties
 // Constructors
 public Fivegram(ITokenizationStrategy strategy, string value)
     : base(5, strategy, value)
 {
 }
示例#14
0
 public LabeledExample Create(ulong id, string label, string text, ITokenizationStrategy strategy)
 => Create(id, label, text, strategy, new NGramTokenizerRuleSet());